commit e874999bd11fb1f23234eb4cecb977292bd8f53a
Author: linyqh <linyqemail@163.com>
Date:   Mon Aug 12 21:35:06 2024 +0800

    first commit

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..df67e1a
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,24 @@
+# Exclude common Python files and directories
+venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pyz
+*.pyw
+*.pyi
+*.egg-info/
+
+# Exclude development and local files
+.env
+.env.*
+*.log
+*.db
+
+# Exclude version control system files
+.git/
+.gitignore
+.svn/
+
+storage/
+config.toml
diff --git a/.github/workflows/codeReview.yml b/.github/workflows/codeReview.yml
new file mode 100644
index 0000000..88b6c2b
--- /dev/null
+++ b/.github/workflows/codeReview.yml
@@ -0,0 +1,24 @@
+name: Code Review
+
+permissions:
+  contents: read
+  pull-requests: write
+
+on:
+  # 在提合并请求的时候触发
+  pull_request:
+    types: [opened, reopened]
+  workflow_dispatch:
+
+jobs:
+  codeReview:
+    runs-on: ubuntu-latest
+    steps:
+      - name: GPT代码逻辑检查
+        uses: anc95/ChatGPT-CodeReview@main
+        env:
+          GITHUB_TOKEN: ${{ secrets.GIT_TOKEN }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          OPENAI_API_ENDPOINT: https://api.groq.com/openai/v1
+          MODEL: llama-3.1-70b-versatile
+          LANGUAGE: Chinese
diff --git a/.github/workflows/dockerImageBuild.yml.bak b/.github/workflows/dockerImageBuild.yml.bak
new file mode 100644
index 0000000..3fc14bd
--- /dev/null
+++ b/.github/workflows/dockerImageBuild.yml.bak
@@ -0,0 +1,35 @@
+name: build_docker
+
+on:
+  release:
+    types: [created] # 表示在创建新的 Release 时触发
+
+jobs:
+  build_docker:
+    name: Build docker
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: |
+            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:${{ github.ref_name }}
+            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:latest
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c51c4e8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
+.DS_Store
+/config.toml
+/storage/
+/.idea/
+/app/services/__pycache__
+/app/__pycache__/
+/app/config/__pycache__/
+/app/models/__pycache__/
+/app/utils/__pycache__/
+/*/__pycache__/*
+.vscode
+/**/.streamlit
+__pycache__
+logs/
+
+node_modules
+# VuePress 默认临时文件目录
+/sites/docs/.vuepress/.temp
+# VuePress 默认缓存目录
+/sites/docs/.vuepress/.cache
+# VuePress 默认构建生成的静态文件目录
+/sites/docs/.vuepress/dist
+# 模型目录
+/models/
+./models/*
+resource/scripts/*
+resource/videos/*
+resource/songs/*
+resource/fonts/*
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..8cf53f0
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,45 @@
+# Use an official Python runtime as a parent image
+FROM python:3.10-slim-bullseye
+
+# Set the working directory in the container
+WORKDIR /NarratoAI
+
+# 设置/NarratoAI目录权限为777
+RUN chmod 777 /NarratoAI
+
+ENV PYTHONPATH="/NarratoAI"
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    imagemagick \
+    ffmpeg \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Fix security policy for ImageMagick
+RUN sed -i '/<policy domain="path" rights="none" pattern="@\*"/d' /etc/ImageMagick-6/policy.xml
+
+# Copy only the requirements.txt first to leverage Docker cache
+COPY requirements.txt ./
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Now copy the rest of the codebase into the image
+COPY . .
+
+# Expose the port the app runs on
+EXPOSE 8501
+
+# Command to run the application
+CMD ["streamlit", "run", "./webui/Main.py","--browser.serverAddress=127.0.0.1","--server.enableCORS=True","--browser.gatherUsageStats=False"]
+
+# 1. Build the Docker image using the following command
+# docker build -t moneyprinterturbo .
+
+# 2. Run the Docker container using the following command
+## For Linux or MacOS:
+# docker run -v $(pwd)/config.toml:/NarratoAI/config.toml -v $(pwd)/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
+## For Windows:
+# docker run -v %cd%/config.toml:/NarratoAI/config.toml -v %cd%/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..84e41e1
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 linyq
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README-en.md b/README-en.md
new file mode 100644
index 0000000..8a04464
--- /dev/null
+++ b/README-en.md
@@ -0,0 +1,170 @@
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎 </h1>
+<h3 align="center">All-in-One AI-Powered Video Narration + Automated Editing Tool🎬</h3>
+
+<h3> 📖 <a href="README.md">Simplified Chinese</a> | English </h3>
+<div align="center">
+  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</div>
+<br>
+NarratoAI is an automated video narration tool that provides an all-in-one solution for script writing, automated video editing, voice-over, and subtitle generation, powered by LLM to enhance efficient content creation.
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+[![Discord](https://img.shields.io/discord/1134848537704804432?style=for-the-badge)](https://dsc.gg/fuji-community)
+
+<h3>Home</h3>
+
+![](docs/index.png)
+
+<h3>Video Review Interface</h3>
+
+![](docs/check.png)
+
+</div>
+
+## System Requirements 📦
+
+- Recommended minimum: CPU with 4 cores or more, 8GB RAM or more, GPU is not required
+- Windows 10 or MacOS 11.0 or above
+
+## Quick Start 🚀
+### Apply for Google AI Studio Account
+1. Visit https://aistudio.google.com/app/prompts/new_chat to apply for an account.
+2. Click `Get API Key` to request an API Key.
+3. Enter the obtained API Key into the `gemini_api_key` setting in the `config.example.toml` file.
+
+### Configure Proxy VPN
+> The method to configure VPN is not restricted, as long as you can access Google's network. Here, `clash` is used as an example.
+1. Note the port of the clash service, usually `http://127.0.0.1:7890`.
+2. If the port is not `7890`, modify the `VPN_PROXY_URL` in the `docker-compose.yml` file to your proxy address.
+   ```yaml
+   environment:
+     - "VPN_PROXY_URL=http://host.docker.internal:7890" # Change to your proxy port; host.docker.internal represents the IP of the physical machine.
+    ```
+
+3. (Optional) Or modify the `proxy` settings in the `config.example.toml` file.
+   ```toml
+   [proxy]
+    ### Use a proxy to access the Pexels API
+    ### Format: "http://<username>:<password>@<proxy>:<port>"
+    ### Example: "http://user:pass@proxy:1234"
+    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
+
+    http = "http://xx.xx.xx.xx:7890"
+    https = "http://xx.xx.xx.xx:7890"
+   ```
+
+### Docker Deployment 🐳
+#### ① Start Docker
+```shell
+cd NarratoAI
+docker-compose up
+```
+#### ② Access the Web Interface
+
+Open your browser and go to http://127.0.0.1:8501
+
+#### ③ Access the API Documentation
+
+Open your browser and go to http://127.0.0.1:8080/docs or http://127.0.0.1:8080/redoc
+
+## Usage
+#### 1. Basic Configuration, Select Model, Enter API Key, and Choose Model
+> Currently, only the `Gemini` model is supported. Other modes will be added in future updates. Contributions are welcome via [PR](https://github.com/linyqh/NarratoAI/pulls) to join in the development 🎉🎉🎉
+<div align="center">
+  <img src="docs/img001.png" alt="001" width="1000"/>
+</div>
+
+#### 2. Select the Video for Narration and Click to Generate Video Script
+> A demo video is included in the platform. To use your own video, place the mp4 file in the `resource/videos` directory and refresh your browser.
+> Note: The filename can be anything, but it must not contain Chinese characters, special characters, spaces, backslashes, etc.
+<div align="center">
+  <img src="docs/img002.png" alt="002" width="400"/>
+</div>
+
+#### 3. Save the Script and Start Editing
+> After saving the script, refresh the browser, and the newly generated `.json` script file will appear in the script file dropdown. Select the json file and video to start editing.
+<div align="center">
+  <img src="docs/img003.png" alt="003" width="400"/>
+</div>
+
+#### 4. Review the Video; if there are segments that don't meet the rules, click to regenerate or manually edit them.
+<div align="center">
+  <img src="docs/img004.png" alt="003" width="1000"/>
+</div>
+
+#### 5. Configure Basic Video Parameters
+<div align="center">
+  <img src="docs/img005.png" alt="003" width="700"/>
+</div>
+
+#### 6. Start Generating
+<div align="center">
+  <img src="docs/img006.png" alt="003" width="1000"/>
+</div>
+
+#### 7. Video Generation Complete
+<div align="center">
+  <img src="docs/img007.png" alt="003" width="1000"/>
+</div>
+
+## Development 💻
+1. Install Dependencies
+```shell
+conda create -n narratoai python=3.10
+conda activate narratoai
+cd narratoai
+pip install -r requirements.txt
+```
+2. Install ImageMagick
+###### Windows:
+
+- Download https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-36-Q16-x64-static.exe
+- Install the downloaded ImageMagick, ensuring you do not change the installation path
+- Update `imagemagick_path` in the `config.toml` file to your actual installation path (typically `C:\Program Files\ImageMagick-7.1.1-Q16\magick.exe`)
+
+###### MacOS:
+
+```shell
+brew install imagemagick
+````
+
+###### Ubuntu
+
+```shell
+sudo apt-get install imagemagick
+```
+
+###### CentOS
+
+```shell
+sudo yum install ImageMagick
+```
+
+3. initiate webui
+```shell
+streamlit run ./webui/Main.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
+```
+4. Access http://127.0.0.1:8501
+
+## Feedback & Suggestions 📢
+
+### 👏👏👏 You can submit [issues](https://github.com/linyqh/NarratoAI/issues) or [pull requests](https://github.com/linyqh/NarratoAI/pulls) 🎉🎉🎉
+
+## Reference Projects 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+This project was refactored based on the above projects with the addition of video narration features. Thanks to the original authors for their open-source spirit 🥳🥳🥳 
+
+## License 📝
+
+Click to view the [`LICENSE`](LICENSE) file
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..547fed4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,175 @@
+
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎📽️ </h1>
+<h3 align="center">一站式 AI 影视解说+自动化剪辑工具🎬🎞️ </h3>
+
+
+<h3>📖 简体中文 | <a href="README-en.md">English</a></h3>
+<div align="center">
+
+[//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
+</div>
+<br>
+NarratoAI 是一个自动化影视解说工具，基于LLM实现文案撰写、自动化视频剪辑、配音和字幕生成的一站式流程，助力高效内容创作。
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+[![Discord](https://img.shields.io/discord/1134848537704804432?style=for-the-badge)](https://discord.gg/WBKChhmZ)
+
+
+<h3>首页</h3>
+
+![](docs/index.png)
+
+<h3>视频审查界面</h3>
+
+![](docs/check.png)
+
+</div>
+
+## 配置要求 📦
+
+- 建议最低 CPU 4核或以上，内存 8G 或以上，显卡非必须
+- Windows 10 或 MacOS 11.0 以上系统
+
+## 快速开始 🚀
+### 申请 Google AI studio 账号
+1. 访问 https://aistudio.google.com/app/prompts/new_chat 申请账号
+2. 点击 `Get API Key` 申请 API Key
+3. 申请的 API Key 填入 `config.example.toml` 文件中的 `gemini_api_key` 配置
+
+### 配置 proxy VPN
+> 配置vpn的方法不限，只要能正常访问 Google 网络即可，本文采用的是 chash
+1. 记住 clash 服务的端口，一般为 `http://127.0.0.1:7890`
+2. 若端口不为 `7890`，请修改 `docker-compose.yml` 文件中的 `VPN_PROXY_URL` 为你的代理地址
+   ```yaml
+   environment:
+     - "VPN_PROXY_URL=http://host.docker.internal:7890" # 修改为你的代理端口；host.docker.internal表示物理机的IP
+   ```
+3. (可选)或者修改 `config.example.toml` 文件中的 `proxy` 配置
+   ```toml
+   [proxy]
+    ### Use a proxy to access the Pexels API
+    ### Format: "http://<username>:<password>@<proxy>:<port>"
+    ### Example: "http://user:pass@proxy:1234"
+    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
+
+    http = "http://xx.xx.xx.xx:7890"
+    https = "http://xx.xx.xx.xx:7890"
+   ```
+### docker部署🐳
+#### ① 垃取项目，启动Docker
+```shell
+git clone https://github.com/linyqh/NarratoAI.git
+cd NarratoAI
+docker-compose up
+```
+#### ② 访问Web界面
+
+打开浏览器，访问 http://127.0.0.1:8501
+
+#### ③ 访问API文档
+
+打开浏览器，访问 http://127.0.0.1:8080/docs 或者 http://127.0.0.1:8080/redoc
+
+## 使用方法
+#### 1. 基础配置，选择模型，填入APIKey，选择模型
+> 目前暂时只支持 `Gemini` 模型，其他模式待后续更新，欢迎大家提交 [PR](https://github.com/linyqh/NarratoAI/pulls)，参与开发 🎉🎉🎉
+<div align="center">
+  <img src="docs/img001.png" alt="001" width="1000"/>
+</div>
+
+#### 2. 选择需要解说的视频，点击生成视频脚本
+> 平台内置了一个演示视频，若要使用自己的视频，将mp4文件放在 `resource/videos` 目录下，刷新浏览器即可，
+> 注意：文件名随意，但文件名不能包含中文，特殊字符，空格，反斜杠等
+<div align="center">
+  <img src="docs/img002.png" alt="002" width="400"/>
+</div>
+
+#### 3. 保存脚本，开始剪辑
+> 保存脚本后，刷新浏览器，在脚本文件的下拉框就会有新生成的 `.json` 脚本文件，选择json文件和视频就可以开始剪辑了。
+<div align="center">
+  <img src="docs/img003.png" alt="003" width="400"/>
+</div>
+
+#### 4. 检查视频，若视频存在不符合规则的片段，可以点击重新生成或者手动编辑
+<div align="center">
+  <img src="docs/img004.png" alt="003" width="1000"/>
+</div>
+
+#### 5. 配置视频基本参数
+<div align="center">
+  <img src="docs/img005.png" alt="003" width="700"/>
+</div>
+
+#### 6. 开始生成
+<div align="center">
+  <img src="docs/img006.png" alt="003" width="1000"/>
+</div>
+
+#### 7. 视频生成完成
+<div align="center">
+  <img src="docs/img007.png" alt="003" width="1000"/>
+</div>
+
+## 开发 💻
+1. 安装依赖
+```shell
+conda create -n narratoai python=3.10
+conda activate narratoai
+cd narratoai
+pip install -r requirements.txt
+```
+
+2. 安装 ImageMagick
+###### Windows:
+
+- 下载 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-36-Q16-x64-static.exe
+- 安装下载好的 ImageMagick，注意不要修改安装路径
+- 修改 `配置文件 config.toml` 中的 `imagemagick_path` 为你的实际安装路径（一般在 `C:\Program Files\ImageMagick-7.1.1-Q16\magick.exe`）
+
+###### MacOS:
+
+```shell
+brew install imagemagick
+````
+
+###### Ubuntu
+
+```shell
+sudo apt-get install imagemagick
+```
+
+###### CentOS
+
+```shell
+sudo yum install ImageMagick
+```
+3. 启动 webui
+```shell
+streamlit run ./webui/Main.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
+```
+4. 访问 http://127.0.0.1:8501
+
+
+## 反馈建议 📢
+
+### 👏👏👏 可以提交 [issue](https://github.com/linyqh/NarratoAI/issues)或者 [pull request](https://github.com/linyqh/NarratoAI/pulls) 🎉🎉🎉
+
+## 参考项目 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+该项目基于以上项目重构而来，增加了影视解说功能，感谢大佬的开源精神 🥳🥳🥳 
+
+## 许可证 📝
+
+点击查看 [`LICENSE`](LICENSE) 文件
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
+
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/asgi.py b/app/asgi.py
new file mode 100644
index 0000000..aec304c
--- /dev/null
+++ b/app/asgi.py
@@ -0,0 +1,82 @@
+"""Application implementation - ASGI."""
+
+import os
+
+from fastapi import FastAPI, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from loguru import logger
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+
+from app.config import config
+from app.models.exception import HttpException
+from app.router import root_api_router
+from app.utils import utils
+
+
+def exception_handler(request: Request, e: HttpException):
+    return JSONResponse(
+        status_code=e.status_code,
+        content=utils.get_response(e.status_code, e.data, e.message),
+    )
+
+
+def validation_exception_handler(request: Request, e: RequestValidationError):
+    return JSONResponse(
+        status_code=400,
+        content=utils.get_response(
+            status=400, data=e.errors(), message="field required"
+        ),
+    )
+
+
+def get_application() -> FastAPI:
+    """Initialize FastAPI application.
+
+    Returns:
+       FastAPI: Application object instance.
+
+    """
+    instance = FastAPI(
+        title=config.project_name,
+        description=config.project_description,
+        version=config.project_version,
+        debug=False,
+    )
+    instance.include_router(root_api_router)
+    instance.add_exception_handler(HttpException, exception_handler)
+    instance.add_exception_handler(RequestValidationError, validation_exception_handler)
+    return instance
+
+
+app = get_application()
+
+# Configures the CORS middleware for the FastAPI app
+cors_allowed_origins_str = os.getenv("CORS_ALLOWED_ORIGINS", "")
+origins = cors_allowed_origins_str.split(",") if cors_allowed_origins_str else ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+task_dir = utils.task_dir()
+app.mount(
+    "/tasks", StaticFiles(directory=task_dir, html=True, follow_symlink=True), name=""
+)
+
+public_dir = utils.public_dir()
+app.mount("/", StaticFiles(directory=public_dir, html=True), name="")
+
+
+@app.on_event("shutdown")
+def shutdown_event():
+    logger.info("shutdown event")
+
+
+@app.on_event("startup")
+def startup_event():
+    logger.info("startup event")
diff --git a/app/config/__init__.py b/app/config/__init__.py
new file mode 100644
index 0000000..dd46812
--- /dev/null
+++ b/app/config/__init__.py
@@ -0,0 +1,56 @@
+import os
+import sys
+
+from loguru import logger
+
+from app.config import config
+from app.utils import utils
+
+
+def __init_logger():
+    # _log_file = utils.storage_dir("logs/server.log")
+    _lvl = config.log_level
+    root_dir = os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+    )
+
+    def format_record(record):
+        # 获取日志记录中的文件全路径
+        file_path = record["file"].path
+        # 将绝对路径转换为相对于项目根目录的路径
+        relative_path = os.path.relpath(file_path, root_dir)
+        # 更新记录中的文件路径
+        record["file"].path = f"./{relative_path}"
+        # 返回修改后的格式字符串
+        # 您可以根据需要调整这里的格式
+        _format = (
+            "<green>{time:%Y-%m-%d %H:%M:%S}</> | "
+            + "<level>{level}</> | "
+            + '"{file.path}:{line}":<blue> {function}</> '
+            + "- <level>{message}</>"
+            + "\n"
+        )
+        return _format
+
+    logger.remove()
+
+    logger.add(
+        sys.stdout,
+        level=_lvl,
+        format=format_record,
+        colorize=True,
+    )
+
+    # logger.add(
+    #     _log_file,
+    #     level=_lvl,
+    #     format=format_record,
+    #     rotation="00:00",
+    #     retention="3 days",
+    #     backtrace=True,
+    #     diagnose=True,
+    #     enqueue=True,
+    # )
+
+
+__init_logger()
diff --git a/app/config/config.py b/app/config/config.py
new file mode 100644
index 0000000..a653ddc
--- /dev/null
+++ b/app/config/config.py
@@ -0,0 +1,70 @@
+import os
+import socket
+import toml
+import shutil
+from loguru import logger
+
+root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+config_file = f"{root_dir}/config.toml"
+
+
+def load_config():
+    # fix: IsADirectoryError: [Errno 21] Is a directory: '/NarratoAI/config.toml'
+    if os.path.isdir(config_file):
+        shutil.rmtree(config_file)
+
+    if not os.path.isfile(config_file):
+        example_file = f"{root_dir}/config.example.toml"
+        if os.path.isfile(example_file):
+            shutil.copyfile(example_file, config_file)
+            logger.info(f"copy config.example.toml to config.toml")
+
+    logger.info(f"load config from file: {config_file}")
+
+    try:
+        _config_ = toml.load(config_file)
+    except Exception as e:
+        logger.warning(f"load config failed: {str(e)}, try to load as utf-8-sig")
+        with open(config_file, mode="r", encoding="utf-8-sig") as fp:
+            _cfg_content = fp.read()
+            _config_ = toml.loads(_cfg_content)
+    return _config_
+
+
+def save_config():
+    with open(config_file, "w", encoding="utf-8") as f:
+        _cfg["app"] = app
+        _cfg["azure"] = azure
+        _cfg["ui"] = ui
+        f.write(toml.dumps(_cfg))
+
+
+_cfg = load_config()
+app = _cfg.get("app", {})
+whisper = _cfg.get("whisper", {})
+proxy = _cfg.get("proxy", {})
+azure = _cfg.get("azure", {})
+ui = _cfg.get("ui", {})
+
+hostname = socket.gethostname()
+
+log_level = _cfg.get("log_level", "DEBUG")
+listen_host = _cfg.get("listen_host", "0.0.0.0")
+listen_port = _cfg.get("listen_port", 8080)
+project_name = _cfg.get("project_name", "NarratoAI")
+project_description = _cfg.get(
+    "project_description",
+    "<a href='https://github.com/linyqh/NarratoAI'>https://github.com/linyqh/NarratoAI</a>",
+)
+project_version = _cfg.get("app", {}).get("project_version")
+reload_debug = False
+
+imagemagick_path = app.get("imagemagick_path", "")
+if imagemagick_path and os.path.isfile(imagemagick_path):
+    os.environ["IMAGEMAGICK_BINARY"] = imagemagick_path
+
+ffmpeg_path = app.get("ffmpeg_path", "")
+if ffmpeg_path and os.path.isfile(ffmpeg_path):
+    os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_path
+
+logger.info(f"{project_name} v{project_version}")
diff --git a/app/controllers/base.py b/app/controllers/base.py
new file mode 100644
index 0000000..122e341
--- /dev/null
+++ b/app/controllers/base.py
@@ -0,0 +1,31 @@
+from uuid import uuid4
+
+from fastapi import Request
+
+from app.config import config
+from app.models.exception import HttpException
+
+
+def get_task_id(request: Request):
+    task_id = request.headers.get("x-task-id")
+    if not task_id:
+        task_id = uuid4()
+    return str(task_id)
+
+
+def get_api_key(request: Request):
+    api_key = request.headers.get("x-api-key")
+    return api_key
+
+
+def verify_token(request: Request):
+    token = get_api_key(request)
+    if token != config.app.get("api_key", ""):
+        request_id = get_task_id(request)
+        request_url = request.url
+        user_agent = request.headers.get("user-agent")
+        raise HttpException(
+            task_id=request_id,
+            status_code=401,
+            message=f"invalid token: {request_url}, {user_agent}",
+        )
diff --git a/app/controllers/manager/base_manager.py b/app/controllers/manager/base_manager.py
new file mode 100644
index 0000000..462589e
--- /dev/null
+++ b/app/controllers/manager/base_manager.py
@@ -0,0 +1,64 @@
+import threading
+from typing import Callable, Any, Dict
+
+
+class TaskManager:
+    def __init__(self, max_concurrent_tasks: int):
+        self.max_concurrent_tasks = max_concurrent_tasks
+        self.current_tasks = 0
+        self.lock = threading.Lock()
+        self.queue = self.create_queue()
+
+    def create_queue(self):
+        raise NotImplementedError()
+
+    def add_task(self, func: Callable, *args: Any, **kwargs: Any):
+        with self.lock:
+            if self.current_tasks < self.max_concurrent_tasks:
+                print(f"add task: {func.__name__}, current_tasks: {self.current_tasks}")
+                self.execute_task(func, *args, **kwargs)
+            else:
+                print(
+                    f"enqueue task: {func.__name__}, current_tasks: {self.current_tasks}"
+                )
+                self.enqueue({"func": func, "args": args, "kwargs": kwargs})
+
+    def execute_task(self, func: Callable, *args: Any, **kwargs: Any):
+        thread = threading.Thread(
+            target=self.run_task, args=(func, *args), kwargs=kwargs
+        )
+        thread.start()
+
+    def run_task(self, func: Callable, *args: Any, **kwargs: Any):
+        try:
+            with self.lock:
+                self.current_tasks += 1
+            func(*args, **kwargs)  # 在这里调用函数，传递*args和**kwargs
+        finally:
+            self.task_done()
+
+    def check_queue(self):
+        with self.lock:
+            if (
+                self.current_tasks < self.max_concurrent_tasks
+                and not self.is_queue_empty()
+            ):
+                task_info = self.dequeue()
+                func = task_info["func"]
+                args = task_info.get("args", ())
+                kwargs = task_info.get("kwargs", {})
+                self.execute_task(func, *args, **kwargs)
+
+    def task_done(self):
+        with self.lock:
+            self.current_tasks -= 1
+        self.check_queue()
+
+    def enqueue(self, task: Dict):
+        raise NotImplementedError()
+
+    def dequeue(self):
+        raise NotImplementedError()
+
+    def is_queue_empty(self):
+        raise NotImplementedError()
diff --git a/app/controllers/manager/memory_manager.py b/app/controllers/manager/memory_manager.py
new file mode 100644
index 0000000..cf7321f
--- /dev/null
+++ b/app/controllers/manager/memory_manager.py
@@ -0,0 +1,18 @@
+from queue import Queue
+from typing import Dict
+
+from app.controllers.manager.base_manager import TaskManager
+
+
+class InMemoryTaskManager(TaskManager):
+    def create_queue(self):
+        return Queue()
+
+    def enqueue(self, task: Dict):
+        self.queue.put(task)
+
+    def dequeue(self):
+        return self.queue.get()
+
+    def is_queue_empty(self):
+        return self.queue.empty()
diff --git a/app/controllers/manager/redis_manager.py b/app/controllers/manager/redis_manager.py
new file mode 100644
index 0000000..cad1912
--- /dev/null
+++ b/app/controllers/manager/redis_manager.py
@@ -0,0 +1,56 @@
+import json
+from typing import Dict
+
+import redis
+
+from app.controllers.manager.base_manager import TaskManager
+from app.models.schema import VideoParams
+from app.services import task as tm
+
+FUNC_MAP = {
+    "start": tm.start,
+    # 'start_test': tm.start_test
+}
+
+
+class RedisTaskManager(TaskManager):
+    def __init__(self, max_concurrent_tasks: int, redis_url: str):
+        self.redis_client = redis.Redis.from_url(redis_url)
+        super().__init__(max_concurrent_tasks)
+
+    def create_queue(self):
+        return "task_queue"
+
+    def enqueue(self, task: Dict):
+        task_with_serializable_params = task.copy()
+
+        if "params" in task["kwargs"] and isinstance(
+            task["kwargs"]["params"], VideoParams
+        ):
+            task_with_serializable_params["kwargs"]["params"] = task["kwargs"][
+                "params"
+            ].dict()
+
+        # 将函数对象转换为其名称
+        task_with_serializable_params["func"] = task["func"].__name__
+        self.redis_client.rpush(self.queue, json.dumps(task_with_serializable_params))
+
+    def dequeue(self):
+        task_json = self.redis_client.lpop(self.queue)
+        if task_json:
+            task_info = json.loads(task_json)
+            # 将函数名称转换回函数对象
+            task_info["func"] = FUNC_MAP[task_info["func"]]
+
+            if "params" in task_info["kwargs"] and isinstance(
+                task_info["kwargs"]["params"], dict
+            ):
+                task_info["kwargs"]["params"] = VideoParams(
+                    **task_info["kwargs"]["params"]
+                )
+
+            return task_info
+        return None
+
+    def is_queue_empty(self):
+        return self.redis_client.llen(self.queue) == 0
diff --git a/app/controllers/ping.py b/app/controllers/ping.py
new file mode 100644
index 0000000..a3eeff0
--- /dev/null
+++ b/app/controllers/ping.py
@@ -0,0 +1,14 @@
+from fastapi import APIRouter
+from fastapi import Request
+
+router = APIRouter()
+
+
+@router.get(
+    "/ping",
+    tags=["Health Check"],
+    description="检查服务可用性",
+    response_description="pong",
+)
+def ping(request: Request) -> str:
+    return "pong"
diff --git a/app/controllers/v1/base.py b/app/controllers/v1/base.py
new file mode 100644
index 0000000..51794df
--- /dev/null
+++ b/app/controllers/v1/base.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+
+
+def new_router(dependencies=None):
+    router = APIRouter()
+    router.tags = ["V1"]
+    router.prefix = "/api/v1"
+    # 将认证依赖项应用于所有路由
+    if dependencies:
+        router.dependencies = dependencies
+    return router
diff --git a/app/controllers/v1/llm.py b/app/controllers/v1/llm.py
new file mode 100644
index 0000000..e841d68
--- /dev/null
+++ b/app/controllers/v1/llm.py
@@ -0,0 +1,44 @@
+from fastapi import Request
+from app.controllers.v1.base import new_router
+from app.models.schema import (
+    VideoScriptResponse,
+    VideoScriptRequest,
+    VideoTermsResponse,
+    VideoTermsRequest,
+)
+from app.services import llm
+from app.utils import utils
+
+# 认证依赖项
+# router = new_router(dependencies=[Depends(base.verify_token)])
+router = new_router()
+
+
+@router.post(
+    "/scripts",
+    response_model=VideoScriptResponse,
+    summary="Create a script for the video",
+)
+def generate_video_script(request: Request, body: VideoScriptRequest):
+    video_script = llm.generate_script(
+        video_subject=body.video_subject,
+        language=body.video_language,
+        paragraph_number=body.paragraph_number,
+    )
+    response = {"video_script": video_script}
+    return utils.get_response(200, response)
+
+
+@router.post(
+    "/terms",
+    response_model=VideoTermsResponse,
+    summary="Generate video terms based on the video script",
+)
+def generate_video_terms(request: Request, body: VideoTermsRequest):
+    video_terms = llm.generate_terms(
+        video_subject=body.video_subject,
+        video_script=body.video_script,
+        amount=body.amount,
+    )
+    response = {"video_terms": video_terms}
+    return utils.get_response(200, response)
diff --git a/app/controllers/v1/video.py b/app/controllers/v1/video.py
new file mode 100644
index 0000000..0430707
--- /dev/null
+++ b/app/controllers/v1/video.py
@@ -0,0 +1,271 @@
+import glob
+import os
+import pathlib
+import shutil
+from typing import Union
+
+from fastapi import BackgroundTasks, Depends, Path, Request, UploadFile
+from fastapi.params import File
+from fastapi.responses import FileResponse, StreamingResponse
+from loguru import logger
+
+from app.config import config
+from app.controllers import base
+from app.controllers.manager.memory_manager import InMemoryTaskManager
+from app.controllers.manager.redis_manager import RedisTaskManager
+from app.controllers.v1.base import new_router
+from app.models.exception import HttpException
+from app.models.schema import (
+    AudioRequest,
+    BgmRetrieveResponse,
+    BgmUploadResponse,
+    SubtitleRequest,
+    TaskDeletionResponse,
+    TaskQueryRequest,
+    TaskQueryResponse,
+    TaskResponse,
+    TaskVideoRequest,
+)
+from app.services import state as sm
+from app.services import task as tm
+from app.utils import utils
+
+# 认证依赖项
+# router = new_router(dependencies=[Depends(base.verify_token)])
+router = new_router()
+
+_enable_redis = config.app.get("enable_redis", False)
+_redis_host = config.app.get("redis_host", "localhost")
+_redis_port = config.app.get("redis_port", 6379)
+_redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)
+_max_concurrent_tasks = config.app.get("max_concurrent_tasks", 5)
+
+redis_url = f"redis://:{_redis_password}@{_redis_host}:{_redis_port}/{_redis_db}"
+# 根据配置选择合适的任务管理器
+if _enable_redis:
+    task_manager = RedisTaskManager(
+        max_concurrent_tasks=_max_concurrent_tasks, redis_url=redis_url
+    )
+else:
+    task_manager = InMemoryTaskManager(max_concurrent_tasks=_max_concurrent_tasks)
+
+
+@router.post("/videos", response_model=TaskResponse, summary="Generate a short video")
+def create_video(
+    background_tasks: BackgroundTasks, request: Request, body: TaskVideoRequest
+):
+    return create_task(request, body, stop_at="video")
+
+
+@router.post("/subtitle", response_model=TaskResponse, summary="Generate subtitle only")
+def create_subtitle(
+    background_tasks: BackgroundTasks, request: Request, body: SubtitleRequest
+):
+    return create_task(request, body, stop_at="subtitle")
+
+
+@router.post("/audio", response_model=TaskResponse, summary="Generate audio only")
+def create_audio(
+    background_tasks: BackgroundTasks, request: Request, body: AudioRequest
+):
+    return create_task(request, body, stop_at="audio")
+
+
+def create_task(
+    request: Request,
+    body: Union[TaskVideoRequest, SubtitleRequest, AudioRequest],
+    stop_at: str,
+):
+    task_id = utils.get_uuid()
+    request_id = base.get_task_id(request)
+    try:
+        task = {
+            "task_id": task_id,
+            "request_id": request_id,
+            "params": body.model_dump(),
+        }
+        sm.state.update_task(task_id)
+        task_manager.add_task(tm.start, task_id=task_id, params=body, stop_at=stop_at)
+        logger.success(f"Task created: {utils.to_json(task)}")
+        return utils.get_response(200, task)
+    except ValueError as e:
+        raise HttpException(
+            task_id=task_id, status_code=400, message=f"{request_id}: {str(e)}"
+        )
+
+
+@router.get(
+    "/tasks/{task_id}", response_model=TaskQueryResponse, summary="Query task status"
+)
+def get_task(
+    request: Request,
+    task_id: str = Path(..., description="Task ID"),
+    query: TaskQueryRequest = Depends(),
+):
+    endpoint = config.app.get("endpoint", "")
+    if not endpoint:
+        endpoint = str(request.base_url)
+    endpoint = endpoint.rstrip("/")
+
+    request_id = base.get_task_id(request)
+    task = sm.state.get_task(task_id)
+    if task:
+        task_dir = utils.task_dir()
+
+        def file_to_uri(file):
+            if not file.startswith(endpoint):
+                _uri_path = v.replace(task_dir, "tasks").replace("\\", "/")
+                _uri_path = f"{endpoint}/{_uri_path}"
+            else:
+                _uri_path = file
+            return _uri_path
+
+        if "videos" in task:
+            videos = task["videos"]
+            urls = []
+            for v in videos:
+                urls.append(file_to_uri(v))
+            task["videos"] = urls
+        if "combined_videos" in task:
+            combined_videos = task["combined_videos"]
+            urls = []
+            for v in combined_videos:
+                urls.append(file_to_uri(v))
+            task["combined_videos"] = urls
+        return utils.get_response(200, task)
+
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )
+
+
+@router.delete(
+    "/tasks/{task_id}",
+    response_model=TaskDeletionResponse,
+    summary="Delete a generated short video task",
+)
+def delete_video(request: Request, task_id: str = Path(..., description="Task ID")):
+    request_id = base.get_task_id(request)
+    task = sm.state.get_task(task_id)
+    if task:
+        tasks_dir = utils.task_dir()
+        current_task_dir = os.path.join(tasks_dir, task_id)
+        if os.path.exists(current_task_dir):
+            shutil.rmtree(current_task_dir)
+
+        sm.state.delete_task(task_id)
+        logger.success(f"video deleted: {utils.to_json(task)}")
+        return utils.get_response(200)
+
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )
+
+
+@router.get(
+    "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
+)
+def get_bgm_list(request: Request):
+    suffix = "*.mp3"
+    song_dir = utils.song_dir()
+    files = glob.glob(os.path.join(song_dir, suffix))
+    bgm_list = []
+    for file in files:
+        bgm_list.append(
+            {
+                "name": os.path.basename(file),
+                "size": os.path.getsize(file),
+                "file": file,
+            }
+        )
+    response = {"files": bgm_list}
+    return utils.get_response(200, response)
+
+
+@router.post(
+    "/musics",
+    response_model=BgmUploadResponse,
+    summary="Upload the BGM file to the songs directory",
+)
+def upload_bgm_file(request: Request, file: UploadFile = File(...)):
+    request_id = base.get_task_id(request)
+    # check file ext
+    if file.filename.endswith("mp3"):
+        song_dir = utils.song_dir()
+        save_path = os.path.join(song_dir, file.filename)
+        # save file
+        with open(save_path, "wb+") as buffer:
+            # If the file already exists, it will be overwritten
+            file.file.seek(0)
+            buffer.write(file.file.read())
+        response = {"file": save_path}
+        return utils.get_response(200, response)
+
+    raise HttpException(
+        "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
+    )
+
+
+@router.get("/stream/{file_path:path}")
+async def stream_video(request: Request, file_path: str):
+    tasks_dir = utils.task_dir()
+    video_path = os.path.join(tasks_dir, file_path)
+    range_header = request.headers.get("Range")
+    video_size = os.path.getsize(video_path)
+    start, end = 0, video_size - 1
+
+    length = video_size
+    if range_header:
+        range_ = range_header.split("bytes=")[1]
+        start, end = [int(part) if part else None for part in range_.split("-")]
+        if start is None:
+            start = video_size - end
+            end = video_size - 1
+        if end is None:
+            end = video_size - 1
+        length = end - start + 1
+
+    def file_iterator(file_path, offset=0, bytes_to_read=None):
+        with open(file_path, "rb") as f:
+            f.seek(offset, os.SEEK_SET)
+            remaining = bytes_to_read or video_size
+            while remaining > 0:
+                bytes_to_read = min(4096, remaining)
+                data = f.read(bytes_to_read)
+                if not data:
+                    break
+                remaining -= len(data)
+                yield data
+
+    response = StreamingResponse(
+        file_iterator(video_path, start, length), media_type="video/mp4"
+    )
+    response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
+    response.headers["Accept-Ranges"] = "bytes"
+    response.headers["Content-Length"] = str(length)
+    response.status_code = 206  # Partial Content
+
+    return response
+
+
+@router.get("/download/{file_path:path}")
+async def download_video(_: Request, file_path: str):
+    """
+    download video
+    :param _: Request request
+    :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
+    :return: video file
+    """
+    tasks_dir = utils.task_dir()
+    video_path = os.path.join(tasks_dir, file_path)
+    file_path = pathlib.Path(video_path)
+    filename = file_path.stem
+    extension = file_path.suffix
+    headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
+    return FileResponse(
+        path=video_path,
+        headers=headers,
+        filename=f"{filename}{extension}",
+        media_type=f"video/{extension[1:]}",
+    )
diff --git a/app/models/__init__.py b/app/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/models/const.py b/app/models/const.py
new file mode 100644
index 0000000..e7540ef
--- /dev/null
+++ b/app/models/const.py
@@ -0,0 +1,25 @@
+PUNCTUATIONS = [
+    "?",
+    ",",
+    ".",
+    "、",
+    ";",
+    ":",
+    "!",
+    "…",
+    "？",
+    "，",
+    "。",
+    "、",
+    "；",
+    "：",
+    "！",
+    "...",
+]
+
+TASK_STATE_FAILED = -1
+TASK_STATE_COMPLETE = 1
+TASK_STATE_PROCESSING = 4
+
+FILE_TYPE_VIDEOS = ["mp4", "mov", "mkv", "webm"]
+FILE_TYPE_IMAGES = ["jpg", "jpeg", "png", "bmp"]
diff --git a/app/models/exception.py b/app/models/exception.py
new file mode 100644
index 0000000..b186cae
--- /dev/null
+++ b/app/models/exception.py
@@ -0,0 +1,28 @@
+import traceback
+from typing import Any
+
+from loguru import logger
+
+
+class HttpException(Exception):
+    def __init__(
+        self, task_id: str, status_code: int, message: str = "", data: Any = None
+    ):
+        self.message = message
+        self.status_code = status_code
+        self.data = data
+        # 获取异常堆栈信息
+        tb_str = traceback.format_exc().strip()
+        if not tb_str or tb_str == "NoneType: None":
+            msg = f"HttpException: {status_code}, {task_id}, {message}"
+        else:
+            msg = f"HttpException: {status_code}, {task_id}, {message}\n{tb_str}"
+
+        if status_code == 400:
+            logger.warning(msg)
+        else:
+            logger.error(msg)
+
+
+class FileNotFoundException(Exception):
+    pass
diff --git a/app/models/schema.py b/app/models/schema.py
new file mode 100644
index 0000000..25e3ce8
--- /dev/null
+++ b/app/models/schema.py
@@ -0,0 +1,370 @@
+import warnings
+from enum import Enum
+from typing import Any, List, Optional
+
+import pydantic
+from pydantic import BaseModel
+
+# 忽略 Pydantic 的特定警告
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message="Field name.*shadows an attribute in parent.*",
+)
+
+
+class VideoConcatMode(str, Enum):
+    random = "random"
+    sequential = "sequential"
+
+
+class VideoAspect(str, Enum):
+    landscape = "16:9"
+    portrait = "9:16"
+    square = "1:1"
+
+    def to_resolution(self):
+        if self == VideoAspect.landscape.value:
+            return 1920, 1080
+        elif self == VideoAspect.portrait.value:
+            return 1080, 1920
+        elif self == VideoAspect.square.value:
+            return 1080, 1080
+        return 1080, 1920
+
+
+class _Config:
+    arbitrary_types_allowed = True
+
+
+@pydantic.dataclasses.dataclass(config=_Config)
+class MaterialInfo:
+    provider: str = "pexels"
+    url: str = ""
+    duration: int = 0
+
+
+# VoiceNames = [
+#     # zh-CN
+#     "female-zh-CN-XiaoxiaoNeural",
+#     "female-zh-CN-XiaoyiNeural",
+#     "female-zh-CN-liaoning-XiaobeiNeural",
+#     "female-zh-CN-shaanxi-XiaoniNeural",
+#
+#     "male-zh-CN-YunjianNeural",
+#     "male-zh-CN-YunxiNeural",
+#     "male-zh-CN-YunxiaNeural",
+#     "male-zh-CN-YunyangNeural",
+#
+#     # "female-zh-HK-HiuGaaiNeural",
+#     # "female-zh-HK-HiuMaanNeural",
+#     # "male-zh-HK-WanLungNeural",
+#     #
+#     # "female-zh-TW-HsiaoChenNeural",
+#     # "female-zh-TW-HsiaoYuNeural",
+#     # "male-zh-TW-YunJheNeural",
+#
+#     # en-US
+#     "female-en-US-AnaNeural",
+#     "female-en-US-AriaNeural",
+#     "female-en-US-AvaNeural",
+#     "female-en-US-EmmaNeural",
+#     "female-en-US-JennyNeural",
+#     "female-en-US-MichelleNeural",
+#
+#     "male-en-US-AndrewNeural",
+#     "male-en-US-BrianNeural",
+#     "male-en-US-ChristopherNeural",
+#     "male-en-US-EricNeural",
+#     "male-en-US-GuyNeural",
+#     "male-en-US-RogerNeural",
+#     "male-en-US-SteffanNeural",
+# ]
+
+
+class VideoParams(BaseModel):
+    """
+    {
+      "video_subject": "",
+      "video_aspect": "横屏 16:9（西瓜视频）",
+      "voice_name": "女生-晓晓",
+      "bgm_name": "random",
+      "font_name": "STHeitiMedium 黑体-中",
+      "text_color": "#FFFFFF",
+      "font_size": 60,
+      "stroke_color": "#000000",
+      "stroke_width": 1.5
+    }
+    """
+
+    video_subject: str
+    video_script: str = ""  # 用于生成视频的脚本
+    video_terms: Optional[str | list] = None  # 用于生成视频的关键词
+    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
+    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+    video_clip_duration: Optional[int] = 5
+    video_count: Optional[int] = 1
+
+    video_source: Optional[str] = "pexels"
+    video_materials: Optional[List[MaterialInfo]] = None  # 用于生成视频的素材
+
+    video_language: Optional[str] = ""  # auto detect
+
+    voice_name: Optional[str] = ""
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.0
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+
+    subtitle_enabled: Optional[bool] = True
+    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
+    custom_position: float = 70.0
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    text_background_color: Optional[str] = "transparent"
+
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    n_threads: Optional[int] = 2
+    paragraph_number: Optional[int] = 1
+
+
+class SubtitleRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    subtitle_position: Optional[str] = "bottom"
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    text_background_color: Optional[str] = "transparent"
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    video_source: Optional[str] = "local"
+    subtitle_enabled: Optional[str] = "true"
+
+
+class AudioRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    video_source: Optional[str] = "local"
+
+
+class VideoScriptParams:
+    """
+    {
+      "video_subject": "春天的花海",
+      "video_language": "",
+      "paragraph_number": 1
+    }
+    """
+
+    video_subject: Optional[str] = "春天的花海"
+    video_language: Optional[str] = ""
+    paragraph_number: Optional[int] = 1
+
+
+class VideoTermsParams:
+    """
+    {
+      "video_subject": "",
+      "video_script": "",
+      "amount": 5
+    }
+    """
+
+    video_subject: Optional[str] = "春天的花海"
+    video_script: Optional[str] = (
+        "春天的花海，如诗如画般展现在眼前。万物复苏的季节里，大地披上了一袭绚丽多彩的盛装。金黄的迎春、粉嫩的樱花、洁白的梨花、艳丽的郁金香……"
+    )
+    amount: Optional[int] = 5
+
+
+class BaseResponse(BaseModel):
+    status: int = 200
+    message: Optional[str] = "success"
+    data: Any = None
+
+
+class TaskVideoRequest(VideoParams, BaseModel):
+    pass
+
+
+class TaskQueryRequest(BaseModel):
+    pass
+
+
+class VideoScriptRequest(VideoScriptParams, BaseModel):
+    pass
+
+
+class VideoTermsRequest(VideoTermsParams, BaseModel):
+    pass
+
+
+######################################################################################################
+######################################################################################################
+######################################################################################################
+######################################################################################################
+class TaskResponse(BaseResponse):
+    class TaskResponseData(BaseModel):
+        task_id: str
+
+    data: TaskResponseData
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"task_id": "6c85c8cc-a77a-42b9-bc30-947815aa0558"},
+            },
+        }
+
+
+class TaskQueryResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "state": 1,
+                    "progress": 100,
+                    "videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/final-1.mp4"
+                    ],
+                    "combined_videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
+                    ],
+                },
+            },
+        }
+
+
+class TaskDeletionResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "state": 1,
+                    "progress": 100,
+                    "videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/final-1.mp4"
+                    ],
+                    "combined_videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
+                    ],
+                },
+            },
+        }
+
+
+class VideoScriptResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "video_script": "春天的花海，是大自然的一幅美丽画卷。在这个季节里，大地复苏，万物生长，花朵争相绽放，形成了一片五彩斑斓的花海..."
+                },
+            },
+        }
+
+
+class VideoTermsResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"video_terms": ["sky", "tree"]},
+            },
+        }
+
+
+class BgmRetrieveResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "files": [
+                        {
+                            "name": "output013.mp3",
+                            "size": 1891269,
+                            "file": "/NarratoAI/resource/songs/output013.mp3",
+                        }
+                    ]
+                },
+            },
+        }
+
+
+class BgmUploadResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"file": "/NarratoAI/resource/songs/example.mp3"},
+            },
+        }
+
+
+class VideoClipParams(BaseModel):
+    video_subject: Optional[str] = "春天的花海让人心旷神怡"
+
+    video_clip_json: Optional[str] = ""      # 视频剪辑脚本
+    video_origin_path: Optional[str] = ""    # 原视频路径
+    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value        # 视频比例
+    video_clip_duration: Optional[int] = 5      # 视频片段时长
+    video_count: Optional[int] = 1      # 视频片段数量
+    video_source: Optional[str] = "local"
+    video_language: Optional[str] = ""  # 自动检测
+    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+
+    # # 女性
+    # "zh-CN-XiaoxiaoNeural",
+    # "zh-CN-XiaoyiNeural",
+    # # 男性
+    # "zh-CN-YunjianNeural" 男声
+    # "zh-CN-YunyangNeural",
+    # "zh-CN-YunxiNeural",
+    voice_name: Optional[str] = "zh-CN-YunjianNeural"      # 语音名称 指定选择：
+    voice_volume: Optional[float] = 1.0     # 语音音量
+    voice_rate: Optional[float] = 1.0       # 语速
+
+    bgm_name: Optional[str] = "random"  # 背景音乐名称
+    bgm_type: Optional[str] = "random"  # 背景音乐类型
+    bgm_file: Optional[str] = ""        # 背景音乐文件
+    bgm_volume: Optional[float] = 0.2
+
+    subtitle_enabled: Optional[bool] = True     # 是否启用字幕
+    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
+    font_name: Optional[str] = "STHeitiMedium.ttc"      # 字体名称
+    text_fore_color: Optional[str] = "#FFFFFF"      # 文字前景色
+    text_background_color: Optional[str] = "transparent"    # 文字背景色
+
+    font_size: int = 60     # 文字大小
+    stroke_color: Optional[str] = "#000000"     # 文字描边颜色
+    stroke_width: float = 1.5       # 文字描边宽度
+    n_threads: Optional[int] = 2    # 线程数
+    paragraph_number: Optional[int] = 1     # 段落数量
diff --git a/app/router.py b/app/router.py
new file mode 100644
index 0000000..cf84037
--- /dev/null
+++ b/app/router.py
@@ -0,0 +1,17 @@
+"""Application configuration - root APIRouter.
+
+Defines all FastAPI application endpoints.
+
+Resources:
+    1. https://fastapi.tiangolo.com/tutorial/bigger-applications
+
+"""
+
+from fastapi import APIRouter
+
+from app.controllers.v1 import llm, video
+
+root_api_router = APIRouter()
+# v1
+root_api_router.include_router(video.router)
+root_api_router.include_router(llm.router)
diff --git a/app/services/__init__.py b/app/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/services/llm.py b/app/services/llm.py
new file mode 100644
index 0000000..d260b07
--- /dev/null
+++ b/app/services/llm.py
@@ -0,0 +1,477 @@
+import logging
+import re
+import os
+import json
+from typing import List
+from loguru import logger
+from openai import OpenAI
+from openai import AzureOpenAI
+from openai.types.chat import ChatCompletion
+import google.generativeai as gemini
+
+from app.config import config
+
+_max_retries = 5
+
+
+def _generate_response(prompt: str) -> str:
+    content = ""
+    llm_provider = config.app.get("llm_provider", "openai")
+    logger.info(f"llm provider: {llm_provider}")
+    if llm_provider == "g4f":
+        model_name = config.app.get("g4f_model_name", "")
+        if not model_name:
+            model_name = "gpt-3.5-turbo-16k-0613"
+        import g4f
+
+        content = g4f.ChatCompletion.create(
+            model=model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+    else:
+        api_version = ""  # for azure
+        if llm_provider == "moonshot":
+            api_key = config.app.get("moonshot_api_key")
+            model_name = config.app.get("moonshot_model_name")
+            base_url = "https://api.moonshot.cn/v1"
+        elif llm_provider == "ollama":
+            # api_key = config.app.get("openai_api_key")
+            api_key = "ollama"  # any string works but you are required to have one
+            model_name = config.app.get("ollama_model_name")
+            base_url = config.app.get("ollama_base_url", "")
+            if not base_url:
+                base_url = "http://localhost:11434/v1"
+        elif llm_provider == "openai":
+            api_key = config.app.get("openai_api_key")
+            model_name = config.app.get("openai_model_name")
+            base_url = config.app.get("openai_base_url", "")
+            if not base_url:
+                base_url = "https://api.openai.com/v1"
+        elif llm_provider == "oneapi":
+            api_key = config.app.get("oneapi_api_key")
+            model_name = config.app.get("oneapi_model_name")
+            base_url = config.app.get("oneapi_base_url", "")
+        elif llm_provider == "azure":
+            api_key = config.app.get("azure_api_key")
+            model_name = config.app.get("azure_model_name")
+            base_url = config.app.get("azure_base_url", "")
+            api_version = config.app.get("azure_api_version", "2024-02-15-preview")
+        elif llm_provider == "gemini":
+            api_key = config.app.get("gemini_api_key")
+            model_name = config.app.get("gemini_model_name")
+            base_url = "***"
+        elif llm_provider == "qwen":
+            api_key = config.app.get("qwen_api_key")
+            model_name = config.app.get("qwen_model_name")
+            base_url = "***"
+        elif llm_provider == "cloudflare":
+            api_key = config.app.get("cloudflare_api_key")
+            model_name = config.app.get("cloudflare_model_name")
+            account_id = config.app.get("cloudflare_account_id")
+            base_url = "***"
+        elif llm_provider == "deepseek":
+            api_key = config.app.get("deepseek_api_key")
+            model_name = config.app.get("deepseek_model_name")
+            base_url = config.app.get("deepseek_base_url")
+            if not base_url:
+                base_url = "https://api.deepseek.com"
+        elif llm_provider == "ernie":
+            api_key = config.app.get("ernie_api_key")
+            secret_key = config.app.get("ernie_secret_key")
+            base_url = config.app.get("ernie_base_url")
+            model_name = "***"
+            if not secret_key:
+                raise ValueError(
+                    f"{llm_provider}: secret_key is not set, please set it in the config.toml file."
+                )
+        else:
+            raise ValueError(
+                "llm_provider is not set, please set it in the config.toml file."
+            )
+
+        if not api_key:
+            raise ValueError(
+                f"{llm_provider}: api_key is not set, please set it in the config.toml file."
+            )
+        if not model_name:
+            raise ValueError(
+                f"{llm_provider}: model_name is not set, please set it in the config.toml file."
+            )
+        if not base_url:
+            raise ValueError(
+                f"{llm_provider}: base_url is not set, please set it in the config.toml file."
+            )
+
+        if llm_provider == "qwen":
+            import dashscope
+            from dashscope.api_entities.dashscope_response import GenerationResponse
+
+            dashscope.api_key = api_key
+            response = dashscope.Generation.call(
+                model=model_name, messages=[{"role": "user", "content": prompt}]
+            )
+            if response:
+                if isinstance(response, GenerationResponse):
+                    status_code = response.status_code
+                    if status_code != 200:
+                        raise Exception(
+                            f'[{llm_provider}] returned an error response: "{response}"'
+                        )
+
+                    content = response["output"]["text"]
+                    return content.replace("\n", "")
+                else:
+                    raise Exception(
+                        f'[{llm_provider}] returned an invalid response: "{response}"'
+                    )
+            else:
+                raise Exception(f"[{llm_provider}] returned an empty response")
+
+        if llm_provider == "gemini":
+            import google.generativeai as genai
+
+            genai.configure(api_key=api_key, transport="rest")
+
+            generation_config = {
+                "temperature": 0.5,
+                "top_p": 1,
+                "top_k": 1,
+                "max_output_tokens": 2048,
+            }
+
+            safety_settings = [
+                {
+                    "category": "HARM_CATEGORY_HARASSMENT",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+                {
+                    "category": "HARM_CATEGORY_HATE_SPEECH",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+                {
+                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+                {
+                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+            ]
+
+            model = genai.GenerativeModel(
+                model_name=model_name,
+                generation_config=generation_config,
+                safety_settings=safety_settings,
+            )
+
+            try:
+                response = model.generate_content(prompt)
+                candidates = response.candidates
+                generated_text = candidates[0].content.parts[0].text
+            except (AttributeError, IndexError) as e:
+                print("Gemini Error:", e)
+
+            return generated_text
+
+        if llm_provider == "cloudflare":
+            import requests
+
+            response = requests.post(
+                f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model_name}",
+                headers={"Authorization": f"Bearer {api_key}"},
+                json={
+                    "messages": [
+                        {"role": "system", "content": "You are a friendly assistant"},
+                        {"role": "user", "content": prompt},
+                    ]
+                },
+            )
+            result = response.json()
+            logger.info(result)
+            return result["result"]["response"]
+
+        if llm_provider == "ernie":
+            import requests
+
+            params = {
+                "grant_type": "client_credentials",
+                "client_id": api_key,
+                "client_secret": secret_key,
+            }
+            access_token = (
+                requests.post("https://aip.baidubce.com/oauth/2.0/token", params=params)
+                .json()
+                .get("access_token")
+            )
+            url = f"{base_url}?access_token={access_token}"
+
+            payload = json.dumps(
+                {
+                    "messages": [{"role": "user", "content": prompt}],
+                    "temperature": 0.5,
+                    "top_p": 0.8,
+                    "penalty_score": 1,
+                    "disable_search": False,
+                    "enable_citation": False,
+                    "response_format": "text",
+                }
+            )
+            headers = {"Content-Type": "application/json"}
+
+            response = requests.request(
+                "POST", url, headers=headers, data=payload
+            ).json()
+            return response.get("result")
+
+        if llm_provider == "azure":
+            client = AzureOpenAI(
+                api_key=api_key,
+                api_version=api_version,
+                azure_endpoint=base_url,
+            )
+        else:
+            client = OpenAI(
+                api_key=api_key,
+                base_url=base_url,
+            )
+
+        response = client.chat.completions.create(
+            model=model_name, messages=[{"role": "user", "content": prompt}]
+        )
+        if response:
+            if isinstance(response, ChatCompletion):
+                content = response.choices[0].message.content
+            else:
+                raise Exception(
+                    f'[{llm_provider}] returned an invalid response: "{response}", please check your network '
+                    f"connection and try again."
+                )
+        else:
+            raise Exception(
+                f"[{llm_provider}] returned an empty response, please check your network connection and try again."
+            )
+
+    return content.replace("\n", "")
+
+
+def generate_script(
+    video_subject: str, language: str = "", paragraph_number: int = 1
+) -> str:
+    prompt = f"""
+# Role: Video Script Generator
+
+## Goals:
+Generate a script for a video, depending on the subject of the video.
+
+## Constrains:
+1. the script is to be returned as a string with the specified number of paragraphs.
+2. do not under any circumstance reference this prompt in your response.
+3. get straight to the point, don't start with unnecessary things like, "welcome to this video".
+4. you must not include any type of markdown or formatting in the script, never use a title.
+5. only return the raw content of the script.
+6. do not include "voiceover", "narrator" or similar indicators of what should be spoken at the beginning of each paragraph or line.
+7. you must not mention the prompt, or anything about the script itself. also, never talk about the amount of paragraphs or lines. just write the script.
+8. respond in the same language as the video subject.
+
+# Initialization:
+- video subject: {video_subject}
+- number of paragraphs: {paragraph_number}
+""".strip()
+    if language:
+        prompt += f"\n- language: {language}"
+
+    final_script = ""
+    logger.info(f"subject: {video_subject}")
+
+    def format_response(response):
+        # Clean the script
+        # Remove asterisks, hashes
+        response = response.replace("*", "")
+        response = response.replace("#", "")
+
+        # Remove markdown syntax
+        response = re.sub(r"\[.*\]", "", response)
+        response = re.sub(r"\(.*\)", "", response)
+
+        # Split the script into paragraphs
+        paragraphs = response.split("\n\n")
+
+        # Select the specified number of paragraphs
+        selected_paragraphs = paragraphs[:paragraph_number]
+
+        # Join the selected paragraphs into a single string
+        return "\n\n".join(paragraphs)
+
+    for i in range(_max_retries):
+        try:
+            response = _generate_response(prompt=prompt)
+            if response:
+                final_script = format_response(response)
+            else:
+                logging.error("gpt returned an empty response")
+
+            # g4f may return an error message
+            if final_script and "当日额度已消耗完" in final_script:
+                raise ValueError(final_script)
+
+            if final_script:
+                break
+        except Exception as e:
+            logger.error(f"failed to generate script: {e}")
+
+        if i < _max_retries:
+            logger.warning(f"failed to generate video script, trying again... {i + 1}")
+
+    logger.success(f"completed: \n{final_script}")
+    return final_script.strip()
+
+
+def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
+    prompt = f"""
+# Role: Video Search Terms Generator
+
+## Goals:
+Generate {amount} search terms for stock videos, depending on the subject of a video.
+
+## Constrains:
+1. the search terms are to be returned as a json-array of strings.
+2. each search term should consist of 1-3 words, always add the main subject of the video.
+3. you must only return the json-array of strings. you must not return anything else. you must not return the script.
+4. the search terms must be related to the subject of the video.
+5. reply with english search terms only.
+
+## Output Example:
+["search term 1", "search term 2", "search term 3","search term 4","search term 5"]
+
+## Context:
+### Video Subject
+{video_subject}
+
+### Video Script
+{video_script}
+
+Please note that you must use English for generating video search terms; Chinese is not accepted.
+""".strip()
+
+    logger.info(f"subject: {video_subject}")
+
+    search_terms = []
+    response = ""
+    for i in range(_max_retries):
+        try:
+            response = _generate_response(prompt)
+            search_terms = json.loads(response)
+            if not isinstance(search_terms, list) or not all(
+                isinstance(term, str) for term in search_terms
+            ):
+                logger.error("response is not a list of strings.")
+                continue
+
+        except Exception as e:
+            logger.warning(f"failed to generate video terms: {str(e)}")
+            if response:
+                match = re.search(r"\[.*]", response)
+                if match:
+                    try:
+                        search_terms = json.loads(match.group())
+                    except Exception as e:
+                        logger.warning(f"failed to generate video terms: {str(e)}")
+                        pass
+
+        if search_terms and len(search_terms) > 0:
+            break
+        if i < _max_retries:
+            logger.warning(f"failed to generate video terms, trying again... {i + 1}")
+
+    logger.success(f"completed: \n{search_terms}")
+    return search_terms
+
+
+def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot: str) -> str:
+    '''
+    使用 gemini-1.5-pro 进行影视解析
+    Args:
+        video_origin_name: str - 影视作品的原始名称
+        video_origin_path: str - 影视作品的原始路径
+        video_plot: str - 影视作品的简介或剧情概述
+
+    Return:
+        str - 解析后的 JSON 格式字符串
+    '''
+    api_key = config.app.get("gemini_api_key")
+    model_name = config.app.get("gemini_model_name")
+
+    gemini.configure(api_key=api_key)
+    model = gemini.GenerativeModel(model_name=model_name)
+
+    prompt = """
+# Role: 影视解说专家
+
+## Background:
+擅长根据剧情描述视频的画面和故事，能够生成一段非常有趣的解说文案。
+
+## Goals:
+1. 根据剧情描述视频的画面和故事，并对重要的画面进行展开叙述
+2. 根据剧情内容，生成符合 tiktok/抖音 风格的影视解说文案
+3. 将结果直接以json格式输出给用户，需要包含字段： picture 画面描述， timestamp 时间戳， narration 解说文案
+4. 剧情内容如下：{%s}
+
+## Skills
+- 精通 tiktok/抖音 等短视频影视解说文案撰写
+- 能够理解视频中的故事和画面表现
+- 能精准匹配视频中的画面和时间戳
+- 能精准把控旁白和时长
+- 精通中文
+- 精通JSON数据格式
+
+## Constrains
+- 解说文案的时长要和时间戳的时长尽量匹配
+- 忽略视频中关于广告的内容
+- 忽略视频中片头和片尾
+- 不得在脚本中包含任何类型的 Markdown 或格式
+
+## Format
+- 对应JSON的key为：picture， timestamp， narration
+""" % video_plot
+    logger.debug(f"视频名称: {video_origin_name}")
+    try:
+        gemini_video_file = gemini.upload_file(video_origin_path)
+        logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
+        while gemini_video_file.state.name == "PROCESSING":
+            import time
+            time.sleep(1)
+            gemini_video_file = gemini.get_file(gemini_video_file.name)
+            logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
+        if gemini_video_file.state.name == "FAILED":
+            raise ValueError(gemini_video_file.state.name)
+    except:
+        logger.error("上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确")
+        raise TimeoutError("上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确")
+
+    streams = model.generate_content([prompt, gemini_video_file], stream=True)
+    response = []
+    for chunk in streams:
+        response.append(chunk.text)
+
+    response = "".join(response)
+    logger.success(f"llm response: \n{response}")
+
+    return response
+
+
+if __name__ == "__main__":
+    juqin = ""
+    res = gemini_video2json("test", "/NarratoAI/resource/videos/test.mp4", juqin)
+    print(res)
+
+    # video_subject = "生命的意义是什么"
+    # script = generate_script(
+    #     video_subject=video_subject, language="zh-CN", paragraph_number=1
+    # )
+    # print("######################")
+    # print(script)
+    # search_terms = generate_terms(
+    #     video_subject=video_subject, video_script=script, amount=5
+    # )
+    # print("######################")
+    # print(search_terms)
diff --git a/app/services/material.py b/app/services/material.py
new file mode 100644
index 0000000..7eca553
--- /dev/null
+++ b/app/services/material.py
@@ -0,0 +1,335 @@
+import os
+import random
+from urllib.parse import urlencode
+
+import requests
+from typing import List
+from loguru import logger
+from moviepy.video.io.VideoFileClip import VideoFileClip
+
+from app.config import config
+from app.models.schema import VideoAspect, VideoConcatMode, MaterialInfo
+from app.utils import utils
+
+requested_count = 0
+
+
+def get_api_key(cfg_key: str):
+    api_keys = config.app.get(cfg_key)
+    if not api_keys:
+        raise ValueError(
+            f"\n\n##### {cfg_key} is not set #####\n\nPlease set it in the config.toml file: {config.config_file}\n\n"
+            f"{utils.to_json(config.app)}"
+        )
+
+    # if only one key is provided, return it
+    if isinstance(api_keys, str):
+        return api_keys
+
+    global requested_count
+    requested_count += 1
+    return api_keys[requested_count % len(api_keys)]
+
+
+def search_videos_pexels(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
+    aspect = VideoAspect(video_aspect)
+    video_orientation = aspect.name
+    video_width, video_height = aspect.to_resolution()
+    api_key = get_api_key("pexels_api_keys")
+    headers = {"Authorization": api_key}
+    # Build URL
+    params = {"query": search_term, "per_page": 20, "orientation": video_orientation}
+    query_url = f"https://api.pexels.com/videos/search?{urlencode(params)}"
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")
+
+    try:
+        r = requests.get(
+            query_url,
+            headers=headers,
+            proxies=config.proxy,
+            verify=False,
+            timeout=(30, 60),
+        )
+        response = r.json()
+        video_items = []
+        if "videos" not in response:
+            logger.error(f"search videos failed: {response}")
+            return video_items
+        videos = response["videos"]
+        # loop through each video in the result
+        for v in videos:
+            duration = v["duration"]
+            # check if video has desired minimum duration
+            if duration < minimum_duration:
+                continue
+            video_files = v["video_files"]
+            # loop through each url to determine the best quality
+            for video in video_files:
+                w = int(video["width"])
+                h = int(video["height"])
+                if w == video_width and h == video_height:
+                    item = MaterialInfo()
+                    item.provider = "pexels"
+                    item.url = video["link"]
+                    item.duration = duration
+                    video_items.append(item)
+                    break
+        return video_items
+    except Exception as e:
+        logger.error(f"search videos failed: {str(e)}")
+
+    return []
+
+
+def search_videos_pixabay(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
+    aspect = VideoAspect(video_aspect)
+
+    video_width, video_height = aspect.to_resolution()
+
+    api_key = get_api_key("pixabay_api_keys")
+    # Build URL
+    params = {
+        "q": search_term,
+        "video_type": "all",  # Accepted values: "all", "film", "animation"
+        "per_page": 50,
+        "key": api_key,
+    }
+    query_url = f"https://pixabay.com/api/videos/?{urlencode(params)}"
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")
+
+    try:
+        r = requests.get(
+            query_url, proxies=config.proxy, verify=False, timeout=(30, 60)
+        )
+        response = r.json()
+        video_items = []
+        if "hits" not in response:
+            logger.error(f"search videos failed: {response}")
+            return video_items
+        videos = response["hits"]
+        # loop through each video in the result
+        for v in videos:
+            duration = v["duration"]
+            # check if video has desired minimum duration
+            if duration < minimum_duration:
+                continue
+            video_files = v["videos"]
+            # loop through each url to determine the best quality
+            for video_type in video_files:
+                video = video_files[video_type]
+                w = int(video["width"])
+                h = int(video["height"])
+                if w >= video_width:
+                    item = MaterialInfo()
+                    item.provider = "pixabay"
+                    item.url = video["url"]
+                    item.duration = duration
+                    video_items.append(item)
+                    break
+        return video_items
+    except Exception as e:
+        logger.error(f"search videos failed: {str(e)}")
+
+    return []
+
+
+def save_video(video_url: str, save_dir: str = "") -> str:
+    if not save_dir:
+        save_dir = utils.storage_dir("cache_videos")
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    url_without_query = video_url.split("?")[0]
+    url_hash = utils.md5(url_without_query)
+    video_id = f"vid-{url_hash}"
+    video_path = f"{save_dir}/{video_id}.mp4"
+
+    # if video already exists, return the path
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        logger.info(f"video already exists: {video_path}")
+        return video_path
+
+    # if video does not exist, download it
+    with open(video_path, "wb") as f:
+        f.write(
+            requests.get(
+                video_url, proxies=config.proxy, verify=False, timeout=(60, 240)
+            ).content
+        )
+
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        try:
+            clip = VideoFileClip(video_path)
+            duration = clip.duration
+            fps = clip.fps
+            clip.close()
+            if duration > 0 and fps > 0:
+                return video_path
+        except Exception as e:
+            try:
+                os.remove(video_path)
+            except Exception as e:
+                logger.warning(f"无效的视频文件: {video_path} => {str(e)}")
+    return ""
+
+
+def download_videos(
+    task_id: str,
+    search_terms: List[str],
+    source: str = "pexels",
+    video_aspect: VideoAspect = VideoAspect.portrait,
+    video_contact_mode: VideoConcatMode = VideoConcatMode.random,
+    audio_duration: float = 0.0,
+    max_clip_duration: int = 5,
+) -> List[str]:
+    valid_video_items = []
+    valid_video_urls = []
+    found_duration = 0.0
+    search_videos = search_videos_pexels
+    if source == "pixabay":
+        search_videos = search_videos_pixabay
+
+    for search_term in search_terms:
+        video_items = search_videos(
+            search_term=search_term,
+            minimum_duration=max_clip_duration,
+            video_aspect=video_aspect,
+        )
+        logger.info(f"found {len(video_items)} videos for '{search_term}'")
+
+        for item in video_items:
+            if item.url not in valid_video_urls:
+                valid_video_items.append(item)
+                valid_video_urls.append(item.url)
+                found_duration += item.duration
+
+    logger.info(
+        f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds"
+    )
+    video_paths = []
+
+    material_directory = config.app.get("material_directory", "").strip()
+    if material_directory == "task":
+        material_directory = utils.task_dir(task_id)
+    elif material_directory and not os.path.isdir(material_directory):
+        material_directory = ""
+
+    if video_contact_mode.value == VideoConcatMode.random.value:
+        random.shuffle(valid_video_items)
+
+    total_duration = 0.0
+    for item in valid_video_items:
+        try:
+            logger.info(f"downloading video: {item.url}")
+            saved_video_path = save_video(
+                video_url=item.url, save_dir=material_directory
+            )
+            if saved_video_path:
+                logger.info(f"video saved: {saved_video_path}")
+                video_paths.append(saved_video_path)
+                seconds = min(max_clip_duration, item.duration)
+                total_duration += seconds
+                if total_duration > audio_duration:
+                    logger.info(
+                        f"total duration of downloaded videos: {total_duration} seconds, skip downloading more"
+                    )
+                    break
+        except Exception as e:
+            logger.error(f"failed to download video: {utils.to_json(item)} => {str(e)}")
+    logger.success(f"downloaded {len(video_paths)} videos")
+    return video_paths
+
+
+def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
+    """
+    保存剪辑后的视频
+    Args:
+        timestamp: 需要裁剪的单个时间戳，如：'00:36-00:40'
+        origin_video: 原视频路径
+        save_dir: 存储目录
+
+    Returns:
+        裁剪后的视频路径
+    """
+    if not save_dir:
+        save_dir = utils.storage_dir("cache_videos")
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    # url_hash = utils.md5(str(uuid.uuid4()))
+    video_id = f"vid-{timestamp.replace(':', '_')}"
+    video_path = f"{save_dir}/{video_id}.mp4"
+
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        logger.info(f"video already exists: {video_path}")
+        return {timestamp: video_path}
+
+    # 剪辑视频
+    start, end = utils.split_timestamp(timestamp)
+    video = VideoFileClip(origin_video).subclip(start, end)
+    video.write_videofile(video_path)
+
+    if os.path.getsize(video_path) > 0 and os.path.exists(video_path):
+        try:
+            clip = VideoFileClip(video_path)
+            duration = clip.duration
+            fps = clip.fps
+            clip.close()
+            if duration > 0 and fps > 0:
+                return {timestamp: video_path}
+        except Exception as e:
+            try:
+                os.remove(video_path)
+            except Exception as e:
+                logger.warning(str(e))
+            logger.warning(f"无效的视频文件: {video_path}")
+    return {}
+
+
+def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -> dict:
+    """
+    剪辑视频
+    Args:
+        task_id: 任务id
+        timestamp_terms: 需要剪辑的时间戳列表，如:['00:00-00:20', '00:36-00:40', '07:07-07:22']
+        origin_video: 原视频路径
+
+    Returns:
+        剪辑后的视频路径
+    """
+    video_paths = {}
+    for item in timestamp_terms:
+        logger.info(f"需要裁剪 '{origin_video}' 为 {len(timestamp_terms)} 个视频")
+        material_directory = config.app.get("material_directory", "").strip()
+        if material_directory == "task":
+            material_directory = utils.task_dir(task_id)
+        elif material_directory and not os.path.isdir(material_directory):
+            material_directory = ""
+
+        try:
+            logger.info(f"clip video: {item}")
+            saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
+            if saved_video_path:
+                logger.info(f"video saved: {saved_video_path}")
+                video_paths.update(saved_video_path)
+        except Exception as e:
+            logger.error(f"视频裁剪失败: {utils.to_json(item)} => {str(e)}")
+            return {}
+    logger.success(f"裁剪 {len(video_paths)} videos")
+    return video_paths
+
+
+if __name__ == "__main__":
+    download_videos(
+        "test123", ["Money Exchange Medium"], audio_duration=100, source="pixabay"
+    )
diff --git a/app/services/state.py b/app/services/state.py
new file mode 100644
index 0000000..51904fb
--- /dev/null
+++ b/app/services/state.py
@@ -0,0 +1,122 @@
+import ast
+from abc import ABC, abstractmethod
+from app.config import config
+from app.models import const
+
+
+# Base class for state management
+class BaseState(ABC):
+    @abstractmethod
+    def update_task(self, task_id: str, state: int, progress: int = 0, **kwargs):
+        pass
+
+    @abstractmethod
+    def get_task(self, task_id: str):
+        pass
+
+
+# Memory state management
+class MemoryState(BaseState):
+    def __init__(self):
+        self._tasks = {}
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
+        progress = int(progress)
+        if progress > 100:
+            progress = 100
+
+        self._tasks[task_id] = {
+            "state": state,
+            "progress": progress,
+            **kwargs,
+        }
+
+    def get_task(self, task_id: str):
+        return self._tasks.get(task_id, None)
+
+    def delete_task(self, task_id: str):
+        if task_id in self._tasks:
+            del self._tasks[task_id]
+
+
+# Redis state management
+class RedisState(BaseState):
+    def __init__(self, host="localhost", port=6379, db=0, password=None):
+        import redis
+
+        self._redis = redis.StrictRedis(host=host, port=port, db=db, password=password)
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
+        progress = int(progress)
+        if progress > 100:
+            progress = 100
+
+        fields = {
+            "state": state,
+            "progress": progress,
+            **kwargs,
+        }
+
+        for field, value in fields.items():
+            self._redis.hset(task_id, field, str(value))
+
+    def get_task(self, task_id: str):
+        task_data = self._redis.hgetall(task_id)
+        if not task_data:
+            return None
+
+        task = {
+            key.decode("utf-8"): self._convert_to_original_type(value)
+            for key, value in task_data.items()
+        }
+        return task
+
+    def delete_task(self, task_id: str):
+        self._redis.delete(task_id)
+
+    @staticmethod
+    def _convert_to_original_type(value):
+        """
+        Convert the value from byte string to its original data type.
+        You can extend this method to handle other data types as needed.
+        """
+        value_str = value.decode("utf-8")
+
+        try:
+            # try to convert byte string array to list
+            return ast.literal_eval(value_str)
+        except (ValueError, SyntaxError):
+            pass
+
+        if value_str.isdigit():
+            return int(value_str)
+        # Add more conversions here if needed
+        return value_str
+
+
+# Global state
+_enable_redis = config.app.get("enable_redis", False)
+_redis_host = config.app.get("redis_host", "localhost")
+_redis_port = config.app.get("redis_port", 6379)
+_redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)
+
+state = (
+    RedisState(
+        host=_redis_host, port=_redis_port, db=_redis_db, password=_redis_password
+    )
+    if _enable_redis
+    else MemoryState()
+)
diff --git a/app/services/subtitle.py b/app/services/subtitle.py
new file mode 100644
index 0000000..ba6e224
--- /dev/null
+++ b/app/services/subtitle.py
@@ -0,0 +1,299 @@
+import json
+import os.path
+import re
+
+from faster_whisper import WhisperModel
+from timeit import default_timer as timer
+from loguru import logger
+
+from app.config import config
+from app.utils import utils
+
+model_size = config.whisper.get("model_size", "large-v3")
+device = config.whisper.get("device", "cpu")
+compute_type = config.whisper.get("compute_type", "int8")
+model = None
+
+
+def create(audio_file, subtitle_file: str = ""):
+    global model
+    if not model:
+        model_path = f"{utils.root_dir()}/models/whisper-{model_size}"
+        model_bin_file = f"{model_path}/model.bin"
+        if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
+            model_path = model_size
+
+        logger.info(
+            f"loading model: {model_path}, device: {device}, compute_type: {compute_type}"
+        )
+        try:
+            model = WhisperModel(
+                model_size_or_path=model_path, device=device, compute_type=compute_type
+            )
+        except Exception as e:
+            logger.error(
+                f"failed to load model: {e} \n\n"
+                f"********************************************\n"
+                f"this may be caused by network issue. \n"
+                f"please download the model manually and put it in the 'models' folder. \n"
+                f"see [README.md FAQ](https://github.com/harry0703/NarratoAI) for more details.\n"
+                f"********************************************\n\n"
+            )
+            return None
+
+    logger.info(f"start, output file: {subtitle_file}")
+    if not subtitle_file:
+        subtitle_file = f"{audio_file}.srt"
+
+    segments, info = model.transcribe(
+        audio_file,
+        beam_size=5,
+        word_timestamps=True,
+        vad_filter=True,
+        vad_parameters=dict(min_silence_duration_ms=500),
+    )
+
+    logger.info(
+        f"detected language: '{info.language}', probability: {info.language_probability:.2f}"
+    )
+
+    start = timer()
+    subtitles = []
+
+    def recognized(seg_text, seg_start, seg_end):
+        seg_text = seg_text.strip()
+        if not seg_text:
+            return
+
+        msg = "[%.2fs -> %.2fs] %s" % (seg_start, seg_end, seg_text)
+        logger.debug(msg)
+
+        subtitles.append(
+            {"msg": seg_text, "start_time": seg_start, "end_time": seg_end}
+        )
+
+    for segment in segments:
+        words_idx = 0
+        words_len = len(segment.words)
+
+        seg_start = 0
+        seg_end = 0
+        seg_text = ""
+
+        if segment.words:
+            is_segmented = False
+            for word in segment.words:
+                if not is_segmented:
+                    seg_start = word.start
+                    is_segmented = True
+
+                seg_end = word.end
+                # 如果包含标点,则断句
+                seg_text += word.word
+
+                if utils.str_contains_punctuation(word.word):
+                    # remove last char
+                    seg_text = seg_text[:-1]
+                    if not seg_text:
+                        continue
+
+                    recognized(seg_text, seg_start, seg_end)
+
+                    is_segmented = False
+                    seg_text = ""
+
+                if words_idx == 0 and segment.start < word.start:
+                    seg_start = word.start
+                if words_idx == (words_len - 1) and segment.end > word.end:
+                    seg_end = word.end
+                words_idx += 1
+
+        if not seg_text:
+            continue
+
+        recognized(seg_text, seg_start, seg_end)
+
+    end = timer()
+
+    diff = end - start
+    logger.info(f"complete, elapsed: {diff:.2f} s")
+
+    idx = 1
+    lines = []
+    for subtitle in subtitles:
+        text = subtitle.get("msg")
+        if text:
+            lines.append(
+                utils.text_to_srt(
+                    idx, text, subtitle.get("start_time"), subtitle.get("end_time")
+                )
+            )
+            idx += 1
+
+    sub = "\n".join(lines) + "\n"
+    with open(subtitle_file, "w", encoding="utf-8") as f:
+        f.write(sub)
+    logger.info(f"subtitle file created: {subtitle_file}")
+
+
+def file_to_subtitles(filename):
+    if not filename or not os.path.isfile(filename):
+        return []
+
+    times_texts = []
+    current_times = None
+    current_text = ""
+    index = 0
+    with open(filename, "r", encoding="utf-8") as f:
+        for line in f:
+            times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line)
+            if times:
+                current_times = line
+            elif line.strip() == "" and current_times:
+                index += 1
+                times_texts.append((index, current_times.strip(), current_text.strip()))
+                current_times, current_text = None, ""
+            elif current_times:
+                current_text += line
+    return times_texts
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) < len(s2):
+        return levenshtein_distance(s2, s1)
+
+    if len(s2) == 0:
+        return len(s1)
+
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+
+    return previous_row[-1]
+
+
+def similarity(a, b):
+    distance = levenshtein_distance(a.lower(), b.lower())
+    max_length = max(len(a), len(b))
+    return 1 - (distance / max_length)
+
+
+def correct(subtitle_file, video_script):
+    subtitle_items = file_to_subtitles(subtitle_file)
+    script_lines = utils.split_string_by_punctuations(video_script)
+
+    corrected = False
+    new_subtitle_items = []
+    script_index = 0
+    subtitle_index = 0
+
+    while script_index < len(script_lines) and subtitle_index < len(subtitle_items):
+        script_line = script_lines[script_index].strip()
+        subtitle_line = subtitle_items[subtitle_index][2].strip()
+
+        if script_line == subtitle_line:
+            new_subtitle_items.append(subtitle_items[subtitle_index])
+            script_index += 1
+            subtitle_index += 1
+        else:
+            combined_subtitle = subtitle_line
+            start_time = subtitle_items[subtitle_index][1].split(" --> ")[0]
+            end_time = subtitle_items[subtitle_index][1].split(" --> ")[1]
+            next_subtitle_index = subtitle_index + 1
+
+            while next_subtitle_index < len(subtitle_items):
+                next_subtitle = subtitle_items[next_subtitle_index][2].strip()
+                if similarity(
+                    script_line, combined_subtitle + " " + next_subtitle
+                ) > similarity(script_line, combined_subtitle):
+                    combined_subtitle += " " + next_subtitle
+                    end_time = subtitle_items[next_subtitle_index][1].split(" --> ")[1]
+                    next_subtitle_index += 1
+                else:
+                    break
+
+            if similarity(script_line, combined_subtitle) > 0.8:
+                logger.warning(
+                    f"Merged/Corrected - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
+                corrected = True
+            else:
+                logger.warning(
+                    f"Mismatch - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
+                corrected = True
+
+            script_index += 1
+            subtitle_index = next_subtitle_index
+
+    # 处理剩余的脚本行
+    while script_index < len(script_lines):
+        logger.warning(f"Extra script line: {script_lines[script_index]}")
+        if subtitle_index < len(subtitle_items):
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    subtitle_items[subtitle_index][1],
+                    script_lines[script_index],
+                )
+            )
+            subtitle_index += 1
+        else:
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    "00:00:00,000 --> 00:00:00,000",
+                    script_lines[script_index],
+                )
+            )
+        script_index += 1
+        corrected = True
+
+    if corrected:
+        with open(subtitle_file, "w", encoding="utf-8") as fd:
+            for i, item in enumerate(new_subtitle_items):
+                fd.write(f"{i + 1}\n{item[1]}\n{item[2]}\n\n")
+        logger.info("Subtitle corrected")
+    else:
+        logger.success("Subtitle is correct")
+
+
+if __name__ == "__main__":
+    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_dir = utils.task_dir(task_id)
+    subtitle_file = f"{task_dir}/subtitle.srt"
+    audio_file = f"{task_dir}/audio.mp3"
+
+    subtitles = file_to_subtitles(subtitle_file)
+    print(subtitles)
+
+    script_file = f"{task_dir}/script.json"
+    with open(script_file, "r") as f:
+        script_content = f.read()
+    s = json.loads(script_content)
+    script = s.get("script")
+
+    correct(subtitle_file, script)
+
+    subtitle_file = f"{task_dir}/subtitle-test.srt"
+    create(audio_file, subtitle_file)
diff --git a/app/services/task.py b/app/services/task.py
new file mode 100644
index 0000000..2f9e365
--- /dev/null
+++ b/app/services/task.py
@@ -0,0 +1,473 @@
+import math
+import json
+import os.path
+import re
+from os import path
+
+from edge_tts import SubMaker
+from loguru import logger
+
+from app.config import config
+from app.models import const
+from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
+from app.services import llm, material, subtitle, video, voice
+from app.services import state as sm
+from app.utils import utils
+
+
+def generate_script(task_id, params):
+    logger.info("\n\n## generating video script")
+    video_script = params.video_script.strip()
+    if not video_script:
+        video_script = llm.generate_script(
+            video_subject=params.video_subject,
+            language=params.video_language,
+            paragraph_number=params.paragraph_number,
+        )
+    else:
+        logger.debug(f"video script: \n{video_script}")
+
+    if not video_script:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error("failed to generate video script.")
+        return None
+
+    return video_script
+
+
+def generate_terms(task_id, params, video_script):
+    logger.info("\n\n## generating video terms")
+    video_terms = params.video_terms
+    if not video_terms:
+        video_terms = llm.generate_terms(
+            video_subject=params.video_subject, video_script=video_script, amount=5
+        )
+    else:
+        if isinstance(video_terms, str):
+            video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
+        elif isinstance(video_terms, list):
+            video_terms = [term.strip() for term in video_terms]
+        else:
+            raise ValueError("video_terms must be a string or a list of strings.")
+
+        logger.debug(f"video terms: {utils.to_json(video_terms)}")
+
+    if not video_terms:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error("failed to generate video terms.")
+        return None
+
+    return video_terms
+
+
+def save_script_data(task_id, video_script, video_terms, params):
+    script_file = path.join(utils.task_dir(task_id), "script.json")
+    script_data = {
+        "script": video_script,
+        "search_terms": video_terms,
+        "params": params,
+    }
+
+    with open(script_file, "w", encoding="utf-8") as f:
+        f.write(utils.to_json(script_data))
+
+
+def generate_audio(task_id, params, video_script):
+    logger.info("\n\n## generating audio")
+    audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
+    sub_maker = voice.tts(
+        text=video_script,
+        voice_name=voice.parse_voice_name(params.voice_name),
+        voice_rate=params.voice_rate,
+        voice_file=audio_file,
+    )
+    if sub_maker is None:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error(
+            """failed to generate audio:
+1. check if the language of the voice matches the language of the video script.
+2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
+        """.strip()
+        )
+        return None, None, None
+
+    audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
+    return audio_file, audio_duration, sub_maker
+
+
+def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
+    if not params.subtitle_enabled:
+        return ""
+
+    subtitle_path = path.join(utils.task_dir(task_id), "subtitle.srt")
+    subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
+
+    subtitle_fallback = False
+    if subtitle_provider == "edge":
+        voice.create_subtitle(
+            text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
+        )
+        if not os.path.exists(subtitle_path):
+            subtitle_fallback = True
+            logger.warning("subtitle file not found, fallback to whisper")
+
+    if subtitle_provider == "whisper" or subtitle_fallback:
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        logger.info("\n\n## correcting subtitle")
+        subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+
+    subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+    if not subtitle_lines:
+        logger.warning(f"subtitle file is invalid: {subtitle_path}")
+        return ""
+
+    return subtitle_path
+
+
+def get_video_materials(task_id, params, video_terms, audio_duration):
+    if params.video_source == "local":
+        logger.info("\n\n## preprocess local materials")
+        materials = video.preprocess_video(
+            materials=params.video_materials, clip_duration=params.video_clip_duration
+        )
+        if not materials:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error(
+                "no valid materials found, please check the materials and try again."
+            )
+            return None
+        return [material_info.url for material_info in materials]
+    else:
+        logger.info(f"\n\n## downloading videos from {params.video_source}")
+        downloaded_videos = material.download_videos(
+            task_id=task_id,
+            search_terms=video_terms,
+            source=params.video_source,
+            video_aspect=params.video_aspect,
+            video_contact_mode=params.video_concat_mode,
+            audio_duration=audio_duration * params.video_count,
+            max_clip_duration=params.video_clip_duration,
+        )
+        if not downloaded_videos:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error(
+                "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
+            )
+            return None
+        return downloaded_videos
+
+
+def generate_final_videos(
+        task_id, params, downloaded_videos, audio_file, subtitle_path
+):
+    final_video_paths = []
+    combined_video_paths = []
+    video_concat_mode = (
+        params.video_concat_mode if params.video_count == 1 else VideoConcatMode.random
+    )
+
+    _progress = 50
+    for i in range(params.video_count):
+        index = i + 1
+        combined_video_path = path.join(
+            utils.task_dir(task_id), f"combined-{index}.mp4"
+        )
+        logger.info(f"\n\n## combining video: {index} => {combined_video_path}")
+        video.combine_videos(
+            combined_video_path=combined_video_path,
+            video_paths=downloaded_videos,
+            audio_file=audio_file,
+            video_aspect=params.video_aspect,
+            video_concat_mode=video_concat_mode,
+            max_clip_duration=params.video_clip_duration,
+            threads=params.n_threads,
+        )
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+
+        logger.info(f"\n\n## generating video: {index} => {final_video_path}")
+        video.generate_video(
+            video_path=combined_video_path,
+            audio_path=audio_file,
+            subtitle_path=subtitle_path,
+            output_file=final_video_path,
+            params=params,
+        )
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_paths.append(final_video_path)
+        combined_video_paths.append(combined_video_path)
+
+    return final_video_paths, combined_video_paths
+
+
+def start(task_id, params: VideoParams, stop_at: str = "video"):
+    logger.info(f"start task: {task_id}, stop_at: {stop_at}")
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
+
+    if type(params.video_concat_mode) is str:
+        params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
+        
+    # 1. Generate script
+    video_script = generate_script(task_id, params)
+    if not video_script:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
+
+    if stop_at == "script":
+        sm.state.update_task(
+            task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script
+        )
+        return {"script": video_script}
+
+    # 2. Generate terms
+    video_terms = ""
+    if params.video_source != "local":
+        video_terms = generate_terms(task_id, params, video_script)
+        if not video_terms:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            return
+
+    save_script_data(task_id, video_script, video_terms, params)
+
+    if stop_at == "terms":
+        sm.state.update_task(
+            task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms
+        )
+        return {"script": video_script, "terms": video_terms}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
+
+    # 3. Generate audio
+    audio_file, audio_duration, sub_maker = generate_audio(task_id, params, video_script)
+    if not audio_file:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
+
+    if stop_at == "audio":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            audio_file=audio_file,
+        )
+        return {"audio_file": audio_file, "audio_duration": audio_duration}
+
+    # 4. Generate subtitle
+    subtitle_path = generate_subtitle(task_id, params, video_script, sub_maker, audio_file)
+
+    if stop_at == "subtitle":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            subtitle_path=subtitle_path,
+        )
+        return {"subtitle_path": subtitle_path}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    # 5. Get video materials
+    downloaded_videos = get_video_materials(
+        task_id, params, video_terms, audio_duration
+    )
+    if not downloaded_videos:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    if stop_at == "materials":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            materials=downloaded_videos,
+        )
+        return {"materials": downloaded_videos}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
+
+    # 6. Generate final videos
+    final_video_paths, combined_video_paths = generate_final_videos(
+        task_id, params, downloaded_videos, audio_file, subtitle_path
+    )
+
+    if not final_video_paths:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    logger.success(
+        f"task {task_id} finished, generated {len(final_video_paths)} videos."
+    )
+
+    kwargs = {
+        "videos": final_video_paths,
+        "combined_videos": combined_video_paths,
+        "script": video_script,
+        "terms": video_terms,
+        "audio_file": audio_file,
+        "audio_duration": audio_duration,
+        "subtitle_path": subtitle_path,
+        "materials": downloaded_videos,
+    }
+    sm.state.update_task(
+        task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs
+    )
+    return kwargs
+
+
+def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
+    """
+    后台任务（自动剪辑视频进行剪辑）
+    """
+    logger.info(f"\n\n## 开始任务: {task_id}")
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
+
+    voice_name = voice.parse_voice_name(params.voice_name)
+    # voice_name = 'zh-CN-XiaoyiNeural'
+    paragraph_number = params.paragraph_number
+    n_threads = params.n_threads
+    max_clip_duration = params.video_clip_duration
+
+    logger.info("\n\n## 1. 读取json")
+    # video_script = params.video_script.strip()
+    # 搜索 ../storage 目录下 名称为 video_subject 的docx文件,并读出所有字符串
+    # video_script_path = path.join('E:\\Projects\\linyq\\MoneyPrinterLin\\txt.txt\\txt2.json')
+    video_script_path = path.join(params.video_clip_json)
+    # 判断json文件是否存在
+    if path.exists(video_script_path):
+        # 读取json文件内容，并转为dict
+        with open(video_script_path, "r", encoding="utf-8") as f:
+            list_script = json.load(f)
+            video_list = [i['narration'] for i in list_script]
+            time_list = [i['timestamp'] for i in list_script]
+
+            video_script = " ".join(video_list)
+            logger.debug(f"原json脚本: \n{video_script}")
+            logger.debug(f"原json时间戳: \n{time_list}")
+
+    else:
+        print("#@#@#@", params.video_clip_json)
+        raise ValueError("解说文案不存在！检查文案名称是否正确。")
+
+    # video_script = llm.text_polishing(context=video_script, language=params.video_language)
+    # logger.debug(f"润色后的视频脚本: \n{video_script}")
+    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
+
+    logger.info("\n\n## 2. 生成音频")
+    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
+    sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_file=audio_file, voice_rate=params.voice_rate)
+    if sub_maker is None:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error(
+            "无法生成音频，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
+        return
+
+    audio_duration = voice.get_audio_duration(sub_maker)
+    audio_duration = math.ceil(audio_duration)
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
+
+    subtitle_path = ""
+    if params.subtitle_enabled:
+        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+        subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+        logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
+        subtitle_fallback = False
+        if subtitle_provider == "edge":
+            voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
+            if not os.path.exists(subtitle_path):
+                subtitle_fallback = True
+                logger.warning("找不到字幕文件，回退到whisper")
+
+        if subtitle_provider == "whisper" or subtitle_fallback:
+            subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+            logger.info("\n\n## 更正字幕")
+            subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+
+        subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+        if not subtitle_lines:
+            logger.warning(f"字幕文件无效: {subtitle_path}")
+            subtitle_path = ""
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    logger.info("\n\n## 4. 裁剪视频")
+    subclip_videos = [x for x in subclip_path_videos.values()]
+    # subclip_videos = material.clip_videos(task_id=task_id,
+    #                                          timestamp_terms=time_list,
+    #                                          origin_video=params.video_origin_path
+    #                                          )
+    logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")
+
+    if not subclip_videos:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error(
+            "裁剪视频失败，可能是 ImageMagick 不可用")
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
+
+    final_video_paths = []
+    combined_video_paths = []
+
+    _progress = 50
+    for i in range(params.video_count):
+        index = i + 1
+        combined_video_path = path.join(utils.task_dir(task_id), f"combined-{index}.mp4")
+        logger.info(f"\n\n## 5. 合并视频: {index} => {combined_video_path}")
+        video.combine_clip_videos(combined_video_path=combined_video_path,
+                             video_paths=subclip_videos,
+                             video_script_list=video_list,
+                             audio_file=audio_file,
+                             video_aspect=params.video_aspect,
+                             threads=n_threads)
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+
+        logger.info(f"\n\n## 6. 生成视频: {index} => {final_video_path}")
+        # 把所有东西合到在一起
+        video.generate_video(video_path=combined_video_path,
+                             audio_path=audio_file,
+                             subtitle_path=subtitle_path,
+                             output_file=final_video_path,
+                             params=params,
+                             )
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_paths.append(final_video_path)
+        combined_video_paths.append(combined_video_path)
+
+    logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
+
+    kwargs = {
+        "videos": final_video_paths,
+        "combined_videos": combined_video_paths
+    }
+    sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs)
+    return kwargs
+
+
+if __name__ == "__main__":
+    task_id = "task_id"
+    params = VideoParams(
+        video_subject="金钱的作用",
+        voice_name="zh-CN-XiaoyiNeural-Female",
+        voice_rate=1.0,
+
+    )
+    start(task_id, params, stop_at="video")
diff --git a/app/services/video.py b/app/services/video.py
new file mode 100644
index 0000000..3daf92f
--- /dev/null
+++ b/app/services/video.py
@@ -0,0 +1,520 @@
+import glob
+import random
+from typing import List
+from typing import Union
+
+from loguru import logger
+from moviepy.editor import *
+from moviepy.video.tools.subtitles import SubtitlesClip
+from PIL import ImageFont
+
+from app.models import const
+from app.models.schema import MaterialInfo, VideoAspect, VideoConcatMode, VideoParams, VideoClipParams
+from app.utils import utils
+
+
+def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
+    if not bgm_type:
+        return ""
+
+    if bgm_file and os.path.exists(bgm_file):
+        return bgm_file
+
+    if bgm_type == "random":
+        suffix = "*.mp3"
+        song_dir = utils.song_dir()
+        files = glob.glob(os.path.join(song_dir, suffix))
+        return random.choice(files)
+
+    return ""
+
+
+def combine_videos(
+    combined_video_path: str,
+    video_paths: List[str],
+    audio_file: str,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+    video_concat_mode: VideoConcatMode = VideoConcatMode.random,
+    max_clip_duration: int = 5,
+    threads: int = 2,
+) -> str:
+    audio_clip = AudioFileClip(audio_file)
+    audio_duration = audio_clip.duration
+    logger.info(f"max duration of audio: {audio_duration} seconds")
+    # Required duration of each clip
+    req_dur = audio_duration / len(video_paths)
+    req_dur = max_clip_duration
+    logger.info(f"each clip will be maximum {req_dur} seconds long")
+    output_dir = os.path.dirname(combined_video_path)
+
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    clips = []
+    video_duration = 0
+
+    raw_clips = []
+    for video_path in video_paths:
+        clip = VideoFileClip(video_path).without_audio()
+        clip_duration = clip.duration
+        start_time = 0
+
+        while start_time < clip_duration:
+            end_time = min(start_time + max_clip_duration, clip_duration)
+            split_clip = clip.subclip(start_time, end_time)
+            raw_clips.append(split_clip)
+            # logger.info(f"splitting from {start_time:.2f} to {end_time:.2f}, clip duration {clip_duration:.2f}, split_clip duration {split_clip.duration:.2f}")
+            start_time = end_time
+            if video_concat_mode.value == VideoConcatMode.sequential.value:
+                break
+
+    # random video_paths order
+    if video_concat_mode.value == VideoConcatMode.random.value:
+        random.shuffle(raw_clips)
+
+    # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
+    while video_duration < audio_duration:
+        for clip in raw_clips:
+            # Check if clip is longer than the remaining audio
+            if (audio_duration - video_duration) < clip.duration:
+                clip = clip.subclip(0, (audio_duration - video_duration))
+            # Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
+            elif req_dur < clip.duration:
+                clip = clip.subclip(0, req_dur)
+            clip = clip.set_fps(30)
+
+            # Not all videos are same size, so we need to resize them
+            clip_w, clip_h = clip.size
+            if clip_w != video_width or clip_h != video_height:
+                clip_ratio = clip.w / clip.h
+                video_ratio = video_width / video_height
+
+                if clip_ratio == video_ratio:
+                    # 等比例缩放
+                    clip = clip.resize((video_width, video_height))
+                else:
+                    # 等比缩放视频
+                    if clip_ratio > video_ratio:
+                        # 按照目标宽度等比缩放
+                        scale_factor = video_width / clip_w
+                    else:
+                        # 按照目标高度等比缩放
+                        scale_factor = video_height / clip_h
+
+                    new_width = int(clip_w * scale_factor)
+                    new_height = int(clip_h * scale_factor)
+                    clip_resized = clip.resize(newsize=(new_width, new_height))
+
+                    background = ColorClip(
+                        size=(video_width, video_height), color=(0, 0, 0)
+                    )
+                    clip = CompositeVideoClip(
+                        [
+                            background.set_duration(clip.duration),
+                            clip_resized.set_position("center"),
+                        ]
+                    )
+
+                logger.info(
+                    f"resizing video to {video_width} x {video_height}, clip size: {clip_w} x {clip_h}"
+                )
+
+            if clip.duration > max_clip_duration:
+                clip = clip.subclip(0, max_clip_duration)
+
+            clips.append(clip)
+            video_duration += clip.duration
+
+    video_clip = concatenate_videoclips(clips)
+    video_clip = video_clip.set_fps(30)
+    logger.info("writing")
+    # https://github.com/harry0703/NarratoAI/issues/111#issuecomment-2032354030
+    video_clip.write_videofile(
+        filename=combined_video_path,
+        threads=threads,
+        logger=None,
+        temp_audiofile_path=output_dir,
+        audio_codec="aac",
+        fps=30,
+    )
+    video_clip.close()
+    logger.success("completed")
+    return combined_video_path
+
+
+def wrap_text(text, max_width, font="Arial", fontsize=60):
+    # 创建字体对象
+    font = ImageFont.truetype(font, fontsize)
+
+    def get_text_size(inner_text):
+        inner_text = inner_text.strip()
+        left, top, right, bottom = font.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text, height
+
+    # logger.warning(f"wrapping text, max_width: {max_width}, text_width: {width}, text: {text}")
+
+    processed = True
+
+    _wrapped_lines_ = []
+    words = text.split(" ")
+    _txt_ = ""
+    for word in words:
+        _before = _txt_
+        _txt_ += f"{word} "
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            if _txt_.strip() == word.strip():
+                processed = False
+                break
+            _wrapped_lines_.append(_before)
+            _txt_ = f"{word} "
+    _wrapped_lines_.append(_txt_)
+    if processed:
+        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
+        result = "\n".join(_wrapped_lines_).strip()
+        height = len(_wrapped_lines_) * height
+        # logger.warning(f"wrapped text: {result}")
+        return result, height
+
+    _wrapped_lines_ = []
+    chars = list(text)
+    _txt_ = ""
+    for word in chars:
+        _txt_ += word
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ""
+    _wrapped_lines_.append(_txt_)
+    result = "\n".join(_wrapped_lines_).strip()
+    height = len(_wrapped_lines_) * height
+    # logger.warning(f"wrapped text: {result}")
+    return result, height
+
+
+def generate_video(
+    video_path: str,
+    audio_path: str,
+    subtitle_path: str,
+    output_file: str,
+    params: Union[VideoParams, VideoClipParams],
+):
+    aspect = VideoAspect(params.video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    logger.info(f"start, video size: {video_width} x {video_height}")
+    logger.info(f"  ① video: {video_path}")
+    logger.info(f"  ② audio: {audio_path}")
+    logger.info(f"  ③ subtitle: {subtitle_path}")
+    logger.info(f"  ④ output: {output_file}")
+
+    # https://github.com/harry0703/NarratoAI/issues/217
+    # PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'final-1.mp4.tempTEMP_MPY_wvf_snd.mp3'
+    # write into the same directory as the output file
+    output_dir = os.path.dirname(output_file)
+
+    font_path = ""
+    if params.subtitle_enabled:
+        if not params.font_name:
+            params.font_name = "STHeitiMedium.ttc"
+        font_path = os.path.join(utils.font_dir(), params.font_name)
+        if os.name == "nt":
+            font_path = font_path.replace("\\", "/")
+
+        logger.info(f"using font: {font_path}")
+
+    def create_text_clip(subtitle_item):
+        phrase = subtitle_item[1]
+        max_width = video_width * 0.9
+        wrapped_txt, txt_height = wrap_text(
+            phrase, max_width=max_width, font=font_path, fontsize=params.font_size
+        )
+        _clip = TextClip(
+            wrapped_txt,
+            font=font_path,
+            fontsize=params.font_size,
+            color=params.text_fore_color,
+            bg_color=params.text_background_color,
+            stroke_color=params.stroke_color,
+            stroke_width=params.stroke_width,
+            print_cmd=False,
+        )
+        duration = subtitle_item[0][1] - subtitle_item[0][0]
+        _clip = _clip.set_start(subtitle_item[0][0])
+        _clip = _clip.set_end(subtitle_item[0][1])
+        _clip = _clip.set_duration(duration)
+        if params.subtitle_position == "bottom":
+            _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
+        elif params.subtitle_position == "top":
+            _clip = _clip.set_position(("center", video_height * 0.05))
+        elif params.subtitle_position == "custom":
+            # 确保字幕完全在屏幕内
+            margin = 10  # 额外的边距，单位为像素
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (params.custom_position / 100)
+            custom_y = max(min_y, min(custom_y, max_y))  # 限制 y 值在有效范围内
+            _clip = _clip.set_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.set_position(("center", "center"))
+        return _clip
+
+    video_clip = VideoFileClip(video_path)
+    audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume)
+
+    if subtitle_path and os.path.exists(subtitle_path):
+        sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
+        text_clips = []
+        for item in sub.subtitles:
+            clip = create_text_clip(subtitle_item=item)
+            text_clips.append(clip)
+        video_clip = CompositeVideoClip([video_clip, *text_clips])
+
+    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
+    if bgm_file:
+        try:
+            bgm_clip = (
+                AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
+            )
+            bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration)
+            audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
+        except Exception as e:
+            logger.error(f"failed to add bgm: {str(e)}")
+
+    video_clip = video_clip.set_audio(audio_clip)
+    video_clip.write_videofile(
+        output_file,
+        audio_codec="aac",
+        temp_audiofile_path=output_dir,
+        threads=params.n_threads or 2,
+        logger=None,
+        fps=30,
+    )
+    video_clip.close()
+    del video_clip
+    logger.success("completed")
+
+
+def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
+    for material in materials:
+        if not material.url:
+            continue
+
+        ext = utils.parse_extension(material.url)
+        try:
+            clip = VideoFileClip(material.url)
+        except Exception:
+            clip = ImageClip(material.url)
+
+        width = clip.size[0]
+        height = clip.size[1]
+        if width < 480 or height < 480:
+            logger.warning(f"video is too small, width: {width}, height: {height}")
+            continue
+
+        if ext in const.FILE_TYPE_IMAGES:
+            logger.info(f"processing image: {material.url}")
+            # 创建一个图片剪辑，并设置持续时间为3秒钟
+            clip = (
+                ImageClip(material.url)
+                .set_duration(clip_duration)
+                .set_position("center")
+            )
+            # 使用resize方法来添加缩放效果。这里使用了lambda函数来使得缩放效果随时间变化。
+            # 假设我们想要从原始大小逐渐放大到120%的大小。
+            # t代表当前时间，clip.duration为视频总时长，这里是3秒。
+            # 注意：1 表示100%的大小，所以1.2表示120%的大小
+            zoom_clip = clip.resize(
+                lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
+            )
+
+            # 如果需要，可以创建一个包含缩放剪辑的复合视频剪辑
+            # （这在您想要在视频中添加其他元素时非常有用）
+            final_clip = CompositeVideoClip([zoom_clip])
+
+            # 输出视频
+            video_file = f"{material.url}.mp4"
+            final_clip.write_videofile(video_file, fps=30, logger=None)
+            final_clip.close()
+            del final_clip
+            material.url = video_file
+            logger.success(f"completed: {video_file}")
+    return materials
+
+
+def combine_clip_videos(combined_video_path: str,
+                        video_paths: List[str],
+                        video_script_list: List[str],
+                        audio_file: str,
+                        video_aspect: VideoAspect = VideoAspect.portrait,
+                        threads: int = 2,
+                        ) -> str:
+    """
+    合并子视频
+    Args:
+        combined_video_path: 合并后的存储路径
+        video_paths: 子视频路径列表
+        audio_file: mp3旁白
+        video_aspect: 屏幕比例
+        threads: 线程数
+
+    Returns:
+
+    """
+    audio_clip = AudioFileClip(audio_file)
+    audio_duration = audio_clip.duration
+    logger.info(f"音频的最大持续时间: {audio_duration} s")
+    # 每个剪辑所需的持续时间
+    req_dur = audio_duration / len(video_paths)
+    # req_dur = max_clip_duration
+    # logger.info(f"每个剪辑的最大长度为 {req_dur} s")
+    output_dir = os.path.dirname(combined_video_path)
+
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    clips = []
+    video_duration = 0
+    # 一遍又一遍地添加下载的剪辑，直到达到音频的持续时间 （max_duration）
+    while video_duration < audio_duration:
+        for video_path, video_script in zip(video_paths, video_script_list):
+            clip = VideoFileClip(video_path).without_audio()
+            # 检查剪辑是否比剩余音频长
+            if (audio_duration - video_duration) < clip.duration:
+                clip = clip.subclip(0, (audio_duration - video_duration))
+            # 仅当计算出的剪辑长度 （req_dur） 短于实际剪辑时，才缩短剪辑以防止静止图像
+            elif req_dur < clip.duration:
+                clip = clip.subclip(0, req_dur)
+            clip = clip.set_fps(30)
+
+            # 并非所有视频的大小都相同，因此我们需要调整它们的大小
+            clip_w, clip_h = clip.size
+            if clip_w != video_width or clip_h != video_height:
+                clip_ratio = clip.w / clip.h
+                video_ratio = video_width / video_height
+
+                if clip_ratio == video_ratio:
+                    # 等比例缩放
+                    clip = clip.resize((video_width, video_height))
+                else:
+                    # 等比缩放视频
+                    if clip_ratio > video_ratio:
+                        # 按照目标宽度等比缩放
+                        scale_factor = video_width / clip_w
+                    else:
+                        # 按照目标高度等比缩放
+                        scale_factor = video_height / clip_h
+
+                    new_width = int(clip_w * scale_factor)
+                    new_height = int(clip_h * scale_factor)
+                    clip_resized = clip.resize(newsize=(new_width, new_height))
+
+                    background = ColorClip(size=(video_width, video_height), color=(0, 0, 0))
+                    clip = CompositeVideoClip([
+                        background.set_duration(clip.duration),
+                        clip_resized.set_position("center")
+                    ])
+
+                logger.info(f"将视频 {video_path} 大小调整为 {video_width} x {video_height}, 剪辑尺寸: {clip_w} x {clip_h}")
+
+            # TODO: 片段时长过长时，需要缩短，但暂时没有好的解决方案
+            # if clip.duration > 5:
+            #     ctime = utils.reduce_video_time(txt=video_script)
+            #     if clip.duration > (2 * ctime):
+            #         clip = clip.subclip(ctime, 2*ctime)
+            #     else:
+            #         clip = clip.subclip(0, ctime)
+            #     logger.info(f"视频 {video_path} 片段时长较长，将剪辑时长缩短至 {ctime} 秒")
+
+            clips.append(clip)
+            video_duration += clip.duration
+
+    video_clip = concatenate_videoclips(clips)
+    video_clip = video_clip.set_fps(30)
+    logger.info(f"合并中...")
+    video_clip.write_videofile(filename=combined_video_path,
+                               threads=threads,
+                               logger=None,
+                               temp_audiofile_path=output_dir,
+                               audio_codec="aac",
+                               fps=30,
+                               )
+    video_clip.close()
+    logger.success(f"completed")
+    return combined_video_path
+
+
+if __name__ == "__main__":
+    from app.utils import utils
+
+    suffix = "*.mp4"
+    song_dir = utils.video_dir()
+    files = glob.glob(os.path.join(song_dir, suffix))
+
+    print(files)
+
+    # m = MaterialInfo()
+    # m.url = "/Users/harry/Downloads/IMG_2915.JPG"
+    # m.provider = "local"
+    # materials = preprocess_video([m], clip_duration=4)
+    # print(materials)
+
+    # txt_en = "Here's your guide to travel hacks for budget-friendly adventures"
+    # txt_zh = "测试长字段这是您的旅行技巧指南帮助您进行预算友好的冒险"
+    # font = utils.resource_dir() + "/fonts/STHeitiMedium.ttc"
+    # for txt in [txt_en, txt_zh]:
+    #     t, h = wrap_text(text=txt, max_width=1000, font=font, fontsize=60)
+    #     print(t)
+    #
+    # task_id = "aa563149-a7ea-49c2-b39f-8c32cc225baf"
+    # task_dir = utils.task_dir(task_id)
+    # video_file = f"{task_dir}/combined-1.mp4"
+    # audio_file = f"{task_dir}/audio.mp3"
+    # subtitle_file = f"{task_dir}/subtitle.srt"
+    # output_file = f"{task_dir}/final.mp4"
+    #
+    # # video_paths = []
+    # # for file in os.listdir(utils.storage_dir("test")):
+    # #     if file.endswith(".mp4"):
+    # #         video_paths.append(os.path.join(utils.storage_dir("test"), file))
+    # #
+    # # combine_videos(combined_video_path=video_file,
+    # #                audio_file=audio_file,
+    # #                video_paths=video_paths,
+    # #                video_aspect=VideoAspect.portrait,
+    # #                video_concat_mode=VideoConcatMode.random,
+    # #                max_clip_duration=5,
+    # #                threads=2)
+    #
+    # cfg = VideoParams()
+    # cfg.video_aspect = VideoAspect.portrait
+    # cfg.font_name = "STHeitiMedium.ttc"
+    # cfg.font_size = 60
+    # cfg.stroke_color = "#000000"
+    # cfg.stroke_width = 1.5
+    # cfg.text_fore_color = "#FFFFFF"
+    # cfg.text_background_color = "transparent"
+    # cfg.bgm_type = "random"
+    # cfg.bgm_file = ""
+    # cfg.bgm_volume = 1.0
+    # cfg.subtitle_enabled = True
+    # cfg.subtitle_position = "bottom"
+    # cfg.n_threads = 2
+    # cfg.paragraph_number = 1
+    #
+    # cfg.voice_volume = 1.0
+    #
+    # generate_video(video_path=video_file,
+    #                audio_path=audio_file,
+    #                subtitle_path=subtitle_file,
+    #                output_file=output_file,
+    #                params=cfg
+    #                )
diff --git a/app/services/voice.py b/app/services/voice.py
new file mode 100644
index 0000000..287e22d
--- /dev/null
+++ b/app/services/voice.py
@@ -0,0 +1,1354 @@
+import asyncio
+import os
+import re
+from datetime import datetime
+from xml.sax.saxutils import unescape
+from edge_tts.submaker import mktimestamp
+from loguru import logger
+from edge_tts import submaker, SubMaker
+import edge_tts
+from moviepy.video.tools import subtitles
+
+from app.config import config
+from app.utils import utils
+
+
+def get_all_azure_voices(filter_locals=None) -> list[str]:
+    if filter_locals is None:
+        filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"]
+    voices_str = """
+Name: af-ZA-AdriNeural
+Gender: Female
+
+Name: af-ZA-WillemNeural
+Gender: Male
+
+Name: am-ET-AmehaNeural
+Gender: Male
+
+Name: am-ET-MekdesNeural
+Gender: Female
+
+Name: ar-AE-FatimaNeural
+Gender: Female
+
+Name: ar-AE-HamdanNeural
+Gender: Male
+
+Name: ar-BH-AliNeural
+Gender: Male
+
+Name: ar-BH-LailaNeural
+Gender: Female
+
+Name: ar-DZ-AminaNeural
+Gender: Female
+
+Name: ar-DZ-IsmaelNeural
+Gender: Male
+
+Name: ar-EG-SalmaNeural
+Gender: Female
+
+Name: ar-EG-ShakirNeural
+Gender: Male
+
+Name: ar-IQ-BasselNeural
+Gender: Male
+
+Name: ar-IQ-RanaNeural
+Gender: Female
+
+Name: ar-JO-SanaNeural
+Gender: Female
+
+Name: ar-JO-TaimNeural
+Gender: Male
+
+Name: ar-KW-FahedNeural
+Gender: Male
+
+Name: ar-KW-NouraNeural
+Gender: Female
+
+Name: ar-LB-LaylaNeural
+Gender: Female
+
+Name: ar-LB-RamiNeural
+Gender: Male
+
+Name: ar-LY-ImanNeural
+Gender: Female
+
+Name: ar-LY-OmarNeural
+Gender: Male
+
+Name: ar-MA-JamalNeural
+Gender: Male
+
+Name: ar-MA-MounaNeural
+Gender: Female
+
+Name: ar-OM-AbdullahNeural
+Gender: Male
+
+Name: ar-OM-AyshaNeural
+Gender: Female
+
+Name: ar-QA-AmalNeural
+Gender: Female
+
+Name: ar-QA-MoazNeural
+Gender: Male
+
+Name: ar-SA-HamedNeural
+Gender: Male
+
+Name: ar-SA-ZariyahNeural
+Gender: Female
+
+Name: ar-SY-AmanyNeural
+Gender: Female
+
+Name: ar-SY-LaithNeural
+Gender: Male
+
+Name: ar-TN-HediNeural
+Gender: Male
+
+Name: ar-TN-ReemNeural
+Gender: Female
+
+Name: ar-YE-MaryamNeural
+Gender: Female
+
+Name: ar-YE-SalehNeural
+Gender: Male
+
+Name: az-AZ-BabekNeural
+Gender: Male
+
+Name: az-AZ-BanuNeural
+Gender: Female
+
+Name: bg-BG-BorislavNeural
+Gender: Male
+
+Name: bg-BG-KalinaNeural
+Gender: Female
+
+Name: bn-BD-NabanitaNeural
+Gender: Female
+
+Name: bn-BD-PradeepNeural
+Gender: Male
+
+Name: bn-IN-BashkarNeural
+Gender: Male
+
+Name: bn-IN-TanishaaNeural
+Gender: Female
+
+Name: bs-BA-GoranNeural
+Gender: Male
+
+Name: bs-BA-VesnaNeural
+Gender: Female
+
+Name: ca-ES-EnricNeural
+Gender: Male
+
+Name: ca-ES-JoanaNeural
+Gender: Female
+
+Name: cs-CZ-AntoninNeural
+Gender: Male
+
+Name: cs-CZ-VlastaNeural
+Gender: Female
+
+Name: cy-GB-AledNeural
+Gender: Male
+
+Name: cy-GB-NiaNeural
+Gender: Female
+
+Name: da-DK-ChristelNeural
+Gender: Female
+
+Name: da-DK-JeppeNeural
+Gender: Male
+
+Name: de-AT-IngridNeural
+Gender: Female
+
+Name: de-AT-JonasNeural
+Gender: Male
+
+Name: de-CH-JanNeural
+Gender: Male
+
+Name: de-CH-LeniNeural
+Gender: Female
+
+Name: de-DE-AmalaNeural
+Gender: Female
+
+Name: de-DE-ConradNeural
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural
+Gender: Male
+
+Name: de-DE-KatjaNeural
+Gender: Female
+
+Name: de-DE-KillianNeural
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural
+Gender: Female
+
+Name: el-GR-AthinaNeural
+Gender: Female
+
+Name: el-GR-NestorasNeural
+Gender: Male
+
+Name: en-AU-NatashaNeural
+Gender: Female
+
+Name: en-AU-WilliamNeural
+Gender: Male
+
+Name: en-CA-ClaraNeural
+Gender: Female
+
+Name: en-CA-LiamNeural
+Gender: Male
+
+Name: en-GB-LibbyNeural
+Gender: Female
+
+Name: en-GB-MaisieNeural
+Gender: Female
+
+Name: en-GB-RyanNeural
+Gender: Male
+
+Name: en-GB-SoniaNeural
+Gender: Female
+
+Name: en-GB-ThomasNeural
+Gender: Male
+
+Name: en-HK-SamNeural
+Gender: Male
+
+Name: en-HK-YanNeural
+Gender: Female
+
+Name: en-IE-ConnorNeural
+Gender: Male
+
+Name: en-IE-EmilyNeural
+Gender: Female
+
+Name: en-IN-NeerjaExpressiveNeural
+Gender: Female
+
+Name: en-IN-NeerjaNeural
+Gender: Female
+
+Name: en-IN-PrabhatNeural
+Gender: Male
+
+Name: en-KE-AsiliaNeural
+Gender: Female
+
+Name: en-KE-ChilembaNeural
+Gender: Male
+
+Name: en-NG-AbeoNeural
+Gender: Male
+
+Name: en-NG-EzinneNeural
+Gender: Female
+
+Name: en-NZ-MitchellNeural
+Gender: Male
+
+Name: en-NZ-MollyNeural
+Gender: Female
+
+Name: en-PH-JamesNeural
+Gender: Male
+
+Name: en-PH-RosaNeural
+Gender: Female
+
+Name: en-SG-LunaNeural
+Gender: Female
+
+Name: en-SG-WayneNeural
+Gender: Male
+
+Name: en-TZ-ElimuNeural
+Gender: Male
+
+Name: en-TZ-ImaniNeural
+Gender: Female
+
+Name: en-US-AnaNeural
+Gender: Female
+
+Name: en-US-AndrewNeural
+Gender: Male
+
+Name: en-US-AriaNeural
+Gender: Female
+
+Name: en-US-AvaNeural
+Gender: Female
+
+Name: en-US-BrianNeural
+Gender: Male
+
+Name: en-US-ChristopherNeural
+Gender: Male
+
+Name: en-US-EmmaNeural
+Gender: Female
+
+Name: en-US-EricNeural
+Gender: Male
+
+Name: en-US-GuyNeural
+Gender: Male
+
+Name: en-US-JennyNeural
+Gender: Female
+
+Name: en-US-MichelleNeural
+Gender: Female
+
+Name: en-US-RogerNeural
+Gender: Male
+
+Name: en-US-SteffanNeural
+Gender: Male
+
+Name: en-ZA-LeahNeural
+Gender: Female
+
+Name: en-ZA-LukeNeural
+Gender: Male
+
+Name: es-AR-ElenaNeural
+Gender: Female
+
+Name: es-AR-TomasNeural
+Gender: Male
+
+Name: es-BO-MarceloNeural
+Gender: Male
+
+Name: es-BO-SofiaNeural
+Gender: Female
+
+Name: es-CL-CatalinaNeural
+Gender: Female
+
+Name: es-CL-LorenzoNeural
+Gender: Male
+
+Name: es-CO-GonzaloNeural
+Gender: Male
+
+Name: es-CO-SalomeNeural
+Gender: Female
+
+Name: es-CR-JuanNeural
+Gender: Male
+
+Name: es-CR-MariaNeural
+Gender: Female
+
+Name: es-CU-BelkysNeural
+Gender: Female
+
+Name: es-CU-ManuelNeural
+Gender: Male
+
+Name: es-DO-EmilioNeural
+Gender: Male
+
+Name: es-DO-RamonaNeural
+Gender: Female
+
+Name: es-EC-AndreaNeural
+Gender: Female
+
+Name: es-EC-LuisNeural
+Gender: Male
+
+Name: es-ES-AlvaroNeural
+Gender: Male
+
+Name: es-ES-ElviraNeural
+Gender: Female
+
+Name: es-ES-XimenaNeural
+Gender: Female
+
+Name: es-GQ-JavierNeural
+Gender: Male
+
+Name: es-GQ-TeresaNeural
+Gender: Female
+
+Name: es-GT-AndresNeural
+Gender: Male
+
+Name: es-GT-MartaNeural
+Gender: Female
+
+Name: es-HN-CarlosNeural
+Gender: Male
+
+Name: es-HN-KarlaNeural
+Gender: Female
+
+Name: es-MX-DaliaNeural
+Gender: Female
+
+Name: es-MX-JorgeNeural
+Gender: Male
+
+Name: es-NI-FedericoNeural
+Gender: Male
+
+Name: es-NI-YolandaNeural
+Gender: Female
+
+Name: es-PA-MargaritaNeural
+Gender: Female
+
+Name: es-PA-RobertoNeural
+Gender: Male
+
+Name: es-PE-AlexNeural
+Gender: Male
+
+Name: es-PE-CamilaNeural
+Gender: Female
+
+Name: es-PR-KarinaNeural
+Gender: Female
+
+Name: es-PR-VictorNeural
+Gender: Male
+
+Name: es-PY-MarioNeural
+Gender: Male
+
+Name: es-PY-TaniaNeural
+Gender: Female
+
+Name: es-SV-LorenaNeural
+Gender: Female
+
+Name: es-SV-RodrigoNeural
+Gender: Male
+
+Name: es-US-AlonsoNeural
+Gender: Male
+
+Name: es-US-PalomaNeural
+Gender: Female
+
+Name: es-UY-MateoNeural
+Gender: Male
+
+Name: es-UY-ValentinaNeural
+Gender: Female
+
+Name: es-VE-PaolaNeural
+Gender: Female
+
+Name: es-VE-SebastianNeural
+Gender: Male
+
+Name: et-EE-AnuNeural
+Gender: Female
+
+Name: et-EE-KertNeural
+Gender: Male
+
+Name: fa-IR-DilaraNeural
+Gender: Female
+
+Name: fa-IR-FaridNeural
+Gender: Male
+
+Name: fi-FI-HarriNeural
+Gender: Male
+
+Name: fi-FI-NooraNeural
+Gender: Female
+
+Name: fil-PH-AngeloNeural
+Gender: Male
+
+Name: fil-PH-BlessicaNeural
+Gender: Female
+
+Name: fr-BE-CharlineNeural
+Gender: Female
+
+Name: fr-BE-GerardNeural
+Gender: Male
+
+Name: fr-CA-AntoineNeural
+Gender: Male
+
+Name: fr-CA-JeanNeural
+Gender: Male
+
+Name: fr-CA-SylvieNeural
+Gender: Female
+
+Name: fr-CA-ThierryNeural
+Gender: Male
+
+Name: fr-CH-ArianeNeural
+Gender: Female
+
+Name: fr-CH-FabriceNeural
+Gender: Male
+
+Name: fr-FR-DeniseNeural
+Gender: Female
+
+Name: fr-FR-EloiseNeural
+Gender: Female
+
+Name: fr-FR-HenriNeural
+Gender: Male
+
+Name: fr-FR-RemyMultilingualNeural
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural
+Gender: Female
+
+Name: ga-IE-ColmNeural
+Gender: Male
+
+Name: ga-IE-OrlaNeural
+Gender: Female
+
+Name: gl-ES-RoiNeural
+Gender: Male
+
+Name: gl-ES-SabelaNeural
+Gender: Female
+
+Name: gu-IN-DhwaniNeural
+Gender: Female
+
+Name: gu-IN-NiranjanNeural
+Gender: Male
+
+Name: he-IL-AvriNeural
+Gender: Male
+
+Name: he-IL-HilaNeural
+Gender: Female
+
+Name: hi-IN-MadhurNeural
+Gender: Male
+
+Name: hi-IN-SwaraNeural
+Gender: Female
+
+Name: hr-HR-GabrijelaNeural
+Gender: Female
+
+Name: hr-HR-SreckoNeural
+Gender: Male
+
+Name: hu-HU-NoemiNeural
+Gender: Female
+
+Name: hu-HU-TamasNeural
+Gender: Male
+
+Name: id-ID-ArdiNeural
+Gender: Male
+
+Name: id-ID-GadisNeural
+Gender: Female
+
+Name: is-IS-GudrunNeural
+Gender: Female
+
+Name: is-IS-GunnarNeural
+Gender: Male
+
+Name: it-IT-DiegoNeural
+Gender: Male
+
+Name: it-IT-ElsaNeural
+Gender: Female
+
+Name: it-IT-GiuseppeNeural
+Gender: Male
+
+Name: it-IT-IsabellaNeural
+Gender: Female
+
+Name: ja-JP-KeitaNeural
+Gender: Male
+
+Name: ja-JP-NanamiNeural
+Gender: Female
+
+Name: jv-ID-DimasNeural
+Gender: Male
+
+Name: jv-ID-SitiNeural
+Gender: Female
+
+Name: ka-GE-EkaNeural
+Gender: Female
+
+Name: ka-GE-GiorgiNeural
+Gender: Male
+
+Name: kk-KZ-AigulNeural
+Gender: Female
+
+Name: kk-KZ-DauletNeural
+Gender: Male
+
+Name: km-KH-PisethNeural
+Gender: Male
+
+Name: km-KH-SreymomNeural
+Gender: Female
+
+Name: kn-IN-GaganNeural
+Gender: Male
+
+Name: kn-IN-SapnaNeural
+Gender: Female
+
+Name: ko-KR-HyunsuNeural
+Gender: Male
+
+Name: ko-KR-InJoonNeural
+Gender: Male
+
+Name: ko-KR-SunHiNeural
+Gender: Female
+
+Name: lo-LA-ChanthavongNeural
+Gender: Male
+
+Name: lo-LA-KeomanyNeural
+Gender: Female
+
+Name: lt-LT-LeonasNeural
+Gender: Male
+
+Name: lt-LT-OnaNeural
+Gender: Female
+
+Name: lv-LV-EveritaNeural
+Gender: Female
+
+Name: lv-LV-NilsNeural
+Gender: Male
+
+Name: mk-MK-AleksandarNeural
+Gender: Male
+
+Name: mk-MK-MarijaNeural
+Gender: Female
+
+Name: ml-IN-MidhunNeural
+Gender: Male
+
+Name: ml-IN-SobhanaNeural
+Gender: Female
+
+Name: mn-MN-BataaNeural
+Gender: Male
+
+Name: mn-MN-YesuiNeural
+Gender: Female
+
+Name: mr-IN-AarohiNeural
+Gender: Female
+
+Name: mr-IN-ManoharNeural
+Gender: Male
+
+Name: ms-MY-OsmanNeural
+Gender: Male
+
+Name: ms-MY-YasminNeural
+Gender: Female
+
+Name: mt-MT-GraceNeural
+Gender: Female
+
+Name: mt-MT-JosephNeural
+Gender: Male
+
+Name: my-MM-NilarNeural
+Gender: Female
+
+Name: my-MM-ThihaNeural
+Gender: Male
+
+Name: nb-NO-FinnNeural
+Gender: Male
+
+Name: nb-NO-PernilleNeural
+Gender: Female
+
+Name: ne-NP-HemkalaNeural
+Gender: Female
+
+Name: ne-NP-SagarNeural
+Gender: Male
+
+Name: nl-BE-ArnaudNeural
+Gender: Male
+
+Name: nl-BE-DenaNeural
+Gender: Female
+
+Name: nl-NL-ColetteNeural
+Gender: Female
+
+Name: nl-NL-FennaNeural
+Gender: Female
+
+Name: nl-NL-MaartenNeural
+Gender: Male
+
+Name: pl-PL-MarekNeural
+Gender: Male
+
+Name: pl-PL-ZofiaNeural
+Gender: Female
+
+Name: ps-AF-GulNawazNeural
+Gender: Male
+
+Name: ps-AF-LatifaNeural
+Gender: Female
+
+Name: pt-BR-AntonioNeural
+Gender: Male
+
+Name: pt-BR-FranciscaNeural
+Gender: Female
+
+Name: pt-BR-ThalitaNeural
+Gender: Female
+
+Name: pt-PT-DuarteNeural
+Gender: Male
+
+Name: pt-PT-RaquelNeural
+Gender: Female
+
+Name: ro-RO-AlinaNeural
+Gender: Female
+
+Name: ro-RO-EmilNeural
+Gender: Male
+
+Name: ru-RU-DmitryNeural
+Gender: Male
+
+Name: ru-RU-SvetlanaNeural
+Gender: Female
+
+Name: si-LK-SameeraNeural
+Gender: Male
+
+Name: si-LK-ThiliniNeural
+Gender: Female
+
+Name: sk-SK-LukasNeural
+Gender: Male
+
+Name: sk-SK-ViktoriaNeural
+Gender: Female
+
+Name: sl-SI-PetraNeural
+Gender: Female
+
+Name: sl-SI-RokNeural
+Gender: Male
+
+Name: so-SO-MuuseNeural
+Gender: Male
+
+Name: so-SO-UbaxNeural
+Gender: Female
+
+Name: sq-AL-AnilaNeural
+Gender: Female
+
+Name: sq-AL-IlirNeural
+Gender: Male
+
+Name: sr-RS-NicholasNeural
+Gender: Male
+
+Name: sr-RS-SophieNeural
+Gender: Female
+
+Name: su-ID-JajangNeural
+Gender: Male
+
+Name: su-ID-TutiNeural
+Gender: Female
+
+Name: sv-SE-MattiasNeural
+Gender: Male
+
+Name: sv-SE-SofieNeural
+Gender: Female
+
+Name: sw-KE-RafikiNeural
+Gender: Male
+
+Name: sw-KE-ZuriNeural
+Gender: Female
+
+Name: sw-TZ-DaudiNeural
+Gender: Male
+
+Name: sw-TZ-RehemaNeural
+Gender: Female
+
+Name: ta-IN-PallaviNeural
+Gender: Female
+
+Name: ta-IN-ValluvarNeural
+Gender: Male
+
+Name: ta-LK-KumarNeural
+Gender: Male
+
+Name: ta-LK-SaranyaNeural
+Gender: Female
+
+Name: ta-MY-KaniNeural
+Gender: Female
+
+Name: ta-MY-SuryaNeural
+Gender: Male
+
+Name: ta-SG-AnbuNeural
+Gender: Male
+
+Name: ta-SG-VenbaNeural
+Gender: Female
+
+Name: te-IN-MohanNeural
+Gender: Male
+
+Name: te-IN-ShrutiNeural
+Gender: Female
+
+Name: th-TH-NiwatNeural
+Gender: Male
+
+Name: th-TH-PremwadeeNeural
+Gender: Female
+
+Name: tr-TR-AhmetNeural
+Gender: Male
+
+Name: tr-TR-EmelNeural
+Gender: Female
+
+Name: uk-UA-OstapNeural
+Gender: Male
+
+Name: uk-UA-PolinaNeural
+Gender: Female
+
+Name: ur-IN-GulNeural
+Gender: Female
+
+Name: ur-IN-SalmanNeural
+Gender: Male
+
+Name: ur-PK-AsadNeural
+Gender: Male
+
+Name: ur-PK-UzmaNeural
+Gender: Female
+
+Name: uz-UZ-MadinaNeural
+Gender: Female
+
+Name: uz-UZ-SardorNeural
+Gender: Male
+
+Name: vi-VN-HoaiMyNeural
+Gender: Female
+
+Name: vi-VN-NamMinhNeural
+Gender: Male
+
+Name: zh-CN-XiaoxiaoNeural
+Gender: Female
+
+Name: zh-CN-XiaoyiNeural
+Gender: Female
+
+Name: zh-CN-YunjianNeural
+Gender: Male
+
+Name: zh-CN-YunxiNeural
+Gender: Male
+
+Name: zh-CN-YunxiaNeural
+Gender: Male
+
+Name: zh-CN-YunyangNeural
+Gender: Male
+
+Name: zh-CN-liaoning-XiaobeiNeural
+Gender: Female
+
+Name: zh-CN-shaanxi-XiaoniNeural
+Gender: Female
+
+Name: zh-HK-HiuGaaiNeural
+Gender: Female
+
+Name: zh-HK-HiuMaanNeural
+Gender: Female
+
+Name: zh-HK-WanLungNeural
+Gender: Male
+
+Name: zh-TW-HsiaoChenNeural
+Gender: Female
+
+Name: zh-TW-HsiaoYuNeural
+Gender: Female
+
+Name: zh-TW-YunJheNeural
+Gender: Male
+
+Name: zu-ZA-ThandoNeural
+Gender: Female
+
+Name: zu-ZA-ThembaNeural
+Gender: Male
+
+
+Name: en-US-AvaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-AndrewMultilingualNeural-V2
+Gender: Male
+
+Name: en-US-EmmaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-BrianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural-V2
+Gender: Female
+
+Name: fr-FR-RemyMultilingualNeural-V2
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural-V2
+Gender: Female
+
+Name: zh-CN-XiaoxiaoMultilingualNeural-V2
+Gender: Female
+    """.strip()
+    voices = []
+    name = ""
+    for line in voices_str.split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("Name: "):
+            name = line[6:].strip()
+        if line.startswith("Gender: "):
+            gender = line[8:].strip()
+            if name and gender:
+                # voices.append({
+                #     "name": name,
+                #     "gender": gender,
+                # })
+                if filter_locals:
+                    for filter_local in filter_locals:
+                        if name.lower().startswith(filter_local.lower()):
+                            voices.append(f"{name}-{gender}")
+                else:
+                    voices.append(f"{name}-{gender}")
+                name = ""
+    voices.sort()
+    return voices
+
+
+def parse_voice_name(name: str):
+    # zh-CN-XiaoyiNeural-Female
+    # zh-CN-YunxiNeural-Male
+    # zh-CN-XiaoxiaoMultilingualNeural-V2-Female
+    name = name.replace("-Female", "").replace("-Male", "").strip()
+    return name
+
+
+def is_azure_v2_voice(voice_name: str):
+    voice_name = parse_voice_name(voice_name)
+    if voice_name.endswith("-V2"):
+        return voice_name.replace("-V2", "").strip()
+    return ""
+
+
+def tts(
+    text: str, voice_name: str, voice_rate: float, voice_file: str
+) -> [SubMaker, None]:
+    if is_azure_v2_voice(voice_name):
+        return azure_tts_v2(text, voice_name, voice_file)
+    return azure_tts_v1(text, voice_name, voice_rate, voice_file)
+
+
+def convert_rate_to_percent(rate: float) -> str:
+    if rate == 1.0:
+        return "+0%"
+    percent = round((rate - 1.0) * 100)
+    if percent > 0:
+        return f"+{percent}%"
+    else:
+        return f"{percent}%"
+
+
+def azure_tts_v1(
+    text: str, voice_name: str, voice_rate: float, voice_file: str
+) -> [SubMaker, None]:
+    voice_name = parse_voice_name(voice_name)
+    text = text.strip()
+    rate_str = convert_rate_to_percent(voice_rate)
+    for i in range(3):
+        try:
+            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+
+            async def _do() -> SubMaker:
+                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
+                sub_maker = edge_tts.SubMaker()
+                with open(voice_file, "wb") as file:
+                    async for chunk in communicate.stream():
+                        if chunk["type"] == "audio":
+                            file.write(chunk["data"])
+                        elif chunk["type"] == "WordBoundary":
+                            sub_maker.create_sub(
+                                (chunk["offset"], chunk["duration"]), chunk["text"]
+                            )
+                return sub_maker
+
+            sub_maker = asyncio.run(_do())
+            if not sub_maker or not sub_maker.subs:
+                logger.warning(f"failed, sub_maker is None or sub_maker.subs is None")
+                continue
+
+            logger.info(f"completed, output file: {voice_file}")
+            return sub_maker
+        except Exception as e:
+            logger.error(f"failed, error: {str(e)}")
+    return None
+
+
+def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+    voice_name = is_azure_v2_voice(voice_name)
+    if not voice_name:
+        logger.error(f"invalid voice name: {voice_name}")
+        raise ValueError(f"invalid voice name: {voice_name}")
+    text = text.strip()
+
+    def _format_duration_to_offset(duration) -> int:
+        if isinstance(duration, str):
+            time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
+            milliseconds = (
+                (time_obj.hour * 3600000)
+                + (time_obj.minute * 60000)
+                + (time_obj.second * 1000)
+                + (time_obj.microsecond // 1000)
+            )
+            return milliseconds * 10000
+
+        if isinstance(duration, int):
+            return duration
+
+        return 0
+
+    for i in range(3):
+        try:
+            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+
+            import azure.cognitiveservices.speech as speechsdk
+
+            sub_maker = SubMaker()
+
+            def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
+                # print('WordBoundary event:')
+                # print('\tBoundaryType: {}'.format(evt.boundary_type))
+                # print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000)))
+                # print('\tDuration: {}'.format(evt.duration))
+                # print('\tText: {}'.format(evt.text))
+                # print('\tTextOffset: {}'.format(evt.text_offset))
+                # print('\tWordLength: {}'.format(evt.word_length))
+
+                duration = _format_duration_to_offset(str(evt.duration))
+                offset = _format_duration_to_offset(evt.audio_offset)
+                sub_maker.subs.append(evt.text)
+                sub_maker.offset.append((offset, offset + duration))
+
+            # Creates an instance of a speech config with specified subscription key and service region.
+            speech_key = config.azure.get("speech_key", "")
+            service_region = config.azure.get("speech_region", "")
+            audio_config = speechsdk.audio.AudioOutputConfig(
+                filename=voice_file, use_default_speaker=True
+            )
+            speech_config = speechsdk.SpeechConfig(
+                subscription=speech_key, region=service_region
+            )
+            speech_config.speech_synthesis_voice_name = voice_name
+            # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
+            #                            value='true')
+            speech_config.set_property(
+                property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
+                value="true",
+            )
+
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3
+            )
+            speech_synthesizer = speechsdk.SpeechSynthesizer(
+                audio_config=audio_config, speech_config=speech_config
+            )
+            speech_synthesizer.synthesis_word_boundary.connect(
+                speech_synthesizer_word_boundary_cb
+            )
+
+            result = speech_synthesizer.speak_text_async(text).get()
+            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+                logger.success(f"azure v2 speech synthesis succeeded: {voice_file}")
+                return sub_maker
+            elif result.reason == speechsdk.ResultReason.Canceled:
+                cancellation_details = result.cancellation_details
+                logger.error(
+                    f"azure v2 speech synthesis canceled: {cancellation_details.reason}"
+                )
+                if cancellation_details.reason == speechsdk.CancellationReason.Error:
+                    logger.error(
+                        f"azure v2 speech synthesis error: {cancellation_details.error_details}"
+                    )
+            logger.info(f"completed, output file: {voice_file}")
+        except Exception as e:
+            logger.error(f"failed, error: {str(e)}")
+    return None
+
+
+def _format_text(text: str) -> str:
+    # text = text.replace("\n", " ")
+    text = text.replace("[", " ")
+    text = text.replace("]", " ")
+    text = text.replace("(", " ")
+    text = text.replace(")", " ")
+    text = text.replace("{", " ")
+    text = text.replace("}", " ")
+    text = text.strip()
+    return text
+
+
+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    text = _format_text(text)
+
+    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
+        """
+        1
+        00:00:00,000 --> 00:00:02,360
+        跑步是一项简单易行的运动
+        """
+        start_t = mktimestamp(start_time).replace(".", ",")
+        end_t = mktimestamp(end_time).replace(".", ",")
+        return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
+
+    start_time = -1.0
+    sub_items = []
+    sub_index = 0
+
+    script_lines = utils.split_string_by_punctuations(text)
+
+    def match_line(_sub_line: str, _sub_index: int):
+        if len(script_lines) <= _sub_index:
+            return ""
+
+        _line = script_lines[_sub_index]
+        if _sub_line == _line:
+            return script_lines[_sub_index].strip()
+
+        _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
+        _line_ = re.sub(r"[^\w\s]", "", _line)
+        if _sub_line_ == _line_:
+            return _line_.strip()
+
+        _sub_line_ = re.sub(r"\W+", "", _sub_line)
+        _line_ = re.sub(r"\W+", "", _line)
+        if _sub_line_ == _line_:
+            return _line.strip()
+
+        return ""
+
+    sub_line = ""
+
+    try:
+        for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
+            _start_time, end_time = offset
+            if start_time < 0:
+                start_time = _start_time
+
+            sub = unescape(sub)
+            sub_line += sub
+            sub_text = match_line(sub_line, sub_index)
+            if sub_text:
+                sub_index += 1
+                line = formatter(
+                    idx=sub_index,
+                    start_time=start_time,
+                    end_time=end_time,
+                    sub_text=sub_text,
+                )
+                sub_items.append(line)
+                start_time = -1.0
+                sub_line = ""
+
+        if len(sub_items) == len(script_lines):
+            with open(subtitle_file, "w", encoding="utf-8") as file:
+                file.write("\n".join(sub_items) + "\n")
+            try:
+                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
+                duration = max([tb for ((ta, tb), txt) in sbs])
+                logger.info(
+                    f"completed, subtitle file created: {subtitle_file}, duration: {duration}"
+                )
+            except Exception as e:
+                logger.error(f"failed, error: {str(e)}")
+                os.remove(subtitle_file)
+        else:
+            logger.warning(
+                f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}"
+            )
+
+    except Exception as e:
+        logger.error(f"failed, error: {str(e)}")
+
+
+def get_audio_duration(sub_maker: submaker.SubMaker):
+    """
+    获取音频时长
+    """
+    if not sub_maker.offset:
+        return 0.0
+    return sub_maker.offset[-1][1] / 10000000
+
+
+if __name__ == "__main__":
+    voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
+    voice_name = parse_voice_name(voice_name)
+    voice_name = is_azure_v2_voice(voice_name)
+    print(voice_name)
+
+    voices = get_all_azure_voices()
+    print(len(voices))
+
+    async def _do():
+        temp_dir = utils.storage_dir("temp")
+
+        voice_names = [
+            "zh-CN-XiaoxiaoMultilingualNeural",
+            # 女性
+            "zh-CN-XiaoxiaoNeural",
+            "zh-CN-XiaoyiNeural",
+            # 男性
+            "zh-CN-YunyangNeural",
+            "zh-CN-YunxiNeural",
+        ]
+        text = """
+        静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人，表达了他对家乡和亲人的深深思念之情。全诗内容是：“床前明月光，疑是地上霜。举头望明月，低头思故乡。”在这短短的四句诗中，诗人通过“明月”和“思故乡”的意象，巧妙地表达了离乡背井人的孤独与哀愁。首句“床前明月光”设景立意，通过明亮的月光引出诗人的遐想；“疑是地上霜”增添了夜晚的寒冷感，加深了诗人的孤寂之情；“举头望明月”和“低头思故乡”则是情感的升华，展现了诗人内心深处的乡愁和对家的渴望。这首诗简洁明快，情感真挚，是中国古典诗歌中非常著名的一首，也深受后人喜爱和推崇。
+            """
+
+        text = """
+        What is the meaning of life? This question has puzzled philosophers, scientists, and thinkers of all kinds for centuries. Throughout history, various cultures and individuals have come up with their interpretations and beliefs around the purpose of life. Some say it's to seek happiness and self-fulfillment, while others believe it's about contributing to the welfare of others and making a positive impact in the world. Despite the myriad of perspectives, one thing remains clear: the meaning of life is a deeply personal concept that varies from one person to another. It's an existential inquiry that encourages us to reflect on our values, desires, and the essence of our existence.
+        """
+
+        text = """
+               预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
+               10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
+               12日天气短暂好转，早晚清凉；
+                   """
+
+        text = "[Opening scene: A sunny day in a suburban neighborhood. A young boy named Alex, around 8 years old, is playing in his front yard with his loyal dog, Buddy.]\n\n[Camera zooms in on Alex as he throws a ball for Buddy to fetch. Buddy excitedly runs after it and brings it back to Alex.]\n\nAlex: Good boy, Buddy! You're the best dog ever!\n\n[Buddy barks happily and wags his tail.]\n\n[As Alex and Buddy continue playing, a series of potential dangers loom nearby, such as a stray dog approaching, a ball rolling towards the street, and a suspicious-looking stranger walking by.]\n\nAlex: Uh oh, Buddy, look out!\n\n[Buddy senses the danger and immediately springs into action. He barks loudly at the stray dog, scaring it away. Then, he rushes to retrieve the ball before it reaches the street and gently nudges it back towards Alex. Finally, he stands protectively between Alex and the stranger, growling softly to warn them away.]\n\nAlex: Wow, Buddy, you're like my superhero!\n\n[Just as Alex and Buddy are about to head inside, they hear a loud crash from a nearby construction site. They rush over to investigate and find a pile of rubble blocking the path of a kitten trapped underneath.]\n\nAlex: Oh no, Buddy, we have to help!\n\n[Buddy barks in agreement and together they work to carefully move the rubble aside, allowing the kitten to escape unharmed. The kitten gratefully nuzzles against Buddy, who responds with a friendly lick.]\n\nAlex: We did it, Buddy! We saved the day again!\n\n[As Alex and Buddy walk home together, the sun begins to set, casting a warm glow over the neighborhood.]\n\nAlex: Thanks for always being there to watch over me, Buddy. You're not just my dog, you're my best friend.\n\n[Buddy barks happily and nuzzles against Alex as they disappear into the sunset, ready to face whatever adventures tomorrow may bring.]\n\n[End scene.]"
+
+        text = "大家好，我是乔哥，一个想帮你把信用卡全部还清的家伙！\n今天我们要聊的是信用卡的取现功能。\n你是不是也曾经因为一时的资金紧张，而拿着信用卡到ATM机取现？如果是，那你得好好看看这个视频了。\n现在都2024年了，我以为现在不会再有人用信用卡取现功能了。前几天一个粉丝发来一张图片，取现1万。\n信用卡取现有三个弊端。\n一，信用卡取现功能代价可不小。会先收取一个取现手续费，比如这个粉丝，取现1万，按2.5%收取手续费，收取了250元。\n二，信用卡正常消费有最长56天的免息期，但取现不享受免息期。从取现那一天开始，每天按照万5收取利息，这个粉丝用了11天，收取了55元利息。\n三，频繁的取现行为，银行会认为你资金紧张，会被标记为高风险用户，影响你的综合评分和额度。\n那么，如果你资金紧张了，该怎么办呢？\n乔哥给你支一招，用破思机摩擦信用卡，只需要少量的手续费，而且还可以享受最长56天的免息期。\n最后，如果你对玩卡感兴趣，可以找乔哥领取一本《卡神秘籍》，用卡过程中遇到任何疑惑，也欢迎找乔哥交流。\n别忘了，关注乔哥，回复用卡技巧，免费领取《2024用卡技巧》，让我们一起成为用卡高手！"
+
+        text = """
+        2023全年业绩速览
+公司全年累计实现营业收入1476.94亿元，同比增长19.01%，归母净利润747.34亿元，同比增长19.16%。EPS达到59.49元。第四季度单季，营业收入444.25亿元，同比增长20.26%，环比增长31.86%；归母净利润218.58亿元，同比增长19.33%，环比增长29.37%。这一阶段
+的业绩表现不仅突显了公司的增长动力和盈利能力，也反映出公司在竞争激烈的市场环境中保持了良好的发展势头。
+2023年Q4业绩速览
+第四季度，营业收入贡献主要增长点；销售费用高增致盈利能力承压；税金同比上升27%，扰动净利率表现。
+业绩解读
+利润方面，2023全年贵州茅台，>归母净利润增速为19%，其中营业收入正贡献18%，营业成本正贡献百分之一，管理费用正贡献百分之一点四。(注：归母净利润增速值=营业收入增速+各科目贡献，展示贡献/拖累的前四名科目，且要求贡献值/净利润增速>15%)
+"""
+        text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人"
+
+        text = _format_text(text)
+        lines = utils.split_string_by_punctuations(text)
+        print(lines)
+
+        for voice_name in voice_names:
+            voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
+            subtitle_file = f"{temp_dir}/tts.mp3.srt"
+            sub_maker = azure_tts_v2(
+                text=text, voice_name=voice_name, voice_file=voice_file
+            )
+            create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
+            audio_duration = get_audio_duration(sub_maker)
+            print(f"voice: {voice_name}, audio duration: {audio_duration}s")
+
+    loop = asyncio.get_event_loop_policy().get_event_loop()
+    try:
+        loop.run_until_complete(_do())
+    finally:
+        loop.close()
diff --git a/app/utils/utils.py b/app/utils/utils.py
new file mode 100644
index 0000000..229d667
--- /dev/null
+++ b/app/utils/utils.py
@@ -0,0 +1,271 @@
+import locale
+import os
+import platform
+import threading
+from typing import Any
+from loguru import logger
+import json
+from uuid import uuid4
+import urllib3
+
+from app.models import const
+
+urllib3.disable_warnings()
+
+
+def get_response(status: int, data: Any = None, message: str = ""):
+    obj = {
+        "status": status,
+    }
+    if data:
+        obj["data"] = data
+    if message:
+        obj["message"] = message
+    return obj
+
+
+def to_json(obj):
+    try:
+        # 定义一个辅助函数来处理不同类型的对象
+        def serialize(o):
+            # 如果对象是可序列化类型，直接返回
+            if isinstance(o, (int, float, bool, str)) or o is None:
+                return o
+            # 如果对象是二进制数据，转换为base64编码的字符串
+            elif isinstance(o, bytes):
+                return "*** binary data ***"
+            # 如果对象是字典，递归处理每个键值对
+            elif isinstance(o, dict):
+                return {k: serialize(v) for k, v in o.items()}
+            # 如果对象是列表或元组，递归处理每个元素
+            elif isinstance(o, (list, tuple)):
+                return [serialize(item) for item in o]
+            # 如果对象是自定义类型，尝试返回其__dict__属性
+            elif hasattr(o, "__dict__"):
+                return serialize(o.__dict__)
+            # 其他情况返回None（或者可以选择抛出异常）
+            else:
+                return None
+
+        # 使用serialize函数处理输入对象
+        serialized_obj = serialize(obj)
+
+        # 序列化处理后的对象为JSON字符串
+        return json.dumps(serialized_obj, ensure_ascii=False, indent=4)
+    except Exception as e:
+        return None
+
+
+def get_uuid(remove_hyphen: bool = False):
+    u = str(uuid4())
+    if remove_hyphen:
+        u = u.replace("-", "")
+    return u
+
+
+def root_dir():
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+
+def storage_dir(sub_dir: str = "", create: bool = False):
+    d = os.path.join(root_dir(), "storage")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if create and not os.path.exists(d):
+        os.makedirs(d)
+
+    return d
+
+
+def resource_dir(sub_dir: str = ""):
+    d = os.path.join(root_dir(), "resource")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    return d
+
+
+def task_dir(sub_dir: str = ""):
+    d = os.path.join(storage_dir(), "tasks")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def font_dir(sub_dir: str = ""):
+    d = resource_dir(f"fonts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def song_dir(sub_dir: str = ""):
+    d = resource_dir(f"songs")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def public_dir(sub_dir: str = ""):
+    d = resource_dir(f"public")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def run_in_background(func, *args, **kwargs):
+    def run():
+        try:
+            func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"run_in_background error: {e}")
+
+    thread = threading.Thread(target=run)
+    thread.start()
+    return thread
+
+
+def time_convert_seconds_to_hmsm(seconds) -> str:
+    hours = int(seconds // 3600)
+    seconds = seconds % 3600
+    minutes = int(seconds // 60)
+    milliseconds = int(seconds * 1000) % 1000
+    seconds = int(seconds % 60)
+    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds)
+
+
+def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str:
+    start_time = time_convert_seconds_to_hmsm(start_time)
+    end_time = time_convert_seconds_to_hmsm(end_time)
+    srt = """%d
+%s --> %s
+%s
+        """ % (
+        idx,
+        start_time,
+        end_time,
+        msg,
+    )
+    return srt
+
+
+def str_contains_punctuation(word):
+    for p in const.PUNCTUATIONS:
+        if p in word:
+            return True
+    return False
+
+
+def split_string_by_punctuations(s):
+    result = []
+    txt = ""
+
+    previous_char = ""
+    next_char = ""
+    for i in range(len(s)):
+        char = s[i]
+        if char == "\n":
+            result.append(txt.strip())
+            txt = ""
+            continue
+
+        if i > 0:
+            previous_char = s[i - 1]
+        if i < len(s) - 1:
+            next_char = s[i + 1]
+
+        if char == "." and previous_char.isdigit() and next_char.isdigit():
+            # 取现1万，按2.5%收取手续费, 2.5 中的 . 不能作为换行标记
+            txt += char
+            continue
+
+        if char not in const.PUNCTUATIONS:
+            txt += char
+        else:
+            result.append(txt.strip())
+            txt = ""
+    result.append(txt.strip())
+    # filter empty string
+    result = list(filter(None, result))
+    return result
+
+
+def md5(text):
+    import hashlib
+
+    return hashlib.md5(text.encode("utf-8")).hexdigest()
+
+
+def get_system_locale():
+    try:
+        loc = locale.getdefaultlocale()
+        # zh_CN, zh_TW return zh
+        # en_US, en_GB return en
+        language_code = loc[0].split("_")[0]
+        return language_code
+    except Exception as e:
+        return "en"
+
+
+def load_locales(i18n_dir):
+    _locales = {}
+    for root, dirs, files in os.walk(i18n_dir):
+        for file in files:
+            if file.endswith(".json"):
+                lang = file.split(".")[0]
+                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
+                    _locales[lang] = json.loads(f.read())
+    return _locales
+
+
+def parse_extension(filename):
+    return os.path.splitext(filename)[1].strip().lower().replace(".", "")
+
+
+def script_dir(sub_dir: str = ""):
+    d = resource_dir(f"scripts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def video_dir(sub_dir: str = ""):
+    d = resource_dir(f"videos")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def split_timestamp(timestamp):
+    """
+    拆分时间戳
+    """
+    start, end = timestamp.split('-')
+    start_hour, start_minute = map(int, start.split(':'))
+    end_hour, end_minute = map(int, end.split(':'))
+
+    start_time = '00:{:02d}:{:02d}'.format(start_hour, start_minute)
+    end_time = '00:{:02d}:{:02d}'.format(end_hour, end_minute)
+
+    return start_time, end_time
+
+
+def reduce_video_time(txt: str, duration: float = 0.21531):
+    """
+    按照字数缩减视频时长，一个字耗时约 0.21531 s,
+    Returns:
+    """
+    # 返回结果四舍五入为整数
+    duration = len(txt) * duration
+    return int(duration)
diff --git a/changelog.py b/changelog.py
new file mode 100644
index 0000000..31a1337
--- /dev/null
+++ b/changelog.py
@@ -0,0 +1,17 @@
+from git_changelog.cli import build_and_render
+
+# 运行这段脚本自动生成CHANGELOG.md文件
+
+build_and_render(
+    repository=".",
+    output="CHANGELOG.md",
+    convention="angular",
+    provider="github",
+    template="keepachangelog",
+    parse_trailers=True,
+    parse_refs=False,
+    sections=["build", "deps", "feat", "fix", "refactor"],
+    versioning="pep440",
+    bump="1.1.2",  # 指定bump版本
+    in_place=True,
+)
diff --git a/config.example.toml b/config.example.toml
new file mode 100644
index 0000000..77a5cb0
--- /dev/null
+++ b/config.example.toml
@@ -0,0 +1,194 @@
+[app]
+    project_version="0.1.2"
+    video_source = "pexels"  # "pexels" or "pixabay"
+    # Pexels API Key
+    # Register at https://www.pexels.com/api/ to get your API key.
+    # You can use multiple keys to avoid rate limits.
+    # For example: pexels_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
+    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
+    pexels_api_keys = []
+
+    # Pixabay API Key
+    # Register at https://pixabay.com/api/docs/ to get your API key.
+    # You can use multiple keys to avoid rate limits.
+    # For example: pixabay_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
+    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
+    pixabay_api_keys = []
+
+    # 如果你没有 OPENAI API Key，可以使用 g4f 代替，或者使用国内的 Moonshot API
+    # If you don't have an OPENAI API Key, you can use g4f instead
+
+    # 支持的提供商 (Supported providers):
+    #   openai
+    #   moonshot (月之暗面)
+    #   oneapi
+    #   g4f
+    #   azure
+    #   qwen (通义千问)
+    #   gemini
+    llm_provider="openai"
+
+    ########## Ollama Settings
+    # No need to set it unless you want to use your own proxy
+    ollama_base_url = ""
+    # Check your available models at https://ollama.com/library
+    ollama_model_name = ""
+
+    ########## OpenAI API Key
+    # Get your API key at https://platform.openai.com/api-keys
+    openai_api_key = ""
+    # No need to set it unless you want to use your own proxy
+    openai_base_url = ""
+    # Check your available models at https://platform.openai.com/account/limits
+    openai_model_name = "gpt-4-turbo"
+
+    ########## Moonshot API Key
+    # Visit https://platform.moonshot.cn/console/api-keys to get your API key.
+    moonshot_api_key=""
+    moonshot_base_url = "https://api.moonshot.cn/v1"
+    moonshot_model_name = "moonshot-v1-8k"
+
+    ########## OneAPI API Key
+    # Visit https://github.com/songquanpeng/one-api to get your API key
+    oneapi_api_key=""
+    oneapi_base_url=""
+    oneapi_model_name=""
+
+    ########## G4F
+    # Visit https://github.com/xtekky/gpt4free to get more details
+    # Supported model list: https://github.com/xtekky/gpt4free/blob/main/g4f/models.py
+    g4f_model_name = "gpt-3.5-turbo"
+
+    ########## Azure API Key
+    # Visit https://learn.microsoft.com/zh-cn/azure/ai-services/openai/ to get more details
+    # API documentation: https://learn.microsoft.com/zh-cn/azure/ai-services/openai/reference
+    azure_api_key = ""
+    azure_base_url=""
+    azure_model_name="gpt-35-turbo" # replace with your model deployment name
+    azure_api_version = "2024-02-15-preview"
+
+    ########## Gemini API Key
+    gemini_api_key=""
+    gemini_model_name = "gemini-1.5-flash"
+
+    ########## Qwen API Key
+    # Visit https://dashscope.console.aliyun.com/apiKey to get your API key
+    # Visit below links to get more details
+    # https://tongyi.aliyun.com/qianwen/
+    # https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction
+    qwen_api_key = ""
+    qwen_model_name = "qwen-max"
+
+
+    ########## DeepSeek API Key
+    # Visit https://platform.deepseek.com/api_keys to get your API key
+    deepseek_api_key = ""
+    deepseek_base_url = "https://api.deepseek.com"
+    deepseek_model_name = "deepseek-chat"
+
+    # Subtitle Provider, "edge" or "whisper"
+    # If empty, the subtitle will not be generated
+    subtitle_provider = "edge"
+
+    #
+    # ImageMagick
+    #
+    # Once you have installed it, ImageMagick will be automatically detected, except on Windows!
+    # On Windows, for example "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
+    # Download from https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
+
+    # imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"
+
+
+    #
+    # FFMPEG
+    #
+    # 通常情况下，ffmpeg 会被自动下载，并且会被自动检测到。
+    # 但是如果你的环境有问题，无法自动下载，可能会遇到如下错误：
+    #   RuntimeError: No ffmpeg exe could be found.
+    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
+    # 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path，下载地址：https://www.gyan.dev/ffmpeg/builds/
+
+    # Under normal circumstances, ffmpeg is downloaded automatically and detected automatically.
+    # However, if there is an issue with your environment that prevents automatic downloading, you might encounter the following error:
+    #   RuntimeError: No ffmpeg exe could be found.
+    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
+    # In such cases, you can manually download ffmpeg and set the ffmpeg_path, download link: https://www.gyan.dev/ffmpeg/builds/
+
+    # ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
+    #########################################################################################
+
+    # 当视频生成成功后，API服务提供的视频下载接入点，默认为当前服务的地址和监听端口
+    # 比如 http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # 如果你需要使用域名对外提供服务（一般会用nginx做代理），则可以设置为你的域名
+    # 比如 https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # endpoint="https://xxxx.com"
+
+    # When the video is successfully generated, the API service provides a download endpoint for the video, defaulting to the service's current address and listening port.
+    # For example, http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # If you need to provide the service externally using a domain name (usually done with nginx as a proxy), you can set it to your domain name.
+    # For example, https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # endpoint="https://xxxx.com"
+    endpoint=""
+
+
+    # Video material storage location
+    # material_directory = ""                    # Indicates that video materials will be downloaded to the default folder, the default folder is ./storage/cache_videos under the current project
+    # material_directory = "/user/harry/videos"  # Indicates that video materials will be downloaded to a specified folder
+    # material_directory = "task"                # Indicates that video materials will be downloaded to the current task's folder, this method does not allow sharing of already downloaded video materials
+
+    # 视频素材存放位置
+    # material_directory = ""                    #表示将视频素材下载到默认的文件夹，默认文件夹为当前项目下的 ./storage/cache_videos
+    # material_directory = "/user/harry/videos"  #表示将视频素材下载到指定的文件夹中
+    # material_directory = "task"                #表示将视频素材下载到当前任务的文件夹中，这种方式无法共享已经下载的视频素材
+
+    material_directory = ""
+
+    # Used for state management of the task
+    enable_redis = false
+    redis_host = "localhost"
+    redis_port = 6379
+    redis_db = 0
+    redis_password = ""
+
+    # 文生视频时的最大并发任务数
+    max_concurrent_tasks = 5
+
+    # webui界面是否显示配置项
+    # webui hide baisc config panel
+    hide_config = false
+
+
+[whisper]
+    # Only effective when subtitle_provider is "whisper"
+
+    # Run on GPU with FP16
+    # model = WhisperModel(model_size, device="cuda", compute_type="float16")
+
+    # Run on GPU with INT8
+    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
+
+    # Run on CPU with INT8
+    # model = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+    # recommended model_size: "large-v3"
+    model_size="large-v3"
+    # if you want to use GPU, set device="cuda"
+    device="CPU"
+    compute_type="int8"
+
+
+[proxy]
+    ### Use a proxy to access the Pexels API
+    ### Format: "http://<username>:<password>@<proxy>:<port>"
+    ### Example: "http://user:pass@proxy:1234"
+    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
+
+    # http = "http://10.10.1.10:3128"
+    # https = "http://10.10.1.10:1080"
+
+[azure]
+    # Azure Speech API Key
+    # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
+    speech_key=""
+    speech_region=""
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..2983a4e
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,26 @@
+x-common-volumes: &common-volumes
+  - ./:/NarratoAI
+
+services:
+  webui:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: "webui"
+    ports:
+      - "8501:8501"
+    command: [ "bash", "webui.sh" ]
+    volumes: *common-volumes
+    environment:
+      - "VPN_PROXY_URL=http://host.docker.internal:7890"
+    restart: always
+  api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: "api"
+    ports:
+      - "8080:8080"
+    command: [ "python3", "main.py" ]
+    volumes: *common-volumes
+    restart: always
diff --git a/docs/check.png b/docs/check.png
new file mode 100644
index 0000000..f460f7f
Binary files /dev/null and b/docs/check.png differ
diff --git a/docs/img001.png b/docs/img001.png
new file mode 100644
index 0000000..e645e7b
Binary files /dev/null and b/docs/img001.png differ
diff --git a/docs/img002.png b/docs/img002.png
new file mode 100644
index 0000000..ac31b47
Binary files /dev/null and b/docs/img002.png differ
diff --git a/docs/img003.png b/docs/img003.png
new file mode 100644
index 0000000..dd6ee07
Binary files /dev/null and b/docs/img003.png differ
diff --git a/docs/img004.png b/docs/img004.png
new file mode 100644
index 0000000..4507169
Binary files /dev/null and b/docs/img004.png differ
diff --git a/docs/img005.png b/docs/img005.png
new file mode 100644
index 0000000..da4c2d9
Binary files /dev/null and b/docs/img005.png differ
diff --git a/docs/img006.png b/docs/img006.png
new file mode 100644
index 0000000..d6065c8
Binary files /dev/null and b/docs/img006.png differ
diff --git a/docs/img007.png b/docs/img007.png
new file mode 100644
index 0000000..0753978
Binary files /dev/null and b/docs/img007.png differ
diff --git a/docs/index.png b/docs/index.png
new file mode 100644
index 0000000..e13a59e
Binary files /dev/null and b/docs/index.png differ
diff --git a/docs/voice-list.txt b/docs/voice-list.txt
new file mode 100644
index 0000000..4672117
--- /dev/null
+++ b/docs/voice-list.txt
@@ -0,0 +1,941 @@
+Name: af-ZA-AdriNeural
+Gender: Female
+
+Name: af-ZA-WillemNeural
+Gender: Male
+
+Name: am-ET-AmehaNeural
+Gender: Male
+
+Name: am-ET-MekdesNeural
+Gender: Female
+
+Name: ar-AE-FatimaNeural
+Gender: Female
+
+Name: ar-AE-HamdanNeural
+Gender: Male
+
+Name: ar-BH-AliNeural
+Gender: Male
+
+Name: ar-BH-LailaNeural
+Gender: Female
+
+Name: ar-DZ-AminaNeural
+Gender: Female
+
+Name: ar-DZ-IsmaelNeural
+Gender: Male
+
+Name: ar-EG-SalmaNeural
+Gender: Female
+
+Name: ar-EG-ShakirNeural
+Gender: Male
+
+Name: ar-IQ-BasselNeural
+Gender: Male
+
+Name: ar-IQ-RanaNeural
+Gender: Female
+
+Name: ar-JO-SanaNeural
+Gender: Female
+
+Name: ar-JO-TaimNeural
+Gender: Male
+
+Name: ar-KW-FahedNeural
+Gender: Male
+
+Name: ar-KW-NouraNeural
+Gender: Female
+
+Name: ar-LB-LaylaNeural
+Gender: Female
+
+Name: ar-LB-RamiNeural
+Gender: Male
+
+Name: ar-LY-ImanNeural
+Gender: Female
+
+Name: ar-LY-OmarNeural
+Gender: Male
+
+Name: ar-MA-JamalNeural
+Gender: Male
+
+Name: ar-MA-MounaNeural
+Gender: Female
+
+Name: ar-OM-AbdullahNeural
+Gender: Male
+
+Name: ar-OM-AyshaNeural
+Gender: Female
+
+Name: ar-QA-AmalNeural
+Gender: Female
+
+Name: ar-QA-MoazNeural
+Gender: Male
+
+Name: ar-SA-HamedNeural
+Gender: Male
+
+Name: ar-SA-ZariyahNeural
+Gender: Female
+
+Name: ar-SY-AmanyNeural
+Gender: Female
+
+Name: ar-SY-LaithNeural
+Gender: Male
+
+Name: ar-TN-HediNeural
+Gender: Male
+
+Name: ar-TN-ReemNeural
+Gender: Female
+
+Name: ar-YE-MaryamNeural
+Gender: Female
+
+Name: ar-YE-SalehNeural
+Gender: Male
+
+Name: az-AZ-BabekNeural
+Gender: Male
+
+Name: az-AZ-BanuNeural
+Gender: Female
+
+Name: bg-BG-BorislavNeural
+Gender: Male
+
+Name: bg-BG-KalinaNeural
+Gender: Female
+
+Name: bn-BD-NabanitaNeural
+Gender: Female
+
+Name: bn-BD-PradeepNeural
+Gender: Male
+
+Name: bn-IN-BashkarNeural
+Gender: Male
+
+Name: bn-IN-TanishaaNeural
+Gender: Female
+
+Name: bs-BA-GoranNeural
+Gender: Male
+
+Name: bs-BA-VesnaNeural
+Gender: Female
+
+Name: ca-ES-EnricNeural
+Gender: Male
+
+Name: ca-ES-JoanaNeural
+Gender: Female
+
+Name: cs-CZ-AntoninNeural
+Gender: Male
+
+Name: cs-CZ-VlastaNeural
+Gender: Female
+
+Name: cy-GB-AledNeural
+Gender: Male
+
+Name: cy-GB-NiaNeural
+Gender: Female
+
+Name: da-DK-ChristelNeural
+Gender: Female
+
+Name: da-DK-JeppeNeural
+Gender: Male
+
+Name: de-AT-IngridNeural
+Gender: Female
+
+Name: de-AT-JonasNeural
+Gender: Male
+
+Name: de-CH-JanNeural
+Gender: Male
+
+Name: de-CH-LeniNeural
+Gender: Female
+
+Name: de-DE-AmalaNeural
+Gender: Female
+
+Name: de-DE-ConradNeural
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural
+Gender: Male
+
+Name: de-DE-KatjaNeural
+Gender: Female
+
+Name: de-DE-KillianNeural
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural
+Gender: Female
+
+Name: el-GR-AthinaNeural
+Gender: Female
+
+Name: el-GR-NestorasNeural
+Gender: Male
+
+Name: en-AU-NatashaNeural
+Gender: Female
+
+Name: en-AU-WilliamNeural
+Gender: Male
+
+Name: en-CA-ClaraNeural
+Gender: Female
+
+Name: en-CA-LiamNeural
+Gender: Male
+
+Name: en-GB-LibbyNeural
+Gender: Female
+
+Name: en-GB-MaisieNeural
+Gender: Female
+
+Name: en-GB-RyanNeural
+Gender: Male
+
+Name: en-GB-SoniaNeural
+Gender: Female
+
+Name: en-GB-ThomasNeural
+Gender: Male
+
+Name: en-HK-SamNeural
+Gender: Male
+
+Name: en-HK-YanNeural
+Gender: Female
+
+Name: en-IE-ConnorNeural
+Gender: Male
+
+Name: en-IE-EmilyNeural
+Gender: Female
+
+Name: en-IN-NeerjaExpressiveNeural
+Gender: Female
+
+Name: en-IN-NeerjaNeural
+Gender: Female
+
+Name: en-IN-PrabhatNeural
+Gender: Male
+
+Name: en-KE-AsiliaNeural
+Gender: Female
+
+Name: en-KE-ChilembaNeural
+Gender: Male
+
+Name: en-NG-AbeoNeural
+Gender: Male
+
+Name: en-NG-EzinneNeural
+Gender: Female
+
+Name: en-NZ-MitchellNeural
+Gender: Male
+
+Name: en-NZ-MollyNeural
+Gender: Female
+
+Name: en-PH-JamesNeural
+Gender: Male
+
+Name: en-PH-RosaNeural
+Gender: Female
+
+Name: en-SG-LunaNeural
+Gender: Female
+
+Name: en-SG-WayneNeural
+Gender: Male
+
+Name: en-TZ-ElimuNeural
+Gender: Male
+
+Name: en-TZ-ImaniNeural
+Gender: Female
+
+Name: en-US-AnaNeural
+Gender: Female
+
+Name: en-US-AndrewNeural
+Gender: Male
+
+Name: en-US-AriaNeural
+Gender: Female
+
+Name: en-US-AvaNeural
+Gender: Female
+
+Name: en-US-BrianNeural
+Gender: Male
+
+Name: en-US-ChristopherNeural
+Gender: Male
+
+Name: en-US-EmmaNeural
+Gender: Female
+
+Name: en-US-EricNeural
+Gender: Male
+
+Name: en-US-GuyNeural
+Gender: Male
+
+Name: en-US-JennyNeural
+Gender: Female
+
+Name: en-US-MichelleNeural
+Gender: Female
+
+Name: en-US-RogerNeural
+Gender: Male
+
+Name: en-US-SteffanNeural
+Gender: Male
+
+Name: en-ZA-LeahNeural
+Gender: Female
+
+Name: en-ZA-LukeNeural
+Gender: Male
+
+Name: es-AR-ElenaNeural
+Gender: Female
+
+Name: es-AR-TomasNeural
+Gender: Male
+
+Name: es-BO-MarceloNeural
+Gender: Male
+
+Name: es-BO-SofiaNeural
+Gender: Female
+
+Name: es-CL-CatalinaNeural
+Gender: Female
+
+Name: es-CL-LorenzoNeural
+Gender: Male
+
+Name: es-CO-GonzaloNeural
+Gender: Male
+
+Name: es-CO-SalomeNeural
+Gender: Female
+
+Name: es-CR-JuanNeural
+Gender: Male
+
+Name: es-CR-MariaNeural
+Gender: Female
+
+Name: es-CU-BelkysNeural
+Gender: Female
+
+Name: es-CU-ManuelNeural
+Gender: Male
+
+Name: es-DO-EmilioNeural
+Gender: Male
+
+Name: es-DO-RamonaNeural
+Gender: Female
+
+Name: es-EC-AndreaNeural
+Gender: Female
+
+Name: es-EC-LuisNeural
+Gender: Male
+
+Name: es-ES-AlvaroNeural
+Gender: Male
+
+Name: es-ES-ElviraNeural
+Gender: Female
+
+Name: es-ES-XimenaNeural
+Gender: Female
+
+Name: es-GQ-JavierNeural
+Gender: Male
+
+Name: es-GQ-TeresaNeural
+Gender: Female
+
+Name: es-GT-AndresNeural
+Gender: Male
+
+Name: es-GT-MartaNeural
+Gender: Female
+
+Name: es-HN-CarlosNeural
+Gender: Male
+
+Name: es-HN-KarlaNeural
+Gender: Female
+
+Name: es-MX-DaliaNeural
+Gender: Female
+
+Name: es-MX-JorgeNeural
+Gender: Male
+
+Name: es-NI-FedericoNeural
+Gender: Male
+
+Name: es-NI-YolandaNeural
+Gender: Female
+
+Name: es-PA-MargaritaNeural
+Gender: Female
+
+Name: es-PA-RobertoNeural
+Gender: Male
+
+Name: es-PE-AlexNeural
+Gender: Male
+
+Name: es-PE-CamilaNeural
+Gender: Female
+
+Name: es-PR-KarinaNeural
+Gender: Female
+
+Name: es-PR-VictorNeural
+Gender: Male
+
+Name: es-PY-MarioNeural
+Gender: Male
+
+Name: es-PY-TaniaNeural
+Gender: Female
+
+Name: es-SV-LorenaNeural
+Gender: Female
+
+Name: es-SV-RodrigoNeural
+Gender: Male
+
+Name: es-US-AlonsoNeural
+Gender: Male
+
+Name: es-US-PalomaNeural
+Gender: Female
+
+Name: es-UY-MateoNeural
+Gender: Male
+
+Name: es-UY-ValentinaNeural
+Gender: Female
+
+Name: es-VE-PaolaNeural
+Gender: Female
+
+Name: es-VE-SebastianNeural
+Gender: Male
+
+Name: et-EE-AnuNeural
+Gender: Female
+
+Name: et-EE-KertNeural
+Gender: Male
+
+Name: fa-IR-DilaraNeural
+Gender: Female
+
+Name: fa-IR-FaridNeural
+Gender: Male
+
+Name: fi-FI-HarriNeural
+Gender: Male
+
+Name: fi-FI-NooraNeural
+Gender: Female
+
+Name: fil-PH-AngeloNeural
+Gender: Male
+
+Name: fil-PH-BlessicaNeural
+Gender: Female
+
+Name: fr-BE-CharlineNeural
+Gender: Female
+
+Name: fr-BE-GerardNeural
+Gender: Male
+
+Name: fr-CA-AntoineNeural
+Gender: Male
+
+Name: fr-CA-JeanNeural
+Gender: Male
+
+Name: fr-CA-SylvieNeural
+Gender: Female
+
+Name: fr-CA-ThierryNeural
+Gender: Male
+
+Name: fr-CH-ArianeNeural
+Gender: Female
+
+Name: fr-CH-FabriceNeural
+Gender: Male
+
+Name: fr-FR-DeniseNeural
+Gender: Female
+
+Name: fr-FR-EloiseNeural
+Gender: Female
+
+Name: fr-FR-HenriNeural
+Gender: Male
+
+Name: fr-FR-RemyMultilingualNeural
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural
+Gender: Female
+
+Name: ga-IE-ColmNeural
+Gender: Male
+
+Name: ga-IE-OrlaNeural
+Gender: Female
+
+Name: gl-ES-RoiNeural
+Gender: Male
+
+Name: gl-ES-SabelaNeural
+Gender: Female
+
+Name: gu-IN-DhwaniNeural
+Gender: Female
+
+Name: gu-IN-NiranjanNeural
+Gender: Male
+
+Name: he-IL-AvriNeural
+Gender: Male
+
+Name: he-IL-HilaNeural
+Gender: Female
+
+Name: hi-IN-MadhurNeural
+Gender: Male
+
+Name: hi-IN-SwaraNeural
+Gender: Female
+
+Name: hr-HR-GabrijelaNeural
+Gender: Female
+
+Name: hr-HR-SreckoNeural
+Gender: Male
+
+Name: hu-HU-NoemiNeural
+Gender: Female
+
+Name: hu-HU-TamasNeural
+Gender: Male
+
+Name: id-ID-ArdiNeural
+Gender: Male
+
+Name: id-ID-GadisNeural
+Gender: Female
+
+Name: is-IS-GudrunNeural
+Gender: Female
+
+Name: is-IS-GunnarNeural
+Gender: Male
+
+Name: it-IT-DiegoNeural
+Gender: Male
+
+Name: it-IT-ElsaNeural
+Gender: Female
+
+Name: it-IT-GiuseppeNeural
+Gender: Male
+
+Name: it-IT-IsabellaNeural
+Gender: Female
+
+Name: ja-JP-KeitaNeural
+Gender: Male
+
+Name: ja-JP-NanamiNeural
+Gender: Female
+
+Name: jv-ID-DimasNeural
+Gender: Male
+
+Name: jv-ID-SitiNeural
+Gender: Female
+
+Name: ka-GE-EkaNeural
+Gender: Female
+
+Name: ka-GE-GiorgiNeural
+Gender: Male
+
+Name: kk-KZ-AigulNeural
+Gender: Female
+
+Name: kk-KZ-DauletNeural
+Gender: Male
+
+Name: km-KH-PisethNeural
+Gender: Male
+
+Name: km-KH-SreymomNeural
+Gender: Female
+
+Name: kn-IN-GaganNeural
+Gender: Male
+
+Name: kn-IN-SapnaNeural
+Gender: Female
+
+Name: ko-KR-HyunsuNeural
+Gender: Male
+
+Name: ko-KR-InJoonNeural
+Gender: Male
+
+Name: ko-KR-SunHiNeural
+Gender: Female
+
+Name: lo-LA-ChanthavongNeural
+Gender: Male
+
+Name: lo-LA-KeomanyNeural
+Gender: Female
+
+Name: lt-LT-LeonasNeural
+Gender: Male
+
+Name: lt-LT-OnaNeural
+Gender: Female
+
+Name: lv-LV-EveritaNeural
+Gender: Female
+
+Name: lv-LV-NilsNeural
+Gender: Male
+
+Name: mk-MK-AleksandarNeural
+Gender: Male
+
+Name: mk-MK-MarijaNeural
+Gender: Female
+
+Name: ml-IN-MidhunNeural
+Gender: Male
+
+Name: ml-IN-SobhanaNeural
+Gender: Female
+
+Name: mn-MN-BataaNeural
+Gender: Male
+
+Name: mn-MN-YesuiNeural
+Gender: Female
+
+Name: mr-IN-AarohiNeural
+Gender: Female
+
+Name: mr-IN-ManoharNeural
+Gender: Male
+
+Name: ms-MY-OsmanNeural
+Gender: Male
+
+Name: ms-MY-YasminNeural
+Gender: Female
+
+Name: mt-MT-GraceNeural
+Gender: Female
+
+Name: mt-MT-JosephNeural
+Gender: Male
+
+Name: my-MM-NilarNeural
+Gender: Female
+
+Name: my-MM-ThihaNeural
+Gender: Male
+
+Name: nb-NO-FinnNeural
+Gender: Male
+
+Name: nb-NO-PernilleNeural
+Gender: Female
+
+Name: ne-NP-HemkalaNeural
+Gender: Female
+
+Name: ne-NP-SagarNeural
+Gender: Male
+
+Name: nl-BE-ArnaudNeural
+Gender: Male
+
+Name: nl-BE-DenaNeural
+Gender: Female
+
+Name: nl-NL-ColetteNeural
+Gender: Female
+
+Name: nl-NL-FennaNeural
+Gender: Female
+
+Name: nl-NL-MaartenNeural
+Gender: Male
+
+Name: pl-PL-MarekNeural
+Gender: Male
+
+Name: pl-PL-ZofiaNeural
+Gender: Female
+
+Name: ps-AF-GulNawazNeural
+Gender: Male
+
+Name: ps-AF-LatifaNeural
+Gender: Female
+
+Name: pt-BR-AntonioNeural
+Gender: Male
+
+Name: pt-BR-FranciscaNeural
+Gender: Female
+
+Name: pt-BR-ThalitaNeural
+Gender: Female
+
+Name: pt-PT-DuarteNeural
+Gender: Male
+
+Name: pt-PT-RaquelNeural
+Gender: Female
+
+Name: ro-RO-AlinaNeural
+Gender: Female
+
+Name: ro-RO-EmilNeural
+Gender: Male
+
+Name: ru-RU-DmitryNeural
+Gender: Male
+
+Name: ru-RU-SvetlanaNeural
+Gender: Female
+
+Name: si-LK-SameeraNeural
+Gender: Male
+
+Name: si-LK-ThiliniNeural
+Gender: Female
+
+Name: sk-SK-LukasNeural
+Gender: Male
+
+Name: sk-SK-ViktoriaNeural
+Gender: Female
+
+Name: sl-SI-PetraNeural
+Gender: Female
+
+Name: sl-SI-RokNeural
+Gender: Male
+
+Name: so-SO-MuuseNeural
+Gender: Male
+
+Name: so-SO-UbaxNeural
+Gender: Female
+
+Name: sq-AL-AnilaNeural
+Gender: Female
+
+Name: sq-AL-IlirNeural
+Gender: Male
+
+Name: sr-RS-NicholasNeural
+Gender: Male
+
+Name: sr-RS-SophieNeural
+Gender: Female
+
+Name: su-ID-JajangNeural
+Gender: Male
+
+Name: su-ID-TutiNeural
+Gender: Female
+
+Name: sv-SE-MattiasNeural
+Gender: Male
+
+Name: sv-SE-SofieNeural
+Gender: Female
+
+Name: sw-KE-RafikiNeural
+Gender: Male
+
+Name: sw-KE-ZuriNeural
+Gender: Female
+
+Name: sw-TZ-DaudiNeural
+Gender: Male
+
+Name: sw-TZ-RehemaNeural
+Gender: Female
+
+Name: ta-IN-PallaviNeural
+Gender: Female
+
+Name: ta-IN-ValluvarNeural
+Gender: Male
+
+Name: ta-LK-KumarNeural
+Gender: Male
+
+Name: ta-LK-SaranyaNeural
+Gender: Female
+
+Name: ta-MY-KaniNeural
+Gender: Female
+
+Name: ta-MY-SuryaNeural
+Gender: Male
+
+Name: ta-SG-AnbuNeural
+Gender: Male
+
+Name: ta-SG-VenbaNeural
+Gender: Female
+
+Name: te-IN-MohanNeural
+Gender: Male
+
+Name: te-IN-ShrutiNeural
+Gender: Female
+
+Name: th-TH-NiwatNeural
+Gender: Male
+
+Name: th-TH-PremwadeeNeural
+Gender: Female
+
+Name: tr-TR-AhmetNeural
+Gender: Male
+
+Name: tr-TR-EmelNeural
+Gender: Female
+
+Name: uk-UA-OstapNeural
+Gender: Male
+
+Name: uk-UA-PolinaNeural
+Gender: Female
+
+Name: ur-IN-GulNeural
+Gender: Female
+
+Name: ur-IN-SalmanNeural
+Gender: Male
+
+Name: ur-PK-AsadNeural
+Gender: Male
+
+Name: ur-PK-UzmaNeural
+Gender: Female
+
+Name: uz-UZ-MadinaNeural
+Gender: Female
+
+Name: uz-UZ-SardorNeural
+Gender: Male
+
+Name: vi-VN-HoaiMyNeural
+Gender: Female
+
+Name: vi-VN-NamMinhNeural
+Gender: Male
+
+Name: zh-CN-XiaoxiaoNeural
+Gender: Female
+
+Name: zh-CN-XiaoyiNeural
+Gender: Female
+
+Name: zh-CN-YunjianNeural
+Gender: Male
+
+Name: zh-CN-YunxiNeural
+Gender: Male
+
+Name: zh-CN-YunxiaNeural
+Gender: Male
+
+Name: zh-CN-YunyangNeural
+Gender: Male
+
+Name: zh-CN-liaoning-XiaobeiNeural
+Gender: Female
+
+Name: zh-CN-shaanxi-XiaoniNeural
+Gender: Female
+
+Name: zh-HK-HiuGaaiNeural
+Gender: Female
+
+Name: zh-HK-HiuMaanNeural
+Gender: Female
+
+Name: zh-HK-WanLungNeural
+Gender: Male
+
+Name: zh-TW-HsiaoChenNeural
+Gender: Female
+
+Name: zh-TW-HsiaoYuNeural
+Gender: Female
+
+Name: zh-TW-YunJheNeural
+Gender: Male
+
+Name: zu-ZA-ThandoNeural
+Gender: Female
+
+Name: zu-ZA-ThembaNeural
+Gender: Male
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e84f32b
--- /dev/null
+++ b/main.py
@@ -0,0 +1,16 @@
+import uvicorn
+from loguru import logger
+
+from app.config import config
+
+if __name__ == "__main__":
+    logger.info(
+        "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs"
+    )
+    uvicorn.run(
+        app="app.asgi:app",
+        host=config.listen_host,
+        port=config.listen_port,
+        reload=config.reload_debug,
+        log_level="warning",
+    )
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..65fca39
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,26 @@
+requests~=2.31.0
+moviepy~=2.0.0.dev2
+openai~=1.13.3
+faster-whisper~=1.0.1
+edge_tts~=6.1.10
+uvicorn~=0.27.1
+fastapi~=0.110.0
+tomli~=2.0.1
+streamlit~=1.33.0
+loguru~=0.7.2
+aiohttp~=3.9.3
+urllib3~=2.2.1
+pillow~=10.3.0
+pydantic~=2.6.3
+g4f~=0.3.0.4
+dashscope~=1.15.0
+google.generativeai>=0.7.2
+python-multipart~=0.0.9
+redis==5.0.3
+# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
+# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error
+opencv-python~=4.9.0.80
+# for azure speech
+# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
+azure-cognitiveservices-speech~=1.37.0
+git-changelog~=2.5.2
diff --git a/resource/public/index.html b/resource/public/index.html
new file mode 100644
index 0000000..3115df0
--- /dev/null
+++ b/resource/public/index.html
@@ -0,0 +1,19 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>NarratoAI</title>
+</head>
+<body>
+<h1>NarratoAI</h1>
+<a href="https://github.com/harry0703/NarratoAI">https://github.com/harry0703/NarratoAI</a>
+<p>
+    只需提供一个视频 主题 或 关键词 ，就可以全自动生成视频文案、视频素材、视频字幕、视频背景音乐，然后合成一个高清的短视频。
+</p>
+
+<p>
+    Simply provide a topic or keyword for a video, and it will automatically generate the video copy, video materials,
+    video subtitles, and video background music before synthesizing a high-definition short video.
+</p>
+</body>
+</html>
\ No newline at end of file
diff --git a/webui.bat b/webui.bat
new file mode 100644
index 0000000..a8a1c00
--- /dev/null
+++ b/webui.bat
@@ -0,0 +1,43 @@
+@echo off
+set CURRENT_DIR=%CD%
+echo ***** Current directory: %CURRENT_DIR% *****
+set PYTHONPATH=%CURRENT_DIR%
+
+@echo off
+setlocal enabledelayedexpansion
+
+rem 创建链接和路径的数组
+set "urls_paths[0]=https://zenodo.org/records/13293144/files/MicrosoftYaHeiBold.ttc|.\resource\fonts"
+set "urls_paths[1]=https://zenodo.org/records/13293144/files/MicrosoftYaHeiNormal.ttc|.\resource\fonts"
+set "urls_paths[2]=https://zenodo.org/records/13293144/files/STHeitiLight.ttc|.\resource\fonts"
+set "urls_paths[3]=https://zenodo.org/records/13293144/files/STHeitiMedium.ttc|.\resource\fonts"
+set "urls_paths[4]=https://zenodo.org/records/13293144/files/UTM%20Kabel%20KT.ttf|.\resource\fonts"
+set "urls_paths[5]=https://zenodo.org/records/13293129/files/demo.mp4|.\resource\videos"
+set "urls_paths[6]=https://zenodo.org/records/13293150/files/output000.mp3|.\resource\songs"
+set "urls_paths[7]=https://zenodo.org/records/13293150/files/output001.mp3|.\resource\songs"
+set "urls_paths[8]=https://zenodo.org/records/13293150/files/output002.mp3|.\resource\songs"
+set "urls_paths[9]=https://zenodo.org/records/13293150/files/output003.mp3|.\resource\songs"
+set "urls_paths[10]=https://zenodo.org/records/13293150/files/output004.mp3|.\resource\songs"
+set "urls_paths[11]=https://zenodo.org/records/13293150/files/output005.mp3|.\resource\songs"
+set "urls_paths[12]=https://zenodo.org/records/13293150/files/output006.mp3|.\resource\songs"
+set "urls_paths[13]=https://zenodo.org/records/13293150/files/output007.mp3|.\resource\songs"
+set "urls_paths[14]=https://zenodo.org/records/13293150/files/output008.mp3|.\resource\songs"
+set "urls_paths[15]=https://zenodo.org/records/13293150/files/output009.mp3|.\resource\songs"
+set "urls_paths[16]=https://zenodo.org/records/13293150/files/output010.mp3|.\resource\songs"
+
+rem 循环下载所有文件并保存到指定路径
+for /L %%i in (0,1,16) do (
+    for /f "tokens=1,2 delims=|" %%a in ("!urls_paths[%%i]!") do (
+        if not exist "%%b" mkdir "%%b"
+        echo 正在下载 %%a 到 %%b
+        curl -o "%%b\%%~nxa" %%a
+    )
+)
+
+echo 所有文件已成功下载到指定目录
+endlocal
+pause
+
+
+rem set HF_ENDPOINT=https://hf-mirror.com
+streamlit run .\webui\Main.py --browser.gatherUsageStats=False --server.enableCORS=True
diff --git a/webui.sh b/webui.sh
new file mode 100644
index 0000000..5a723ae
--- /dev/null
+++ b/webui.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# 从环境变量中加载VPN代理的配置URL
+vpn_proxy_url="$VPN_PROXY_URL"
+# 检查是否成功加载
+if [ -z "$vpn_proxy_url" ]; then
+    echo "VPN代理配置URL未设置，请检查环境变量VPN_PROXY_URL"
+    exit 1
+fi
+# 使用VPN代理进行一些操作，比如通过代理下载文件
+export http_proxy="$vpn_proxy_url"
+export https_proxy="$vpn_proxy_url"
+
+# 创建链接和路径的数组
+declare -A urls_paths=(
+    ["https://zenodo.org/records/13293144/files/MicrosoftYaHeiBold.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/MicrosoftYaHeiNormal.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/STHeitiLight.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/STHeitiMedium.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/UTM%20Kabel%20KT.ttf"]="./resource/fonts"
+    ["https://zenodo.org/records/13293129/files/demo.mp4"]="./resource/videos"
+    ["https://zenodo.org/records/13293150/files/output000.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output001.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output002.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output003.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output004.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output005.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output006.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output007.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output008.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output009.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output010.mp3"]="./resource/songs"
+    # 添加更多链接及其对应的路径
+)
+
+# 循环下载所有文件并保存到指定路径
+for url in "${!urls_paths[@]}"; do
+    output_dir="${urls_paths[$url]}"
+    mkdir -p "$output_dir"  # 创建目录（如果不存在）
+
+    # 提取文件名
+    filename=$(basename "$url")
+
+    # 检查文件是否已经存在
+    if [ -f "$output_dir/$filename" ]; then
+        echo "文件 $filename 已经存在，跳过下载"
+    else
+        wget -P "$output_dir" "$url" &
+    fi
+done
+
+# 等待所有下载完成
+wait
+
+echo "所有文件已成功下载到指定目录"
+
+
+streamlit run ./webui/Main.py --browser.serverAddress="0.0.0.0" --server.enableCORS=True --browser.gatherUsageStats=False
diff --git a/webui/Main.py b/webui/Main.py
new file mode 100644
index 0000000..bc9feeb
--- /dev/null
+++ b/webui/Main.py
@@ -0,0 +1,746 @@
+import sys
+import os
+import glob
+import json
+import datetime
+
+# 将项目的根目录添加到系统路径中，以允许从项目导入模块
+root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+if root_dir not in sys.path:
+    sys.path.append(root_dir)
+    print("******** sys.path ********")
+    print(sys.path)
+    print("")
+
+import streamlit as st
+
+import os
+from uuid import uuid4
+import platform
+import streamlit.components.v1 as components
+from loguru import logger
+from app.config import config
+
+st.set_page_config(
+    page_title="NarratoAI",
+    page_icon="📽️",
+    layout="wide",
+    initial_sidebar_state="auto",
+    menu_items={
+        "Report a bug": "https://github.com/linyqh/NarratoAI/issues",
+        'About': f"# NarratoAI:sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
+                                f"自动化影视解说视频详情请移步：https://github.com/linyqh/NarratoAI"
+    },
+)
+
+from app.models.const import FILE_TYPE_IMAGES, FILE_TYPE_VIDEOS
+from app.models.schema import VideoClipParams, VideoAspect, VideoConcatMode
+from app.services import task as tm, llm, voice, material
+from app.utils import utils
+
+os.environ["HTTP_PROXY"] = config.proxy.get("http", "") or os.getenv("VPN_PROXY_URL", "")
+os.environ["HTTPS_PROXY"] = config.proxy.get("https", "") or os.getenv("VPN_PROXY_URL", "")
+
+hide_streamlit_style = """
+<style>#root > div:nth-child(1) > div > div > div > div > section > div {padding-top: 6px; padding-bottom: 10px; padding-left: 20px; padding-right: 20px;}</style>
+"""
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+st.title(f"NarratoAI :sunglasses:📽️")
+support_locales = [
+    "zh-CN",
+    "zh-HK",
+    "zh-TW",
+    "de-DE",
+    "en-US",
+    "fr-FR",
+    "vi-VN",
+    "th-TH",
+]
+font_dir = os.path.join(root_dir, "resource", "fonts")
+song_dir = os.path.join(root_dir, "resource", "songs")
+i18n_dir = os.path.join(root_dir, "webui", "i18n")
+config_file = os.path.join(root_dir, "webui", ".streamlit", "webui.toml")
+system_locale = utils.get_system_locale()
+
+if 'video_subject' not in st.session_state:
+    st.session_state['video_subject'] = ''
+if 'video_clip_json' not in st.session_state:
+    st.session_state['video_clip_json'] = ''
+if 'video_plot' not in st.session_state:
+    st.session_state['video_plot'] = ''
+if 'ui_language' not in st.session_state:
+    st.session_state['ui_language'] = config.ui.get("language", system_locale)
+
+
+def get_all_fonts():
+    fonts = []
+    for root, dirs, files in os.walk(font_dir):
+        for file in files:
+            if file.endswith(".ttf") or file.endswith(".ttc"):
+                fonts.append(file)
+    fonts.sort()
+    return fonts
+
+
+def get_all_songs():
+    songs = []
+    for root, dirs, files in os.walk(song_dir):
+        for file in files:
+            if file.endswith(".mp3"):
+                songs.append(file)
+    return songs
+
+
+def open_task_folder(task_id):
+    try:
+        sys = platform.system()
+        path = os.path.join(root_dir, "storage", "tasks", task_id)
+        if os.path.exists(path):
+            if sys == 'Windows':
+                os.system(f"start {path}")
+            if sys == 'Darwin':
+                os.system(f"open {path}")
+    except Exception as e:
+        logger.error(e)
+
+
+def scroll_to_bottom():
+    js = f"""
+    <script>
+        console.log("scroll_to_bottom");
+        function scroll(dummy_var_to_force_repeat_execution){{
+            var sections = parent.document.querySelectorAll('section.main');
+            console.log(sections);
+            for(let index = 0; index<sections.length; index++) {{
+                sections[index].scrollTop = sections[index].scrollHeight;
+            }}
+        }}
+        scroll(1);
+    </script>
+    """
+    st.components.v1.html(js, height=0, width=0)
+
+
+def init_log():
+    logger.remove()
+    _lvl = "DEBUG"
+
+    def format_record(record):
+        # 获取日志记录中的文件全路径
+        file_path = record["file"].path
+        # 将绝对路径转换为相对于项目根目录的路径
+        relative_path = os.path.relpath(file_path, root_dir)
+        # 更新记录中的文件路径
+        record["file"].path = f"./{relative_path}"
+        # 返回修改后的格式字符串
+        # 您可以根据需要调整这里的格式
+        record['message'] = record['message'].replace(root_dir, ".")
+
+        _format = '<green>{time:%Y-%m-%d %H:%M:%S}</> | ' + \
+                  '<level>{level}</> | ' + \
+                  '"{file.path}:{line}":<blue> {function}</> ' + \
+                  '- <level>{message}</>' + "\n"
+        return _format
+
+    logger.add(
+        sys.stdout,
+        level=_lvl,
+        format=format_record,
+        colorize=True,
+    )
+
+
+init_log()
+
+locales = utils.load_locales(i18n_dir)
+
+
+def tr(key):
+    loc = locales.get(st.session_state['ui_language'], {})
+    return loc.get("Translation", {}).get(key, key)
+
+
+st.write(tr("Get Help"))
+
+# 基础设置
+with st.expander(tr("Basic Settings"), expanded=False):
+    config_panels = st.columns(3)
+    left_config_panel = config_panels[0]
+    middle_config_panel = config_panels[1]
+    right_config_panel = config_panels[2]
+    with left_config_panel:
+        display_languages = []
+        selected_index = 0
+        for i, code in enumerate(locales.keys()):
+            display_languages.append(f"{code} - {locales[code].get('Language')}")
+            if code == st.session_state['ui_language']:
+                selected_index = i
+
+        selected_language = st.selectbox(tr("Language"), options=display_languages,
+                                         index=selected_index)
+        if selected_language:
+            code = selected_language.split(" - ")[0].strip()
+            st.session_state['ui_language'] = code
+            config.ui['language'] = code
+
+    with middle_config_panel:
+        #   openai
+        #   moonshot (月之暗面)
+        #   oneapi
+        #   g4f
+        #   azure
+        #   qwen (通义千问)
+        #   gemini
+        #   ollama
+        llm_providers = ['OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Gemini', 'Ollama', 'G4f', 'OneAPI', "Cloudflare"]
+        saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
+        saved_llm_provider_index = 0
+        for i, provider in enumerate(llm_providers):
+            if provider.lower() == saved_llm_provider:
+                saved_llm_provider_index = i
+                break
+
+        llm_provider = st.selectbox(tr("LLM Provider"), options=llm_providers, index=saved_llm_provider_index)
+        llm_provider = llm_provider.lower()
+        config.app["llm_provider"] = llm_provider
+
+        llm_api_key = config.app.get(f"{llm_provider}_api_key", "")
+        llm_base_url = config.app.get(f"{llm_provider}_base_url", "")
+        llm_model_name = config.app.get(f"{llm_provider}_model_name", "")
+        llm_account_id = config.app.get(f"{llm_provider}_account_id", "")
+        st_llm_api_key = st.text_input(tr("API Key"), value=llm_api_key, type="password")
+        st_llm_base_url = st.text_input(tr("Base Url"), value=llm_base_url)
+        st_llm_model_name = st.text_input(tr("Model Name"), value=llm_model_name)
+        if st_llm_api_key:
+            config.app[f"{llm_provider}_api_key"] = st_llm_api_key
+        if st_llm_base_url:
+            config.app[f"{llm_provider}_base_url"] = st_llm_base_url
+        if st_llm_model_name:
+            config.app[f"{llm_provider}_model_name"] = st_llm_model_name
+
+        if llm_provider == 'cloudflare':
+            st_llm_account_id = st.text_input(tr("Account ID"), value=llm_account_id)
+            if st_llm_account_id:
+                config.app[f"{llm_provider}_account_id"] = st_llm_account_id
+
+    with right_config_panel:
+        pexels_api_keys = config.app.get("pexels_api_keys", [])
+        if isinstance(pexels_api_keys, str):
+            pexels_api_keys = [pexels_api_keys]
+        pexels_api_key = ", ".join(pexels_api_keys)
+
+        pexels_api_key = st.text_input(tr("Pexels API Key"), value=pexels_api_key, type="password")
+        pexels_api_key = pexels_api_key.replace(" ", "")
+        if pexels_api_key:
+            config.app["pexels_api_keys"] = pexels_api_key.split(",")
+
+panel = st.columns(3)
+left_panel = panel[0]
+middle_panel = panel[1]
+right_panel = panel[2]
+
+params = VideoClipParams()
+
+# 左侧面板
+with left_panel:
+    with st.container(border=True):
+        st.write(tr("Video Script Configuration"))
+        # 脚本语言
+        video_languages = [
+            (tr("Auto Detect"), ""),
+        ]
+        for code in ["zh-CN", "zh-TW", "de-DE", "en-US", "vi-VN"]:
+            video_languages.append((code, code))
+
+        selected_index = st.selectbox(tr("Script Language"),
+                                      index=0,
+                                      options=range(len(video_languages)),  # 使用索引作为内部选项值
+                                      format_func=lambda x: video_languages[x][0]  # 显示给用户的是标签
+                                      )
+        params.video_language = video_languages[selected_index][1]
+
+        # 脚本路径
+        suffix = "*.json"
+        song_dir = utils.script_dir()
+        files = glob.glob(os.path.join(song_dir, suffix))
+        script_list = []
+        for file in files:
+            script_list.append({
+                "name": os.path.basename(file),
+                "size": os.path.getsize(file),
+                "file": file,
+            })
+
+        script_path = [(tr("Auto Generate"), ""), ]
+        for code in [file['file'] for file in script_list]:
+            script_path.append((code, code))
+
+        selected_json2 = st.selectbox(tr("Script Files"),
+                                      index=0,
+                                      options=range(len(script_path)),  # 使用索引作为内部选项值
+                                      format_func=lambda x: script_path[x][0]  # 显示给用户的是标签
+                                      )
+        params.video_clip_json = script_path[selected_json2][1]
+        video_json_file = params.video_clip_json
+
+        # 视频文件
+        suffix = "*.mp4"
+        song_dir = utils.video_dir()
+        files = glob.glob(os.path.join(song_dir, suffix))
+        video_list = []
+        for file in files:
+            video_list.append({
+                "name": os.path.basename(file),
+                "size": os.path.getsize(file),
+                "file": file,
+            })
+
+        video_path = [(tr("None"), ""), ]
+        for code in [file['file'] for file in video_list]:
+            video_path.append((code, code))
+
+        selected_index2 = st.selectbox(tr("Video File"),
+                                       index=0,
+                                       options=range(len(video_path)),  # 使用索引作为内部选项值
+                                       format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
+                                       )
+        params.video_origin_path = video_path[selected_index2][1]
+
+        # 剧情内容
+        video_plot = st.text_area(
+            tr("Plot Description"),
+            value=st.session_state['video_plot'],
+            height=180
+        )
+
+        if st.button(tr("Video Script Generate"), key="auto_generate_script"):
+            with st.spinner(tr("Video Script Generate")):
+                if video_json_file == "" and params.video_origin_path != "":
+                    script = llm.gemini_video2json(
+                        video_origin_name=params.video_origin_path.split("\\")[-1],
+                        video_origin_path=params.video_origin_path,
+                        video_plot=video_plot
+                    )
+                    st.session_state['video_clip_json'] = script
+                    cleaned_string = script.strip("```json").strip("```")
+                    st.session_state['video_script_list'] = json.loads(cleaned_string)
+                else:
+                    with open(video_json_file, 'r', encoding='utf-8') as f:
+                        script = f.read()
+                        st.session_state['video_clip_json'] = script
+                        cleaned_string = script.strip("```json").strip("```")
+                        st.session_state['video_script_list'] = json.loads(cleaned_string)
+
+        video_clip_json_details = st.text_area(
+            tr("Video Script"),
+            value=st.session_state['video_clip_json'],
+            height=180
+        )
+
+        button_columns = st.columns(2)
+        with button_columns[0]:
+            if st.button(tr("Save Script"), key="auto_generate_terms", use_container_width=True):
+                if not video_clip_json_details:
+                    st.error(tr("请输入视频脚本"))
+                    st.stop()
+
+                with st.spinner(tr("保存脚本")):
+                    script_dir = utils.script_dir()
+                    # 获取当前时间戳，形如 2024-0618-171820
+                    timestamp = datetime.datetime.now().strftime("%Y-%m%d-%H%M%S")
+                    save_path = os.path.join(script_dir, f"{timestamp}.json")
+
+                    # 尝试解析输入的 JSON 数据
+                    input_json = str(video_clip_json_details).replace("'", '"')
+                    input_json = input_json.strip('```json').strip('```')
+                    try:
+                        data = json.loads(input_json)
+                    except:
+                        raise ValueError("视频脚本格式错误，请检查脚本是否符合 JSON 格式")
+
+                    # 检查是否是一个列表
+                    if not isinstance(data, list):
+                        raise ValueError("JSON is not a list")
+
+                    # 检查列表中的每个元素是否包含所需的键
+                    required_keys = {"picture", "timestamp", "narration"}
+                    for item in data:
+                        if not isinstance(item, dict):
+                            raise ValueError("List 元素不是字典")
+                        if not required_keys.issubset(item.keys()):
+                            raise ValueError("Dict 元素不包含必需的键")
+
+                    # 存储为新的 JSON 文件
+                    with open(save_path, 'w', encoding='utf-8') as file:
+                        json.dump(data, file, ensure_ascii=False, indent=4)
+                        # 将data的值存储到 session_state 中，类似缓存
+                        st.session_state['video_script_list'] = data
+
+                    logger.debug(f"脚本内容已成功保存到 {save_path}")
+
+        with button_columns[1]:
+            if st.button(tr("Crop Video"), key="auto_crop_video", use_container_width=True):
+                with st.spinner(tr("裁剪视频中...")):
+                    st.session_state['task_id'] = str(uuid4())
+
+                    if st.session_state.get('video_script_list', None) is not None:
+                        video_script_list = st.session_state.video_script_list
+                        time_list = [i['timestamp'] for i in video_script_list]
+                        subclip_videos = material.clip_videos(
+                            task_id=st.session_state['task_id'],
+                            timestamp_terms=time_list,
+                            origin_video=params.video_origin_path
+                        )
+                        if subclip_videos is None:
+                            st.error(tr("裁剪视频失败"))
+                            st.stop()
+                        st.session_state['subclip_videos'] = subclip_videos
+                        for video_script in video_script_list:
+                            try:
+                                video_script['path'] = subclip_videos[video_script['timestamp']]
+                            except KeyError as e:
+                                st.error(f"裁剪视频失败")
+                        # logger.debug(f"当前的脚本为：{st.session_state.video_script_list}")
+                    else:
+                        st.error(tr("请先生成视频脚本"))
+
+# 新中间面板
+with middle_panel:
+    with st.container(border=True):
+        st.write(tr("Video Settings"))
+        video_concat_modes = [
+            (tr("Sequential"), "sequential"),
+            (tr("Random"), "random"),
+        ]
+        # video_sources = [
+        #     (tr("Pexels"), "pexels"),
+        #     (tr("Pixabay"), "pixabay"),
+        #     (tr("Local file"), "local"),
+        #     (tr("TikTok"), "douyin"),
+        #     (tr("Bilibili"), "bilibili"),
+        #     (tr("Xiaohongshu"), "xiaohongshu"),
+        # ]
+        #
+        # saved_video_source_name = config.app.get("video_source", "pexels")
+        # saved_video_source_index = [v[1] for v in video_sources].index(
+        #     saved_video_source_name
+        # )
+        #
+        # selected_index = st.selectbox(
+        #     tr("Video Source"),
+        #     options=range(len(video_sources)),
+        #     format_func=lambda x: video_sources[x][0],
+        #     index=saved_video_source_index,
+        # )
+        # params.video_source = video_sources[selected_index][1]
+        # config.app["video_source"] = params.video_source
+        #
+        # if params.video_source == "local":
+        #     _supported_types = FILE_TYPE_VIDEOS + FILE_TYPE_IMAGES
+        #     uploaded_files = st.file_uploader(
+        #         "Upload Local Files",
+        #         type=["mp4", "mov", "avi", "flv", "mkv", "jpg", "jpeg", "png"],
+        #         accept_multiple_files=True,
+        #     )
+
+        selected_index = st.selectbox(
+            tr("Video Concat Mode"),
+            index=1,
+            options=range(len(video_concat_modes)),  # 使用索引作为内部选项值
+            format_func=lambda x: video_concat_modes[x][0],  # 显示给用户的是标签
+        )
+        params.video_concat_mode = VideoConcatMode(
+            video_concat_modes[selected_index][1]
+        )
+
+        video_aspect_ratios = [
+            (tr("Portrait"), VideoAspect.portrait.value),
+            (tr("Landscape"), VideoAspect.landscape.value),
+        ]
+        selected_index = st.selectbox(
+            tr("Video Ratio"),
+            options=range(len(video_aspect_ratios)),  # 使用索引作为内部选项值
+            format_func=lambda x: video_aspect_ratios[x][0],  # 显示给用户的是标签
+        )
+        params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1])
+
+        params.video_clip_duration = st.selectbox(
+            tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1
+        )
+        params.video_count = st.selectbox(
+            tr("Number of Videos Generated Simultaneously"),
+            options=[1, 2, 3, 4, 5],
+            index=0,
+        )
+    with st.container(border=True):
+        st.write(tr("Audio Settings"))
+
+        # tts_providers = ['edge', 'azure']
+        # tts_provider = st.selectbox(tr("TTS Provider"), tts_providers)
+
+        voices = voice.get_all_azure_voices(filter_locals=support_locales)
+        friendly_names = {
+            v: v.replace("Female", tr("Female"))
+            .replace("Male", tr("Male"))
+            .replace("Neural", "")
+            for v in voices
+        }
+        saved_voice_name = config.ui.get("voice_name", "")
+        saved_voice_name_index = 0
+        if saved_voice_name in friendly_names:
+            saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
+        else:
+            for i, v in enumerate(voices):
+                if (
+                    v.lower().startswith(st.session_state["ui_language"].lower())
+                    and "V2" not in v
+                ):
+                    saved_voice_name_index = i
+                    break
+
+        selected_friendly_name = st.selectbox(
+            tr("Speech Synthesis"),
+            options=list(friendly_names.values()),
+            index=saved_voice_name_index,
+        )
+
+        voice_name = list(friendly_names.keys())[
+            list(friendly_names.values()).index(selected_friendly_name)
+        ]
+        params.voice_name = voice_name
+        config.ui["voice_name"] = voice_name
+
+        if st.button(tr("Play Voice")):
+            play_content = params.video_subject
+            if not play_content:
+                play_content = params.video_script
+            if not play_content:
+                play_content = tr("Voice Example")
+            with st.spinner(tr("Synthesizing Voice")):
+                temp_dir = utils.storage_dir("temp", create=True)
+                audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
+                sub_maker = voice.tts(
+                    text=play_content,
+                    voice_name=voice_name,
+                    voice_rate=params.voice_rate,
+                    voice_file=audio_file,
+                )
+                # if the voice file generation failed, try again with a default content.
+                if not sub_maker:
+                    play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
+                    sub_maker = voice.tts(
+                        text=play_content,
+                        voice_name=voice_name,
+                        voice_rate=params.voice_rate,
+                        voice_file=audio_file,
+                    )
+
+                if sub_maker and os.path.exists(audio_file):
+                    st.audio(audio_file, format="audio/mp3")
+                    if os.path.exists(audio_file):
+                        os.remove(audio_file)
+
+        if voice.is_azure_v2_voice(voice_name):
+            saved_azure_speech_region = config.azure.get("speech_region", "")
+            saved_azure_speech_key = config.azure.get("speech_key", "")
+            azure_speech_region = st.text_input(
+                tr("Speech Region"), value=saved_azure_speech_region
+            )
+            azure_speech_key = st.text_input(
+                tr("Speech Key"), value=saved_azure_speech_key, type="password"
+            )
+            config.azure["speech_region"] = azure_speech_region
+            config.azure["speech_key"] = azure_speech_key
+
+        params.voice_volume = st.selectbox(
+            tr("Speech Volume"),
+            options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0],
+            index=2,
+        )
+
+        params.voice_rate = st.selectbox(
+            tr("Speech Rate"),
+            options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+            index=2,
+        )
+
+        bgm_options = [
+            (tr("No Background Music"), ""),
+            (tr("Random Background Music"), "random"),
+            (tr("Custom Background Music"), "custom"),
+        ]
+        selected_index = st.selectbox(
+            tr("Background Music"),
+            index=1,
+            options=range(len(bgm_options)),  # 使用索引作为内部选项值
+            format_func=lambda x: bgm_options[x][0],  # 显示给用户的是标签
+        )
+        # 获取选择的背景音乐类型
+        params.bgm_type = bgm_options[selected_index][1]
+
+        # 根据选择显示或隐藏组件
+        if params.bgm_type == "custom":
+            custom_bgm_file = st.text_input(tr("Custom Background Music File"))
+            if custom_bgm_file and os.path.exists(custom_bgm_file):
+                params.bgm_file = custom_bgm_file
+                # st.write(f":red[已选择自定义背景音乐]：**{custom_bgm_file}**")
+        params.bgm_volume = st.selectbox(
+            tr("Background Music Volume"),
+            options=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+            index=2,
+        )
+
+# 新右侧面板
+with right_panel:
+    with st.container(border=True):
+        st.write(tr("Subtitle Settings"))
+        params.subtitle_enabled = st.checkbox(tr("Enable Subtitles"), value=True)
+        font_names = get_all_fonts()
+        saved_font_name = config.ui.get("font_name", "")
+        saved_font_name_index = 0
+        if saved_font_name in font_names:
+            saved_font_name_index = font_names.index(saved_font_name)
+        params.font_name = st.selectbox(
+            tr("Font"), font_names, index=saved_font_name_index
+        )
+        config.ui["font_name"] = params.font_name
+
+        subtitle_positions = [
+            (tr("Top"), "top"),
+            (tr("Center"), "center"),
+            (tr("Bottom"), "bottom"),
+            (tr("Custom"), "custom"),
+        ]
+        selected_index = st.selectbox(
+            tr("Position"),
+            index=2,
+            options=range(len(subtitle_positions)),
+            format_func=lambda x: subtitle_positions[x][0],
+        )
+        params.subtitle_position = subtitle_positions[selected_index][1]
+
+        if params.subtitle_position == "custom":
+            custom_position = st.text_input(
+                tr("Custom Position (% from top)"), value="70.0"
+            )
+            try:
+                params.custom_position = float(custom_position)
+                if params.custom_position < 0 or params.custom_position > 100:
+                    st.error(tr("Please enter a value between 0 and 100"))
+            except ValueError:
+                st.error(tr("Please enter a valid number"))
+
+        font_cols = st.columns([0.3, 0.7])
+        with font_cols[0]:
+            saved_text_fore_color = config.ui.get("text_fore_color", "#FFFFFF")
+            params.text_fore_color = st.color_picker(
+                tr("Font Color"), saved_text_fore_color
+            )
+            config.ui["text_fore_color"] = params.text_fore_color
+
+        with font_cols[1]:
+            saved_font_size = config.ui.get("font_size", 60)
+            params.font_size = st.slider(tr("Font Size"), 30, 100, saved_font_size)
+            config.ui["font_size"] = params.font_size
+
+        stroke_cols = st.columns([0.3, 0.7])
+        with stroke_cols[0]:
+            params.stroke_color = st.color_picker(tr("Stroke Color"), "#000000")
+        with stroke_cols[1]:
+            params.stroke_width = st.slider(tr("Stroke Width"), 0.0, 10.0, 1.5)
+
+# 视频编辑面板
+with st.expander(tr("视频审查"), expanded=False):
+    try:
+        video_list = st.session_state['video_script_list']
+    except KeyError as e:
+        video_list = []
+
+    # 计算列数和行数
+    num_videos = len(video_list)
+    cols_per_row = 3
+    rows = (num_videos + cols_per_row - 1) // cols_per_row  # 向上取整计算行数
+
+    # 使用容器展示视频
+    for row in range(rows):
+        cols = st.columns(cols_per_row)
+        for col in range(cols_per_row):
+            index = row * cols_per_row + col
+            if index < num_videos:
+                with cols[col]:
+                    video_info = video_list[index]
+                    video_path = video_info.get('path')
+                    if video_path is not None:
+                        initial_narration = video_info['narration']
+                        initial_picture = video_info['picture']
+                        initial_timestamp = video_info['timestamp']
+
+                        with open(video_path, 'rb') as video_file:
+                            video_bytes = video_file.read()
+                            st.video(video_bytes)
+
+                        # 可编辑的输入框
+                        text_panels = st.columns(2)
+                        with text_panels[0]:
+                            text1 = st.text_area("时间戳", value=initial_timestamp, height=20)
+                        with text_panels[1]:
+                            text2 = st.text_area("画面描述", value=initial_picture, height=20)
+                        text3 = st.text_area("解说旁白", value=initial_narration, height=100)
+
+                        # 清空文本框按钮
+                        if st.button("重新生成", key=f"button_{index}"):
+                            print(123123)
+                            # with st.spinner(tr("大模型生成中...")):
+
+start_button = st.button(tr("Generate Video"), use_container_width=True, type="primary")
+if start_button:
+    config.save_config()
+    task_id = st.session_state['task_id']
+    if not params.video_clip_json:
+        st.error(tr("脚本文件不能为空"))
+        scroll_to_bottom()
+        st.stop()
+    if not params.video_origin_path:
+        st.error(tr("视频文件不能为空"))
+        scroll_to_bottom()
+        st.stop()
+    if llm_provider != 'g4f' and not config.app.get(f"{llm_provider}_api_key", ""):
+        st.error(tr("请输入 LLM API 密钥"))
+        scroll_to_bottom()
+        st.stop()
+
+    log_container = st.empty()
+    log_records = []
+
+
+    def log_received(msg):
+        with log_container:
+            log_records.append(msg)
+            st.code("\n".join(log_records))
+
+
+    logger.add(log_received)
+
+    st.toast(tr("生成视频"))
+    logger.info(tr("开始生成视频"))
+    logger.info(utils.to_json(params))
+    scroll_to_bottom()
+
+    result = tm.start_subclip(task_id=task_id, params=params, subclip_path_videos=st.session_state.subclip_videos)
+
+    video_files = result.get("videos", [])
+    st.success(tr("视频生成完成"))
+    try:
+        if video_files:
+            # 将视频播放器居中
+            player_cols = st.columns(len(video_files) * 2 + 1)
+            for i, url in enumerate(video_files):
+                player_cols[i * 2 + 1].video(url)
+    except Exception as e:
+        pass
+
+    open_task_folder(task_id)
+    logger.info(tr("视频生成完成"))
+    scroll_to_bottom()
+
+config.save_config()
diff --git a/webui/i18n/de.json b/webui/i18n/de.json
new file mode 100644
index 0000000..7749e6f
--- /dev/null
+++ b/webui/i18n/de.json
@@ -0,0 +1,79 @@
+{
+  "Language": "German",
+  "Translation": {
+    "Video Script Settings": "**Drehbuch / Topic des Videos**",
+    "Video Subject": "Worum soll es in dem Video gehen? (Geben Sie ein Keyword an, :red[Dank KI wird automatisch ein Drehbuch generieren])",
+    "Script Language": "Welche Sprache soll zum Generieren von Drehbüchern  verwendet werden? :red[KI generiert anhand dieses Begriffs das Drehbuch]",
+    "Generate Video Script and Keywords": "Klicken Sie hier, um mithilfe von KI ein [Video Drehbuch] und [Video Keywords] basierend auf dem **Keyword** zu generieren.",
+    "Auto Detect": "Automatisch erkennen",
+    "Video Script": "Drehbuch (Storybook) (:blue[① Optional, KI generiert  ② Die richtige Zeichensetzung hilft bei der Erstellung von Untertiteln])",
+    "Generate Video Keywords": "Klicken Sie, um KI zum Generieren zu verwenden [Video Keywords] basierend auf dem **Drehbuch**",
+    "Please Enter the Video Subject": "Bitte geben Sie zuerst das Drehbuch an",
+    "Generating Video Script and Keywords": "KI generiert ein Drehbuch und Schlüsselwörter...",
+    "Generating Video Keywords": "AI is generating video keywords...",
+    "Video Keywords": "Video Schlüsselwörter (:blue[① Optional, KI generiert ② Verwende **, (Kommas)** zur Trennung der Wörter, in englischer Sprache])",
+    "Video Settings": "**Video Einstellungen**",
+    "Video Concat Mode": "Videoverkettungsmodus",
+    "Random": "Zufällige Verkettung (empfohlen)",
+    "Sequential": "Sequentielle Verkettung",
+    "Video Ratio": "Video-Seitenverhältnis",
+    "Portrait": "Portrait 9:16",
+    "Landscape": "Landschaft 16:9",
+    "Clip Duration": "Maximale Dauer einzelner Videoclips in sekunden",
+    "Number of Videos Generated Simultaneously": "Anzahl der parallel generierten Videos",
+    "Audio Settings": "**Audio Einstellungen**",
+    "Speech Synthesis": "Sprachausgabe",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Lautstärke der Sprachausgabe",
+    "Speech Rate": "Lesegeschwindigkeit (1,0 bedeutet 1x)",
+    "Male": "Männlich",
+    "Female": "Weiblich",
+    "Background Music": "Hintergrundmusik",
+    "No Background Music": "Ohne Hintergrundmusik",
+    "Random Background Music": "Zufällig erzeugte Hintergrundmusik",
+    "Custom Background Music": "Benutzerdefinierte Hintergrundmusik",
+    "Custom Background Music File": "Bitte gib den Pfad zur Musikdatei an:",
+    "Background Music Volume": "Lautstärke: (0.2 entspricht 20%, sollte nicht zu laut sein)",
+    "Subtitle Settings": "**Untertitel-Einstellungen**",
+    "Enable Subtitles": "Untertitel aktivieren (Wenn diese Option deaktiviert ist, werden die Einstellungen nicht genutzt)",
+    "Font": "Schriftart des Untertitels",
+    "Position": "Ausrichtung des Untertitels",
+    "Top": "Oben",
+    "Center": "Mittig",
+    "Bottom": "Unten (empfohlen)",
+    "Custom": "Benutzerdefinierte Position (70, was 70% von oben bedeutet)",
+    "Font Size": "Schriftgröße für Untertitel",
+    "Font Color": "Schriftfarbe",
+    "Stroke Color": "Kontur",
+    "Stroke Width": "Breite der Untertitelkontur",
+    "Generate Video": "Generiere Videos durch KI",
+    "Video Script and Subject Cannot Both Be Empty": "Das Video-Thema und Drehbuch dürfen nicht beide leer sein",
+    "Generating Video": "Video wird erstellt, bitte warten...",
+    "Start Generating Video": "Beginne mit der Generierung",
+    "Video Generation Completed": "Video erfolgreich generiert",
+    "Video Generation Failed": "Video Generierung fehlgeschlagen",
+    "You can download the generated video from the following links": "Sie können das generierte Video über die folgenden Links herunterladen",
+    "Basic Settings": "**Grunde Instellungen**",
+    "Pexels API Key": "Pexels API Key ([Get API Key](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API Key ([Get API Key](https://pixabay.com/api/docs/#api_search_videos))",
+    "Language": "Language",
+    "LLM Provider": "LLM Provider",
+    "API Key": "API Key (:red[Required])",
+    "Base Url": "Base Url",
+    "Model Name": "Model Name",
+    "Please Enter the LLM API Key": "Please Enter the **LLM API Key**",
+    "Please Enter the Pexels API Key": "Please Enter the **Pexels API Key**",
+    "Please Enter the Pixabay API Key": "Please Enter the **Pixabay API Key**",
+    "Get Help": "If you need help, or have any questions, you can join discord for help ",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (TikTok support is coming soon)",
+    "Bilibili": "Bilibili (Bilibili support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Voice",
+    "Voice Example": "This is an example text for testing speech synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "Select the voice synthesis provider"
+  }
+}
\ No newline at end of file
diff --git a/webui/i18n/en.json b/webui/i18n/en.json
new file mode 100644
index 0000000..9e97c11
--- /dev/null
+++ b/webui/i18n/en.json
@@ -0,0 +1,81 @@
+{
+  "Language": "English",
+  "Translation": {
+    "Video Script Settings": "**Video Script Settings**",
+    "Video Subject": "Video Subject (Provide a keyword, :red[AI will automatically generate] video script)",
+    "Script Language": "Language for Generating Video Script (AI will automatically output based on the language of your subject)",
+    "Generate Video Script and Keywords": "Click to use AI to generate [Video Script] and [Video Keywords] based on **subject**",
+    "Auto Detect": "Auto Detect",
+    "Video Script": "Video Script (:blue[① Optional, AI generated  ② Proper punctuation helps with subtitle generation])",
+    "Generate Video Keywords": "Click to use AI to generate [Video Keywords] based on **script**",
+    "Please Enter the Video Subject": "Please Enter the Video Script First",
+    "Generating Video Script and Keywords": "AI is generating video script and keywords...",
+    "Generating Video Keywords": "AI is generating video keywords...",
+    "Video Keywords": "Video Keywords (:blue[① Optional, AI generated ② Use **English commas** for separation, English only])",
+    "Video Settings": "**Video Settings**",
+    "Video Concat Mode": "Video Concatenation Mode",
+    "Random": "Random Concatenation (Recommended)",
+    "Sequential": "Sequential Concatenation",
+    "Video Ratio": "Video Aspect Ratio",
+    "Portrait": "Portrait 9:16",
+    "Landscape": "Landscape 16:9",
+    "Clip Duration": "Maximum Duration of Video Clips (seconds)",
+    "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
+    "Audio Settings": "**Audio Settings**",
+    "Speech Synthesis": "Speech Synthesis Voice",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Speech Volume (1.0 represents 100%)",
+    "Speech Rate": "Speech Rate (1.0 means 1x speed)",
+    "Male": "Male",
+    "Female": "Female",
+    "Background Music": "Background Music",
+    "No Background Music": "No Background Music",
+    "Random Background Music": "Random Background Music",
+    "Custom Background Music": "Custom Background Music",
+    "Custom Background Music File": "Please enter the file path for custom background music:",
+    "Background Music Volume": "Background Music Volume (0.2 represents 20%, background music should not be too loud)",
+    "Subtitle Settings": "**Subtitle Settings**",
+    "Enable Subtitles": "Enable Subtitles (If unchecked, the settings below will not take effect)",
+    "Font": "Subtitle Font",
+    "Position": "Subtitle Position",
+    "Top": "Top",
+    "Center": "Center",
+    "Bottom": "Bottom (Recommended)",
+    "Custom": "Custom position (70, indicating 70% down from the top)",
+    "Font Size": "Subtitle Font Size",
+    "Font Color": "Subtitle Font Color",
+    "Stroke Color": "Subtitle Outline Color",
+    "Stroke Width": "Subtitle Outline Width",
+    "Generate Video": "Generate Video",
+    "Video Script and Subject Cannot Both Be Empty": "Video Subject and Video Script cannot both be empty",
+    "Generating Video": "Generating video, please wait...",
+    "Start Generating Video": "Start Generating Video",
+    "Video Generation Completed": "Video Generation Completed",
+    "Video Generation Failed": "Video Generation Failed",
+    "You can download the generated video from the following links": "You can download the generated video from the following links",
+    "Pexels API Key": "Pexels API Key ([Get API Key](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API Key ([Get API Key](https://pixabay.com/api/docs/#api_search_videos))",
+    "Basic Settings": "**Basic Settings** (:blue[Click to expand])",
+    "Language": "Language",
+    "LLM Provider": "LLM Provider",
+    "API Key": "API Key (:red[Required])",
+    "Base Url": "Base Url",
+    "Account ID": "Account ID (Get from Cloudflare dashboard)",
+    "Model Name": "Model Name",
+    "Please Enter the LLM API Key": "Please Enter the **LLM API Key**",
+    "Please Enter the Pexels API Key": "Please Enter the **Pexels API Key**",
+    "Please Enter the Pixabay API Key": "Please Enter the **Pixabay API Key**",
+    "Get Help": "If you need help, or have any questions, you can join discord for help ",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (TikTok support is coming soon)",
+    "Bilibili": "Bilibili (Bilibili support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Voice",
+    "Voice Example": "This is an example text for testing speech synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "Select the voice synthesis provider",
+    "Hide Log": "Hide Log"
+  }
+}
\ No newline at end of file
diff --git a/webui/i18n/vi.json b/webui/i18n/vi.json
new file mode 100644
index 0000000..339d30e
--- /dev/null
+++ b/webui/i18n/vi.json
@@ -0,0 +1,80 @@
+{
+  "Language": "Tiếng Việt",
+  "Translation": {
+    "Video Script Settings": "**Cài Đặt Kịch Bản Video**",
+    "Video Subject": "Chủ Đề Video (Cung cấp một từ khóa, :red[AI sẽ tự động tạo ra] kịch bản video)",
+    "Script Language": "Ngôn Ngữ cho Việc Tạo Kịch Bản Video (AI sẽ tự động xuất ra dựa trên ngôn ngữ của chủ đề của bạn)",
+    "Generate Video Script and Keywords": "Nhấn để sử dụng AI để tạo [Kịch Bản Video] và [Từ Khóa Video] dựa trên **chủ đề**",
+    "Auto Detect": "Tự Động Phát Hiện",
+    "Video Script": "Kịch Bản Video (:blue[① Tùy chọn, AI tạo ra  ② Dấu câu chính xác giúp việc tạo phụ đề)",
+    "Generate Video Keywords": "Nhấn để sử dụng AI để tạo [Từ Khóa Video] dựa trên **kịch bản**",
+    "Please Enter the Video Subject": "Vui lòng Nhập Kịch Bản Video Trước",
+    "Generating Video Script and Keywords": "AI đang tạo kịch bản video và từ khóa...",
+    "Generating Video Keywords": "AI đang tạo từ khóa video...",
+    "Video Keywords": "Từ Khóa Video (:blue[① Tùy chọn, AI tạo ra ② Sử dụng dấu phẩy **Tiếng Anh** để phân tách, chỉ sử dụng Tiếng Anh])",
+    "Video Settings": "**Cài Đặt Video**",
+    "Video Concat Mode": "Chế Độ Nối Video",
+    "Random": "Nối Ngẫu Nhiên (Được Khuyến Nghị)",
+    "Sequential": "Nối Theo Thứ Tự",
+    "Video Ratio": "Tỷ Lệ Khung Hình Video",
+    "Portrait": "Dọc 9:16",
+    "Landscape": "Ngang 16:9",
+    "Clip Duration": "Thời Lượng Tối Đa Của Đoạn Video (giây)",
+    "Number of Videos Generated Simultaneously": "Số Video Được Tạo Ra Đồng Thời",
+    "Audio Settings": "**Cài Đặt Âm Thanh**",
+    "Speech Synthesis": "Giọng Đọc Văn Bản",
+    "Speech Region": "Vùng(:red[Bắt Buộc，[Lấy Vùng](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "Khóa API(:red[Bắt Buộc，[Lấy Khóa API](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Âm Lượng Giọng Đọc (1.0 đại diện cho 100%)",
+    "Speech Rate": "Tốc độ đọc (1.0 biểu thị tốc độ gốc)",
+    "Male": "Nam",
+    "Female": "Nữ",
+    "Background Music": "Âm Nhạc Nền",
+    "No Background Music": "Không Có Âm Nhạc Nền",
+    "Random Background Music": "Âm Nhạc Nền Ngẫu Nhiên",
+    "Custom Background Music": "Âm Nhạc Nền Tùy Chỉnh",
+    "Custom Background Music File": "Vui lòng nhập đường dẫn tệp cho âm nhạc nền tùy chỉnh:",
+    "Background Music Volume": "Âm Lượng Âm Nhạc Nền (0.2 đại diện cho 20%, âm nhạc nền không nên quá to)",
+    "Subtitle Settings": "**Cài Đặt Phụ Đề**",
+    "Enable Subtitles": "Bật Phụ Đề (Nếu không chọn, các cài đặt dưới đây sẽ không có hiệu lực)",
+    "Font": "Phông Chữ Phụ Đề",
+    "Position": "Vị Trí Phụ Đề",
+    "Top": "Trên",
+    "Center": "Giữa",
+    "Bottom": "Dưới (Được Khuyến Nghị)",
+    "Custom": "Vị trí tùy chỉnh (70, chỉ ra là cách đầu trang 70%)",
+    "Font Size": "Cỡ Chữ Phụ Đề",
+    "Font Color": "Màu Chữ Phụ Đề",
+    "Stroke Color": "Màu Viền Phụ Đề",
+    "Stroke Width": "Độ Rộng Viền Phụ Đề",
+    "Generate Video": "Tạo Video",
+    "Video Script and Subject Cannot Both Be Empty": "Chủ Đề Video và Kịch Bản Video không thể cùng trống",
+    "Generating Video": "Đang tạo video, vui lòng đợi...",
+    "Start Generating Video": "Bắt Đầu Tạo Video",
+    "Video Generation Completed": "Hoàn Tất Tạo Video",
+    "Video Generation Failed": "Tạo Video Thất Bại",
+    "You can download the generated video from the following links": "Bạn có thể tải video được tạo ra từ các liên kết sau",
+    "Pexels API Key": "Khóa API Pexels ([Lấy Khóa API](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API Key ([Get API Key](https://pixabay.com/api/docs/#api_search_videos))",
+    "Basic Settings": "**Cài Đặt Cơ Bản** (:blue[Nhấp để mở rộng])",
+    "Language": "Ngôn Ngữ",
+    "LLM Provider": "Nhà Cung Cấp LLM",
+    "API Key": "Khóa API (:red[Bắt Buộc])",
+    "Base Url": "Url Cơ Bản",
+    "Account ID": "ID Tài Khoản (Lấy từ bảng điều khiển Cloudflare)",
+    "Model Name": "Tên Mô Hình",
+    "Please Enter the LLM API Key": "Vui lòng Nhập **Khóa API LLM**",
+    "Please Enter the Pexels API Key": "Vui lòng Nhập **Khóa API Pexels**",
+    "Please Enter the Pixabay API Key": "Vui lòng Nhập **Pixabay API Key**",
+    "Get Help": "Nếu bạn cần giúp đỡ hoặc có bất kỳ câu hỏi nào, bạn có thể tham gia discord để được giúp đỡ ",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (TikTok support is coming soon)",
+    "Bilibili": "Bilibili (Bilibili support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Voice",
+    "Voice Example": "This is an example text for testing speech synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "Select the voice synthesis provider"
+  }
+}
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
new file mode 100644
index 0000000..e344987
--- /dev/null
+++ b/webui/i18n/zh.json
@@ -0,0 +1,88 @@
+{
+  "Language": "简体中文",
+  "Translation": {
+    "Video Script Configuration": "**视频脚本配置**",
+    "Video Script Generate": "生成视频脚本",
+    "Video Subject": "视频主题（给定一个关键词，:red[AI自动生成]视频文案）",
+    "Script Language": "生成视频脚本的语言（一般情况AI会自动根据你输入的主题语言输出）",
+    "Script Files": "脚本文件",
+    "Generate Video Script and Keywords": "点击使用AI根据**主题**生成 【视频文案】 和 【视频关键词】",
+    "Auto Detect": "自动检测",
+    "Auto Generate": "自动生成",
+    "Video Script": "视频脚本（:blue[①可不填，使用AI生成  ②合理使用标点断句，有助于生成字幕]）",
+    "Save Script": "保存脚本",
+    "Crop Video": "裁剪视频",
+    "Video File": "视频文件（:blue）",
+    "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])",
+    "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键词】",
+    "Please Enter the Video Subject": "请先填写视频文案",
+    "Generating Video Script and Keywords": "AI正在生成视频文案和关键词...",
+    "Generating Video Keywords": "AI正在生成视频关键词...",
+    "Video Keywords": "视频关键词（:blue[①可不填，使用AI生成 ②用**英文逗号**分隔，只支持英文]）",
+    "Video Settings": "**视频设置**",
+    "Video Concat Mode": "视频拼接模式",
+    "Random": "随机拼接（推荐）",
+    "Sequential": "顺序拼接",
+    "Video Ratio": "视频比例",
+    "Portrait": "竖屏 9:16（抖音视频）",
+    "Landscape": "横屏 16:9（西瓜视频）",
+    "Clip Duration": "视频片段最大时长(秒)（**不是视频总长度**，是指每个**合成片段**的长度）",
+    "Number of Videos Generated Simultaneously": "同时生成视频数量",
+    "Audio Settings": "**音频设置**",
+    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY]）",
+    "Speech Region": "服务区域 (:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key (:red[必填，密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "朗读音量（1.0表示100%）",
+    "Speech Rate": "朗读速度（1.0表示1倍速）",
+    "Male": "男性",
+    "Female": "女性",
+    "Background Music": "背景音乐",
+    "No Background Music": "无背景音乐",
+    "Random Background Music": "随机背景音乐",
+    "Custom Background Music": "自定义背景音乐",
+    "Custom Background Music File": "请输入自定义背景音乐的文件路径",
+    "Background Music Volume": "背景音乐音量（0.2表示20%，背景声音不宜过高）",
+    "Subtitle Settings": "**字幕设置**",
+    "Enable Subtitles": "启用字幕（若取消勾选，下面的设置都将不生效）",
+    "Font": "字幕字体",
+    "Position": "字幕位置",
+    "Top": "顶部",
+    "Center": "中间",
+    "Bottom": "底部（推荐）",
+    "Custom": "自定义位置（70，表示离顶部70%的位置）",
+    "Font Size": "字幕大小",
+    "Font Color": "字幕颜色",
+    "Stroke Color": "描边颜色",
+    "Stroke Width": "描边粗细",
+    "Generate Video": "生成视频",
+    "Video Script and Subject Cannot Both Be Empty": "视频主题 和 视频文案，不能同时为空",
+    "Generating Video": "正在生成视频，请稍候...",
+    "Start Generating Video": "开始生成视频",
+    "Video Generation Completed": "视频生成完成",
+    "Video Generation Failed": "视频生成失败",
+    "You can download the generated video from the following links": "你可以从以下链接下载生成的视频",
+    "Basic Settings": "**基础设置** (:blue[点击展开])",
+    "Language": "界面语言",
+    "Pexels API Key": "Pexels API Key ([点击获取](https://www.pexels.com/api/)) :red[推荐使用]",
+    "Pixabay API Key": "Pixabay API Key ([点击获取](https://pixabay.com/api/docs/#api_search_videos)) :red[可以不用配置，如果 Pexels 无法使用，再选择Pixabay]",
+    "LLM Provider": "大模型提供商",
+    "API Key": "API Key (:red[必填，需要到大模型提供商的后台申请])",
+    "Base Url": "Base Url (可选)",
+    "Account ID": "账户ID (Cloudflare的dash面板url中获取)",
+    "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])",
+    "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
+    "Please Enter the Pexels API Key": "请先填写 **Pexels API Key**",
+    "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**",
+    "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://discord.gg/WBKChhmZ",
+    "Video Source": "视频来源",
+    "TikTok": "抖音 (TikTok 支持中，敬请期待)",
+    "Bilibili": "哔哩哔哩 (Bilibili 支持中，敬请期待)",
+    "Xiaohongshu": "小红书 (Xiaohongshu 支持中，敬请期待)",
+    "Local file": "本地文件",
+    "Play Voice": "试听语音合成",
+    "Voice Example": "这是一段测试语音合成的示例文本",
+    "Synthesizing Voice": "语音合成中，请稍候...",
+    "TTS Provider": "语音合成提供商",
+    "Hide Log": "隐藏日志"
+  }
+}
\ No newline at end of file