From 11bd56bd02727374e5346a3995141cd73f367ca2 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Sat, 14 Sep 2024 18:39:01 +0800
Subject: [PATCH 01/21] =?UTF-8?q?##=20=E6=9C=AC=E6=AC=A1=E6=9B=B4=E6=96=B0?=
 =?UTF-8?q?=EF=BC=9A=20-=20=E6=96=B0=E5=A2=9E=E6=92=AD=E6=94=BE=E5=8E=9F?=
 =?UTF-8?q?=E5=A3=B0ost=E9=85=8D=E7=BD=AE=EF=BC=9B=20-=20=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96=E8=A7=A3=E8=AF=B4=E6=8F=90=E7=A4=BA=E8=AF=8D=20-=20?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=A7=86=E9=A2=91=E5=90=88=E6=88=90=E9=85=8D?=
 =?UTF-8?q?=E7=BD=AE=EF=BC=8C=E6=94=AF=E6=8C=81=E6=92=AD=E6=94=BE=E5=8E=9F?=
 =?UTF-8?q?=E5=A3=B0=E8=A7=86=E9=A2=91=20-=20=E6=96=B0=E5=A2=9E=E8=8E=B7?=
 =?UTF-8?q?=E5=8F=96=E5=BD=93=E5=89=8D=E7=BD=91=E7=BB=9C=E5=8C=BA=E5=9F=9F?=
 =?UTF-8?q?=20-=20=E4=BC=98=E5=8C=96readme?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README-zh.md          |   5 +-
 README.md             |   2 +-
 app/services/llm.py   | 162 ++++++++++++++++++++----------------------
 app/services/task.py  |  15 ++--
 app/services/video.py |  10 ++-
 app/services/voice.py | 127 +++++++++++++++++----------------
 app/utils/utils.py    |  26 ++++++-
 webui/Main.py         |   2 +
 8 files changed, 188 insertions(+), 161 deletions(-)

diff --git a/README-zh.md b/README-zh.md
index 88d3843..26dfd10 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -56,8 +56,9 @@ NarratoAI 是一个自动化影视解说工具，基于LLM实现文案撰写、
 2. 点击 `Get API Key` 申请 API Key
 3. 申请的 API Key 填入 `config.example.toml` 文件中的 `gemini_api_key` 配置
 
-### 配置 proxy VPN
-> 配置vpn的方法不限，只要能正常访问 Google 网络即可，本文采用的是 chash
+### 配置 proxy VPN 😑
+> 配置vpn的方法不限，只要能正常访问 Google 网络即可，本文采用的是 [clash](https://github.com/Z-Siqi/Clash-for-Windows_Chinese) \
+> 最近发现 clash 非常不稳定，在对比后发现 [v2rayN](https://github.com/2dust/v2rayN) 要好用一些 👍
 1. 记住 clash 服务的端口，一般为 `http://127.0.0.1:7890`
 2. 若端口不为 `7890`，请修改 `docker-compose.yml` 文件中的 `VPN_PROXY_URL` 为你的代理地址
    ```yaml
diff --git a/README.md b/README.md
index 195ece5..e874f1c 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ Note:
 3. Enter the obtained API Key into the `gemini_api_key` setting in the `config.example.toml` file.
 
 ### Configure Proxy VPN
-> The method to configure VPN is not restricted, as long as you can access Google's network. Here, `clash` is used as an example.
+> The method to configure VPN is not restricted, as long as you can access Google's network. Here, [clash](https://github.com/Z-Siqi/Clash-for-Windows_Chinese) is used as an example.
 1. Note the port of the clash service, usually `http://127.0.0.1:7890`.
 2. If the port is not `7890`, modify the `VPN_PROXY_URL` in the `docker-compose.yml` file to your proxy address.
    ```yaml
diff --git a/app/services/llm.py b/app/services/llm.py
index c9e4ac4..c033ab5 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -405,44 +405,49 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
     model = gemini.GenerativeModel(model_name=model_name)
 
     prompt = """
-# 角色设定：
-你是一位影视解说专家，擅长根据剧情描述视频的画面和故事生成一段有趣且吸引人的解说文案。你特别熟悉 tiktok/抖音 风格的影视解说文案创作。
+**角色设定：**  
+你是一位影视解说专家，擅长根据剧情生成引人入胜的短视频解说文案，特别熟悉适用于TikTok/抖音风格的快速、抓人视频解说。
 
-# 任务目标：
-1.	根据给定的剧情描述，详细描述视频画面并展开叙述，尤其是对重要画面进行细致刻画。
-2.	生成风格符合 tiktok/抖音 的影视解说文案，使其节奏快、内容抓人。
-3.	最终结果以 JSON 格式输出，字段包含：
-  • "picture"：画面描述
-  • "timestamp"：时间戳（表示画面出现的时间-画面结束的时间）
-  • "narration"：对应的解说文案
+**任务目标：**  
+1. 根据给定剧情，详细描述画面，重点突出重要场景和情节。  
+2. 生成符合TikTok/抖音风格的解说，节奏紧凑，语言简洁，吸引观众。  
+3. 解说的时候需要解说一段播放一段原视频，原视频一般为有台词的片段，原视频的控制有 OST 字段控制。
+4. 结果输出为JSON格式，包含字段：  
+   - "picture"：画面描述  
+   - "timestamp"：画面出现的时间范围  
+   - "narration"：解说内容
+   - "OST": 是否开启原声（true / false）
 
-# 输入示例：
-```text
-在一个黑暗的小巷中，主角缓慢走进，四周静谧无声，只有远处隐隐传来猫的叫声。突然，背后出现一个神秘的身影。
-```
+**输入示例：**  
+```text  
+在一个黑暗的小巷中，主角缓慢走进，四周静谧无声，只有远处隐隐传来猫的叫声。突然，背后出现一个神秘的身影。  
+```  
+
+**输出格式：**  
+```json  
+[  
+    {  
+        "picture": "黑暗的小巷，主角缓慢走入，四周安静，远处传来猫叫声。",  
+        "timestamp": "00:00-00:17",  
+        "narration": "静谧的小巷里，主角步步前行，气氛渐渐变得压抑。"  
+        "OST": False  
+    },  
+    {  
+        "picture": "神秘身影突然出现，紧张气氛加剧。",  
+        "timestamp": "00:17-00:39",  
+        "narration": "原声播放"  
+        "OST": True  
+    }  
+]  
+```  
+
+**提示：**  
+- 文案要简短有力，契合短视频平台用户的观赏习惯。  
+- 保持强烈的悬念和情感代入，吸引观众继续观看。  
+- 解说一段后播放一段原声，原声内容尽量和解说匹配。
+- 文案语言为：%s  
+- 剧情内容：%s (为空则忽略)  
 
-# 输出格式：
-```json
-[
-    {
-        "picture": "黑暗的小巷中，主角缓慢走进，四周静谧无声，远处有模糊的猫叫声。",
-        "timestamp": "00:00-00:17",
-        "narration": "昏暗的小巷里，他独自前行，空气中透着一丝不安，隐约中能听到远处的猫叫声。 "
-    },
-    {
-        "picture": "主角背后突然出现一个神秘的身影，气氛骤然紧张。",
-        "timestamp": "00:17-00:39",
-        "narration": "就在他以为安全时，一个身影悄无声息地出现在他身后，危险一步步逼近！ "
-    }
-    ...
-]
-```
-# 提示：
-  - 生成的解说文案应简洁有力，符合短视频平台用户的偏好。
-  - 叙述中应有强烈的代入感和悬念，以吸引观众持续观看。
-  - 文案语言为：%s
-  - 剧情内容如下：%s (若为空则忽略)
-  
 """ % (language, video_plot)
 
     logger.debug(f"视频名称: {video_origin_name}")
@@ -472,59 +477,46 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
 
 
 if __name__ == "__main__":
-    video_subject = "摔跤吧！爸爸 Dangal"
-    video_path = "/NarratoAI/resource/videos/test.mp4"
-    video_plot = '''
-马哈维亚（阿米尔·汗 Aamir Khan 饰）曾经是一名前途无量的摔跤运动员，在放弃了职业生涯后，他最大的遗憾就是没有能够替国家赢得金牌。马哈维亚将这份希望寄托在了尚未出生的儿子身上，哪知道妻子接连给他生了两个女儿，取名吉塔（法缇玛·萨那·纱卡 Fatima Sana Shaikh 饰）和巴比塔（桑亚·玛荷塔 Sanya Malhotra 饰）。让马哈维亚没有想到的是，两个姑娘展现出了杰出的摔跤天赋，让他幡然醒悟，就算是女孩，也能够昂首挺胸的站在比赛场上，为了国家和她们自己赢得荣誉。
-就这样，在马哈维亚的指导下，吉塔和巴比塔开始了艰苦的训练，两人进步神速，很快就因为在比赛中连连获胜而成为了当地的名人。为了获得更多的机会，吉塔进入了国家体育学院学习，在那里，她将面对更大的诱惑和更多的选择。
-'''
+    """
+    File API 可让您为每个项目存储最多 20 GB 的文件，每个项目使用 每个文件的大小上限为 2 GB。文件会存储 48 小时。
+    它们可以是 在此期间使用您的 API 密钥访问，但无法下载 使用任何 API。它已在使用 Gemini 的所有地区免费提供 API 可用。
+    """
+    import os
+    import sys
+    import requests
+    from app.utils.utils import get_current_country
+
+    # # 添加当前目录到系统路径
+    # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    #
+    video_subject = "卖菜大妈竟是皇嫂"
+    video_path = "/NarratoAI/resource/videos/demoyasuo.mp4"
+
+    video_plot = ''' '''
     language = "zh-CN"
     res = gemini_video2json(video_subject, video_path, video_plot, language)
     print(res)
 
-    # video_subject = "生命的意义是什么"
-    # script = generate_script(
-    #     video_subject=video_subject, language="zh-CN", paragraph_number=1
-    # )
-    # print("######################")
-    # print(script)
-    # search_terms = generate_terms(
-    #     video_subject=video_subject, video_script=script, amount=5
-    # )
-    # print("######################")
-    # print(search_terms)
-    #     prompt = """
-    # # Role: 影视解说专家
+    # get_current_country()
+    # api_key = config.app.get("gemini_api_key")
+    # model_name = config.app.get("gemini_model_name")
+    # gemini.configure(api_key=api_key)
+    # model = gemini.GenerativeModel(model_name=model_name)
+    # # 卖菜大妈竟是皇嫂 测试视频
+    # video_name = "files/y3npkshvldsd"
+    # video_file = gemini.get_file(video_name)
+    # logger.debug(f"视频当前状态(ACTIVE才可用): {video_file.state.name}")
     #
-    # ## Background:
-    # 擅长根据剧情描述视频的画面和故事，能够生成一段非常有趣的解说文案。
+    # # 转录视频并提供视觉说明
+    # prompt = "Transcribe the audio, giving timestamps. Also provide visual descriptions. use ZH-CN ONLY"
+    # # Make the LLM request.
+    # print("发出 LLM 推理请求...")
+    # streams = model.generate_content([prompt, video_file],
+    #                                   request_options={"timeout": 600},
+    #                                   stream=True)
+    # response = []
+    # for chunk in streams:
+    #     response.append(chunk.text)
     #
-    # ## Goals:
-    # 1. 根据剧情描述视频的画面和故事，并对重要的画面进行展开叙述
-    # 2. 根据剧情内容，生成符合 tiktok/抖音 风格的影视解说文案
-    # 3. 将结果直接以json格式输出给用户，需要包含字段： picture 画面描述， timestamp 时间戳， narration 解说文案
-    # 4. 剧情内容如下：{%s}
-    #
-    # ## Skills
-    # - 精通 tiktok/抖音 等短视频影视解说文案撰写
-    # - 能够理解视频中的故事和画面表现
-    # - 能精准匹配视频中的画面和时间戳
-    # - 能精准把控旁白和时长
-    # - 精通中文
-    # - 精通JSON数据格式
-    #
-    # ## Constrains
-    # - 解说文案的时长要和时间戳的时长尽量匹配
-    # - 忽略视频中关于广告的内容
-    # - 忽略视频中片头和片尾
-    # - 不得在脚本中包含任何类型的 Markdown 或格式
-    #
-    # ## Format
-    # - 对应JSON的key为：picture， timestamp， narration
-    #
-    # # Initialization:
-    # - video subject: {video_subject}
-    # - number of paragraphs: {paragraph_number}
-    # """.strip()
-    #     if language:
-    #         prompt += f"\n- language: {language}"
+    # response = "".join(response)
+    # logger.success(f"llm response: \n{response}")
diff --git a/app/services/task.py b/app/services/task.py
index 5735731..41070e7 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -346,6 +346,7 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
         with open(video_script_path, "r", encoding="utf-8") as f:
             list_script = json.load(f)
             video_list = [i['narration'] for i in list_script]
+            video_ost = [i['OST'] for i in list_script]
             time_list = [i['timestamp'] for i in list_script]
 
             video_script = " ".join(video_list)
@@ -421,12 +422,14 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
         index = i + 1
         combined_video_path = path.join(utils.task_dir(task_id), f"combined-{index}.mp4")
         logger.info(f"\n\n## 5. 合并视频: {index} => {combined_video_path}")
-        video.combine_clip_videos(combined_video_path=combined_video_path,
-                             video_paths=subclip_videos,
-                             video_script_list=video_list,
-                             audio_file=audio_file,
-                             video_aspect=params.video_aspect,
-                             threads=n_threads)
+        video.combine_clip_videos(
+            combined_video_path=combined_video_path,
+            video_paths=subclip_videos,
+            video_ost_list=video_ost,
+            audio_file=audio_file,
+            video_aspect=params.video_aspect,
+            threads=n_threads
+        )
 
         _progress += 50 / params.video_count / 2
         sm.state.update_task(task_id, progress=_progress)
diff --git a/app/services/video.py b/app/services/video.py
index 3daf92f..d5cc18f 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -352,7 +352,7 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
 
 def combine_clip_videos(combined_video_path: str,
                         video_paths: List[str],
-                        video_script_list: List[str],
+                        video_ost_list: List[str],
                         audio_file: str,
                         video_aspect: VideoAspect = VideoAspect.portrait,
                         threads: int = 2,
@@ -385,8 +385,12 @@ def combine_clip_videos(combined_video_path: str,
     video_duration = 0
     # 一遍又一遍地添加下载的剪辑，直到达到音频的持续时间 （max_duration）
     while video_duration < audio_duration:
-        for video_path, video_script in zip(video_paths, video_script_list):
-            clip = VideoFileClip(video_path).without_audio()
+        for video_path, video_ost in zip(video_paths, video_ost_list):
+            clip = VideoFileClip(video_path)
+            if video_ost:
+                clip = clip.set_audio(audio_clip)
+            else:
+                clip = clip.set_audio(audio_clip).without_audio()
             # 检查剪辑是否比剩余音频长
             if (audio_duration - video_duration) < clip.duration:
                 clip = clip.subclip(0, (audio_duration - video_duration))
diff --git a/app/services/voice.py b/app/services/voice.py
index 287e22d..ec07bad 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1288,67 +1288,68 @@ if __name__ == "__main__":
     voice_name = parse_voice_name(voice_name)
     voice_name = is_azure_v2_voice(voice_name)
     print(voice_name)
+    a = tts("预计未来3天深圳冷空气活动频繁, 等待5个字，，，，，，5个字结束", "zh-CN-YunyangNeural", 1.2, "/NarratoAI/test123.mp3")
+    print(a)
+    # voices = get_all_azure_voices()
+    # print(len(voices))
 
-    voices = get_all_azure_voices()
-    print(len(voices))
-
-    async def _do():
-        temp_dir = utils.storage_dir("temp")
-
-        voice_names = [
-            "zh-CN-XiaoxiaoMultilingualNeural",
-            # 女性
-            "zh-CN-XiaoxiaoNeural",
-            "zh-CN-XiaoyiNeural",
-            # 男性
-            "zh-CN-YunyangNeural",
-            "zh-CN-YunxiNeural",
-        ]
-        text = """
-        静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人，表达了他对家乡和亲人的深深思念之情。全诗内容是：“床前明月光，疑是地上霜。举头望明月，低头思故乡。”在这短短的四句诗中，诗人通过“明月”和“思故乡”的意象，巧妙地表达了离乡背井人的孤独与哀愁。首句“床前明月光”设景立意，通过明亮的月光引出诗人的遐想；“疑是地上霜”增添了夜晚的寒冷感，加深了诗人的孤寂之情；“举头望明月”和“低头思故乡”则是情感的升华，展现了诗人内心深处的乡愁和对家的渴望。这首诗简洁明快，情感真挚，是中国古典诗歌中非常著名的一首，也深受后人喜爱和推崇。
-            """
-
-        text = """
-        What is the meaning of life? This question has puzzled philosophers, scientists, and thinkers of all kinds for centuries. Throughout history, various cultures and individuals have come up with their interpretations and beliefs around the purpose of life. Some say it's to seek happiness and self-fulfillment, while others believe it's about contributing to the welfare of others and making a positive impact in the world. Despite the myriad of perspectives, one thing remains clear: the meaning of life is a deeply personal concept that varies from one person to another. It's an existential inquiry that encourages us to reflect on our values, desires, and the essence of our existence.
-        """
-
-        text = """
-               预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
-               10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
-               12日天气短暂好转，早晚清凉；
-                   """
-
-        text = "[Opening scene: A sunny day in a suburban neighborhood. A young boy named Alex, around 8 years old, is playing in his front yard with his loyal dog, Buddy.]\n\n[Camera zooms in on Alex as he throws a ball for Buddy to fetch. Buddy excitedly runs after it and brings it back to Alex.]\n\nAlex: Good boy, Buddy! You're the best dog ever!\n\n[Buddy barks happily and wags his tail.]\n\n[As Alex and Buddy continue playing, a series of potential dangers loom nearby, such as a stray dog approaching, a ball rolling towards the street, and a suspicious-looking stranger walking by.]\n\nAlex: Uh oh, Buddy, look out!\n\n[Buddy senses the danger and immediately springs into action. He barks loudly at the stray dog, scaring it away. Then, he rushes to retrieve the ball before it reaches the street and gently nudges it back towards Alex. Finally, he stands protectively between Alex and the stranger, growling softly to warn them away.]\n\nAlex: Wow, Buddy, you're like my superhero!\n\n[Just as Alex and Buddy are about to head inside, they hear a loud crash from a nearby construction site. They rush over to investigate and find a pile of rubble blocking the path of a kitten trapped underneath.]\n\nAlex: Oh no, Buddy, we have to help!\n\n[Buddy barks in agreement and together they work to carefully move the rubble aside, allowing the kitten to escape unharmed. The kitten gratefully nuzzles against Buddy, who responds with a friendly lick.]\n\nAlex: We did it, Buddy! We saved the day again!\n\n[As Alex and Buddy walk home together, the sun begins to set, casting a warm glow over the neighborhood.]\n\nAlex: Thanks for always being there to watch over me, Buddy. You're not just my dog, you're my best friend.\n\n[Buddy barks happily and nuzzles against Alex as they disappear into the sunset, ready to face whatever adventures tomorrow may bring.]\n\n[End scene.]"
-
-        text = "大家好，我是乔哥，一个想帮你把信用卡全部还清的家伙！\n今天我们要聊的是信用卡的取现功能。\n你是不是也曾经因为一时的资金紧张，而拿着信用卡到ATM机取现？如果是，那你得好好看看这个视频了。\n现在都2024年了，我以为现在不会再有人用信用卡取现功能了。前几天一个粉丝发来一张图片，取现1万。\n信用卡取现有三个弊端。\n一，信用卡取现功能代价可不小。会先收取一个取现手续费，比如这个粉丝，取现1万，按2.5%收取手续费，收取了250元。\n二，信用卡正常消费有最长56天的免息期，但取现不享受免息期。从取现那一天开始，每天按照万5收取利息，这个粉丝用了11天，收取了55元利息。\n三，频繁的取现行为，银行会认为你资金紧张，会被标记为高风险用户，影响你的综合评分和额度。\n那么，如果你资金紧张了，该怎么办呢？\n乔哥给你支一招，用破思机摩擦信用卡，只需要少量的手续费，而且还可以享受最长56天的免息期。\n最后，如果你对玩卡感兴趣，可以找乔哥领取一本《卡神秘籍》，用卡过程中遇到任何疑惑，也欢迎找乔哥交流。\n别忘了，关注乔哥，回复用卡技巧，免费领取《2024用卡技巧》，让我们一起成为用卡高手！"
-
-        text = """
-        2023全年业绩速览
-公司全年累计实现营业收入1476.94亿元，同比增长19.01%，归母净利润747.34亿元，同比增长19.16%。EPS达到59.49元。第四季度单季，营业收入444.25亿元，同比增长20.26%，环比增长31.86%；归母净利润218.58亿元，同比增长19.33%，环比增长29.37%。这一阶段
-的业绩表现不仅突显了公司的增长动力和盈利能力，也反映出公司在竞争激烈的市场环境中保持了良好的发展势头。
-2023年Q4业绩速览
-第四季度，营业收入贡献主要增长点；销售费用高增致盈利能力承压；税金同比上升27%，扰动净利率表现。
-业绩解读
-利润方面，2023全年贵州茅台，>归母净利润增速为19%，其中营业收入正贡献18%，营业成本正贡献百分之一，管理费用正贡献百分之一点四。(注：归母净利润增速值=营业收入增速+各科目贡献，展示贡献/拖累的前四名科目，且要求贡献值/净利润增速>15%)
-"""
-        text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人"
-
-        text = _format_text(text)
-        lines = utils.split_string_by_punctuations(text)
-        print(lines)
-
-        for voice_name in voice_names:
-            voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
-            subtitle_file = f"{temp_dir}/tts.mp3.srt"
-            sub_maker = azure_tts_v2(
-                text=text, voice_name=voice_name, voice_file=voice_file
-            )
-            create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
-            audio_duration = get_audio_duration(sub_maker)
-            print(f"voice: {voice_name}, audio duration: {audio_duration}s")
-
-    loop = asyncio.get_event_loop_policy().get_event_loop()
-    try:
-        loop.run_until_complete(_do())
-    finally:
-        loop.close()
+#     async def _do():
+#         temp_dir = utils.storage_dir("temp")
+#
+#         voice_names = [
+#             "zh-CN-XiaoxiaoMultilingualNeural",
+#             # 女性
+#             "zh-CN-XiaoxiaoNeural",
+#             "zh-CN-XiaoyiNeural",
+#             # 男性
+#             "zh-CN-YunyangNeural",
+#             "zh-CN-YunxiNeural",
+#         ]
+#         text = """
+#         静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人，表达了他对家乡和亲人的深深思念之情。全诗内容是：“床前明月光，疑是地上霜。举头望明月，低头思故乡。”在这短短的四句诗中，诗人通过“明月”和“思故乡”的意象，巧妙地表达了离乡背井人的孤独与哀愁。首句“床前明月光”设景立意，通过明亮的月光引出诗人的遐想；“疑是地上霜”增添了夜晚的寒冷感，加深了诗人的孤寂之情；“举头望明月”和“低头思故乡”则是情感的升华，展现了诗人内心深处的乡愁和对家的渴望。这首诗简洁明快，情感真挚，是中国古典诗歌中非常著名的一首，也深受后人喜爱和推崇。
+#             """
+#
+#         text = """
+#         What is the meaning of life? This question has puzzled philosophers, scientists, and thinkers of all kinds for centuries. Throughout history, various cultures and individuals have come up with their interpretations and beliefs around the purpose of life. Some say it's to seek happiness and self-fulfillment, while others believe it's about contributing to the welfare of others and making a positive impact in the world. Despite the myriad of perspectives, one thing remains clear: the meaning of life is a deeply personal concept that varies from one person to another. It's an existential inquiry that encourages us to reflect on our values, desires, and the essence of our existence.
+#         """
+#
+#         text = """
+#                预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
+#                10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
+#                12日天气短暂好转，早晚清凉；
+#                    """
+#
+#         text = "[Opening scene: A sunny day in a suburban neighborhood. A young boy named Alex, around 8 years old, is playing in his front yard with his loyal dog, Buddy.]\n\n[Camera zooms in on Alex as he throws a ball for Buddy to fetch. Buddy excitedly runs after it and brings it back to Alex.]\n\nAlex: Good boy, Buddy! You're the best dog ever!\n\n[Buddy barks happily and wags his tail.]\n\n[As Alex and Buddy continue playing, a series of potential dangers loom nearby, such as a stray dog approaching, a ball rolling towards the street, and a suspicious-looking stranger walking by.]\n\nAlex: Uh oh, Buddy, look out!\n\n[Buddy senses the danger and immediately springs into action. He barks loudly at the stray dog, scaring it away. Then, he rushes to retrieve the ball before it reaches the street and gently nudges it back towards Alex. Finally, he stands protectively between Alex and the stranger, growling softly to warn them away.]\n\nAlex: Wow, Buddy, you're like my superhero!\n\n[Just as Alex and Buddy are about to head inside, they hear a loud crash from a nearby construction site. They rush over to investigate and find a pile of rubble blocking the path of a kitten trapped underneath.]\n\nAlex: Oh no, Buddy, we have to help!\n\n[Buddy barks in agreement and together they work to carefully move the rubble aside, allowing the kitten to escape unharmed. The kitten gratefully nuzzles against Buddy, who responds with a friendly lick.]\n\nAlex: We did it, Buddy! We saved the day again!\n\n[As Alex and Buddy walk home together, the sun begins to set, casting a warm glow over the neighborhood.]\n\nAlex: Thanks for always being there to watch over me, Buddy. You're not just my dog, you're my best friend.\n\n[Buddy barks happily and nuzzles against Alex as they disappear into the sunset, ready to face whatever adventures tomorrow may bring.]\n\n[End scene.]"
+#
+#         text = "大家好，我是乔哥，一个想帮你把信用卡全部还清的家伙！\n今天我们要聊的是信用卡的取现功能。\n你是不是也曾经因为一时的资金紧张，而拿着信用卡到ATM机取现？如果是，那你得好好看看这个视频了。\n现在都2024年了，我以为现在不会再有人用信用卡取现功能了。前几天一个粉丝发来一张图片，取现1万。\n信用卡取现有三个弊端。\n一，信用卡取现功能代价可不小。会先收取一个取现手续费，比如这个粉丝，取现1万，按2.5%收取手续费，收取了250元。\n二，信用卡正常消费有最长56天的免息期，但取现不享受免息期。从取现那一天开始，每天按照万5收取利息，这个粉丝用了11天，收取了55元利息。\n三，频繁的取现行为，银行会认为你资金紧张，会被标记为高风险用户，影响你的综合评分和额度。\n那么，如果你资金紧张了，该怎么办呢？\n乔哥给你支一招，用破思机摩擦信用卡，只需要少量的手续费，而且还可以享受最长56天的免息期。\n最后，如果你对玩卡感兴趣，可以找乔哥领取一本《卡神秘籍》，用卡过程中遇到任何疑惑，也欢迎找乔哥交流。\n别忘了，关注乔哥，回复用卡技巧，免费领取《2024用卡技巧》，让我们一起成为用卡高手！"
+#
+#         text = """
+#         2023全年业绩速览
+# 公司全年累计实现营业收入1476.94亿元，同比增长19.01%，归母净利润747.34亿元，同比增长19.16%。EPS达到59.49元。第四季度单季，营业收入444.25亿元，同比增长20.26%，环比增长31.86%；归母净利润218.58亿元，同比增长19.33%，环比增长29.37%。这一阶段
+# 的业绩表现不仅突显了公司的增长动力和盈利能力，也反映出公司在竞争激烈的市场环境中保持了良好的发展势头。
+# 2023年Q4业绩速览
+# 第四季度，营业收入贡献主要增长点；销售费用高增致盈利能力承压；税金同比上升27%，扰动净利率表现。
+# 业绩解读
+# 利润方面，2023全年贵州茅台，>归母净利润增速为19%，其中营业收入正贡献18%，营业成本正贡献百分之一，管理费用正贡献百分之一点四。(注：归母净利润增速值=营业收入增速+各科目贡献，展示贡献/拖累的前四名科目，且要求贡献值/净利润增速>15%)
+# """
+#         text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人"
+#
+#         text = _format_text(text)
+#         lines = utils.split_string_by_punctuations(text)
+#         print(lines)
+#
+#         for voice_name in voice_names:
+#             voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
+#             subtitle_file = f"{temp_dir}/tts.mp3.srt"
+#             sub_maker = azure_tts_v2(
+#                 text=text, voice_name=voice_name, voice_file=voice_file
+#             )
+#             create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
+#             audio_duration = get_audio_duration(sub_maker)
+#             print(f"voice: {voice_name}, audio duration: {audio_duration}s")
+#
+#     loop = asyncio.get_event_loop_policy().get_event_loop()
+#     try:
+#         loop.run_until_complete(_do())
+#     finally:
+#         loop.close()
diff --git a/app/utils/utils.py b/app/utils/utils.py
index 229d667..4ad0cd0 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -1,6 +1,6 @@
 import locale
 import os
-import platform
+import requests
 import threading
 from typing import Any
 from loguru import logger
@@ -269,3 +269,27 @@ def reduce_video_time(txt: str, duration: float = 0.21531):
     # 返回结果四舍五入为整数
     duration = len(txt) * duration
     return int(duration)
+
+
+def get_current_country():
+    """
+    判断当前网络IP地址所在的国家
+    """
+    try:
+        # 使用ipapi.co的免费API获取IP地址信息
+        response = requests.get('https://ipapi.co/json/')
+        data = response.json()
+
+        # 获取国家名称
+        country = data.get('country_name')
+
+        if country:
+            logger.debug(f"当前网络IP地址位于：{country}")
+            return country
+        else:
+            logger.debug("无法确定当前网络IP地址所在的国家")
+            return None
+
+    except requests.RequestException:
+        logger.error("获取IP地址信息时发生错误，请检查网络连接")
+        return None
diff --git a/webui/Main.py b/webui/Main.py
index bf85903..10efc55 100644
--- a/webui/Main.py
+++ b/webui/Main.py
@@ -354,6 +354,7 @@ with left_panel:
         if st.button(tr("Video Script Generate"), key="auto_generate_script"):
             with st.spinner(tr("Video Script Generate")):
                 if video_json_file == "" and params.video_origin_path != "":
+                    # 使用大模型生成视频脚本
                     script = llm.gemini_video2json(
                         video_origin_name=params.video_origin_path.split("\\")[-1],
                         video_origin_path=params.video_origin_path,
@@ -732,6 +733,7 @@ with st.expander(tr("Video Check"), expanded=False):
                             text1 = st.text_area(tr("timestamp"), value=initial_timestamp, height=20)
                         with text_panels[1]:
                             text2 = st.text_area(tr("Picture description"), value=initial_picture, height=20)
+                        logger.debug(initial_narration)
                         text3 = st.text_area(tr("Narration"), value=initial_narration, height=100)
 
                         # 重新生成按钮

From d1da23e37f678607888ff4992aaf7a0362efba0f Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Wed, 18 Sep 2024 18:29:01 +0800
Subject: [PATCH 02/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=88=86=E6=AE=B5?=
 =?UTF-8?q?=E5=AD=97=E5=B9=95=E7=94=9F=E6=88=90=E9=80=BB=E8=BE=91=EF=BC=9B?=
 =?UTF-8?q?=20=E4=B8=8B=E4=B8=80=E6=AD=A5=E4=BC=98=E5=8C=96=E5=88=86?=
 =?UTF-8?q?=E6=AE=B5=E8=A7=86=E9=A2=91=E5=90=88=E5=B9=B6=EF=BC=9B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/task.py  |  78 +++++------
 app/services/voice.py | 297 ++++++++++++++++++++++--------------------
 app/utils/utils.py    |  99 ++++++++++++++
 3 files changed, 295 insertions(+), 179 deletions(-)

diff --git a/app/services/task.py b/app/services/task.py
index 41070e7..b1895d3 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -3,8 +3,6 @@ import json
 import os.path
 import re
 from os import path
-
-from edge_tts import SubMaker
 from loguru import logger
 
 from app.config import config
@@ -333,45 +331,44 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
 
     voice_name = voice.parse_voice_name(params.voice_name)
-    # voice_name = 'zh-CN-XiaoyiNeural'
     paragraph_number = params.paragraph_number
     n_threads = params.n_threads
     max_clip_duration = params.video_clip_duration
 
-    logger.info("\n\n## 1. 读取json")
+    logger.info("\n\n## 1. 读取视频json脚本")
     video_script_path = path.join(params.video_clip_json)
     # 判断json文件是否存在
     if path.exists(video_script_path):
-        # 读取json文件内容，并转为dict
-        with open(video_script_path, "r", encoding="utf-8") as f:
-            list_script = json.load(f)
-            video_list = [i['narration'] for i in list_script]
-            video_ost = [i['OST'] for i in list_script]
-            time_list = [i['timestamp'] for i in list_script]
-
-            video_script = " ".join(video_list)
-            logger.debug(f"原json脚本: \n{video_script}")
-            logger.debug(f"原json时间戳: \n{time_list}")
+        try:
+            with open(video_script_path, "r", encoding="utf-8") as f:
+                list_script = json.load(f)
+                video_list = [i['narration'] for i in list_script]
+                video_ost = [i['OST'] for i in list_script]
+                time_list = [i['timestamp'] for i in list_script]
 
+                video_script = " ".join(video_list)
+                logger.debug(f"解说完整脚本: \n{video_script}")
+                logger.debug(f"解说 OST 列表: \n{video_ost}")
+                logger.debug(f"解说时间戳列表: \n{time_list}")
+        except Exception as e:
+            logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
+            raise ValueError("无法读取视频json脚本，请检查配置是否正确")
     else:
-        raise ValueError("解说文案不存在！检查文案名称是否正确。")
+        raise ValueError("解说脚本不存在！请检查配置是否正确。")
 
-    # video_script = llm.text_polishing(context=video_script, language=params.video_language)
-    # logger.debug(f"润色后的视频脚本: \n{video_script}")
-    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
-
-    logger.info("\n\n## 2. 生成音频")
-    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
-    sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_file=audio_file, voice_rate=params.voice_rate)
-    if sub_maker is None:
+    logger.info("\n\n## 2. 生成音频列表")
+    audio_files, sub_maker_list = voice.tts_multiple(
+        task_id=task_id,
+        list_script=list_script,
+        voice_name=voice_name,
+        voice_rate=params.voice_rate,
+        force_regenerate=True
+    )
+    if audio_files is None:
         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
         logger.error(
-            "无法生成音频，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
+            "音频文件为空，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
         return
-
-    audio_duration = voice.get_audio_duration(sub_maker)
-    audio_duration = math.ceil(audio_duration)
-
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
 
     subtitle_path = ""
@@ -379,17 +376,22 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
         subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        subtitle_fallback = False
+        # subtitle_fallback = False
         if subtitle_provider == "edge":
-            voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
-            if not os.path.exists(subtitle_path):
-                subtitle_fallback = True
-                logger.warning("找不到字幕文件，回退到whisper")
-
-        if subtitle_provider == "whisper" or subtitle_fallback:
-            subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
-            logger.info("\n\n## 更正字幕")
-            subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+            voice.create_subtitle_from_multiple(
+                text=video_script,
+                sub_maker_list=sub_maker_list,
+                list_script=list_script,
+                subtitle_file=subtitle_path
+            )
+        #     if not os.path.exists(subtitle_path):
+        #         subtitle_fallback = True
+        #         logger.warning("找不到字幕文件，回退到whisper")
+        #
+        # if subtitle_provider == "whisper" or subtitle_fallback:
+        #     subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        #     logger.info("\n\n## 更正字幕")
+        #     subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
 
         subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
         if not subtitle_lines:
diff --git a/app/services/voice.py b/app/services/voice.py
index ec07bad..e54eda9 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1,12 +1,15 @@
-import asyncio
 import os
 import re
+import json
+import traceback
+
+import edge_tts
+import asyncio
+from loguru import logger
+from typing import List
 from datetime import datetime
 from xml.sax.saxutils import unescape
-from edge_tts.submaker import mktimestamp
-from loguru import logger
 from edge_tts import submaker, SubMaker
-import edge_tts
 from moviepy.video.tools import subtitles
 
 from app.config import config
@@ -1184,94 +1187,107 @@ def _format_text(text: str) -> str:
     return text
 
 
-def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], list_script: List[dict], 
+                                  subtitle_file: str):
     """
-    优化字幕文件
-    1. 将字幕文件按照标点符号分割成多行
-    2. 逐行匹配字幕文件中的文本
-    3. 生成新的字幕文件
+    根据多个 SubMaker 对象、完整文本和原始脚本创建优化的字幕文件
+    1. 使用原始脚本中的时间戳
+    2. 跳过 OST 为 true 的部分
+    3. 将字幕文件按照标点符号分割成多行
+    4. 根据完整文本分段，保持原文的语句结构
+    5. 生成新的字幕文件，时间戳包含小时单位
     """
-
     text = _format_text(text)
+    sentences = utils.split_string_by_punctuations(text)
 
-    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
-        """
-        1
-        00:00:00,000 --> 00:00:02,360
-        跑步是一项简单易行的运动
-        """
-        start_t = mktimestamp(start_time).replace(".", ",")
-        end_t = mktimestamp(end_time).replace(".", ",")
-        return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
+    def formatter(idx: int, start_time: str, end_time: str, sub_text: str) -> str:
+        return f"{idx}\n{start_time.replace('.', ',')} --> {end_time.replace('.', ',')}\n{sub_text}\n"
 
-    start_time = -1.0
     sub_items = []
     sub_index = 0
-
-    script_lines = utils.split_string_by_punctuations(text)
-
-    def match_line(_sub_line: str, _sub_index: int):
-        if len(script_lines) <= _sub_index:
-            return ""
-
-        _line = script_lines[_sub_index]
-        if _sub_line == _line:
-            return script_lines[_sub_index].strip()
-
-        _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
-        _line_ = re.sub(r"[^\w\s]", "", _line)
-        if _sub_line_ == _line_:
-            return _line_.strip()
-
-        _sub_line_ = re.sub(r"\W+", "", _sub_line)
-        _line_ = re.sub(r"\W+", "", _line)
-        if _sub_line_ == _line_:
-            return _line.strip()
-
-        return ""
-
-    sub_line = ""
+    sentence_index = 0
 
     try:
-        for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
-            _start_time, end_time = offset
-            if start_time < 0:
-                start_time = _start_time
+        sub_maker_index = 0
+        for script_item in list_script:
+            if script_item['OST']:
+                continue
 
-            sub = unescape(sub)
-            sub_line += sub
-            sub_text = match_line(sub_line, sub_index)
-            if sub_text:
+            start_time, end_time = script_item['timestamp'].split('-')
+            if sub_maker_index >= len(sub_maker_list):
+                logger.error(f"Sub maker list index out of range: {sub_maker_index}")
+                break
+            sub_maker = sub_maker_list[sub_maker_index]
+            sub_maker_index += 1
+
+            script_duration = utils.time_to_seconds(end_time) - utils.time_to_seconds(start_time)
+            audio_duration = get_audio_duration(sub_maker)
+            time_ratio = script_duration / audio_duration if audio_duration > 0 else 1
+
+            current_sub = ""
+            current_start = None
+            current_end = None
+
+            for offset, sub in zip(sub_maker.offset, sub_maker.subs):
+                sub = unescape(sub).strip()
+                sub_start = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[0] / 10000000 * time_ratio)
+                sub_end = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[1] / 10000000 * time_ratio)
+                
+                if current_start is None:
+                    current_start = sub_start
+                current_end = sub_end
+                
+                current_sub += sub
+                
+                # 检查当前累积的字幕是否匹配下一个句子
+                while sentence_index < len(sentences) and sentences[sentence_index] in current_sub:
+                    sub_index += 1
+                    line = formatter(
+                        idx=sub_index,
+                        start_time=current_start,
+                        end_time=current_end,
+                        sub_text=sentences[sentence_index].strip(),
+                    )
+                    sub_items.append(line)
+                    current_sub = current_sub.replace(sentences[sentence_index], "", 1).strip()
+                    current_start = current_end
+                    sentence_index += 1
+
+                # 如果当前字幕长度超过15个字符，也生成一个新的字幕项
+                if len(current_sub) > 15:
+                    sub_index += 1
+                    line = formatter(
+                        idx=sub_index,
+                        start_time=current_start,
+                        end_time=current_end,
+                        sub_text=current_sub.strip(),
+                    )
+                    sub_items.append(line)
+                    current_sub = ""
+                    current_start = current_end
+
+            # 处理剩余的文本
+            if current_sub.strip():
                 sub_index += 1
                 line = formatter(
                     idx=sub_index,
-                    start_time=start_time,
-                    end_time=end_time,
-                    sub_text=sub_text,
+                    start_time=current_start,
+                    end_time=current_end,
+                    sub_text=current_sub.strip(),
                 )
                 sub_items.append(line)
-                start_time = -1.0
-                sub_line = ""
 
-        if len(sub_items) == len(script_lines):
-            with open(subtitle_file, "w", encoding="utf-8") as file:
-                file.write("\n".join(sub_items) + "\n")
-            try:
-                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
-                duration = max([tb for ((ta, tb), txt) in sbs])
-                logger.info(
-                    f"completed, subtitle file created: {subtitle_file}, duration: {duration}"
-                )
-            except Exception as e:
-                logger.error(f"failed, error: {str(e)}")
-                os.remove(subtitle_file)
-        else:
-            logger.warning(
-                f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}"
-            )
+        if len(sub_items) == 0:
+            logger.error("No subtitle items generated")
+            return
 
+        with open(subtitle_file, "w", encoding="utf-8") as file:
+            file.write("\n".join(sub_items))
+
+        logger.info(f"completed, subtitle file created: {subtitle_file}")
     except Exception as e:
         logger.error(f"failed, error: {str(e)}")
+        traceback.print_exc()
 
 
 def get_audio_duration(sub_maker: submaker.SubMaker):
@@ -1283,73 +1299,72 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
     return sub_maker.offset[-1][1] / 10000000
 
 
-if __name__ == "__main__":
-    voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
+def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, force_regenerate: bool = True):
+    """
+    根据JSON文件中的多段文本进行TTS转换
+    
+    :param task_id: 任务ID
+    :param list_script: 脚本列表
+    :param voice_name: 语音名称
+    :param voice_rate: 语音速率
+    :param force_regenerate: 是否强制重新生成已存在的音频文件
+    :return: 生成的音频文件列表
+    """
     voice_name = parse_voice_name(voice_name)
-    voice_name = is_azure_v2_voice(voice_name)
-    print(voice_name)
-    a = tts("预计未来3天深圳冷空气活动频繁, 等待5个字，，，，，，5个字结束", "zh-CN-YunyangNeural", 1.2, "/NarratoAI/test123.mp3")
-    print(a)
-    # voices = get_all_azure_voices()
-    # print(len(voices))
+    output_dir = utils.task_dir(task_id)
+    audio_files = []
+    sub_maker_list = []
 
-#     async def _do():
-#         temp_dir = utils.storage_dir("temp")
-#
-#         voice_names = [
-#             "zh-CN-XiaoxiaoMultilingualNeural",
-#             # 女性
-#             "zh-CN-XiaoxiaoNeural",
-#             "zh-CN-XiaoyiNeural",
-#             # 男性
-#             "zh-CN-YunyangNeural",
-#             "zh-CN-YunxiNeural",
-#         ]
-#         text = """
-#         静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人，表达了他对家乡和亲人的深深思念之情。全诗内容是：“床前明月光，疑是地上霜。举头望明月，低头思故乡。”在这短短的四句诗中，诗人通过“明月”和“思故乡”的意象，巧妙地表达了离乡背井人的孤独与哀愁。首句“床前明月光”设景立意，通过明亮的月光引出诗人的遐想；“疑是地上霜”增添了夜晚的寒冷感，加深了诗人的孤寂之情；“举头望明月”和“低头思故乡”则是情感的升华，展现了诗人内心深处的乡愁和对家的渴望。这首诗简洁明快，情感真挚，是中国古典诗歌中非常著名的一首，也深受后人喜爱和推崇。
-#             """
-#
-#         text = """
-#         What is the meaning of life? This question has puzzled philosophers, scientists, and thinkers of all kinds for centuries. Throughout history, various cultures and individuals have come up with their interpretations and beliefs around the purpose of life. Some say it's to seek happiness and self-fulfillment, while others believe it's about contributing to the welfare of others and making a positive impact in the world. Despite the myriad of perspectives, one thing remains clear: the meaning of life is a deeply personal concept that varies from one person to another. It's an existential inquiry that encourages us to reflect on our values, desires, and the essence of our existence.
-#         """
-#
-#         text = """
-#                预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
-#                10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
-#                12日天气短暂好转，早晚清凉；
-#                    """
-#
-#         text = "[Opening scene: A sunny day in a suburban neighborhood. A young boy named Alex, around 8 years old, is playing in his front yard with his loyal dog, Buddy.]\n\n[Camera zooms in on Alex as he throws a ball for Buddy to fetch. Buddy excitedly runs after it and brings it back to Alex.]\n\nAlex: Good boy, Buddy! You're the best dog ever!\n\n[Buddy barks happily and wags his tail.]\n\n[As Alex and Buddy continue playing, a series of potential dangers loom nearby, such as a stray dog approaching, a ball rolling towards the street, and a suspicious-looking stranger walking by.]\n\nAlex: Uh oh, Buddy, look out!\n\n[Buddy senses the danger and immediately springs into action. He barks loudly at the stray dog, scaring it away. Then, he rushes to retrieve the ball before it reaches the street and gently nudges it back towards Alex. Finally, he stands protectively between Alex and the stranger, growling softly to warn them away.]\n\nAlex: Wow, Buddy, you're like my superhero!\n\n[Just as Alex and Buddy are about to head inside, they hear a loud crash from a nearby construction site. They rush over to investigate and find a pile of rubble blocking the path of a kitten trapped underneath.]\n\nAlex: Oh no, Buddy, we have to help!\n\n[Buddy barks in agreement and together they work to carefully move the rubble aside, allowing the kitten to escape unharmed. The kitten gratefully nuzzles against Buddy, who responds with a friendly lick.]\n\nAlex: We did it, Buddy! We saved the day again!\n\n[As Alex and Buddy walk home together, the sun begins to set, casting a warm glow over the neighborhood.]\n\nAlex: Thanks for always being there to watch over me, Buddy. You're not just my dog, you're my best friend.\n\n[Buddy barks happily and nuzzles against Alex as they disappear into the sunset, ready to face whatever adventures tomorrow may bring.]\n\n[End scene.]"
-#
-#         text = "大家好，我是乔哥，一个想帮你把信用卡全部还清的家伙！\n今天我们要聊的是信用卡的取现功能。\n你是不是也曾经因为一时的资金紧张，而拿着信用卡到ATM机取现？如果是，那你得好好看看这个视频了。\n现在都2024年了，我以为现在不会再有人用信用卡取现功能了。前几天一个粉丝发来一张图片，取现1万。\n信用卡取现有三个弊端。\n一，信用卡取现功能代价可不小。会先收取一个取现手续费，比如这个粉丝，取现1万，按2.5%收取手续费，收取了250元。\n二，信用卡正常消费有最长56天的免息期，但取现不享受免息期。从取现那一天开始，每天按照万5收取利息，这个粉丝用了11天，收取了55元利息。\n三，频繁的取现行为，银行会认为你资金紧张，会被标记为高风险用户，影响你的综合评分和额度。\n那么，如果你资金紧张了，该怎么办呢？\n乔哥给你支一招，用破思机摩擦信用卡，只需要少量的手续费，而且还可以享受最长56天的免息期。\n最后，如果你对玩卡感兴趣，可以找乔哥领取一本《卡神秘籍》，用卡过程中遇到任何疑惑，也欢迎找乔哥交流。\n别忘了，关注乔哥，回复用卡技巧，免费领取《2024用卡技巧》，让我们一起成为用卡高手！"
-#
-#         text = """
-#         2023全年业绩速览
-# 公司全年累计实现营业收入1476.94亿元，同比增长19.01%，归母净利润747.34亿元，同比增长19.16%。EPS达到59.49元。第四季度单季，营业收入444.25亿元，同比增长20.26%，环比增长31.86%；归母净利润218.58亿元，同比增长19.33%，环比增长29.37%。这一阶段
-# 的业绩表现不仅突显了公司的增长动力和盈利能力，也反映出公司在竞争激烈的市场环境中保持了良好的发展势头。
-# 2023年Q4业绩速览
-# 第四季度，营业收入贡献主要增长点；销售费用高增致盈利能力承压；税金同比上升27%，扰动净利率表现。
-# 业绩解读
-# 利润方面，2023全年贵州茅台，>归母净利润增速为19%，其中营业收入正贡献18%，营业成本正贡献百分之一，管理费用正贡献百分之一点四。(注：归母净利润增速值=营业收入增速+各科目贡献，展示贡献/拖累的前四名科目，且要求贡献值/净利润增速>15%)
-# """
-#         text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人"
-#
-#         text = _format_text(text)
-#         lines = utils.split_string_by_punctuations(text)
-#         print(lines)
-#
-#         for voice_name in voice_names:
-#             voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
-#             subtitle_file = f"{temp_dir}/tts.mp3.srt"
-#             sub_maker = azure_tts_v2(
-#                 text=text, voice_name=voice_name, voice_file=voice_file
-#             )
-#             create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
-#             audio_duration = get_audio_duration(sub_maker)
-#             print(f"voice: {voice_name}, audio duration: {audio_duration}s")
-#
-#     loop = asyncio.get_event_loop_policy().get_event_loop()
-#     try:
-#         loop.run_until_complete(_do())
-#     finally:
-#         loop.close()
+    for item in list_script:
+        if not item['OST']:
+            timestamp = item['timestamp'].replace(':', '-')
+            audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
+            
+            # 检查文件是否已存在，如存在且不强制重新生成，则跳过
+            if os.path.exists(audio_file) and not force_regenerate:
+                logger.info(f"音频文件已存在，跳过生成: {audio_file}")
+                audio_files.append(audio_file)
+                continue
+
+            text = item['narration']
+
+            sub_maker = tts(
+                text=text,
+                voice_name=voice_name,
+                voice_rate=voice_rate,
+                voice_file=audio_file
+            )
+
+            if sub_maker is None:
+                logger.error(f"无法为时间戳 {timestamp} 生成音频; "
+                             f"如果您在中国，请使用VPN。或者手动选择 zh-CN-YunyangNeural 等角色；"
+                             f"或者使用其他 tts 引擎")
+                continue
+
+            audio_files.append(audio_file)
+            sub_maker_list.append(sub_maker)
+            logger.info(f"已生成音频文件: {audio_file}")
+
+    return audio_files, sub_maker_list
+
+
+if __name__ == "__main__":
+    voice_name = "zh-CN-YunyangNeural"
+    # voice_name = "af-ZA-AdriNeural"
+    voice_name = parse_voice_name(voice_name)
+    print(voice_name)
+
+    with open("../../resource/scripts/2024-0913-040147.json", 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1)
+
+    full_text = " ".join([item['narration'] for item in data if not item['OST']])
+    subtitle_file = os.path.join(utils.task_dir("12312312"), "subtitle_multiple.srt")
+    create_subtitle_from_multiple(full_text, sub_maker_list, data, subtitle_file)
+    print(f"生成的音频文件列表: {audio_files}")
+    print(f"生成的字幕文件: {subtitle_file}")
+
+    # text = " ".join([item['narration'] for item in data])
+    # sub_marks = tts(text=text, voice_name=voice_name, voice_rate=1, voice_file="../../storage/tasks/12312312/aaa.mp3")
+    # create_subtitle(text=text, sub_maker=sub_marks, subtitle_file="../../storage/tasks/12312312/subtitle_123.srt")
diff --git a/app/utils/utils.py b/app/utils/utils.py
index 4ad0cd0..728aed2 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -293,3 +293,102 @@ def get_current_country():
     except requests.RequestException:
         logger.error("获取IP地址信息时发生错误，请检查网络连接")
         return None
+
+
+def time_to_seconds(time_str: str) -> float:
+    parts = time_str.split(':')
+    if len(parts) == 2:
+        m, s = map(float, parts)
+        return m * 60 + s
+    elif len(parts) == 3:
+        h, m, s = map(float, parts)
+        return h * 3600 + m * 60 + s
+    else:
+        raise ValueError(f"Invalid time format: {time_str}")
+
+
+def seconds_to_time(seconds: float) -> str:
+    h, remainder = divmod(seconds, 3600)
+    m, s = divmod(remainder, 60)
+    return f"{int(h):02d}:{int(m):02d}:{s:06.3f}"
+
+
+def load_locales(i18n_dir):
+    _locales = {}
+    for root, dirs, files in os.walk(i18n_dir):
+        for file in files:
+            if file.endswith(".json"):
+                lang = file.split(".")[0]
+                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
+                    _locales[lang] = json.loads(f.read())
+    return _locales
+
+
+def parse_extension(filename):
+    return os.path.splitext(filename)[1].strip().lower().replace(".", "")
+
+
+def script_dir(sub_dir: str = ""):
+    d = resource_dir(f"scripts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def video_dir(sub_dir: str = ""):
+    d = resource_dir(f"videos")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def split_timestamp(timestamp):
+    """
+    拆分时间戳
+    """
+    start, end = timestamp.split('-')
+    start_hour, start_minute = map(int, start.split(':'))
+    end_hour, end_minute = map(int, end.split(':'))
+
+    start_time = '00:{:02d}:{:02d}'.format(start_hour, start_minute)
+    end_time = '00:{:02d}:{:02d}'.format(end_hour, end_minute)
+
+    return start_time, end_time
+
+
+def reduce_video_time(txt: str, duration: float = 0.21531):
+    """
+    按照字数缩减视频时长，一个字耗时约 0.21531 s,
+    Returns:
+    """
+    # 返回结果四舍五入为整数
+    duration = len(txt) * duration
+    return int(duration)
+
+
+def get_current_country():
+    """
+    判断当前网络IP地址所在的国家
+    """
+    try:
+        # 使用ipapi.co的免费API获取IP地址信息
+        response = requests.get('https://ipapi.co/json/')
+        data = response.json()
+
+        # 获取国家名称
+        country = data.get('country_name')
+
+        if country:
+            logger.debug(f"当前网络IP地址位于：{country}")
+            return country
+        else:
+            logger.debug("无法确定当前网络IP地址所在的国家")
+            return None
+
+    except requests.RequestException:
+        logger.error("获取IP地址信息时发生错误，请检查网络连接")
+        return None

From 2bc94651a26094eb624c8f272cc126d86a0350d5 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Thu, 19 Sep 2024 18:23:54 +0800
Subject: [PATCH 03/21] =?UTF-8?q?=E6=9C=AA=E5=AE=8C=E6=88=90=20generate=5F?=
 =?UTF-8?q?video=5Fv2=20=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/task.py  |   2 +-
 app/services/video.py | 350 ++++++++++++++++++++++++++++--------------
 app/services/voice.py |   2 +-
 app/utils/utils.py    |  92 ++---------
 4 files changed, 253 insertions(+), 193 deletions(-)

diff --git a/app/services/task.py b/app/services/task.py
index b1895d3..e58f4b4 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -428,7 +428,7 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
             combined_video_path=combined_video_path,
             video_paths=subclip_videos,
             video_ost_list=video_ost,
-            audio_file=audio_file,
+            list_script=list_script,
             video_aspect=params.video_aspect,
             threads=n_threads
         )
diff --git a/app/services/video.py b/app/services/video.py
index d5cc18f..9924923 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -1,3 +1,4 @@
+import re
 import glob
 import random
 from typing import List
@@ -216,9 +217,7 @@ def generate_video(
     logger.info(f"  ③ subtitle: {subtitle_path}")
     logger.info(f"  ④ output: {output_file}")
 
-    # https://github.com/harry0703/NarratoAI/issues/217
-    # PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'final-1.mp4.tempTEMP_MPY_wvf_snd.mp3'
-    # write into the same directory as the output file
+    # 写入与输出文件相同的目录
     output_dir = os.path.dirname(output_file)
 
     font_path = ""
@@ -303,6 +302,133 @@ def generate_video(
     logger.success("completed")
 
 
+def generate_video_v2(
+        video_path: str,
+        audio_paths: List[str],
+        subtitle_path: str,
+        output_file: str,
+        params: Union[VideoParams, VideoClipParams],
+):
+    aspect = VideoAspect(params.video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    logger.info(f"开始，视频尺寸: {video_width} x {video_height}")
+    logger.info(f"  ① 视频: {video_path}")
+    logger.info(f"  ② 音频文件数量: {len(audio_paths)}")
+    logger.info(f"  ③ 字幕: {subtitle_path}")
+    logger.info(f"  ④ 输出: {output_file}")
+
+    # 写入与输出文件相同的目录
+    output_dir = os.path.dirname(output_file)
+
+    # 字体设置部分保持不变
+    font_path = ""
+    if params.subtitle_enabled:
+        if not params.font_name:
+            params.font_name = "STHeitiMedium.ttc"
+        font_path = os.path.join(utils.font_dir(), params.font_name)
+        if os.name == "nt":
+            font_path = font_path.replace("\\", "/")
+        logger.info(f"使用字体: {font_path}")
+
+    # create_text_clip 函数保持不变
+    def create_text_clip(subtitle_item):
+        phrase = subtitle_item[1]
+        max_width = video_width * 0.9
+        wrapped_txt, txt_height = wrap_text(
+            phrase, max_width=max_width, font=font_path, fontsize=params.font_size
+        )
+        _clip = TextClip(
+            wrapped_txt,
+            font=font_path,
+            fontsize=params.font_size,
+            color=params.text_fore_color,
+            bg_color=params.text_background_color,
+            stroke_color=params.stroke_color,
+            stroke_width=params.stroke_width,
+            print_cmd=False,
+        )
+        duration = subtitle_item[0][1] - subtitle_item[0][0]
+        _clip = _clip.set_start(subtitle_item[0][0])
+        _clip = _clip.set_end(subtitle_item[0][1])
+        _clip = _clip.set_duration(duration)
+        if params.subtitle_position == "bottom":
+            _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
+        elif params.subtitle_position == "top":
+            _clip = _clip.set_position(("center", video_height * 0.05))
+        elif params.subtitle_position == "custom":
+            # 确保字幕完全在屏幕内
+            margin = 10  # 额外的边距，单位为像素
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (params.custom_position / 100)
+            custom_y = max(min_y, min(custom_y, max_y))  # 限制 y 值在有效范围内
+            _clip = _clip.set_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.set_position(("center", "center"))
+        return _clip
+
+    video_clip = VideoFileClip(video_path)
+
+    # 处理多个音频文件
+    audio_clips = []
+    for audio_path in audio_paths:
+        # 从文件名中提取时间信息
+        match = re.search(r'audio_(\d{2}-\d{2}-\d{2}-\d{2})\.mp3', os.path.basename(audio_path))
+        if match:
+            time_str = match.group(1)
+            start, end = time_str.split('-')[:2], time_str.split('-')[2:]
+            start_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(start)))
+            end_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(end)))
+
+            audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume)
+            audio_clip = audio_clip.set_start(start_time).set_end(end_time)
+            audio_clips.append(audio_clip)
+        else:
+            logger.warning(f"无法从文件名解析时间信息: {audio_path}")
+
+    # 合并所有音频剪辑
+    if audio_clips:
+        audio_clip = CompositeAudioClip(audio_clips)
+    else:
+        logger.warning("没有有效的音频文件")
+        audio_clip = AudioClip(lambda t: 0, duration=video_clip.duration)
+
+    # 字幕处理部分保持不变
+    if subtitle_path and os.path.exists(subtitle_path):
+        sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
+        text_clips = []
+        for item in sub.subtitles:
+            clip = create_text_clip(subtitle_item=item)
+            text_clips.append(clip)
+        video_clip = CompositeVideoClip([video_clip, *text_clips])
+
+    # 背景音乐处理部分保持不变
+    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
+    if bgm_file:
+        try:
+            bgm_clip = (
+                AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
+            )
+            bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration)
+            audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
+        except Exception as e:
+            logger.error(f"添加背景音乐失败: {str(e)}")
+
+    video_clip = video_clip.set_audio(audio_clip)
+    video_clip.write_videofile(
+        output_file,
+        audio_codec="aac",
+        temp_audiofile_path=output_dir,
+        threads=params.n_threads or 2,
+        logger=None,
+        fps=30,
+    )
+    video_clip.close()
+    del video_clip
+    logger.success("完成")
+
+
 def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
     for material in materials:
         if not material.url:
@@ -352,8 +478,8 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
 
 def combine_clip_videos(combined_video_path: str,
                         video_paths: List[str],
-                        video_ost_list: List[str],
-                        audio_file: str,
+                        video_ost_list: List[bool],
+                        list_script: list,
                         video_aspect: VideoAspect = VideoAspect.portrait,
                         threads: int = 2,
                         ) -> str:
@@ -369,8 +495,8 @@ def combine_clip_videos(combined_video_path: str,
     Returns:
 
     """
-    audio_clip = AudioFileClip(audio_file)
-    audio_duration = audio_clip.duration
+    from app.utils.utils import calculate_total_duration
+    audio_duration = calculate_total_duration(list_script)
     logger.info(f"音频的最大持续时间: {audio_duration} s")
     # 每个剪辑所需的持续时间
     req_dur = audio_duration / len(video_paths)
@@ -384,62 +510,52 @@ def combine_clip_videos(combined_video_path: str,
     clips = []
     video_duration = 0
     # 一遍又一遍地添加下载的剪辑，直到达到音频的持续时间 （max_duration）
-    while video_duration < audio_duration:
-        for video_path, video_ost in zip(video_paths, video_ost_list):
-            clip = VideoFileClip(video_path)
-            if video_ost:
-                clip = clip.set_audio(audio_clip)
+    # while video_duration < audio_duration:
+    for video_path, video_ost in zip(video_paths, video_ost_list):
+        clip = VideoFileClip(video_path)
+        # 通过 ost 字段判断是否播放原声
+        if not video_ost:
+            clip = clip.without_audio()
+        # # 检查剪辑是否比剩余音频长
+        # if (audio_duration - video_duration) < clip.duration:
+        #     clip = clip.subclip(0, (audio_duration - video_duration))
+        # # 仅当计算出的剪辑长度 （req_dur） 短于实际剪辑时，才缩短剪辑以防止静止图像
+        # elif req_dur < clip.duration:
+        #     clip = clip.subclip(0, req_dur)
+        clip = clip.set_fps(30)
+
+        # 并非所有视频的大小都相同，因此我们需要调整它们的大小
+        clip_w, clip_h = clip.size
+        if clip_w != video_width or clip_h != video_height:
+            clip_ratio = clip.w / clip.h
+            video_ratio = video_width / video_height
+
+            if clip_ratio == video_ratio:
+                # 等比例缩放
+                clip = clip.resize((video_width, video_height))
             else:
-                clip = clip.set_audio(audio_clip).without_audio()
-            # 检查剪辑是否比剩余音频长
-            if (audio_duration - video_duration) < clip.duration:
-                clip = clip.subclip(0, (audio_duration - video_duration))
-            # 仅当计算出的剪辑长度 （req_dur） 短于实际剪辑时，才缩短剪辑以防止静止图像
-            elif req_dur < clip.duration:
-                clip = clip.subclip(0, req_dur)
-            clip = clip.set_fps(30)
-
-            # 并非所有视频的大小都相同，因此我们需要调整它们的大小
-            clip_w, clip_h = clip.size
-            if clip_w != video_width or clip_h != video_height:
-                clip_ratio = clip.w / clip.h
-                video_ratio = video_width / video_height
-
-                if clip_ratio == video_ratio:
-                    # 等比例缩放
-                    clip = clip.resize((video_width, video_height))
+                # 等比缩放视频
+                if clip_ratio > video_ratio:
+                    # 按照目标宽度等比缩放
+                    scale_factor = video_width / clip_w
                 else:
-                    # 等比缩放视频
-                    if clip_ratio > video_ratio:
-                        # 按照目标宽度等比缩放
-                        scale_factor = video_width / clip_w
-                    else:
-                        # 按照目标高度等比缩放
-                        scale_factor = video_height / clip_h
+                    # 按照目标高度等比缩放
+                    scale_factor = video_height / clip_h
 
-                    new_width = int(clip_w * scale_factor)
-                    new_height = int(clip_h * scale_factor)
-                    clip_resized = clip.resize(newsize=(new_width, new_height))
+                new_width = int(clip_w * scale_factor)
+                new_height = int(clip_h * scale_factor)
+                clip_resized = clip.resize(newsize=(new_width, new_height))
 
-                    background = ColorClip(size=(video_width, video_height), color=(0, 0, 0))
-                    clip = CompositeVideoClip([
-                        background.set_duration(clip.duration),
-                        clip_resized.set_position("center")
-                    ])
+                background = ColorClip(size=(video_width, video_height), color=(0, 0, 0))
+                clip = CompositeVideoClip([
+                    background.set_duration(clip.duration),
+                    clip_resized.set_position("center")
+                ])
 
-                logger.info(f"将视频 {video_path} 大小调整为 {video_width} x {video_height}, 剪辑尺寸: {clip_w} x {clip_h}")
+            logger.info(f"将视频 {video_path} 大小调整为 {video_width} x {video_height}, 剪辑尺寸: {clip_w} x {clip_h}")
 
-            # TODO: 片段时长过长时，需要缩短，但暂时没有好的解决方案
-            # if clip.duration > 5:
-            #     ctime = utils.reduce_video_time(txt=video_script)
-            #     if clip.duration > (2 * ctime):
-            #         clip = clip.subclip(ctime, 2*ctime)
-            #     else:
-            #         clip = clip.subclip(0, ctime)
-            #     logger.info(f"视频 {video_path} 片段时长较长，将剪辑时长缩短至 {ctime} 秒")
-
-            clips.append(clip)
-            video_duration += clip.duration
+        clips.append(clip)
+        video_duration += clip.duration
 
     video_clip = concatenate_videoclips(clips)
     video_clip = video_clip.set_fps(30)
@@ -457,68 +573,78 @@ def combine_clip_videos(combined_video_path: str,
 
 
 if __name__ == "__main__":
-    from app.utils import utils
+    combined_video_path = "../../storage/tasks/12312312/com123.mp4"
 
-    suffix = "*.mp4"
-    song_dir = utils.video_dir()
-    files = glob.glob(os.path.join(song_dir, suffix))
+    video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4',
+                   '../../storage/cache_videos/vid-00_03-00_07.mp4',
+                   '../../storage/cache_videos/vid-00_12-00_17.mp4',
+                   '../../storage/cache_videos/vid-00_26-00_31.mp4']
+    video_ost_list = [False, True, False, True]
+    list_script = [
+        {
+            "picture": "夜晚，一个小孩在树林里奔跑，后面有人拿着火把在追赶",
+            "timestamp": "00:00-00:03",
+            "narration": "夜黑风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
+            "OST": False
+        },
+        {
+            "picture": "追赶的人命令抓住小孩",
+            "timestamp": "00:03-00:07",
+            "narration": "原声播放1",
+            "OST": True
+        },
+        {
+            "picture": "小孩躲在草丛里，黑衣人用脚踢了踢他",
+            "timestamp": "00:12-00:17",
+            "narration": "小孩脱下外套，跑进树林, 一路奔跑，直到第二天清晨",
+            "OST": False
+        },
+        {
+            "picture": "小孩跑到车前，慌慌张张地对女人说有人要杀他",
+            "timestamp": "00:26-00:31",
+            "narration": "原声播放2",
+            "OST": True
+        }
+    ]
+    # combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script)
 
-    print(files)
+    cfg = VideoClipParams()
+    cfg.video_aspect = VideoAspect.portrait
+    cfg.font_name = "STHeitiMedium.ttc"
+    cfg.font_size = 60
+    cfg.stroke_color = "#000000"
+    cfg.stroke_width = 1.5
+    cfg.text_fore_color = "#FFFFFF"
+    cfg.text_background_color = "transparent"
+    cfg.bgm_type = "random"
+    cfg.bgm_file = ""
+    cfg.bgm_volume = 1.0
+    cfg.subtitle_enabled = True
+    cfg.subtitle_position = "bottom"
+    cfg.n_threads = 2
+    cfg.paragraph_number = 1
 
-    # m = MaterialInfo()
-    # m.url = "/Users/harry/Downloads/IMG_2915.JPG"
-    # m.provider = "local"
-    # materials = preprocess_video([m], clip_duration=4)
-    # print(materials)
+    cfg.voice_volume = 1.0
 
-    # txt_en = "Here's your guide to travel hacks for budget-friendly adventures"
-    # txt_zh = "测试长字段这是您的旅行技巧指南帮助您进行预算友好的冒险"
-    # font = utils.resource_dir() + "/fonts/STHeitiMedium.ttc"
-    # for txt in [txt_en, txt_zh]:
-    #     t, h = wrap_text(text=txt, max_width=1000, font=font, fontsize=60)
-    #     print(t)
-    #
-    # task_id = "aa563149-a7ea-49c2-b39f-8c32cc225baf"
-    # task_dir = utils.task_dir(task_id)
-    # video_file = f"{task_dir}/combined-1.mp4"
-    # audio_file = f"{task_dir}/audio.mp3"
-    # subtitle_file = f"{task_dir}/subtitle.srt"
-    # output_file = f"{task_dir}/final.mp4"
-    #
-    # # video_paths = []
-    # # for file in os.listdir(utils.storage_dir("test")):
-    # #     if file.endswith(".mp4"):
-    # #         video_paths.append(os.path.join(utils.storage_dir("test"), file))
-    # #
-    # # combine_videos(combined_video_path=video_file,
-    # #                audio_file=audio_file,
-    # #                video_paths=video_paths,
-    # #                video_aspect=VideoAspect.portrait,
-    # #                video_concat_mode=VideoConcatMode.random,
-    # #                max_clip_duration=5,
-    # #                threads=2)
-    #
-    # cfg = VideoParams()
-    # cfg.video_aspect = VideoAspect.portrait
-    # cfg.font_name = "STHeitiMedium.ttc"
-    # cfg.font_size = 60
-    # cfg.stroke_color = "#000000"
-    # cfg.stroke_width = 1.5
-    # cfg.text_fore_color = "#FFFFFF"
-    # cfg.text_background_color = "transparent"
-    # cfg.bgm_type = "random"
-    # cfg.bgm_file = ""
-    # cfg.bgm_volume = 1.0
-    # cfg.subtitle_enabled = True
-    # cfg.subtitle_position = "bottom"
-    # cfg.n_threads = 2
-    # cfg.paragraph_number = 1
-    #
-    # cfg.voice_volume = 1.0
-    #
     # generate_video(video_path=video_file,
     #                audio_path=audio_file,
     #                subtitle_path=subtitle_file,
     #                output_file=output_file,
     #                params=cfg
     #                )
+
+    video_path = "../../storage/tasks/12312312/com123.mp4"
+
+    audio_paths = ['../../storage/tasks/12312312/audio_00-00-00-03.mp3',
+                   '../../storage/tasks/12312312/audio_00-12-00-17.mp3']
+
+    subtitle_path = "../../storage/tasks/12312312/subtitle_multiple.srt"
+
+    output_file = "../../storage/tasks/12312312/out123.mp4"
+
+    generate_video_v2(video_path=video_path,
+                       audio_paths=audio_paths,
+                       subtitle_path=subtitle_path,
+                       output_file=output_file,
+                       params=cfg
+                      )
diff --git a/app/services/voice.py b/app/services/voice.py
index e54eda9..20180ba 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1354,7 +1354,7 @@ if __name__ == "__main__":
     voice_name = parse_voice_name(voice_name)
     print(voice_name)
 
-    with open("../../resource/scripts/2024-0913-040147.json", 'r', encoding='utf-8') as f:
+    with open("../../resource/scripts/test.json", 'r', encoding='utf-8') as f:
         data = json.load(f)
 
     audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1)
diff --git a/app/utils/utils.py b/app/utils/utils.py
index 728aed2..95d796b 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -7,6 +7,7 @@ from loguru import logger
 import json
 from uuid import uuid4
 import urllib3
+from datetime import datetime
 
 from app.models import const
 
@@ -313,82 +314,15 @@ def seconds_to_time(seconds: float) -> str:
     return f"{int(h):02d}:{int(m):02d}:{s:06.3f}"
 
 
-def load_locales(i18n_dir):
-    _locales = {}
-    for root, dirs, files in os.walk(i18n_dir):
-        for file in files:
-            if file.endswith(".json"):
-                lang = file.split(".")[0]
-                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
-                    _locales[lang] = json.loads(f.read())
-    return _locales
-
-
-def parse_extension(filename):
-    return os.path.splitext(filename)[1].strip().lower().replace(".", "")
-
-
-def script_dir(sub_dir: str = ""):
-    d = resource_dir(f"scripts")
-    if sub_dir:
-        d = os.path.join(d, sub_dir)
-    if not os.path.exists(d):
-        os.makedirs(d)
-    return d
-
-
-def video_dir(sub_dir: str = ""):
-    d = resource_dir(f"videos")
-    if sub_dir:
-        d = os.path.join(d, sub_dir)
-    if not os.path.exists(d):
-        os.makedirs(d)
-    return d
-
-
-def split_timestamp(timestamp):
-    """
-    拆分时间戳
-    """
-    start, end = timestamp.split('-')
-    start_hour, start_minute = map(int, start.split(':'))
-    end_hour, end_minute = map(int, end.split(':'))
-
-    start_time = '00:{:02d}:{:02d}'.format(start_hour, start_minute)
-    end_time = '00:{:02d}:{:02d}'.format(end_hour, end_minute)
-
-    return start_time, end_time
-
-
-def reduce_video_time(txt: str, duration: float = 0.21531):
-    """
-    按照字数缩减视频时长，一个字耗时约 0.21531 s,
-    Returns:
-    """
-    # 返回结果四舍五入为整数
-    duration = len(txt) * duration
-    return int(duration)
-
-
-def get_current_country():
-    """
-    判断当前网络IP地址所在的国家
-    """
-    try:
-        # 使用ipapi.co的免费API获取IP地址信息
-        response = requests.get('https://ipapi.co/json/')
-        data = response.json()
-
-        # 获取国家名称
-        country = data.get('country_name')
-
-        if country:
-            logger.debug(f"当前网络IP地址位于：{country}")
-            return country
-        else:
-            logger.debug("无法确定当前网络IP地址所在的国家")
-            return None
-
-    except requests.RequestException:
-        logger.error("获取IP地址信息时发生错误，请检查网络连接")
-        return None
+def calculate_total_duration(scenes):
+    total_seconds = 0
+    
+    for scene in scenes:
+        start, end = scene['timestamp'].split('-')
+        start_time = datetime.strptime(start, '%M:%S')
+        end_time = datetime.strptime(end, '%M:%S')
+        
+        duration = end_time - start_time
+        total_seconds += duration.total_seconds()
+    
+    return total_seconds

From a675e35f1da555b60456602cb9bb742828a38eb3 Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Fri, 20 Sep 2024 00:42:33 +0800
Subject: [PATCH 04/21] =?UTF-8?q?=E8=BF=90=E8=A1=8C=E6=88=90=E5=8A=9F?=
 =?UTF-8?q?=EF=BC=8C=E4=BD=86=E8=84=9A=E6=9C=AC=E9=97=AE=E9=A2=98=E8=BF=98?=
 =?UTF-8?q?=E5=BE=88=E5=A4=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py   |  28 ++++-----
 app/services/task.py  |  13 +++--
 app/services/video.py | 129 ++++++++++++++++++++++++++++--------------
 app/services/voice.py |   4 +-
 app/utils/utils.py    |  41 +++++++++++++-
 webui/Main.py         |   2 +-
 6 files changed, 149 insertions(+), 68 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index c033ab5..25c6557 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -451,19 +451,19 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
 """ % (language, video_plot)
 
     logger.debug(f"视频名称: {video_origin_name}")
-    try:
-        gemini_video_file = gemini.upload_file(video_origin_path)
-        logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
-        while gemini_video_file.state.name == "PROCESSING":
-            import time
-            time.sleep(1)
-            gemini_video_file = gemini.get_file(gemini_video_file.name)
-            logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
-        if gemini_video_file.state.name == "FAILED":
-            raise ValueError(gemini_video_file.state.name)
-    except Exception as err:
-        logger.error(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确 \n{traceback.format_exc()}")
-        raise TimeoutError(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确; {err}")
+    # try:
+    gemini_video_file = gemini.upload_file(video_origin_path)
+    logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
+    while gemini_video_file.state.name == "PROCESSING":
+        import time
+        time.sleep(1)
+        gemini_video_file = gemini.get_file(gemini_video_file.name)
+        logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
+    if gemini_video_file.state.name == "FAILED":
+        raise ValueError(gemini_video_file.state.name)
+    # except Exception as err:
+    #     logger.error(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确 \n{traceback.format_exc()}")
+    #     raise TimeoutError(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确; {err}")
 
     streams = model.generate_content([prompt, gemini_video_file], stream=True)
     response = []
@@ -490,7 +490,7 @@ if __name__ == "__main__":
     # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     #
     video_subject = "卖菜大妈竟是皇嫂"
-    video_path = "/NarratoAI/resource/videos/demoyasuo.mp4"
+    video_path = "../../resource/videos/demoyasuo.mp4"
 
     video_plot = ''' '''
     language = "zh-CN"
diff --git a/app/services/task.py b/app/services/task.py
index e58f4b4..c768e6a 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -440,12 +440,13 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
 
         logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
         # 把所有东西合到在一起
-        video.generate_video(video_path=combined_video_path,
-                             audio_path=audio_file,
-                             subtitle_path=subtitle_path,
-                             output_file=final_video_path,
-                             params=params,
-                             )
+        video.generate_video_v2(
+            video_path=combined_video_path,
+            audio_paths=audio_files,
+            subtitle_path=subtitle_path,
+            output_file=final_video_path,
+            params=params,
+        )
 
         _progress += 50 / params.video_count / 2
         sm.state.update_task(task_id, progress=_progress)
diff --git a/app/services/video.py b/app/services/video.py
index 9924923..dd9907f 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -1,4 +1,5 @@
 import re
+import os
 import glob
 import random
 from typing import List
@@ -369,10 +370,17 @@ def generate_video_v2(
         return _clip
 
     video_clip = VideoFileClip(video_path)
+    original_audio = video_clip.audio  # 保存原始视频的音轨
+    video_duration = video_clip.duration
 
     # 处理多个音频文件
     audio_clips = []
     for audio_path in audio_paths:
+        # 确保每个音频文件路径是正确的
+        if not os.path.exists(audio_path):
+            logger.warning(f"音频文件不存在: {audio_path}")
+            continue
+
         # 从文件名中提取时间信息
         match = re.search(r'audio_(\d{2}-\d{2}-\d{2}-\d{2})\.mp3', os.path.basename(audio_path))
         if match:
@@ -382,28 +390,53 @@ def generate_video_v2(
             end_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(end)))
 
             audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume)
-            audio_clip = audio_clip.set_start(start_time).set_end(end_time)
+            
+            # 确保结束时间不超过音频实际长度
+            actual_end_time = min(end_time - start_time, audio_clip.duration)
+            
+            audio_clip = audio_clip.subclip(0, actual_end_time)
+            audio_clip = audio_clip.set_start(start_time).set_end(start_time + actual_end_time)
             audio_clips.append(audio_clip)
         else:
             logger.warning(f"无法从文件名解析时间信息: {audio_path}")
 
-    # 合并所有音频剪辑
+    # 合并所有音频剪辑，包括原始音轨
     if audio_clips:
+        audio_clips.insert(0, original_audio)  # 将原始音轨添加到音频剪辑列表的开头
         audio_clip = CompositeAudioClip(audio_clips)
     else:
-        logger.warning("没有有效的音频文件")
-        audio_clip = AudioClip(lambda t: 0, duration=video_clip.duration)
+        logger.warning("没有有效的音频文件，使用原始音轨")
+        audio_clip = original_audio
 
-    # 字幕处理部分保持不变
+    # 字幕处理部分
     if subtitle_path and os.path.exists(subtitle_path):
         sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
         text_clips = []
+        
         for item in sub.subtitles:
             clip = create_text_clip(subtitle_item=item)
+            
+            # 确保字幕的开始时间不早于视频开始
+            start_time = max(clip.start, 0)
+            
+            # 如果字幕的开始时间晚于视频结束时间，则跳过此字幕
+            if start_time >= video_duration:
+                continue
+            
+            # 调整字幕的结束时间，但不要超过视频长度
+            end_time = min(clip.end, video_duration)
+            
+            # 调整字幕的时间范围
+            clip = clip.set_start(start_time).set_end(end_time)
+            
             text_clips.append(clip)
+        
+        logger.info(f"处理了 {len(text_clips)} 段字幕")
+        
+        # 创建一个新的视频剪辑，包含所有字幕
         video_clip = CompositeVideoClip([video_clip, *text_clips])
 
-    # 背景音乐处理部分保持不变
+    # 背景音乐处理部分
     bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
     if bgm_file:
         try:
@@ -573,39 +606,43 @@ def combine_clip_videos(combined_video_path: str,
 
 
 if __name__ == "__main__":
-    combined_video_path = "../../storage/tasks/12312312/com123.mp4"
-
-    video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4',
-                   '../../storage/cache_videos/vid-00_03-00_07.mp4',
-                   '../../storage/cache_videos/vid-00_12-00_17.mp4',
-                   '../../storage/cache_videos/vid-00_26-00_31.mp4']
-    video_ost_list = [False, True, False, True]
-    list_script = [
-        {
-            "picture": "夜晚，一个小孩在树林里奔跑，后面有人拿着火把在追赶",
-            "timestamp": "00:00-00:03",
-            "narration": "夜黑风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
-            "OST": False
-        },
-        {
-            "picture": "追赶的人命令抓住小孩",
-            "timestamp": "00:03-00:07",
-            "narration": "原声播放1",
-            "OST": True
-        },
-        {
-            "picture": "小孩躲在草丛里，黑衣人用脚踢了踢他",
-            "timestamp": "00:12-00:17",
-            "narration": "小孩脱下外套，跑进树林, 一路奔跑，直到第二天清晨",
-            "OST": False
-        },
-        {
-            "picture": "小孩跑到车前，慌慌张张地对女人说有人要杀他",
-            "timestamp": "00:26-00:31",
-            "narration": "原声播放2",
-            "OST": True
-        }
-    ]
+    # combined_video_path = "../../storage/tasks/12312312/com123.mp4"
+    #
+    # video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4',
+    #                '../../storage/cache_videos/vid-00_03-00_07.mp4',
+    #                '../../storage/cache_videos/vid-00_12-00_17.mp4',
+    #                '../../storage/cache_videos/vid-00_26-00_31.mp4']
+    # video_ost_list = [False, True, False, True]
+    # list_script = [
+    #     {
+    #         "picture": "夜晚，一个小孩在树林里奔跑，后面有人拿着火把在追赶",
+    #         "timestamp": "00:00-00:03",
+    #         "narration": "夜黑风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
+    #         "OST": False,
+    #         "new_timestamp": "00:00-00:03"
+    #     },
+    #     {
+    #         "picture": "追赶的人命令抓住小孩",
+    #         "timestamp": "00:03-00:07",
+    #         "narration": "原声播放1",
+    #         "OST": True,
+    #         "new_timestamp": "00:03-00:07"
+    #     },
+    #     {
+    #         "picture": "小孩躲在草丛里，黑衣人用脚踢了踢他",
+    #         "timestamp": "00:12-00:17",
+    #         "narration": "小孩脱下外套，跑进树林, 一路奔跑，直到第二天清晨",
+    #         "OST": False,
+    #         "new_timestamp": "00:07-00:12"
+    #     },
+    #     {
+    #         "picture": "小孩跑到车前，慌慌张张地对女人说有人要杀他",
+    #         "timestamp": "00:26-00:31",
+    #         "narration": "原声播放2",
+    #         "OST": True,
+    #         "new_timestamp": "00:12-00:17"
+    #     }
+    # ]
     # combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script)
 
     cfg = VideoClipParams()
@@ -633,14 +670,18 @@ if __name__ == "__main__":
     #                params=cfg
     #                )
 
-    video_path = "../../storage/tasks/12312312/com123.mp4"
+    video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4"
 
-    audio_paths = ['../../storage/tasks/12312312/audio_00-00-00-03.mp3',
-                   '../../storage/tasks/12312312/audio_00-12-00-17.mp3']
+    audio_paths = ['../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3',
+                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-14-00-17.mp3',
+                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-17-00-22.mp3',
+                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-34-00-45.mp3',
+                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-59-01-09.mp3',
+                   ]
 
-    subtitle_path = "../../storage/tasks/12312312/subtitle_multiple.srt"
+    subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt"
 
-    output_file = "../../storage/tasks/12312312/out123.mp4"
+    output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4"
 
     generate_video_v2(video_path=video_path,
                        audio_paths=audio_paths,
diff --git a/app/services/voice.py b/app/services/voice.py
index 20180ba..785f3f1 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1213,7 +1213,7 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
             if script_item['OST']:
                 continue
 
-            start_time, end_time = script_item['timestamp'].split('-')
+            start_time, end_time = script_item['new_timestamp'].split('-')
             if sub_maker_index >= len(sub_maker_list):
                 logger.error(f"Sub maker list index out of range: {sub_maker_index}")
                 break
@@ -1317,7 +1317,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
 
     for item in list_script:
         if not item['OST']:
-            timestamp = item['timestamp'].replace(':', '-')
+            timestamp = item['new_timestamp'].replace(':', '-')
             audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
             
             # 检查文件是否已存在，如存在且不强制重新生成，则跳过
diff --git a/app/utils/utils.py b/app/utils/utils.py
index 95d796b..b5a91cb 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -7,7 +7,7 @@ from loguru import logger
 import json
 from uuid import uuid4
 import urllib3
-from datetime import datetime
+from datetime import datetime, timedelta
 
 from app.models import const
 
@@ -326,3 +326,42 @@ def calculate_total_duration(scenes):
         total_seconds += duration.total_seconds()
     
     return total_seconds
+
+
+def add_new_timestamps(scenes):
+    """
+    新增新视频的时间戳，并为"原生播放"的narration添加唯一标识符
+    Args:
+        scenes: 场景列表
+
+    Returns:
+        更新后的场景列表
+    """
+    current_time = timedelta()
+    updated_scenes = []
+
+    for scene in scenes:
+        new_scene = scene.copy()  # 创建场景的副本，以保留原始数据
+        start, end = new_scene['timestamp'].split('-')
+        start_time = datetime.strptime(start, '%M:%S')
+        end_time = datetime.strptime(end, '%M:%S')
+        duration = end_time - start_time
+
+        new_start = current_time
+        current_time += duration
+        new_end = current_time
+
+        # 将 timedelta 转换为分钟和秒
+        new_start_str = f"{int(new_start.total_seconds() // 60):02d}:{int(new_start.total_seconds() % 60):02d}"
+        new_end_str = f"{int(new_end.total_seconds() // 60):02d}:{int(new_end.total_seconds() % 60):02d}"
+
+        new_scene['new_timestamp'] = f"{new_start_str}-{new_end_str}"
+
+        # 为"原生播放"的narration添加唯一标识符
+        if new_scene.get('narration') == "原声播放":
+            unique_id = str(uuid4())[:8]  # 使用UUID的前8个字符作为唯一标识符
+            new_scene['narration'] = f"原声播放_{unique_id}"
+
+        updated_scenes.append(new_scene)
+
+    return updated_scenes
diff --git a/webui/Main.py b/webui/Main.py
index 10efc55..2db4569 100644
--- a/webui/Main.py
+++ b/webui/Main.py
@@ -395,7 +395,7 @@ with left_panel:
                     # 去掉json的头尾标识
                     input_json = input_json.strip('```json').strip('```')
                     try:
-                        data = json.loads(input_json)
+                        data = utils.add_new_timestamps(json.loads(input_json))
                     except Exception as err:
                         raise ValueError(
                             f"视频脚本格式错误，请检查脚本是否符合 JSON 格式；{err} \n\n{traceback.format_exc()}")

From fd9c8d0d6cf19e7a16c3522de27322331c7dfd89 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Fri, 20 Sep 2024 17:49:08 +0800
Subject: [PATCH 05/21] =?UTF-8?q?=E5=AE=8C=E5=96=84=E8=A7=A3=E8=AF=B4=20pr?=
 =?UTF-8?q?ompt=EF=BC=9B=E4=B8=8B=E4=B8=80=E6=AD=A5=E6=96=87=E6=A1=88?=
 =?UTF-8?q?=E5=8C=B9=E9=85=8D=E5=9C=BA=E6=99=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py | 376 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 363 insertions(+), 13 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index 25c6557..8ded6da 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -8,11 +8,101 @@ from openai import OpenAI
 from openai import AzureOpenAI
 from openai.types.chat import ChatCompletion
 import google.generativeai as gemini
+from googleapiclient.errors import ResumableUploadError
+from google.api_core.exceptions import FailedPrecondition
+from google.generativeai.types import HarmCategory, HarmBlockThreshold
 
 from app.config import config
 
 _max_retries = 5
 
+Method = """
+重要提示：每一部剧的文案，前几句必须吸引人
+首先我们在看完看懂电影后，大脑里面要先有一个大概的轮廓，也就是一个类似于作文的大纲，电影主题线在哪里，首先要找到。
+一般将文案分为开头、内容、结尾
+## 开头部分
+文案开头三句话，是留住用户的关键！
+
+### 方式一：开头概括总结
+文案的前三句，是整部电影的概括总结，2-3句介绍后，开始叙述故事剧情！
+推荐新手（新号）做：（盘点型）
+盘点全球最恐怖的10部电影
+盘点全球最科幻的10部电影
+盘点全球最悲惨的10部电影
+盘点全球最值得看的10部灾难电影
+盘点全球最值得看的10部励志电影
+
+下面的示例就是最简单的解说文案开头：
+1.这是XXX国20年来最大尺度的一部剧，极度烧脑，却让99%的人看得心潮澎湃、无法自拔，故事开始……
+2.这是有史以来电影院唯一一部全程开灯放完的电影，期间无数人尖叫昏厥，他被成为勇敢者的专属，因为99%的人都不敢看到结局，许多人看完它从此不愿再碰手机，他就是大名鼎鼎的暗黑神作《XXX》……
+3.这到底是一部什么样的电影，能被55个国家公开抵制，它甚至为了上映，不惜删减掉整整47分钟的剧情……
+4.是什么样的一个人，被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
+5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/……
+6.这到底是一部什么样的影片，他一个人就拿了4个顶级奖项，第一季8.7分，第二季直接干到9.5分，11万人给出5星好评，一共也就6集，却斩获26项国际大奖，看过的人都说，他是近年来最好的xxx剧，几乎成为了近年来xxx剧的标杆。故事发生在……
+7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《时代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
+8.这是一部让所有人看得荷尔蒙飙升的爽片……
+9.他被成为世界上最虐心绝望的电影，至今无人敢看第二遍，很难想象，他是根据真实事件改编而来……
+10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他就是顶级神作《xxxx》……
+11.这是X国有史以来最高赞的一部悬疑电影，然而却因为某些原因，国内90%的人，没能看过这部片子，他就是《xxx》……
+12.有这样一部电影，这辈子，你绝对不想再看第二遍，并不是它剧情烂俗，而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场，更让许多同行都不想解说这部电影，他就是大名鼎鼎的暗黑神作《xxx》…
+13.它被誉为史上最牛悬疑片，无数人在看完它时候，一个月不敢照镜子，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
+14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影，能够在某瓣上被70万人打出9.3分的高分……
+15.这是一部细思极恐的科幻大片，整部电影颠覆你的三观，它的名字叫……
+16.史上最震撼的灾难片，每一点都不舍得快进的电影，他叫……
+17.今天给大家带来一部基于真实事件改编的（主题介绍一句……）的故事片，这是一部连环悬疑剧，如果不看到最后绝对想不到结局竟然是这样的反转……
+
+### 方式二：情景式、假设性开头
+1.他叫……你以为他是……的吗？不。他是来……然后开始叙述
+2.你知道……吗？原来……然后开始叙述
+3.如果给你….，你会怎么样？
+4.如果你是….，你会怎么样？
+
+### 方式三：以国家为开头！简单明了。话语不需要多，但是需要讲解透彻！
+1.这是一部韩国最新灾难片，你一定没有看过……
+2.这是一部印度高分悬疑片，
+3.这部电影原在日本因为……而被下架，
+4.这是韩国最恐怖的犯罪片，
+5.这是最近国产片评分最高的悬疑片
+以上均按照影片国家来区分，然后简单介绍下主题。就可以开始直接叙述作品。也是一个很不错的方法！
+
+### 方式四：如何自由发挥
+正常情况下，每一部电影都有非常关键的一个大纲，这部电影的主题其实是可以用一句话、两句话概括的。只要看懂电影，就能找到这个主题大纲。
+我们提前把这个主题大纲给放到影视最前面，作为我们的前三句的文案，将会非常吸引人！
+
+例如：
+1.这不是电影，这是真实故事。两个女人和一个男人被关在可桑拿室。喊破喉咙也没有一丝回音。窒息感和热度让人抓狂，故事就是从这里开始！ 
+2.如果你男朋友出轨了，他不爱你了，还对你家暴，怎么办？接下来这部电影就会教你如何让老公服服帖帖的呆在你身边！女主是一个……开始叙述了。 
+3.他力大无穷，双眼放光，这不是拯救地球的超人吗？然而不是。今天给大家推荐的这部电影叫……
+
+以上是需要看完影片，看懂影片，然后从里面提炼出精彩的几句话,当然是比较难的，当你不会自己去总结前三句的经典的话。可以用前面方式一二三！
+实在想不出来如何去提炼，可以去搜索这部剧，对这部电影的影评，也会给你带过来很多灵感的！
+
+
+## 内容部分
+开头有了，剩下的就是开始叙述正文了。主题介绍是根据影片内容来介绍，如果实在自己想不出来。可以参考其他平台中对这部电影的精彩介绍，提取2-3句也可以！
+正常情况下，我们叙述的时候其实是非常简单的，把整部电影主题线，叙述下来，其实文案就是加些修饰词把电影重点内容叙述下来。加上一些修饰词。
+
+以悬疑剧为例：
+竟然，突然，原来，但是，但，可是，结果，直到，如果，而，果然，发现，只是，出奇，之后，没错，不止，更是，当然，因为，所以……等！
+以上是比较常用的，当然还有很多，需要靠平时思考和阅读的积累！因悬疑剧会有多处反转剧情。所以需要用到反转的修饰词比较多，只有用到这些词。才能体现出各种反转剧情！
+建议大家在刚开始做的时候，做8分钟内的，不要太长，分成三段。每段也是不超过三分钟，这样时间刚好。可以比较好的完成完播率！
+
+
+## 结尾部分
+最后故事的结局，除了反转，可以来点人生的道理！如果刚开始不会，可以不写。
+后面水平越来越高的时候，可以进行人生道理的讲评。
+
+比如：这部电影告诉我们……
+类似于哲理性质的，作为一个总结！
+也可以把最后的影视反转，原生放出来，留下悬念。
+
+比如：也可以总结下这部短片如何的好，推荐/值得大家去观看之类的话语。
+其实就是给我们的作品来一个总结，总结我们所做的三个视频，有开始就要有结束。这个结束不一定是固定的模版。但是视频一定要有结尾。让人感觉有头有尾才最舒服！
+做解说是一个比较浪费脑细胞的活，虽然刚开始比较难一点，但是当你正常做三部剧之后。所有自己的思路都会被打开！以后的基本就可以独立完成来操作来。
+做解说第一次，可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平常的制作全部时间！
+
+"""
+
 
 def _generate_response(prompt: str) -> str:
     content = ""
@@ -476,26 +566,200 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
     return response
 
 
+def gemini_video_transcription(video_origin_name: str, video_origin_path: str, language: str):
+    '''
+    使用 gemini-1.5-xxx 进行视频画面转录
+    '''
+    api_key = config.app.get("gemini_api_key")
+    model_name = config.app.get("gemini_model_name")
+
+    gemini.configure(api_key=api_key)
+    model = gemini.GenerativeModel(model_name=model_name)
+
+    prompt = """
+    Please transcribe the audio, include timestamps, and provide visual descriptions, then output in JSON format，use %s ONLY.
+
+    Use this JSON schema:
+
+    Graphics = {"timestamp": "MM:SS-MM:SS", "picture": "str", "quotes": "str"(If no one says anything, use an empty string instead.)}
+    Return: list[Graphics]
+    """ % language
+
+    logger.debug(f"视频名称: {video_origin_name}")
+    try:
+        gemini_video_file = gemini.upload_file(video_origin_path)
+        # gemini_video_file = gemini.get_file("files/uxo6r9n80s84")
+        logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
+        while gemini_video_file.state.name == "PROCESSING":
+            import time
+            time.sleep(1)
+            gemini_video_file = gemini.get_file(gemini_video_file.name)
+            logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
+        if gemini_video_file.state.name == "FAILED":
+            raise ValueError(gemini_video_file.state.name)
+    except ResumableUploadError as err:
+        logger.error(f"上传视频至 Google cloud 失败, 用户的位置信息不支持用于该API; \n{traceback.format_exc()}")
+        return ""
+    except FailedPrecondition as err:
+        logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}")
+        return ""
+
+    response = model.generate_content(
+        [prompt, gemini_video_file],
+        safety_settings={
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+    )
+    logger.success(f"llm 视频转录: \n{response.text}")
+    return response.text
+
+
+def video_copy_writing(video_plot, video_name):
+    """
+    影视解说（电影解说）
+    """
+    api_key = config.app.get("gemini_api_key")
+    model_name = config.app.get("gemini_model_name")
+
+    gemini.configure(api_key=api_key)
+    model = gemini.GenerativeModel(model_name)
+
+    prompt = f"""
+    **角色设定：**  
+    你是一名有10年经验的影视解说文案的创作者，
+    下面是关于如何写解说文案的方法 {Method}，请认真阅读它，之后我会给你一部影视作品的名称，然后让你写一篇文案
+    请根据方法撰写 《{video_name}》的影视解说文案，文案要符合以下要求:
+    
+    **任务目标：**  
+    1. 文案字数在 1500字左右，严格要求字数，最低不得少于 1000字。
+    2. 避免使用 markdown 格式输出文案。  
+    3. 仅输出解说文案，不输出任何其他内容。
+    4. 不要包含小标题，每个段落以 \n 进行分隔。
+    """
+    response = model.generate_content(
+        prompt,
+        generation_config=gemini.types.GenerationConfig(
+            candidate_count=1,
+            temperature=1.3,
+        ),
+        safety_settings={
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+    )
+    print(response.text)
+    print("字数：", len(response.text))
+
+
+def short_play_commentary(video_plot: str, video_name: str):
+    """
+    影视解说（短剧解说）
+    """
+    api_key = config.app.get("gemini_api_key")
+    model_name = config.app.get("gemini_model_name")
+
+    gemini.configure(api_key=api_key)
+    model = gemini.GenerativeModel(model_name)
+
+    if not video_plot:
+        raise ValueError("短剧的简介不能为空")
+    if not video_name:
+        raise ValueError("短剧名称不能为空")
+
+    prompt = f"""
+    **角色设定：**  
+    你是一名有10年经验的短剧解说文案的创作者，
+    下面是关于如何写解说文案的方法 {Method}，请认真阅读它，之后我会给你一部短剧作品的简介，然后让你写一篇解说文案
+    请根据方法撰写 《{video_name}》的解说文案，《{video_name}》的大致剧情如下: {video_plot}
+    文案要符合以下要求:
+
+    **任务目标：**  
+    1. 文案字数在 800字左右，严格要求字数，最低不得少于 500字。
+    2. 避免使用 markdown 格式输出文案。
+    3. 仅输出解说文案，不输出任何其他内容。
+    4. 不要包含小标题，每个段落以 \n 进行分隔。
+    """
+    response = model.generate_content(
+        prompt,
+        generation_config=gemini.types.GenerationConfig(
+            candidate_count=1,
+            temperature=1.0,
+        ),
+        safety_settings={
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+    )
+    print(response.text)
+    print("字数：", len(response.text))
+
+
 if __name__ == "__main__":
     """
     File API 可让您为每个项目存储最多 20 GB 的文件，每个项目使用 每个文件的大小上限为 2 GB。文件会存储 48 小时。
     它们可以是 在此期间使用您的 API 密钥访问，但无法下载 使用任何 API。它已在使用 Gemini 的所有地区免费提供 API 可用。
     """
-    import os
-    import sys
-    import requests
-    from app.utils.utils import get_current_country
+    # video_copy_writing("", "阿甘正传")
 
-    # # 添加当前目录到系统路径
-    # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    video_plot = """
+    ## 短剧《卖菜大妈竟是皇嫂》分析
+
+**主要剧情:**
+
+短剧《卖菜大妈竟是皇嫂》讲述了农妇刘桂花在逃荒途中意外救助了一名孩童，这个孩童正是当时的五皇子。然而，在救五皇子的过程中，刘桂花失去了自己的儿子志洲。二十年后，五皇子长大成人，并与刘桂花重逢。刘桂花在得知真相后，面对着皇室的权势和自己的过往，最终选择勇敢地面对命运，并最终收获了幸福。
+
+**内容:**
+
+短剧以古装仙侠为题材，融合了穿越、宫廷、爱情等元素，展现了主角刘桂花从一个平凡的卖菜大妈成长为皇室成员的传奇故事。剧中展现了刘桂花善良、勇敢、坚韧的性格，以及她与五皇子之间错综复杂的情感纠葛。
+
+**核心信息:**
+
+这部短剧的核心信息是“命运的安排，无法改变，但我们可以选择如何面对”。刘桂花在经历了失去儿子的痛苦和与五皇子重逢的惊喜后，最终选择了勇敢地面对命运，并最终获得了幸福。这体现了人性的善良、勇敢和坚韧，也展现了对美好生活的追求和对命运的掌控。
+
+**人物:**
+
+* **刘桂花:**  短剧的主角，一位善良、勇敢、坚韧的农妇。她经历了失去儿子的痛苦，却依然保持着善良的本性，最终获得了幸福。
+* **五皇子:** 皇室成员，与刘桂花有着特殊的缘分。他善良、正直、勇敢，最终与刘桂花相爱。
+
+**思考:**
+
+这部短剧带给我们的思考是，面对命运的安排，我们应该保持勇敢和坚韧，积极地面对生活，追求美好的生活，而不是一味地沉溺于痛苦之中。同时，短剧也提醒我们，人性的善良和勇敢，是战胜困难、获得幸福的关键。
+
+**总结:**
+
+《卖菜大妈竟是皇嫂》是一部以女性视角展开的古装仙侠题材作品，讲述了主角刘桂花从一个平凡的卖菜大妈成长为皇室成员的传奇故事。剧中展现了刘桂花善良、勇敢、坚韧的性格，以及她与五皇子之间错综复杂的情感纠葛。这部短剧的核心信息是“命运的安排，无法改变，但我们可以选择如何面对”，它鼓励人们在面对困难时，保持勇敢和坚韧，积极地面对生活，最终获得幸福。
+    """
+    short_play_commentary(video_plot, "卖菜大妈竟是皇嫂")
+
+    # import os
+    # import sys
+    # import requests
+    # from app.utils.utils import get_current_country
     #
-    video_subject = "卖菜大妈竟是皇嫂"
-    video_path = "../../resource/videos/demoyasuo.mp4"
-
-    video_plot = ''' '''
-    language = "zh-CN"
-    res = gemini_video2json(video_subject, video_path, video_plot, language)
-    print(res)
+    # # # 添加当前目录到系统路径
+    # # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    # # proxy_url_http = "http://127.0.0.1:7890"
+    # # os.environ["HTTP_PROXY"] = proxy_url_http
+    # # os.environ["HTTPS_PROXY"] = proxy_url_http
+    #
+    # video_subject = "卖菜大妈竟是皇嫂"
+    # video_path = "../../resource/videos/demoyasuo.mp4"
+    # # video_path = "../../resource/videos/庆余年2-1-1.mp4"
+    #
+    # video_plot = ''' '''
+    # language = "zh-CN"
+    # # res = gemini_video2json(video_subject, video_path, video_plot, language)
+    # script = gemini_video_transcription(video_subject, video_path, language)
+    # cleaned_string = script.strip("```json").strip("```")
+    # res = json.loads(cleaned_string)
+    # print(res)
 
     # get_current_country()
     # api_key = config.app.get("gemini_api_key")
@@ -520,3 +784,89 @@ if __name__ == "__main__":
     #
     # response = "".join(response)
     # logger.success(f"llm response: \n{response}")
+    wenan = """
+重要提示：每一部剧的文案，前几句必须吸引人
+首先我们在看完看懂电影后，大脑里面要先有一个大概的轮廓，也就是一个类似于作文的大纲，电影主题线在哪里，首先要找到。
+一般将文案分为开头、内容、结尾
+## 开头部分
+文案开头三句话，是留住用户的关键！
+
+### 方式一：开头概括总结
+文案的前三句，是整部电影的概括总结，2-3句介绍后，开始叙述故事剧情！
+推荐新手（新号）做：（盘点型）
+盘点全球最恐怖的10部电影
+盘点全球最科幻的10部电影
+盘点全球最悲惨的10部电影
+盘点全球最值得看的10部灾难电影
+盘点全球最值得看的10部励志电影
+
+下面的示例就是最简单的解说文案开头：
+1.这是XXX国20年来最大尺度的一部剧，极度烧脑，却让99%的人看得心潮澎湃、无法自拔，故事开始……
+2.这是有史以来电影院唯一一部全程开灯放完的电影，期间无数人尖叫昏厥，他被成为勇敢者的专属，因为99%的人都不敢看到结局，许多人看完它从此不愿再碰手机，他就是大名鼎鼎的暗黑神作《XXX》……
+3.这到底是一部什么样的电影，能被55个国家公开抵制，它甚至为了上映，不惜删减掉整整47分钟的剧情……
+4.是什么样的一个人，被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
+5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/……
+6.这到底是一部什么样的影片，他一个人就拿了4个顶级奖项，第一季8.7分，第二季直接干到9.5分，11万人给出5星好评，一共也就6集，却斩获26项国际大奖，看过的人都说，他是近年来最好的xxx剧，几乎成为了近年来xxx剧的标杆。故事发生在……
+7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《时代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
+8.这是一部让所有人看得荷尔蒙飙升的爽片……
+9.他被成为世界上最虐心绝望的电影，至今无人敢看第二遍，很难想象，他是根据真实事件改编而来……
+10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他就是顶级神作《xxxx》……
+11.这是X国有史以来最高赞的一部悬疑电影，然而却因为某些原因，国内90%的人，没能看过这部片子，他就是《xxx》……
+12.有这样一部电影，这辈子，你绝对不想再看第二遍，并不是它剧情烂俗，而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场，更让许多同行都不想解说这部电影，他就是大名鼎鼎的暗黑神作《xxx》…
+13.它被誉为史上最牛悬疑片，无数人在看完它时候，一个月不敢照镜子，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
+14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影，能够在某瓣上被70万人打出9.3分的高分……
+15.这是一部细思极恐的科幻大片，整部电影颠覆你的三观，它的名字叫……
+16.史上最震撼的灾难片，每一点都不舍得快进的电影，他叫……
+17.今天给大家带来一部基于真实事件改编的（主题介绍一句……）的故事片，这是一部连环悬疑剧，如果不看到最后绝对想不到结局竟然是这样的反转……
+
+### 方式二：情景式、假设性开头
+1.他叫……你以为他是……的吗？不。他是来……然后开始叙述
+2.你知道……吗？原来……然后开始叙述
+3.如果给你….，你会怎么样？
+4.如果你是….，你会怎么样？
+ 
+### 方式三：以国家为开头！简单明了。话语不需要多，但是需要讲解透彻！
+1.这是一部韩国最新灾难片，你一定没有看过……
+2.这是一部印度高分悬疑片，
+3.这部电影原在日本因为……而被下架，
+4.这是韩国最恐怖的犯罪片，
+5.这是最近国产片评分最高的悬疑片
+以上均按照影片国家来区分，然后简单介绍下主题。就可以开始直接叙述作品。也是一个很不错的方法！
+
+### 方式四：如何自由发挥
+正常情况下，每一部电影都有非常关键的一个大纲，这部电影的主题其实是可以用一句话、两句话概括的。只要看懂电影，就能找到这个主题大纲。
+我们提前把这个主题大纲给放到影视最前面，作为我们的前三句的文案，将会非常吸引人！
+
+例如：
+1.这不是电影，这是真实故事。两个女人和一个男人被关在可桑拿室。喊破喉咙也没有一丝回音。窒息感和热度让人抓狂，故事就是从这里开始！ 
+2.如果你男朋友出轨了，他不爱你了，还对你家暴，怎么办？接下来这部电影就会教你如何让老公服服帖帖的呆在你身边！女主是一个……开始叙述了。 
+3.他力大无穷，双眼放光，这不是拯救地球的超人吗？然而不是。今天给大家推荐的这部电影叫……
+
+以上是需要看完影片，看懂影片，然后从里面提炼出精彩的几句话,当然是比较难的，当你不会自己去总结前三句的经典的话。可以用前面方式一二三！
+实在想不出来如何去提炼，可以去搜索这部剧，对这部电影的影评，也会给你带过来很多灵感的！
+
+
+## 内容部分
+开头有了，剩下的就是开始叙述正文了。主题介绍是根据影片内容来介绍，如果实在自己想不出来。可以参考其他平台中对这部电影的精彩介绍，提取2-3句也可以！
+正常情况下，我们叙述的时候其实是非常简单的，把整部电影主题线，叙述下来，其实文案就是加些修饰词把电影重点内容叙述下来。加上一些修饰词。
+
+以悬疑剧为例：
+竟然，突然，原来，但是，但，可是，结果，直到，如果，而，果然，发现，只是，出奇，之后，没错，不止，更是，当然，因为，所以……等！
+以上是比较常用的，当然还有很多，需要靠平时思考和阅读的积累！因悬疑剧会有多处反转剧情。所以需要用到反转的修饰词比较多，只有用到这些词。才能体现出各种反转剧情！
+建议大家在刚开始做的时候，做8分钟内的，不要太长，分成三段。每段也是不超过三分钟，这样时间刚好。可以比较好的完成完播率！
+
+
+## 结尾部分
+最后故事的结局，除了反转，可以来点人生的道理！如果刚开始不会，可以不写。
+后面水平越来越高的时候，可以进行人生道理的讲评。
+
+比如：这部电影告诉我们……
+类似于哲理性质的，作为一个总结！
+也可以把最后的影视反转，原生放出来，留下悬念。
+
+比如：也可以总结下这部短片如何的好，推荐/值得大家去观看之类的话语。
+其实就是给我们的作品来一个总结，总结我们所做的三个视频，有开始就要有结束。这个结束不一定是固定的模版。但是视频一定要有结尾。让人感觉有头有尾才最舒服！
+做解说是一个比较浪费脑细胞的活，虽然刚开始比较难一点，但是当你正常做三部剧之后。所有自己的思路都会被打开！以后的基本就可以独立完成来操作来。
+做解说第一次，可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平常的制作全部时间！
+
+    """

From 93188e13282879aba0b774cea8a513df551e2a89 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Mon, 23 Sep 2024 17:46:46 +0800
Subject: [PATCH 06/21] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E9=9F=B3=E7=94=BB?=
 =?UTF-8?q?=E5=90=8C=E6=AD=A5=E6=8F=90=E7=A4=BA=E8=AF=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py | 118 +++++++++++++++++++++++++++++++-------------
 app/utils/utils.py  |   2 +-
 2 files changed, 84 insertions(+), 36 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index 8ded6da..f998ebb 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -617,7 +617,7 @@ def gemini_video_transcription(video_origin_name: str, video_origin_path: str, l
     return response.text
 
 
-def video_copy_writing(video_plot, video_name):
+def writing_movie(video_plot, video_name):
     """
     影视解说（电影解说）
     """
@@ -656,7 +656,7 @@ def video_copy_writing(video_plot, video_name):
     print("字数：", len(response.text))
 
 
-def short_play_commentary(video_plot: str, video_name: str):
+def writing_short_play(video_plot: str, video_name: str):
     """
     影视解说（短剧解说）
     """
@@ -679,10 +679,10 @@ def short_play_commentary(video_plot: str, video_name: str):
     文案要符合以下要求:
 
     **任务目标：**  
-    1. 文案字数在 800字左右，严格要求字数，最低不得少于 500字。
+    1. 文案字数在 800字左右，严格要求字数，最低不得少于 600字。
     2. 避免使用 markdown 格式输出文案。
     3. 仅输出解说文案，不输出任何其他内容。
-    4. 不要包含小标题，每个段落以 \n 进行分隔。
+    4. 不要包含小标题，每个段落以 \\n 进行分隔。
     """
     response = model.generate_content(
         prompt,
@@ -701,42 +701,90 @@ def short_play_commentary(video_plot: str, video_name: str):
     print("字数：", len(response.text))
 
 
+def screen_matching(huamian: str, wenan: str):
+    """
+    画面匹配
+    """
+    api_key = config.app.get("gemini_api_key")
+    model_name = config.app.get("gemini_model_name")
+
+    gemini.configure(api_key=api_key)
+    model = gemini.GenerativeModel(model_name)
+
+    if not huamian:
+        raise ValueError("画面不能为空")
+    if not wenan:
+        raise ValueError("文案不能为空")
+
+    prompt = """
+    你是一名有10年经验的影视解说创作者，
+    你的任务是根据画面描述文本和解说文案，匹配出每段解说文案对应的画面时间戳, 结果以 json 格式输出。
+    
+    画面描述文本和文案（由 XML 标记<SOURCE_TEXT><SOURCE_TEXT>和 <COPYWRITER><COPYWRITER>分隔）如下所示：
+    <SOURCE_TEXT>
+    %s
+    </SOURCE_TEXT>
+    
+    <COPYWRITER>
+    %s
+    </COPYWRITER>
+
+    Use this JSON schema:
+    script = {'picture': str, 'timestamp': str, "narration": str, "OST": bool}
+    Return: list[script]
+    """ % (huamian, wenan)
+
+    logger.info(prompt)
+
+    response = model.generate_content(
+        prompt,
+        generation_config=gemini.types.GenerationConfig(
+            candidate_count=1,
+            temperature=1.0,
+        ),
+        safety_settings={
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+    )
+    print(response.text)
+    print("字数：", len(response.text))
+
+
+
 if __name__ == "__main__":
+    # 1. 视频转录
+    # video_subject = "第二十条之无罪释放"
+    # video_path = "../../resource/videos/test01.mp4"
+    # language = "zh-CN"
+    # gemini_video_transcription(video_subject, video_path, language)
+
+    # 2. 解说文案
+    # video_plot = """
+    #     李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明“你儿子是你儿子”。
+    # 走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
+    # 苏醒后的李牧坚决为父亲做无罪辩护，面对银行的顶级律师团队，他一个法学院大一学生，能否力挽狂澜，创作奇迹？挥法律之利剑 ，持正义之天平！
+    # """
+    # print(video_plot)
+    # res = writing_short_play(video_plot, "第二十条之无罪释放")
+
+    wenan = """
+    这到底是一部什么样的电影，能让银行经理在法庭上公然下跪，能让无数网友为之愤怒，更能让无数人为之动容？\n
+他叫李自忠，为了给儿子筹集医药费，他来到了银行，想取出儿子名下的存款，却被银行告知，要证明“你儿子是你儿子”，走投无路之下，他却被卷入了一场银行抢劫案，阴差阳错之下，劫匪给了他两沓钱，让他救儿子，本以为是希望，没想到却是绝望的开始，他因此被认定为抢劫犯，被判处20年有期徒刑。\n
+然而，天无绝人之路，昏迷的儿子醒了，苏醒后的儿子，怎么也不敢相信，自己的父亲竟然被判为抢劫犯，为了给父亲讨回公道，他做出了一个决定，他要为父亲做无罪辩护，要知道，他只是一个法学院的大一学生，面对银行的顶级律师团队，他能成功吗？\n
+面对种种不利证据，他一次次败诉，又一次次上诉，就像一只打不死的小强，为了找到有利的证据，他四处奔波，走访调查，甚至不惜以身犯险，只为还原事实真相，然而，真相真的会到来吗？\n
+正义或许会迟到，但永远不会缺席，随着案件的审理，越来越多的疑点浮出水面，案情也发生了惊天大逆转，他究竟发现了什么？最后的真相又是什么？本案改编自真实事件，究竟是人性的扭曲，还是道德的沦丧？\n
+想知道案件的最终结果吗？让我们一起走进这部电影，寻找最终的真相吧！
     """
-    File API 可让您为每个项目存储最多 20 GB 的文件，每个项目使用 每个文件的大小上限为 2 GB。文件会存储 48 小时。
-    它们可以是 在此期间使用您的 API 密钥访问，但无法下载 使用任何 API。它已在使用 Gemini 的所有地区免费提供 API 可用。
-    """
-    # video_copy_writing("", "阿甘正传")
+    # 读取指定目录下的 json 文件
+    with open("../../resource/scripts/zhuanlu.json", "r", encoding="utf-8") as f:
+        huamian = json.load(f)
 
-    video_plot = """
-    ## 短剧《卖菜大妈竟是皇嫂》分析
+    screen_matching(huamian, wenan)
 
-**主要剧情:**
 
-短剧《卖菜大妈竟是皇嫂》讲述了农妇刘桂花在逃荒途中意外救助了一名孩童，这个孩童正是当时的五皇子。然而，在救五皇子的过程中，刘桂花失去了自己的儿子志洲。二十年后，五皇子长大成人，并与刘桂花重逢。刘桂花在得知真相后，面对着皇室的权势和自己的过往，最终选择勇敢地面对命运，并最终收获了幸福。
-
-**内容:**
-
-短剧以古装仙侠为题材，融合了穿越、宫廷、爱情等元素，展现了主角刘桂花从一个平凡的卖菜大妈成长为皇室成员的传奇故事。剧中展现了刘桂花善良、勇敢、坚韧的性格，以及她与五皇子之间错综复杂的情感纠葛。
-
-**核心信息:**
-
-这部短剧的核心信息是“命运的安排，无法改变，但我们可以选择如何面对”。刘桂花在经历了失去儿子的痛苦和与五皇子重逢的惊喜后，最终选择了勇敢地面对命运，并最终获得了幸福。这体现了人性的善良、勇敢和坚韧，也展现了对美好生活的追求和对命运的掌控。
-
-**人物:**
-
-* **刘桂花:**  短剧的主角，一位善良、勇敢、坚韧的农妇。她经历了失去儿子的痛苦，却依然保持着善良的本性，最终获得了幸福。
-* **五皇子:** 皇室成员，与刘桂花有着特殊的缘分。他善良、正直、勇敢，最终与刘桂花相爱。
-
-**思考:**
-
-这部短剧带给我们的思考是，面对命运的安排，我们应该保持勇敢和坚韧，积极地面对生活，追求美好的生活，而不是一味地沉溺于痛苦之中。同时，短剧也提醒我们，人性的善良和勇敢，是战胜困难、获得幸福的关键。
-
-**总结:**
-
-《卖菜大妈竟是皇嫂》是一部以女性视角展开的古装仙侠题材作品，讲述了主角刘桂花从一个平凡的卖菜大妈成长为皇室成员的传奇故事。剧中展现了刘桂花善良、勇敢、坚韧的性格，以及她与五皇子之间错综复杂的情感纠葛。这部短剧的核心信息是“命运的安排，无法改变，但我们可以选择如何面对”，它鼓励人们在面对困难时，保持勇敢和坚韧，积极地面对生活，最终获得幸福。
-    """
-    short_play_commentary(video_plot, "卖菜大妈竟是皇嫂")
 
     # import os
     # import sys
diff --git a/app/utils/utils.py b/app/utils/utils.py
index b5a91cb..dc38b90 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -358,7 +358,7 @@ def add_new_timestamps(scenes):
         new_scene['new_timestamp'] = f"{new_start_str}-{new_end_str}"
 
         # 为"原生播放"的narration添加唯一标识符
-        if new_scene.get('narration') == "原声播放":
+        if new_scene.get('narration') == "原声播放" or new_scene.get('narration') == None:
             unique_id = str(uuid4())[:8]  # 使用UUID的前8个字符作为唯一标识符
             new_scene['narration'] = f"原声播放_{unique_id}"
 

From 6669b2836151c780127cd07861adeb0fc9ab32df Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Tue, 24 Sep 2024 18:25:02 +0800
Subject: [PATCH 07/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=20webui=20=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README-zh.md              |   2 +-
 README.md                 |   2 +-
 app/models/schema.py      |   3 +-
 docker-compose.yml        |   6 +-
 webui.bat                 |   2 +-
 webui/Main.py => webui.py | 136 ++++++++++++++++++++------------------
 webui.sh                  |   2 +-
 webui/i18n/en.json        |   2 +-
 webui/i18n/zh.json        |   2 +-
 9 files changed, 84 insertions(+), 73 deletions(-)
 rename webui/Main.py => webui.py (90%)

diff --git a/README-zh.md b/README-zh.md
index 26dfd10..a7de5ca 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -166,7 +166,7 @@ sudo yum install ImageMagick
 ```
 3. 启动 webui
 ```shell
-streamlit run ./webui/Main.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
+streamlit run ./webui/webui.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
 ```
 4. 访问 http://127.0.0.1:8501
 
diff --git a/README.md b/README.md
index e874f1c..43c07a2 100644
--- a/README.md
+++ b/README.md
@@ -167,7 +167,7 @@ sudo yum install ImageMagick
 
 3. initiate webui
 ```shell
-streamlit run ./webui/Main.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
+streamlit run ./webui/webui.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
 ```
 4. Access http://127.0.0.1:8501
 
diff --git a/app/models/schema.py b/app/models/schema.py
index 25e3ce8..b90a4c1 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -339,7 +339,7 @@ class VideoClipParams(BaseModel):
     video_count: Optional[int] = 1      # 视频片段数量
     video_source: Optional[str] = "local"
     video_language: Optional[str] = ""  # 自动检测
-    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+    # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
 
     # # 女性
     # "zh-CN-XiaoxiaoNeural",
@@ -366,5 +366,6 @@ class VideoClipParams(BaseModel):
     font_size: int = 60     # 文字大小
     stroke_color: Optional[str] = "#000000"     # 文字描边颜色
     stroke_width: float = 1.5       # 文字描边宽度
+    custom_position: float = 70.0   # 自定义位置
     n_threads: Optional[int] = 2    # 线程数
     paragraph_number: Optional[int] = 1     # 段落数量
diff --git a/docker-compose.yml b/docker-compose.yml
index 6c7d6ae..cc94678 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -20,7 +20,9 @@ services:
       dockerfile: Dockerfile
     container_name: "api"
     ports:
-      - "8502:8080"
-    command: [ "python3", "main.py" ]
+      - "8502:22"
+    command: [ "sleep", "48h" ]
     volumes: *common-volumes
+    environment:
+      - "VPN_PROXY_URL=http://host.docker.internal:7890"
     restart: always
diff --git a/webui.bat b/webui.bat
index a8a1c00..111e1d3 100644
--- a/webui.bat
+++ b/webui.bat
@@ -40,4 +40,4 @@ pause
 
 
 rem set HF_ENDPOINT=https://hf-mirror.com
-streamlit run .\webui\Main.py --browser.gatherUsageStats=False --server.enableCORS=True
+streamlit run webui.py --browser.gatherUsageStats=False --server.enableCORS=True
diff --git a/webui/Main.py b/webui.py
similarity index 90%
rename from webui/Main.py
rename to webui.py
index 2db4569..27e4b1c 100644
--- a/webui/Main.py
+++ b/webui.py
@@ -5,24 +5,26 @@ import json
 import time
 import datetime
 import traceback
+import streamlit as st
+from uuid import uuid4
+import platform
+import streamlit.components.v1 as components
+from loguru import logger
 
-# 将项目的根目录添加到系统路径中，以允许从项目导入模块
-root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+from app.config import config
+from app.models.const import FILE_TYPE_VIDEOS
+from app.models.schema import VideoClipParams, VideoAspect, VideoConcatMode
+from app.services import task as tm, llm, voice, material
+from app.utils import utils
+
+# # 将项目的根目录添加到系统路径中，以允许从项目导入模块
+root_dir = os.path.dirname(os.path.realpath(__file__))
 if root_dir not in sys.path:
     sys.path.append(root_dir)
     print("******** sys.path ********")
     print(sys.path)
     print("")
 
-import streamlit as st
-
-import os
-from uuid import uuid4
-import platform
-import streamlit.components.v1 as components
-from loguru import logger
-from app.config import config
-
 st.set_page_config(
     page_title="NarratoAI",
     page_icon="📽️",
@@ -35,11 +37,6 @@ st.set_page_config(
     },
 )
 
-from app.models.const import FILE_TYPE_IMAGES, FILE_TYPE_VIDEOS
-from app.models.schema import VideoClipParams, VideoAspect, VideoConcatMode
-from app.services import task as tm, llm, voice, material
-from app.utils import utils
-
 proxy_url_http = config.proxy.get("http", "") or os.getenv("VPN_PROXY_URL", "")
 proxy_url_https = config.proxy.get("https", "") or os.getenv("VPN_PROXY_URL", "")
 os.environ["HTTP_PROXY"] = proxy_url_http
@@ -278,18 +275,23 @@ with left_panel:
                 "name": os.path.basename(file),
                 "size": os.path.getsize(file),
                 "file": file,
+                "ctime": os.path.getctime(file)  # 获取文件创建时间
             })
 
-        script_path = [(tr("Auto Generate"), ""), ]
-        for code in [file['file'] for file in script_list]:
-            script_path.append((code, code))
+        # 按创建时间降序排序
+        script_list.sort(key=lambda x: x["ctime"], reverse=True)
 
-        selected_json2 = st.selectbox(tr("Script Files"),
-                                      index=0,
-                                      options=range(len(script_path)),  # 使用索引作为内部选项值
-                                      format_func=lambda x: script_path[x][0]  # 显示给用户的是标签
-                                      )
-        params.video_clip_json = script_path[selected_json2][1]
+        # 脚本文件 下拉框
+        script_path = [(tr("Auto Generate"), ""), ]
+        for file in script_list:
+            display_name = file['file'].replace(root_dir, "")
+            script_path.append((display_name, file['file']))
+        selected_script_index = st.selectbox(tr("Script Files"),
+                                             index=0,
+                                             options=range(len(script_path)),  # 使用索引作为内部选项值
+                                             format_func=lambda x: script_path[x][0]  # 显示给用户的是标签
+                                             )
+        params.video_clip_json = script_path[selected_script_index][1]
         video_json_file = params.video_clip_json
 
         # 视频文件处理
@@ -310,12 +312,12 @@ with left_panel:
         for code in [file['file'] for file in video_list]:
             video_path.append((code, code))
 
-        selected_index2 = st.selectbox(tr("Video File"),
-                                       index=0,
-                                       options=range(len(video_path)),  # 使用索引作为内部选项值
-                                       format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
-                                       )
-        params.video_origin_path = video_path[selected_index2][1]
+        selected_video_index = st.selectbox(tr("Video File"),
+                                            index=0,
+                                            options=range(len(video_path)),  # 使用索引作为内部选项值
+                                            format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
+                                            )
+        params.video_origin_path = video_path[selected_video_index][1]
         config.app["video_origin_path"] = params.video_origin_path
 
         # 从本地上传 mp4 文件
@@ -341,8 +343,6 @@ with left_panel:
                     st.success(tr("File Uploaded Successfully"))
                     time.sleep(1)
                     st.rerun()
-            # params.video_origin_path = video_path[selected_index2][1]
-            # config.app["video_origin_path"] = params.video_origin_path
 
         # 剧情内容
         video_plot = st.text_area(
@@ -351,12 +351,13 @@ with left_panel:
             height=180
         )
 
+        # 生成视频脚本
         if st.button(tr("Video Script Generate"), key="auto_generate_script"):
             with st.spinner(tr("Video Script Generate")):
                 if video_json_file == "" and params.video_origin_path != "":
                     # 使用大模型生成视频脚本
                     script = llm.gemini_video2json(
-                        video_origin_name=params.video_origin_path.split("\\")[-1],
+                        video_origin_name=os.path.basename(params.video_origin_path),
                         video_origin_path=params.video_origin_path,
                         video_plot=video_plot,
                         language=params.video_language,
@@ -371,12 +372,14 @@ with left_panel:
                         cleaned_string = script.strip("```json").strip("```")
                         st.session_state['video_script_list'] = json.loads(cleaned_string)
 
+        # 视频脚本
         video_clip_json_details = st.text_area(
             tr("Video Script"),
             value=st.session_state['video_clip_json'],
             height=180
         )
 
+        # 保存脚本
         button_columns = st.columns(2)
         with button_columns[0]:
             if st.button(tr("Save Script"), key="auto_generate_terms", use_container_width=True):
@@ -397,20 +400,23 @@ with left_panel:
                     try:
                         data = utils.add_new_timestamps(json.loads(input_json))
                     except Exception as err:
-                        raise ValueError(
-                            f"视频脚本格式错误，请检查脚本是否符合 JSON 格式；{err} \n\n{traceback.format_exc()}")
+                        st.error(f"视频脚本格式错误，请检查脚本是否符合 JSON 格式；{err} \n\n{traceback.format_exc()}")
+                        st.stop()
 
                     # 检查是否是一个列表
                     if not isinstance(data, list):
-                        raise ValueError("JSON is not a list")
+                        st.error("JSON is not a list")
+                        st.stop()
 
                     # 检查列表中的每个元素是否包含所需的键
                     required_keys = {"picture", "timestamp", "narration"}
                     for item in data:
                         if not isinstance(item, dict):
-                            raise ValueError("List 元素不是字典")
+                            st.error("List 元素不是字典")
+                            st.stop()
                         if not required_keys.issubset(item.keys()):
-                            raise ValueError("Dict 元素不包含必需的键")
+                            st.error("Dict 元素不包含必需的键")
+                            st.stop()
 
                     # 存储为新的 JSON 文件
                     with open(save_path, 'w', encoding='utf-8') as file:
@@ -441,13 +447,13 @@ with left_panel:
                 for video_script in video_script_list:
                     try:
                         video_script['path'] = subclip_videos[video_script['timestamp']]
-                    except KeyError as e:
-                        st.error(f"裁剪视频失败")
+                    except KeyError as err:
+                        st.error(f"裁剪视频失败 {err}")
                 # logger.debug(f"当前的脚本为：{st.session_state.video_script_list}")
             else:
                 st.error(tr("请先生成视频脚本"))
 
-
+        # 裁剪视频
         with button_columns[1]:
             if st.button(tr("Crop Video"), key="auto_crop_video", use_container_width=True):
                 caijian()
@@ -456,10 +462,10 @@ with left_panel:
 with middle_panel:
     with st.container(border=True):
         st.write(tr("Video Settings"))
-        video_concat_modes = [
-            (tr("Sequential"), "sequential"),
-            (tr("Random"), "random"),
-        ]
+        # video_concat_modes = [
+        #     (tr("Sequential"), "sequential"),
+        #     (tr("Random"), "random"),
+        # ]
         # video_sources = [
         #     (tr("Pexels"), "pexels"),
         #     (tr("Pixabay"), "pixabay"),
@@ -491,16 +497,17 @@ with middle_panel:
         #         accept_multiple_files=True,
         #     )
 
-        selected_index = st.selectbox(
-            tr("Video Concat Mode"),
-            index=1,
-            options=range(len(video_concat_modes)),  # 使用索引作为内部选项值
-            format_func=lambda x: video_concat_modes[x][0],  # 显示给用户的是标签
-        )
-        params.video_concat_mode = VideoConcatMode(
-            video_concat_modes[selected_index][1]
-        )
+        # selected_index = st.selectbox(
+        #     tr("Video Concat Mode"),
+        #     index=1,
+        #     options=range(len(video_concat_modes)),  # 使用索引作为内部选项值
+        #     format_func=lambda x: video_concat_modes[x][0],  # 显示给用户的是标签
+        # )
+        # params.video_concat_mode = VideoConcatMode(
+        #     video_concat_modes[selected_index][1]
+        # )
 
+        # 视频比例
         video_aspect_ratios = [
             (tr("Portrait"), VideoAspect.portrait.value),
             (tr("Landscape"), VideoAspect.landscape.value),
@@ -512,14 +519,14 @@ with middle_panel:
         )
         params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1])
 
-        params.video_clip_duration = st.selectbox(
-            tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1
-        )
-        params.video_count = st.selectbox(
-            tr("Number of Videos Generated Simultaneously"),
-            options=[1, 2, 3, 4, 5],
-            index=0,
-        )
+        # params.video_clip_duration = st.selectbox(
+        #     tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1
+        # )
+        # params.video_count = st.selectbox(
+        #     tr("Number of Videos Generated Simultaneously"),
+        #     options=[1, 2, 3, 4, 5],
+        #     index=0,
+        # )
     with st.container(border=True):
         st.write(tr("Audio Settings"))
 
@@ -638,7 +645,7 @@ with middle_panel:
             index=2,
         )
 
-# 新右侧面板
+# 新侧面板
 with right_panel:
     with st.container(border=True):
         st.write(tr("Subtitle Settings"))
@@ -676,6 +683,7 @@ with right_panel:
                 if params.custom_position < 0 or params.custom_position > 100:
                     st.error(tr("Please enter a value between 0 and 100"))
             except ValueError:
+                logger.error(f"输入的值无效: {traceback.format_exc()}")
                 st.error(tr("Please enter a valid number"))
 
         font_cols = st.columns([0.3, 0.7])
diff --git a/webui.sh b/webui.sh
index 4b7b7a4..001eaae 100644
--- a/webui.sh
+++ b/webui.sh
@@ -47,4 +47,4 @@ done
 # 等待所有后台任务完成
 wait
 echo "所有文件已成功下载到指定目录"
-streamlit run ./webui/Main.py --browser.serverAddress="0.0.0.0" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
+streamlit run webui.py --browser.serverAddress="0.0.0.0" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
diff --git a/webui/i18n/en.json b/webui/i18n/en.json
index 6ec7f08..e0f2900 100644
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@@ -73,7 +73,7 @@
     "Please Enter the LLM API Key": "Please enter the **LLM API Key**",
     "Please Enter the Pexels API Key": "Please enter the **Pexels API Key**",
     "Please Enter the Pixabay API Key": "Please enter the **Pixabay API Key**",
-    "Get Help": "One-stop AI video commentary + automated editing tool\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\nFor any questions or suggestions, you can join the **community channel** for help or discussion: https://discord.gg/WBKChhmZ",
+    "Get Help": "One-stop AI video commentary + automated editing tool\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\nFor any questions or suggestions, you can join the **community channel** for help or discussion: https://github.com/linyqh/NarratoAI/wiki",
     "Video Source": "Video Source",
     "TikTok": "TikTok (Support is coming soon)",
     "Bilibili": "Bilibili (Support is coming soon)",
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
index fd4500e..8a77698 100644
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -73,7 +73,7 @@
     "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
     "Please Enter the Pexels API Key": "请先填写 **Pexels API Key**",
     "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**",
-    "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://discord.gg/WBKChhmZ",
+    "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://github.com/linyqh/NarratoAI/wiki",
     "Video Source": "视频来源",
     "TikTok": "抖音 (TikTok 支持中，敬请期待)",
     "Bilibili": "哔哩哔哩 (Bilibili 支持中，敬请期待)",

From d6663fde2128efb5f4f25eab0638503a68f7ba7c Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Wed, 25 Sep 2024 01:43:55 +0800
Subject: [PATCH 08/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=20webui.py;=20?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC=E5=89=AA=E8=BE=91=E4=B8=8D=E7=A8=B3=E5=AE=9A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py | 340 +++++++++++++-------------------------------
 webui.py            |  49 ++++---
 webui/i18n/zh.json  |   1 +
 3 files changed, 129 insertions(+), 261 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index f998ebb..bafb925 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -1,7 +1,8 @@
-import logging
+import os
 import re
 import json
 import traceback
+import streamlit as st
 from typing import List
 from loguru import logger
 from openai import OpenAI
@@ -11,6 +12,7 @@ import google.generativeai as gemini
 from googleapiclient.errors import ResumableUploadError
 from google.api_core.exceptions import FailedPrecondition
 from google.generativeai.types import HarmCategory, HarmBlockThreshold
+import subprocess
 
 from app.config import config
 
@@ -29,29 +31,29 @@ Method = """
 盘点全球最恐怖的10部电影
 盘点全球最科幻的10部电影
 盘点全球最悲惨的10部电影
-盘点全球最值得看的10部灾难电影
+盘全球最值得看的10部灾难电影
 盘点全球最值得看的10部励志电影
 
 下面的示例就是最简单的解说文案开头：
 1.这是XXX国20年来最大尺度的一部剧，极度烧脑，却让99%的人看得心潮澎湃、无法自拔，故事开始……
 2.这是有史以来电影院唯一一部全程开灯放完的电影，期间无数人尖叫昏厥，他被成为勇敢者的专属，因为99%的人都不敢看到结局，许多人看完它从此不愿再碰手机，他就是大名鼎鼎的暗黑神作《XXX》……
 3.这到底是一部什么样的电影，能被55个国家公开抵制，它甚至为了上映，不惜删减掉整整47分钟的剧情……
-4.是什么样的一个人，被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
+4.是什么样的一个人被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
 5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/……
 6.这到底是一部什么样的影片，他一个人就拿了4个顶级奖项，第一季8.7分，第二季直接干到9.5分，11万人给出5星好评，一共也就6集，却斩获26项国际大奖，看过的人都说，他是近年来最好的xxx剧，几乎成为了近年来xxx剧的标杆。故事发生在……
 7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《时代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
 8.这是一部让所有人看得荷尔蒙飙升的爽片……
 9.他被成为世界上最虐心绝望的电影，至今无人敢看第二遍，很难想象，他是根据真实事件改编而来……
-10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他就是顶级神作《xxxx》……
+10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他是顶级神作《xxxx》……
 11.这是X国有史以来最高赞的一部悬疑电影，然而却因为某些原因，国内90%的人，没能看过这部片子，他就是《xxx》……
 12.有这样一部电影，这辈子，你绝对不想再看第二遍，并不是它剧情烂俗，而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场，更让许多同行都不想解说这部电影，他就是大名鼎鼎的暗黑神作《xxx》…
-13.它被誉为史上最牛悬疑片，无数人在看完它时候，一个月不敢照镜子，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
+13.它被誉为史上最牛悬疑片无数人在看完它时候，一个月不敢照镜子，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
 14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影，能够在某瓣上被70万人打出9.3分的高分……
 15.这是一部细思极恐的科幻大片，整部电影颠覆你的三观，它的名字叫……
 16.史上最震撼的灾难片，每一点都不舍得快进的电影，他叫……
 17.今天给大家带来一部基于真实事件改编的（主题介绍一句……）的故事片，这是一部连环悬疑剧，如果不看到最后绝对想不到结局竟然是这样的反转……
 
-### 方式二：情景式、假设性开头
+### 方式：情景式、假设性开头
 1.他叫……你以为他是……的吗？不。他是来……然后开始叙述
 2.你知道……吗？原来……然后开始叙述
 3.如果给你….，你会怎么样？
@@ -71,7 +73,7 @@ Method = """
 
 例如：
 1.这不是电影，这是真实故事。两个女人和一个男人被关在可桑拿室。喊破喉咙也没有一丝回音。窒息感和热度让人抓狂，故事就是从这里开始！ 
-2.如果你男朋友出轨了，他不爱你了，还对你家暴，怎么办？接下来这部电影就会教你如何让老公服服帖帖的呆在你身边！女主是一个……开始叙述了。 
+2.如果你男朋友出轨了，他不爱你了，还你家暴，怎么办？接下来这部电影就会教你如何让老公服服帖帖的呆在你身边！女主是一个……开始叙述了。 
 3.他力大无穷，双眼放光，这不是拯救地球的超人吗？然而不是。今天给大家推荐的这部电影叫……
 
 以上是需要看完影片，看懂影片，然后从里面提炼出精彩的几句话,当然是比较难的，当你不会自己去总结前三句的经典的话。可以用前面方式一二三！
@@ -98,8 +100,7 @@ Method = """
 
 比如：也可以总结下这部短片如何的好，推荐/值得大家去观看之类的话语。
 其实就是给我们的作品来一个总结，总结我们所做的三个视频，有开始就要有结束。这个结束不一定是固定的模版。但是视频一定要有结尾。让人感觉有头有尾才最舒服！
-做解说是一个比较浪费脑细胞的活，虽然刚开始比较难一点，但是当你正常做三部剧之后。所有自己的思路都会被打开！以后的基本就可以独立完成来操作来。
-做解说第一次，可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平常的制作全部时间！
+做解说第一次，可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平的制作全部时间！
 
 """
 
@@ -344,76 +345,73 @@ def _generate_response(prompt: str) -> str:
     return content.replace("\n", "")
 
 
+def compress_video(input_path: str, output_path: str):
+    """
+    压缩视频文件
+    Args:
+        input_path: 输入视频文件路径
+        output_path: 输出压缩后的视频文件路径
+    """
+    ffmpeg_path = "E:\\projects\\NarratoAI_v0.1.2\\lib\\ffmpeg\\ffmpeg-7.0-essentials_build\\ffmpeg.exe"  # 指定 ffmpeg 的完整路径
+
+    # 如果压缩后的视频文件已经存在，则直接使用
+    if os.path.exists(output_path):
+        logger.info(f"压缩视频文件已存在: {output_path}")
+        return
+
+    try:
+        command = [
+            ffmpeg_path,
+            "-i", input_path,
+            "-c:v", "h264",
+            "-b:v", "500k",
+            "-c:a", "aac",
+            "-b:a", "128k",
+            output_path
+        ]
+        subprocess.run(command, check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"视频压缩失败: {e}")
+        raise
+
+
 def generate_script(
-    video_subject: str, language: str = "", paragraph_number: int = 1
+    video_path: str, video_plot: str, video_name: str, language: str = "zh-CN", progress_text: st.empty = st.empty()
 ) -> str:
-    prompt = f"""
-# Role: Video Script Generator
+    """
+    生成视频剪辑脚本
+    Args:
+        video_path: 视频文件路径
+        video_plot: 视频剧情内容
+        video_name: 视频名称
+        language: 语言
 
-## Goals:
-Generate a script for a video, depending on the subject of the video.
+    Returns:
+        str: 生成的脚本
+    """
+    # 1. 压缩视频
+    progress_text.text("压缩视频中...")
+    compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4"
+    compress_video(video_path, compressed_video_path)
 
-## Constrains:
-1. the script is to be returned as a string with the specified number of paragraphs.
-2. do not under any circumstance reference this prompt in your response.
-3. get straight to the point, don't start with unnecessary things like, "welcome to this video".
-4. you must not include any type of markdown or formatting in the script, never use a title.
-5. only return the raw content of the script.
-6. do not include "voiceover", "narrator" or similar indicators of what should be spoken at the beginning of each paragraph or line.
-7. you must not mention the prompt, or anything about the script itself. also, never talk about the amount of paragraphs or lines. just write the script.
-8. respond in the same language as the video subject.
+    # 2. 转录视频
+    transcription = gemini_video_transcription(video_name=video_name, video_path=compressed_video_path, language=language, progress_text=progress_text)
 
-# Initialization:
-- video subject: {video_subject}
-- number of paragraphs: {paragraph_number}
-""".strip()
-    if language:
-        prompt += f"\n- language: {language}"
+    # # 清理压缩后的视频文件
+    # try:
+    #     os.remove(compressed_video_path)
+    # except OSError as e:
+    #     logger.warning(f"删除压缩视频文件失败: {e}")
 
-    final_script = ""
-    logger.info(f"subject: {video_subject}")
+    # 3. 编写解说文案
+    progress_text.text("解说文案中...")
+    script = writing_short_play(video_plot, video_name)
 
-    def format_response(response):
-        # Clean the script
-        # Remove asterisks, hashes
-        response = response.replace("*", "")
-        response = response.replace("#", "")
+    # 4. 文案匹配画面
+    progress_text.text("画面匹配中...")
+    matched_script = screen_matching(huamian=transcription, wenan=script)
 
-        # Remove markdown syntax
-        response = re.sub(r"\[.*\]", "", response)
-        response = re.sub(r"\(.*\)", "", response)
-
-        # Split the script into paragraphs
-        paragraphs = response.split("\n\n")
-
-        # Select the specified number of paragraphs
-        selected_paragraphs = paragraphs[:paragraph_number]
-
-        # Join the selected paragraphs into a single string
-        return "\n\n".join(paragraphs)
-
-    for i in range(_max_retries):
-        try:
-            response = _generate_response(prompt=prompt)
-            if response:
-                final_script = format_response(response)
-            else:
-                logging.error("gpt returned an empty response")
-
-            # g4f may return an error message
-            if final_script and "当日额度已消耗完" in final_script:
-                raise ValueError(final_script)
-
-            if final_script:
-                break
-        except Exception as e:
-            logger.error(f"failed to generate script: {e}")
-
-        if i < _max_retries:
-            logger.warning(f"failed to generate video script, trying again... {i + 1}")
-
-    logger.success(f"completed: \n{final_script}")
-    return final_script.strip()
+    return matched_script
 
 
 def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
@@ -510,7 +508,7 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
 
 **输入示例：**  
 ```text  
-在一个黑暗的小巷中，主角缓慢走进，四周静谧无声，只有远处隐隐传来猫的叫声。突然，背后出现一个神秘的身影。  
+在一个���暗的小巷中，主角缓慢走进，四周静谧无声，只有远处隐隐传来猫的叫声。突然，背后出现一个神秘的身影。  
 ```  
 
 **输出格式：**  
@@ -566,7 +564,7 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
     return response
 
 
-def gemini_video_transcription(video_origin_name: str, video_origin_path: str, language: str):
+def gemini_video_transcription(video_name: str, video_path: str, language: str, progress_text: st.empty = ""):
     '''
     使用 gemini-1.5-xxx 进行视频画面转录
     '''
@@ -577,24 +575,25 @@ def gemini_video_transcription(video_origin_name: str, video_origin_path: str, l
     model = gemini.GenerativeModel(model_name=model_name)
 
     prompt = """
-    Please transcribe the audio, include timestamps, and provide visual descriptions, then output in JSON format，use %s ONLY.
-
+    Please transcribe the audio, include timestamps, and provide visual descriptions, then output in JSON format.
+    Please use %s output
     Use this JSON schema:
 
     Graphics = {"timestamp": "MM:SS-MM:SS", "picture": "str", "quotes": "str"(If no one says anything, use an empty string instead.)}
     Return: list[Graphics]
     """ % language
 
-    logger.debug(f"视频名称: {video_origin_name}")
+    logger.debug(f"视频名称: {video_name}")
     try:
-        gemini_video_file = gemini.upload_file(video_origin_path)
-        # gemini_video_file = gemini.get_file("files/uxo6r9n80s84")
+        progress_text.text("上传视频中...")
+        gemini_video_file = gemini.upload_file(video_path)
         logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
         while gemini_video_file.state.name == "PROCESSING":
             import time
             time.sleep(1)
             gemini_video_file = gemini.get_file(gemini_video_file.name)
-            logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
+            progress_text.text(f"解析视频中, 当前状态: {gemini_video_file.state.name}")
+            # logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
         if gemini_video_file.state.name == "FAILED":
             raise ValueError(gemini_video_file.state.name)
     except ResumableUploadError as err:
@@ -604,6 +603,7 @@ def gemini_video_transcription(video_origin_name: str, video_origin_path: str, l
         logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}")
         return ""
 
+    progress_text.text("视频转录中...")
     response = model.generate_content(
         [prompt, gemini_video_file],
         safety_settings={
@@ -613,7 +613,7 @@ def gemini_video_transcription(video_origin_name: str, video_origin_path: str, l
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
         }
     )
-    logger.success(f"llm 视频转录: \n{response.text}")
+    logger.success("视频转录成功")
     return response.text
 
 
@@ -652,8 +652,9 @@ def writing_movie(video_plot, video_name):
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
         }
     )
-    print(response.text)
-    print("字数：", len(response.text))
+    logger.debug(response.text)
+    logger.debug("字数：", len(response.text))
+    return response.text
 
 
 def writing_short_play(video_plot: str, video_name: str):
@@ -697,8 +698,8 @@ def writing_short_play(video_plot: str, video_name: str):
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
         }
     )
-    print(response.text)
-    print("字数：", len(response.text))
+    logger.success("解说文案生成成功")
+    return response.text
 
 
 def screen_matching(huamian: str, wenan: str):
@@ -733,9 +734,6 @@ def screen_matching(huamian: str, wenan: str):
     script = {'picture': str, 'timestamp': str, "narration": str, "OST": bool}
     Return: list[script]
     """ % (huamian, wenan)
-
-    logger.info(prompt)
-
     response = model.generate_content(
         prompt,
         generation_config=gemini.types.GenerationConfig(
@@ -749,9 +747,8 @@ def screen_matching(huamian: str, wenan: str):
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
         }
     )
-    print(response.text)
-    print("字数：", len(response.text))
-
+    logger.success("匹配成功")
+    return response.text
 
 
 if __name__ == "__main__":
@@ -762,159 +759,12 @@ if __name__ == "__main__":
     # gemini_video_transcription(video_subject, video_path, language)
 
     # 2. 解说文案
-    # video_plot = """
-    #     李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明“你儿子是你儿子”。
-    # 走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
-    # 苏醒后的李牧坚决为父亲做无罪辩护，面对银行的顶级律师团队，他一个法学院大一学生，能否力挽狂澜，创作奇迹？挥法律之利剑 ，持正义之天平！
-    # """
-    # print(video_plot)
-    # res = writing_short_play(video_plot, "第二十条之无罪释放")
-
-    wenan = """
-    这到底是一部什么样的电影，能让银行经理在法庭上公然下跪，能让无数网友为之愤怒，更能让无数人为之动容？\n
-他叫李自忠，为了给儿子筹集医药费，他来到了银行，想取出儿子名下的存款，却被银行告知，要证明“你儿子是你儿子”，走投无路之下，他却被卷入了一场银行抢劫案，阴差阳错之下，劫匪给了他两沓钱，让他救儿子，本以为是希望，没想到却是绝望的开始，他因此被认定为抢劫犯，被判处20年有期徒刑。\n
-然而，天无绝人之路，昏迷的儿子醒了，苏醒后的儿子，怎么也不敢相信，自己的父亲竟然被判为抢劫犯，为了给父亲讨回公道，他做出了一个决定，他要为父亲做无罪辩护，要知道，他只是一个法学院的大一学生，面对银行的顶级律师团队，他能成功吗？\n
-面对种种不利证据，他一次次败诉，又一次次上诉，就像一只打不死的小强，为了找到有利的证据，他四处奔波，走访调查，甚至不惜以身犯险，只为还原事实真相，然而，真相真的会到来吗？\n
-正义或许会迟到，但永远不会缺席，随着案件的审理，越来越多的疑点浮出水面，案情也发生了惊天大逆转，他究竟发现了什么？最后的真相又是什么？本案改编自真实事件，究竟是人性的扭曲，还是道德的沦丧？\n
-想知道案件的最终结果吗？让我们一起走进这部电影，寻找最终的真相吧！
-    """
-    # 读取指定目录下的 json 文件
-    with open("../../resource/scripts/zhuanlu.json", "r", encoding="utf-8") as f:
-        huamian = json.load(f)
-
-    screen_matching(huamian, wenan)
-
-
-
-    # import os
-    # import sys
-    # import requests
-    # from app.utils.utils import get_current_country
-    #
-    # # # 添加当前目录到系统路径
-    # # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-    # # proxy_url_http = "http://127.0.0.1:7890"
-    # # os.environ["HTTP_PROXY"] = proxy_url_http
-    # # os.environ["HTTPS_PROXY"] = proxy_url_http
-    #
-    # video_subject = "卖菜大妈竟是皇嫂"
-    # video_path = "../../resource/videos/demoyasuo.mp4"
-    # # video_path = "../../resource/videos/庆余年2-1-1.mp4"
-    #
-    # video_plot = ''' '''
-    # language = "zh-CN"
-    # # res = gemini_video2json(video_subject, video_path, video_plot, language)
-    # script = gemini_video_transcription(video_subject, video_path, language)
-    # cleaned_string = script.strip("```json").strip("```")
-    # res = json.loads(cleaned_string)
-    # print(res)
-
-    # get_current_country()
-    # api_key = config.app.get("gemini_api_key")
-    # model_name = config.app.get("gemini_model_name")
-    # gemini.configure(api_key=api_key)
-    # model = gemini.GenerativeModel(model_name=model_name)
-    # # 卖菜大妈竟是皇嫂 测试视频
-    # video_name = "files/y3npkshvldsd"
-    # video_file = gemini.get_file(video_name)
-    # logger.debug(f"视频当前状态(ACTIVE才可用): {video_file.state.name}")
-    #
-    # # 转录视频并提供视觉说明
-    # prompt = "Transcribe the audio, giving timestamps. Also provide visual descriptions. use ZH-CN ONLY"
-    # # Make the LLM request.
-    # print("发出 LLM 推理请求...")
-    # streams = model.generate_content([prompt, video_file],
-    #                                   request_options={"timeout": 600},
-    #                                   stream=True)
-    # response = []
-    # for chunk in streams:
-    #     response.append(chunk.text)
-    #
-    # response = "".join(response)
-    # logger.success(f"llm response: \n{response}")
-    wenan = """
-重要提示：每一部剧的文案，前几句必须吸引人
-首先我们在看完看懂电影后，大脑里面要先有一个大概的轮廓，也就是一个类似于作文的大纲，电影主题线在哪里，首先要找到。
-一般将文案分为开头、内容、结尾
-## 开头部分
-文案开头三句话，是留住用户的关键！
-
-### 方式一：开头概括总结
-文案的前三句，是整部电影的概括总结，2-3句介绍后，开始叙述故事剧情！
-推荐新手（新号）做：（盘点型）
-盘点全球最恐怖的10部电影
-盘点全球最科幻的10部电影
-盘点全球最悲惨的10部电影
-盘点全球最值得看的10部灾难电影
-盘点全球最值得看的10部励志电影
-
-下面的示例就是最简单的解说文案开头：
-1.这是XXX国20年来最大尺度的一部剧，极度烧脑，却让99%的人看得心潮澎湃、无法自拔，故事开始……
-2.这是有史以来电影院唯一一部全程开灯放完的电影，期间无数人尖叫昏厥，他被成为勇敢者的专属，因为99%的人都不敢看到结局，许多人看完它从此不愿再碰手机，他就是大名鼎鼎的暗黑神作《XXX》……
-3.这到底是一部什么样的电影，能被55个国家公开抵制，它甚至为了上映，不惜删减掉整整47分钟的剧情……
-4.是什么样的一个人，被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
-5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/……
-6.这到底是一部什么样的影片，他一个人就拿了4个顶级奖项，第一季8.7分，第二季直接干到9.5分，11万人给出5星好评，一共也就6集，却斩获26项国际大奖，看过的人都说，他是近年来最好的xxx剧，几乎成为了近年来xxx剧的标杆。故事发生在……
-7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《时代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
-8.这是一部让所有人看得荷尔蒙飙升的爽片……
-9.他被成为世界上最虐心绝望的电影，至今无人敢看第二遍，很难想象，他是根据真实事件改编而来……
-10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他就是顶级神作《xxxx》……
-11.这是X国有史以来最高赞的一部悬疑电影，然而却因为某些原因，国内90%的人，没能看过这部片子，他就是《xxx》……
-12.有这样一部电影，这辈子，你绝对不想再看第二遍，并不是它剧情烂俗，而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场，更让许多同行都不想解说这部电影，他就是大名鼎鼎的暗黑神作《xxx》…
-13.它被誉为史上最牛悬疑片，无数人在看完它时候，一个月不敢照镜子，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
-14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影，能够在某瓣上被70万人打出9.3分的高分……
-15.这是一部细思极恐的科幻大片，整部电影颠覆你的三观，它的名字叫……
-16.史上最震撼的灾难片，每一点都不舍得快进的电影，他叫……
-17.今天给大家带来一部基于真实事件改编的（主题介绍一句……）的故事片，这是一部连环悬疑剧，如果不看到最后绝对想不到结局竟然是这样的反转……
-
-### 方式二：情景式、假设性开头
-1.他叫……你以为他是……的吗？不。他是来……然后开始叙述
-2.你知道……吗？原来……然后开始叙述
-3.如果给你….，你会怎么样？
-4.如果你是….，你会怎么样？
- 
-### 方式三：以国家为开头！简单明了。话语不需要多，但是需要讲解透彻！
-1.这是一部韩国最新灾难片，你一定没有看过……
-2.这是一部印度高分悬疑片，
-3.这部电影原在日本因为……而被下架，
-4.这是韩国最恐怖的犯罪片，
-5.这是最近国产片评分最高的悬疑片
-以上均按照影片国家来区分，然后简单介绍下主题。就可以开始直接叙述作品。也是一个很不错的方法！
-
-### 方式四：如何自由发挥
-正常情况下，每一部电影都有非常关键的一个大纲，这部电影的主题其实是可以用一句话、两句话概括的。只要看懂电影，就能找到这个主题大纲。
-我们提前把这个主题大纲给放到影视最前面，作为我们的前三句的文案，将会非常吸引人！
-
-例如：
-1.这不是电影，这是真实故事。两个女人和一个男人被关在可桑拿室。喊破喉咙也没有一丝回音。窒息感和热度让人抓狂，故事就是从这里开始！ 
-2.如果你男朋友出轨了，他不爱你了，还对你家暴，怎么办？接下来这部电影就会教你如何让老公服服帖帖的呆在你身边！女主是一个……开始叙述了。 
-3.他力大无穷，双眼放光，这不是拯救地球的超人吗？然而不是。今天给大家推荐的这部电影叫……
-
-以上是需要看完影片，看懂影片，然后从里面提炼出精彩的几句话,当然是比较难的，当你不会自己去总结前三句的经典的话。可以用前面方式一二三！
-实在想不出来如何去提炼，可以去搜索这部剧，对这部电影的影评，也会给你带过来很多灵感的！
-
-
-## 内容部分
-开头有了，剩下的就是开始叙述正文了。主题介绍是根据影片内容来介绍，如果实在自己想不出来。可以参考其他平台中对这部电影的精彩介绍，提取2-3句也可以！
-正常情况下，我们叙述的时候其实是非常简单的，把整部电影主题线，叙述下来，其实文案就是加些修饰词把电影重点内容叙述下来。加上一些修饰词。
-
-以悬疑剧为例：
-竟然，突然，原来，但是，但，可是，结果，直到，如果，而，果然，发现，只是，出奇，之后，没错，不止，更是，当然，因为，所以……等！
-以上是比较常用的，当然还有很多，需要靠平时思考和阅读的积累！因悬疑剧会有多处反转剧情。所以需要用到反转的修饰词比较多，只有用到这些词。才能体现出各种反转剧情！
-建议大家在刚开始做的时候，做8分钟内的，不要太长，分成三段。每段也是不超过三分钟，这样时间刚好。可以比较好的完成完播率！
-
-
-## 结尾部分
-最后故事的结局，除了反转，可以来点人生的道理！如果刚开始不会，可以不写。
-后面水平越来越高的时候，可以进行人生道理的讲评。
-
-比如：这部电影告诉我们……
-类似于哲理性质的，作为一个总结！
-也可以把最后的影视反转，原生放出来，留下悬念。
-
-比如：也可以总结下这部短片如何的好，推荐/值得大家去观看之类的话语。
-其实就是给我们的作品来一个总结，总结我们所做的三个视频，有开始就要有结束。这个结束不一定是固定的模版。但是视频一定要有结尾。让人感觉有头有尾才最舒服！
-做解说是一个比较浪费脑细胞的活，虽然刚开始比较难一点，但是当你正常做三部剧之后。所有自己的思路都会被打开！以后的基本就可以独立完成来操作来。
-做解说第一次，可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平常的制作全部时间！
-
+    video_path = "E:\\projects\\NarratoAI\\resource\\videos\\2.mp4"
+    video_plot = """
+        李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明"你儿子是你儿子"。
+    走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
+    苏醒后的李牧坚决为父亲做无罪辩护，面对银行的顶级律师团队，他一个法学院大一学生，能否力挽狂澜，创作奇迹？挥法律之利剑 ，持正义之天平！
     """
+    res = generate_script(video_path, video_plot, video_name="第二十条之无罪释放")
+    # res = generate_script(video_path, video_plot, video_name="海岸")
+    print("res \n", res)
diff --git a/webui.py b/webui.py
index 27e4b1c..02fceb1 100644
--- a/webui.py
+++ b/webui.py
@@ -23,7 +23,7 @@ if root_dir not in sys.path:
     sys.path.append(root_dir)
     print("******** sys.path ********")
     print(sys.path)
-    print("")
+    print("*" * 20)
 
 st.set_page_config(
     page_title="NarratoAI",
@@ -67,6 +67,8 @@ if 'video_plot' not in st.session_state:
     st.session_state['video_plot'] = ''
 if 'ui_language' not in st.session_state:
     st.session_state['ui_language'] = config.ui.get("language", system_locale)
+if 'script_generation_status' not in st.session_state:
+    st.session_state['script_generation_status'] = ""
 
 
 def get_all_fonts():
@@ -197,7 +199,6 @@ with st.expander(tr("Basic Settings"), expanded=False):
         #   qwen (通义千问)
         #   gemini
         #   ollama
-        # llm_providers = ['Gemini', 'OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Ollama', 'G4f', 'OneAPI', "Cloudflare"]
         llm_providers = ['Gemini']
         saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
         saved_llm_provider_index = 0
@@ -295,27 +296,30 @@ with left_panel:
         video_json_file = params.video_clip_json
 
         # 视频文件处理
-        files = []
+        video_files = []
         for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]:
-            files.extend(glob.glob(os.path.join(utils.video_dir(), suffix)))
-        files = files[::-1]
+            video_files.extend(glob.glob(os.path.join(utils.video_dir(), suffix)))
+        video_files = video_files[::-1]
 
         video_list = []
-        for file in files:
+        for video_file in video_files:
             video_list.append({
-                "name": os.path.basename(file),
-                "size": os.path.getsize(file),
-                "file": file,
+                "name": os.path.basename(video_file),
+                "size": os.path.getsize(video_file),
+                "file": video_file,
+                "ctime": os.path.getctime(video_file)  # 获取文件创建时间
             })
-
+        # 按创建时间降序排序
+        video_list.sort(key=lambda x: x["ctime"], reverse=True)
         video_path = [("None", ""), (tr("Upload Local Files"), "local")]
         for code in [file['file'] for file in video_list]:
             video_path.append((code, code))
 
+        # 视频文件
         selected_video_index = st.selectbox(tr("Video File"),
                                             index=0,
                                             options=range(len(video_path)),  # 使用索引作为内部选项值
-                                            format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
+                                            format_func=lambda x: video_path[x][0]  # 显示给用户的是标
                                             )
         params.video_origin_path = video_path[selected_video_index][1]
         config.app["video_origin_path"] = params.video_origin_path
@@ -343,7 +347,8 @@ with left_panel:
                     st.success(tr("File Uploaded Successfully"))
                     time.sleep(1)
                     st.rerun()
-
+        # 视频名称
+        video_name = st.text_input(tr("Video Name"))
         # 剧情内容
         video_plot = st.text_area(
             tr("Plot Description"),
@@ -352,16 +357,26 @@ with left_panel:
         )
 
         # 生成视频脚本
+        st.session_state['script_generation_status'] = "开始生成视频脚本"
         if st.button(tr("Video Script Generate"), key="auto_generate_script"):
-            with st.spinner(tr("Video Script Generate")):
+            with st.spinner("正在生成脚本..."):
+                # 这里可以用 st.empty() 来动态更新文本
+                progress_text = st.empty()
+                progress_text.text("正在处理...")
+
                 if video_json_file == "" and params.video_origin_path != "":
+                    progress_text.text("开始压缩...")
                     # 使用大模型生成视频脚本
-                    script = llm.gemini_video2json(
-                        video_origin_name=os.path.basename(params.video_origin_path),
-                        video_origin_path=params.video_origin_path,
+                    script = llm.generate_script(
+                        video_path=params.video_origin_path,
                         video_plot=video_plot,
+                        video_name=video_name,
                         language=params.video_language,
+                        progress_text=progress_text
                     )
+                    if script is None:
+                        st.error("生成脚本失败，请检查日志")
+                        st.stop()
                     st.session_state['video_clip_json'] = script
                     cleaned_string = script.strip("```json").strip("```")
                     st.session_state['video_script_list'] = json.loads(cleaned_string)
@@ -434,6 +449,8 @@ with left_panel:
 
             if st.session_state.get('video_script_list', None) is not None:
                 video_script_list = st.session_state.video_script_list
+                print(video_script_list)
+                print(type(video_script_list))
                 time_list = [i['timestamp'] for i in video_script_list]
                 subclip_videos = material.clip_videos(
                     task_id=st.session_state['task_id'],
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
index 8a77698..dc1da54 100644
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -9,6 +9,7 @@
     "Generate Video Script and Keywords": "点击使用AI根据**主题**生成 【视频文案】 和 【视频关键词】",
     "Auto Detect": "自动检测",
     "Auto Generate": "自动生成",
+    "Video Name": "视频名称",
     "Video Script": "视频脚本（:blue[①可不填，使用AI生成  ②合理使用标点断句，有助于生成字幕]）",
     "Save Script": "保存脚本",
     "Crop Video": "裁剪视频",

From 990994e9cd6fd334a48f0b3223256dd24d0d5687 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Wed, 25 Sep 2024 18:32:38 +0800
Subject: [PATCH 09/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=20webui=20task=20?=
 =?UTF-8?q?=E9=80=BB=E8=BE=9130%=EF=BC=9B=E6=96=B0=E5=A2=9E=E6=A3=80?=
 =?UTF-8?q?=E6=9F=A5/=E4=BF=AE=E5=A4=8D=E8=84=9A=E6=9C=AC=E6=96=B9?=
 =?UTF-8?q?=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/models/schema.py      |  65 ++++++-------
 app/services/llm.py       |   3 +-
 app/services/task.py      |  13 ++-
 app/utils/check_script.py | 198 ++++++++++++++++++++++++++++++++++++++
 requirements.txt          |   1 +
 webui.py                  | 113 ++++++----------------
 6 files changed, 269 insertions(+), 124 deletions(-)
 create mode 100644 app/utils/check_script.py

diff --git a/app/models/schema.py b/app/models/schema.py
index b90a4c1..f20657a 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -3,7 +3,7 @@ from enum import Enum
 from typing import Any, List, Optional
 
 import pydantic
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 # 忽略 Pydantic 的特定警告
 warnings.filterwarnings(
@@ -330,42 +330,39 @@ class BgmUploadResponse(BaseResponse):
 
 
 class VideoClipParams(BaseModel):
-    video_subject: Optional[str] = "春天的花海让人心旷神怡"
+    """
+    NarratoAI 数据模型
+    """
+    video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容")
+    video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径")
+    video_origin_path: Optional[str] = Field(default="", description="原视频路径")
+    video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例")
+    video_language: Optional[str] = Field(default="zh-CN", description="视频语言")
 
-    video_clip_json: Optional[str] = ""      # 视频剪辑脚本
-    video_origin_path: Optional[str] = ""    # 原视频路径
-    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value        # 视频比例
-    video_clip_duration: Optional[int] = 5      # 视频片段时长
-    video_count: Optional[int] = 1      # 视频片段数量
-    video_source: Optional[str] = "local"
-    video_language: Optional[str] = ""  # 自动检测
+    # video_clip_duration: Optional[int] = 5      # 视频片段时长
+    # video_count: Optional[int] = 1      # 视频片段数量
+    # video_source: Optional[str] = "local"
     # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
 
-    # # 女性
-    # "zh-CN-XiaoxiaoNeural",
-    # "zh-CN-XiaoyiNeural",
-    # # 男性
-    # "zh-CN-YunjianNeural" 男声
-    # "zh-CN-YunyangNeural",
-    # "zh-CN-YunxiNeural",
-    voice_name: Optional[str] = "zh-CN-YunjianNeural"      # 语音名称 指定选择：
-    voice_volume: Optional[float] = 1.0     # 语音音量
-    voice_rate: Optional[float] = 1.0       # 语速
+    voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
+    voice_volume: Optional[float] = Field(default=1.0, description="语音音量")
+    voice_rate: Optional[float] = Field(default=1.0, description="语速")
 
-    bgm_name: Optional[str] = "random"  # 背景音乐名称
-    bgm_type: Optional[str] = "random"  # 背景音乐类型
-    bgm_file: Optional[str] = ""        # 背景音乐文件
-    bgm_volume: Optional[float] = 0.2
+    bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
+    bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
+    bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
+    bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量")
 
-    subtitle_enabled: Optional[bool] = True     # 是否启用字幕
-    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
-    font_name: Optional[str] = "STHeitiMedium.ttc"      # 字体名称
-    text_fore_color: Optional[str] = "#FFFFFF"      # 文字前景色
-    text_background_color: Optional[str] = "transparent"    # 文字背景色
+    subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕")
+    subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置")  # top, bottom, center
+    font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称")
+    text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色")
+    text_background_color: Optional[str] = Field(default="transparent", description="文字背景色")
 
-    font_size: int = 60     # 文字大小
-    stroke_color: Optional[str] = "#000000"     # 文字描边颜色
-    stroke_width: float = 1.5       # 文字描边宽度
-    custom_position: float = 70.0   # 自定义位置
-    n_threads: Optional[int] = 2    # 线程数
-    paragraph_number: Optional[int] = 1     # 段落数量
+    font_size: int = Field(default=60, description="文字大小")
+    stroke_color: Optional[str] = Field(default="#000000", description="文字描边颜色")
+    stroke_width: float = Field(default=1.5, description="文字描边宽度")
+    custom_position: float = Field(default=70.0, description="自定义位置")
+
+    # n_threads: Optional[int] = 2    # 线程数
+    # paragraph_number: Optional[int] = 1     # 段落数量
diff --git a/app/services/llm.py b/app/services/llm.py
index bafb925..6aa818e 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -352,7 +352,8 @@ def compress_video(input_path: str, output_path: str):
         input_path: 输入视频文件路径
         output_path: 输出压缩后的视频文件路径
     """
-    ffmpeg_path = "E:\\projects\\NarratoAI_v0.1.2\\lib\\ffmpeg\\ffmpeg-7.0-essentials_build\\ffmpeg.exe"  # 指定 ffmpeg 的完整路径
+    # 指定 ffmpeg 的完整路径
+    ffmpeg_path = os.getenv("FFMPEG_PATH") or config.app.get("ffmpeg_path")
 
     # 如果压缩后的视频文件已经存在，则直接使用
     if os.path.exists(output_path):
diff --git a/app/services/task.py b/app/services/task.py
index c768e6a..0c544c2 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -326,17 +326,20 @@ def start(task_id, params: VideoParams, stop_at: str = "video"):
 def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
     """
     后台任务（自动剪辑视频进行剪辑）
+
+        task_id: 任务ID
+        params: 剪辑参数
+        subclip_path_videos: 视频文件路径
+
     """
     logger.info(f"\n\n## 开始任务: {task_id}")
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
 
+    # tts 角色名称
     voice_name = voice.parse_voice_name(params.voice_name)
-    paragraph_number = params.paragraph_number
-    n_threads = params.n_threads
-    max_clip_duration = params.video_clip_duration
 
     logger.info("\n\n## 1. 读取视频json脚本")
-    video_script_path = path.join(params.video_clip_json)
+    video_script_path = path.join(params.video_clip_json_path)
     # 判断json文件是否存在
     if path.exists(video_script_path):
         try:
@@ -430,7 +433,7 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
             video_ost_list=video_ost,
             list_script=list_script,
             video_aspect=params.video_aspect,
-            threads=n_threads
+            threads=1   # 暂时只支持单线程
         )
 
         _progress += 50 / params.video_count / 2
diff --git a/app/utils/check_script.py b/app/utils/check_script.py
new file mode 100644
index 0000000..e10bd3f
--- /dev/null
+++ b/app/utils/check_script.py
@@ -0,0 +1,198 @@
+import json
+from loguru import logger
+import os
+from datetime import datetime, timedelta
+import re
+
+
+def time_to_seconds(time_str):
+    time_obj = datetime.strptime(time_str, "%M:%S")
+    return timedelta(minutes=time_obj.minute, seconds=time_obj.second).total_seconds()
+
+
+def seconds_to_time_str(seconds):
+    minutes, seconds = divmod(int(seconds), 60)
+    return f"{minutes:02d}:{seconds:02d}"
+
+
+def check_script(file_path, total_duration):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    errors = []
+    ost_narrations = set()
+    last_end_time = 0
+
+    logger.info(f"开始检查文件: {file_path}")
+    logger.info(f"视频总时长: {total_duration:.2f} 秒")
+    logger.info("=" * 50)
+
+    for i, item in enumerate(data, 1):
+        logger.info(f"\n检查第 {i} 项:")
+
+        # 检查所有必需字段是否存在
+        required_fields = ['picture', 'timestamp', 'narration', 'OST', 'new_timestamp']
+        for field in required_fields:
+            if field not in item:
+                errors.append(f"第 {i} 项缺少 {field} 字段")
+                logger.info(f"  - 错误: 缺少 {field} 字段")
+            else:
+                logger.info(f"  - {field}: {item[field]}")
+
+        # 检查 OST 为 false 的情况
+        if item.get('OST') == False:
+            if not item.get('narration'):
+                errors.append(f"第 {i} 项 OST 为 false，但 narration 为空")
+                logger.info("  - 错误: OST 为 false，但 narration 为空")
+            elif len(item['narration']) > 30:
+                errors.append(f"第 {i} 项 OST 为 false，但 narration 超过 30 字")
+                logger.info(f"  - 错误: OST 为 false，但 narration 超过 30 字 (当前: {len(item['narration'])} 字)")
+            else:
+                logger.info("  - OST 为 false，narration 检查通过")
+
+        # 检查 OST 为 true 的情况
+        if item.get('OST') == True:
+            if not item.get('narration').startswith('原声播放_'):
+                errors.append(f"第 {i} 项 OST 为 true，但 narration 不是 '原声播放_xxx' 格式")
+                logger.info("  - 错误: OST 为 true，但 narration 不是 '原声播放_xxx' 格式")
+            elif item['narration'] in ost_narrations:
+                errors.append(f"第 {i} 项 OST 为 true，但 narration '{item['narration']}' 不是唯一值")
+                logger.info(f"  - 错误: OST 为 true，但 narration '{item['narration']}' 不是唯一值")
+            else:
+                logger.info("  - OST 为 true，narration 检查通过")
+                ost_narrations.add(item['narration'])
+
+        # 检查 timestamp 是否重叠
+        if 'timestamp' in item:
+            start, end = map(time_to_seconds, item['timestamp'].split('-'))
+            if start < last_end_time:
+                errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 与前一项重叠")
+                logger.info(f"  - 错误: timestamp '{item['timestamp']}' 与前一项重叠")
+            else:
+                logger.info(f"  - timestamp '{item['timestamp']}' 检查通过")
+            last_end_time = end
+
+            # 检查 timestamp 是否超过总时长
+            if end > total_duration:
+                errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
+                logger.info(f"  - 错误: timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
+            else:
+                logger.info(f"  - timestamp 在总时长范围内")
+
+    # 检查 new_timestamp 是否连续
+    logger.info("\n检查 new_timestamp 连续性:")
+    last_end_time = 0
+    for i, item in enumerate(data, 1):
+        if 'new_timestamp' in item:
+            start, end = map(time_to_seconds, item['new_timestamp'].split('-'))
+            if start != last_end_time:
+                errors.append(f"第 {i} 项 new_timestamp '{item['new_timestamp']}' 与前一项不连续")
+                logger.info(f"  - 错误: 第 {i} 项 new_timestamp '{item['new_timestamp']}' 与前一项不连续")
+            else:
+                logger.info(f"  - 第 {i} 项 new_timestamp '{item['new_timestamp']}' 连续性检查通过")
+            last_end_time = end
+
+    if errors:
+        logger.info("检查结果：不通过")
+        logger.info("发现以下错误：")
+        for error in errors:
+            logger.info(f"- {error}")
+        fix_script(file_path, data, errors)
+    else:
+        logger.info("检查结果：通过")
+        logger.info("所有项目均符合规则要求。")
+
+
+def fix_script(file_path, data, errors):
+    logger.info("\n开始修复脚本...")
+    fixed_data = []
+    for i, item in enumerate(data, 1):
+        if item['OST'] == False and (not item['narration'] or len(item['narration']) > 30):
+            if not item['narration']:
+                logger.info(f"第 {i} 项 narration 为空，需要人工参与修复。")
+                fixed_data.append(item)
+            else:
+                logger.info(f"修复第 {i} 项 narration 超过 30 字的问题...")
+                fixed_items = split_narration(item)
+                fixed_data.extend(fixed_items)
+        else:
+            fixed_data.append(item)
+
+    for error in errors:
+        if not error.startswith("第") or "OST 为 false" not in error:
+            logger.info(f"需要人工参与修复: {error}")
+
+    # 生成新的文件名
+    file_name, file_ext = os.path.splitext(file_path)
+    new_file_path = f"{file_name}_revise{file_ext}"
+
+    # 保存修复后的数据到新文件
+    with open(new_file_path, 'w', encoding='utf-8') as f:
+        json.dump(fixed_data, f, ensure_ascii=False, indent=4)
+
+    logger.info(f"\n脚本修复完成，已保存到新文件: {new_file_path}")
+
+
+def split_narration(item):
+    narration = item['narration']
+    chunks = smart_split(narration)
+
+    start_time, end_time = map(time_to_seconds, item['timestamp'].split('-'))
+    new_start_time, new_end_time = map(time_to_seconds, item['new_timestamp'].split('-'))
+
+    total_duration = end_time - start_time
+    new_total_duration = new_end_time - new_start_time
+    chunk_duration = total_duration / len(chunks)
+    new_chunk_duration = new_total_duration / len(chunks)
+
+    fixed_items = []
+    for i, chunk in enumerate(chunks):
+        new_item = item.copy()
+        new_item['narration'] = chunk
+
+        chunk_start = start_time + i * chunk_duration
+        chunk_end = chunk_start + chunk_duration
+        new_item['timestamp'] = f"{seconds_to_time_str(chunk_start)}-{seconds_to_time_str(chunk_end)}"
+
+        new_chunk_start = new_start_time + i * new_chunk_duration
+        new_chunk_end = new_chunk_start + new_chunk_duration
+        new_item['new_timestamp'] = f"{seconds_to_time_str(new_chunk_start)}-{seconds_to_time_str(new_chunk_end)}"
+
+        fixed_items.append(new_item)
+
+    return fixed_items
+
+
+def smart_split(text, target_length=30):
+    # 使用正则表达式分割文本，保留标点符号
+    segments = re.findall(r'[^，。！？,!?]+[，。！？,!?]?', text)
+    result = []
+    current_chunk = ""
+
+    for segment in segments:
+        if len(current_chunk) + len(segment) <= target_length:
+            current_chunk += segment
+        else:
+            if current_chunk:
+                result.append(current_chunk.strip())
+            current_chunk = segment
+
+    if current_chunk:
+        result.append(current_chunk.strip())
+
+    # 如果有任何chunk超过了目标长度，进行进一步的分割
+    final_result = []
+    for chunk in result:
+        if len(chunk) > target_length:
+            sub_chunks = [chunk[i:i + target_length] for i in range(0, len(chunk), target_length)]
+            final_result.extend(sub_chunks)
+        else:
+            final_result.append(chunk)
+
+    return final_result
+
+
+if __name__ == "__main__":
+    file_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/2024-0923-085036.json"
+    total_duration = 280
+    check_script(file_path, total_duration)
diff --git a/requirements.txt b/requirements.txt
index cfe7295..a562dcb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,3 +24,4 @@ opencv-python~=4.9.0.80
 # https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
 azure-cognitiveservices-speech~=1.37.0
 git-changelog~=2.5.2
+watchdog==5.0.2
diff --git a/webui.py b/webui.py
index 02fceb1..49d52f6 100644
--- a/webui.py
+++ b/webui.py
@@ -1,29 +1,5 @@
-import sys
-import os
-import glob
-import json
-import time
-import datetime
-import traceback
 import streamlit as st
-from uuid import uuid4
-import platform
-import streamlit.components.v1 as components
-from loguru import logger
-
 from app.config import config
-from app.models.const import FILE_TYPE_VIDEOS
-from app.models.schema import VideoClipParams, VideoAspect, VideoConcatMode
-from app.services import task as tm, llm, voice, material
-from app.utils import utils
-
-# # 将项目的根目录添加到系统路径中，以允许从项目导入模块
-root_dir = os.path.dirname(os.path.realpath(__file__))
-if root_dir not in sys.path:
-    sys.path.append(root_dir)
-    print("******** sys.path ********")
-    print(sys.path)
-    print("*" * 20)
 
 st.set_page_config(
     page_title="NarratoAI",
@@ -37,6 +13,31 @@ st.set_page_config(
     },
 )
 
+import sys
+import os
+import glob
+import json
+import time
+import datetime
+import traceback
+from uuid import uuid4
+import platform
+import streamlit.components.v1 as components
+from loguru import logger
+
+from app.models.const import FILE_TYPE_VIDEOS
+from app.models.schema import VideoClipParams, VideoAspect, VideoConcatMode
+from app.services import task as tm, llm, voice, material
+from app.utils import utils
+
+# # 将项目的根目录添加到系统路径中，以允许从项目导入模块
+root_dir = os.path.dirname(os.path.realpath(__file__))
+if root_dir not in sys.path:
+    sys.path.append(root_dir)
+    print("******** sys.path ********")
+    print(sys.path)
+    print("*" * 20)
+
 proxy_url_http = config.proxy.get("http", "") or os.getenv("VPN_PROXY_URL", "")
 proxy_url_https = config.proxy.get("https", "") or os.getenv("VPN_PROXY_URL", "")
 os.environ["HTTP_PROXY"] = proxy_url_http
@@ -59,8 +60,6 @@ i18n_dir = os.path.join(root_dir, "webui", "i18n")
 config_file = os.path.join(root_dir, "webui", ".streamlit", "webui.toml")
 system_locale = utils.get_system_locale()
 
-if 'video_subject' not in st.session_state:
-    st.session_state['video_subject'] = ''
 if 'video_clip_json' not in st.session_state:
     st.session_state['video_clip_json'] = ''
 if 'video_plot' not in st.session_state:
@@ -189,16 +188,7 @@ with st.expander(tr("Basic Settings"), expanded=False):
         if HTTPS_PROXY:
             config.proxy["https"] = HTTPS_PROXY
 
-
     with middle_config_panel:
-        #   openai
-        #   moonshot (月之暗面)
-        #   oneapi
-        #   g4f
-        #   azure
-        #   qwen (通义千问)
-        #   gemini
-        #   ollama
         llm_providers = ['Gemini']
         saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
         saved_llm_provider_index = 0
@@ -470,6 +460,7 @@ with left_panel:
             else:
                 st.error(tr("请先生成视频脚本"))
 
+
         # 裁剪视频
         with button_columns[1]:
             if st.button(tr("Crop Video"), key="auto_crop_video", use_container_width=True):
@@ -479,50 +470,6 @@ with left_panel:
 with middle_panel:
     with st.container(border=True):
         st.write(tr("Video Settings"))
-        # video_concat_modes = [
-        #     (tr("Sequential"), "sequential"),
-        #     (tr("Random"), "random"),
-        # ]
-        # video_sources = [
-        #     (tr("Pexels"), "pexels"),
-        #     (tr("Pixabay"), "pixabay"),
-        #     (tr("Local file"), "local"),
-        #     (tr("TikTok"), "douyin"),
-        #     (tr("Bilibili"), "bilibili"),
-        #     (tr("Xiaohongshu"), "xiaohongshu"),
-        # ]
-        #
-        # saved_video_source_name = config.app.get("video_source", "pexels")
-        # saved_video_source_index = [v[1] for v in video_sources].index(
-        #     saved_video_source_name
-        # )
-        #
-        # selected_index = st.selectbox(
-        #     tr("Video Source"),
-        #     options=range(len(video_sources)),
-        #     format_func=lambda x: video_sources[x][0],
-        #     index=saved_video_source_index,
-        # )
-        # params.video_source = video_sources[selected_index][1]
-        # config.app["video_source"] = params.video_source
-        #
-        # if params.video_source == "local":
-        #     _supported_types = FILE_TYPE_VIDEOS + FILE_TYPE_IMAGES
-        #     uploaded_files = st.file_uploader(
-        #         "Upload Local Files",
-        #         type=["mp4", "mov", "avi", "flv", "mkv", "jpg", "jpeg", "png"],
-        #         accept_multiple_files=True,
-        #     )
-
-        # selected_index = st.selectbox(
-        #     tr("Video Concat Mode"),
-        #     index=1,
-        #     options=range(len(video_concat_modes)),  # 使用索引作为内部选项值
-        #     format_func=lambda x: video_concat_modes[x][0],  # 显示给用户的是标签
-        # )
-        # params.video_concat_mode = VideoConcatMode(
-        #     video_concat_modes[selected_index][1]
-        # )
 
         # 视频比例
         video_aspect_ratios = [
@@ -582,8 +529,9 @@ with middle_panel:
         params.voice_name = voice_name
         config.ui["voice_name"] = voice_name
 
+        # 试听语言合成
         if st.button(tr("Play Voice")):
-            play_content = params.video_subject
+            play_content = "这是一段试听语言"
             if not play_content:
                 play_content = params.video_script
             if not play_content:
@@ -779,6 +727,7 @@ with st.expander(tr("Video Check"), expanded=False):
                             caijian()
                             st.rerun()
 
+# 开始按钮
 start_button = st.button(tr("Generate Video"), use_container_width=True, type="primary")
 if start_button:
     config.save_config()
@@ -800,10 +749,6 @@ if start_button:
         st.error(tr("视频文件不能为空"))
         scroll_to_bottom()
         st.stop()
-    if llm_provider != 'g4f' and not config.app.get(f"{llm_provider}_api_key", ""):
-        st.error(tr("请输入 LLM API 密钥"))
-        scroll_to_bottom()
-        st.stop()
 
     log_container = st.empty()
     log_records = []

From 18d4fff0280f51a6402061ff68638356df1a2dd8 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Thu, 26 Sep 2024 15:56:50 +0800
Subject: [PATCH 10/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=A4=A7=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=E7=94=9F=E6=88=90=E8=84=9A=E6=9C=AC=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py | 318 ++++++++++++++++++++++++++------------------
 app/utils/utils.py  |  10 ++
 config.example.toml |  23 +---
 3 files changed, 200 insertions(+), 151 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index 6aa818e..e9b6048 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -10,11 +10,12 @@ from openai import AzureOpenAI
 from openai.types.chat import ChatCompletion
 import google.generativeai as gemini
 from googleapiclient.errors import ResumableUploadError
-from google.api_core.exceptions import FailedPrecondition
-from google.generativeai.types import HarmCategory, HarmBlockThreshold
+from google.api_core.exceptions import *
+from google.generativeai.types import *
 import subprocess
 
 from app.config import config
+from app.utils.utils import clean_model_output
 
 _max_retries = 5
 
@@ -105,9 +106,39 @@ Method = """
 """
 
 
-def _generate_response(prompt: str) -> str:
+def handle_exception(err):
+    if isinstance(err, PermissionDenied):
+        logger.error("403 用户没有权限访问该资源")
+    elif isinstance(err, ResourceExhausted):
+        logger.error("429 您的配额已用尽。请稍后重试。请考虑设置自动重试来处理这些错误")
+    elif isinstance(err, InvalidArgument):
+        logger.error("400 参数无效。例如，文件过大，超出了载荷大小限制。另一个事件提供了无效的 API 密钥。")
+    elif isinstance(err, AlreadyExists):
+        logger.error("409 已存在具有相同 ID 的已调参模型。对新模型进行调参时，请指定唯一的模型 ID。")
+    elif isinstance(err, RetryError):
+        logger.error("使用不支持 gRPC 的代理时可能会引起此错误。请尝试将 REST 传输与 genai.configure(..., transport=rest) 搭配使用。")
+    elif isinstance(err, BlockedPromptException):
+        logger.error("400 出于安全原因，该提示已被屏蔽。")
+    elif isinstance(err, BrokenResponseError):
+        logger.error("500 流式传输响应已损坏。在访问需要完整响应的内容（例如聊天记录）时引发。查看堆栈轨迹中提供的错误详情。")
+    elif isinstance(err, IncompleteIterationError):
+        logger.error("500 访问需要完整 API 响应但流式响应尚未完全迭代的内容时引发。对响应对象调用 resolve() 以使用迭代器。")
+    elif isinstance(err, ConnectionError):
+        logger.error("网络连接错误，请检查您的网络连接。")
+    else:
+        logger.error(f"视频转录失败, 下面是具体报错信息: \n{traceback.format_exc()} \n问题排查指南: https://ai.google.dev/gemini-api/docs/troubleshooting?hl=zh-cn")
+    return ""
+
+
+def _generate_response(prompt: str, llm_provider: str = None) -> str:
+    """
+    调用大模型通用方法
+        prompt：
+        llm_provider：
+    """
     content = ""
-    llm_provider = config.app.get("llm_provider", "openai")
+    if not llm_provider:
+        llm_provider = config.app.get("llm_provider", "openai")
     logger.info(f"llm provider: {llm_provider}")
     if llm_provider == "g4f":
         model_name = config.app.get("g4f_model_name", "")
@@ -223,46 +254,23 @@ def _generate_response(prompt: str) -> str:
 
             genai.configure(api_key=api_key, transport="rest")
 
-            generation_config = {
-                "temperature": 0.5,
-                "top_p": 1,
-                "top_k": 1,
-                "max_output_tokens": 2048,
+            safety_settings = {
+                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
             }
 
-            safety_settings = [
-                {
-                    "category": "HARM_CATEGORY_HARASSMENT",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-                {
-                    "category": "HARM_CATEGORY_HATE_SPEECH",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-                {
-                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-                {
-                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-            ]
-
             model = genai.GenerativeModel(
                 model_name=model_name,
-                generation_config=generation_config,
                 safety_settings=safety_settings,
             )
 
             try:
                 response = model.generate_content(prompt)
-                candidates = response.candidates
-                generated_text = candidates[0].content.parts[0].text
-            except (AttributeError, IndexError) as e:
-                print("Gemini Error:", e)
-
-            return generated_text
+                return response.text
+            except Exception as err:
+                return handle_exception(err)
 
         if llm_provider == "cloudflare":
             import requests
@@ -345,6 +353,43 @@ def _generate_response(prompt: str) -> str:
     return content.replace("\n", "")
 
 
+def _generate_response_video(prompt: str, llm_provider: str, video_file: str | File) -> str:
+    """
+    多模态能力大模型
+    """
+    if llm_provider == "gemini":
+        api_key = config.app.get("gemini_api_key")
+        model_name = config.app.get("gemini_model_name")
+        base_url = "***"
+    else:
+        raise ValueError(
+            "llm_provider 未设置，请在 config.toml 文件中进行设置。"
+        )
+
+    if llm_provider == "gemini":
+        import google.generativeai as genai
+
+        genai.configure(api_key=api_key, transport="rest")
+
+        safety_settings = {
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+
+        model = genai.GenerativeModel(
+            model_name=model_name,
+            safety_settings=safety_settings,
+        )
+
+        try:
+            response = model.generate_content([prompt, video_file])
+            return response.text
+        except Exception as err:
+            return handle_exception(err)
+
+
 def compress_video(input_path: str, output_path: str):
     """
     压缩视频文件
@@ -353,7 +398,7 @@ def compress_video(input_path: str, output_path: str):
         output_path: 输出压缩后的视频文件路径
     """
     # 指定 ffmpeg 的完整路径
-    ffmpeg_path = os.getenv("FFMPEG_PATH") or config.app.get("ffmpeg_path")
+    ffmpeg_path = os.getenv("FFMPEG_PATH") or config.app.get("ffmpeg_path") or "ffmpeg"
 
     # 如果压缩后的视频文件已经存在，则直接使用
     if os.path.exists(output_path):
@@ -370,6 +415,7 @@ def compress_video(input_path: str, output_path: str):
             "-b:a", "128k",
             output_path
         ]
+        logger.info(f"执行命令: {' '.join(command)}")
         subprocess.run(command, check=True)
     except subprocess.CalledProcessError as e:
         logger.error(f"视频压缩失败: {e}")
@@ -396,7 +442,13 @@ def generate_script(
     compress_video(video_path, compressed_video_path)
 
     # 2. 转录视频
-    transcription = gemini_video_transcription(video_name=video_name, video_path=compressed_video_path, language=language, progress_text=progress_text)
+    transcription = gemini_video_transcription(
+        video_name=video_name,
+        video_path=compressed_video_path,
+        language=language,
+        progress_text=progress_text,
+        llm_provider="gemini"
+    )
 
     # # 清理压缩后的视频文件
     # try:
@@ -406,13 +458,16 @@ def generate_script(
 
     # 3. 编写解说文案
     progress_text.text("解说文案中...")
-    script = writing_short_play(video_plot, video_name)
+    script = writing_short_play(video_plot, video_name, "openai")
 
     # 4. 文案匹配画面
-    progress_text.text("画面匹配中...")
-    matched_script = screen_matching(huamian=transcription, wenan=script)
+    if transcription != "":
+        progress_text.text("画面匹配中...")
+        matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider="openai")
 
-    return matched_script
+        return matched_script
+    else:
+        return ""
 
 
 def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
@@ -565,57 +620,52 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
     return response
 
 
-def gemini_video_transcription(video_name: str, video_path: str, language: str, progress_text: st.empty = ""):
+def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider: str, progress_text: st.empty = ""):
     '''
     使用 gemini-1.5-xxx 进行视频画面转录
     '''
     api_key = config.app.get("gemini_api_key")
-    model_name = config.app.get("gemini_model_name")
-
     gemini.configure(api_key=api_key)
-    model = gemini.GenerativeModel(model_name=model_name)
 
     prompt = """
-    Please transcribe the audio, include timestamps, and provide visual descriptions, then output in JSON format.
-    Please use %s output
-    Use this JSON schema:
-
-    Graphics = {"timestamp": "MM:SS-MM:SS", "picture": "str", "quotes": "str"(If no one says anything, use an empty string instead.)}
-    Return: list[Graphics]
-    """ % language
+    请转录音频，包括时间戳，并提供视觉描述，然后以 JSON 格式输出，当前视频中使用的语言为 %s。
+    
+    在转录视频时，请通过确保以下条件来完成转录：
+    1. 画面描述使用语言: %s 进行输出。
+    2. 同一个画面合并为一个转录记录。
+    3. 使用以下 JSON schema:    
+        Graphics = {"timestamp": "MM:SS-MM:SS"(时间戳格式), "picture": "str"(画面描述), "speech": "str"(台词，如果没有人说话，则使用空字符串。)}
+        Return: list[Graphics]
+    4. 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
+    """ % (language, language)
 
     logger.debug(f"视频名称: {video_name}")
     try:
         progress_text.text("上传视频中...")
         gemini_video_file = gemini.upload_file(video_path)
-        logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
+        logger.debug(f"视频 {gemini_video_file.name} 上传至 Google cloud 成功, 开始解析...")
         while gemini_video_file.state.name == "PROCESSING":
-            import time
-            time.sleep(1)
             gemini_video_file = gemini.get_file(gemini_video_file.name)
             progress_text.text(f"解析视频中, 当前状态: {gemini_video_file.state.name}")
-            # logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
         if gemini_video_file.state.name == "FAILED":
             raise ValueError(gemini_video_file.state.name)
+        elif gemini_video_file.state.name == "ACTIVE":
+            progress_text.text("解析完成")
+            logger.debug("解析完成, 开始转录...")
     except ResumableUploadError as err:
         logger.error(f"上传视频至 Google cloud 失败, 用户的位置信息不支持用于该API; \n{traceback.format_exc()}")
-        return ""
+        return False
     except FailedPrecondition as err:
         logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}")
-        return ""
+        return False
 
     progress_text.text("视频转录中...")
-    response = model.generate_content(
-        [prompt, gemini_video_file],
-        safety_settings={
-            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
-        }
-    )
-    logger.success("视频转录成功")
-    return response.text
+    try:
+        response = _generate_response_video(prompt=prompt, llm_provider=llm_provider, video_file=gemini_video_file)
+        logger.success("视频转录成功")
+        return response
+    except Exception as err:
+        return handle_exception(err)
 
 
 def writing_movie(video_plot, video_name):
@@ -640,33 +690,34 @@ def writing_movie(video_plot, video_name):
     3. 仅输出解说文案，不输出任何其他内容。
     4. 不要包含小标题，每个段落以 \n 进行分隔。
     """
-    response = model.generate_content(
-        prompt,
-        generation_config=gemini.types.GenerationConfig(
-            candidate_count=1,
-            temperature=1.3,
-        ),
-        safety_settings={
-            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
-        }
-    )
-    logger.debug(response.text)
-    logger.debug("字数：", len(response.text))
-    return response.text
+    try:
+        response = model.generate_content(
+            prompt,
+            generation_config=gemini.types.GenerationConfig(
+                candidate_count=1,
+                temperature=1.3,
+            ),
+            safety_settings={
+                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+            }
+        )
+        return response.text
+    except Exception as err:
+        return handle_exception(err)
 
 
-def writing_short_play(video_plot: str, video_name: str):
+def writing_short_play(video_plot: str, video_name: str, llm_provider: str):
     """
     影视解说（短剧解说）
     """
-    api_key = config.app.get("gemini_api_key")
-    model_name = config.app.get("gemini_model_name")
-
-    gemini.configure(api_key=api_key)
-    model = gemini.GenerativeModel(model_name)
+    # api_key = config.app.get("gemini_api_key")
+    # # model_name = config.app.get("gemini_model_name")
+    #
+    # gemini.configure(api_key=api_key)
+    # model = gemini.GenerativeModel(model_name)
 
     if not video_plot:
         raise ValueError("短剧的简介不能为空")
@@ -686,33 +737,34 @@ def writing_short_play(video_plot: str, video_name: str):
     3. 仅输出解说文案，不输出任何其他内容。
     4. 不要包含小标题，每个段落以 \\n 进行分隔。
     """
-    response = model.generate_content(
-        prompt,
-        generation_config=gemini.types.GenerationConfig(
-            candidate_count=1,
-            temperature=1.0,
-        ),
-        safety_settings={
-            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
-        }
-    )
-    logger.success("解说文案生成成功")
-    return response.text
+    try:
+        # if "gemini" in model_name:
+        #     response = model.generate_content(
+        #         prompt,
+        #         generation_config=gemini.types.GenerationConfig(
+        #             candidate_count=1,
+        #             temperature=1.0,
+        #         ),
+        #         safety_settings={
+        #             HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+        #             HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+        #             HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+        #             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        #         }
+        #     )
+        # else:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("解说文案生成成功")
+        logger.debug(response)
+        return response
+    except Exception as err:
+        return handle_exception(err)
 
 
-def screen_matching(huamian: str, wenan: str):
+def screen_matching(huamian: str, wenan: str, llm_provider: str):
     """
     画面匹配
     """
-    api_key = config.app.get("gemini_api_key")
-    model_name = config.app.get("gemini_model_name")
-
-    gemini.configure(api_key=api_key)
-    model = gemini.GenerativeModel(model_name)
-
     if not huamian:
         raise ValueError("画面不能为空")
     if not wenan:
@@ -731,25 +783,20 @@ def screen_matching(huamian: str, wenan: str):
     %s
     </COPYWRITER>
 
-    Use this JSON schema:
-    script = {'picture': str, 'timestamp': str, "narration": str, "OST": bool}
-    Return: list[script]
+    在匹配的过程中，请通过确保以下条件来完成匹配：
+    - 使用以下 JSON schema:    
+        script = {'picture': str, 'timestamp': str(时间戳), "narration": str, "OST": bool(是否开启原声)}
+        Return: list[script]
+    - 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
+    - 
     """ % (huamian, wenan)
-    response = model.generate_content(
-        prompt,
-        generation_config=gemini.types.GenerationConfig(
-            candidate_count=1,
-            temperature=1.0,
-        ),
-        safety_settings={
-            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
-            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
-        }
-    )
-    logger.success("匹配成功")
-    return response.text
+    try:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("匹配成功")
+        logger.debug(response)
+        return response
+    except Exception as err:
+        return handle_exception(err)
 
 
 if __name__ == "__main__":
@@ -760,7 +807,7 @@ if __name__ == "__main__":
     # gemini_video_transcription(video_subject, video_path, language)
 
     # 2. 解说文案
-    video_path = "E:\\projects\\NarratoAI\\resource\\videos\\2.mp4"
+    video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4"
     video_plot = """
         李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明"你儿子是你儿子"。
     走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
@@ -768,4 +815,9 @@ if __name__ == "__main__":
     """
     res = generate_script(video_path, video_plot, video_name="第二十条之无罪释放")
     # res = generate_script(video_path, video_plot, video_name="海岸")
-    print("res \n", res)
+    print("脚本生成成功:\n", res)
+    res = clean_model_output(res)
+    aaa = json.loads(res)
+    print(json.dumps(aaa, indent=2, ensure_ascii=False))
+    # response = _generate_response("你好，介绍一下你自己")
+    # print(response)
diff --git a/app/utils/utils.py b/app/utils/utils.py
index dc38b90..d897442 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -365,3 +365,13 @@ def add_new_timestamps(scenes):
         updated_scenes.append(new_scene)
 
     return updated_scenes
+
+
+def clean_model_output(output):
+    """
+    模型输出包含 ```json 标记时的处理
+    """
+    if "```json" in output:
+        print("##########")
+        output = output.replace("```json", "").replace("```", "")
+    return output.strip()
diff --git a/config.example.toml b/config.example.toml
index 77a5cb0..50b2531 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -1,20 +1,5 @@
 [app]
-    project_version="0.1.2"
-    video_source = "pexels"  # "pexels" or "pixabay"
-    # Pexels API Key
-    # Register at https://www.pexels.com/api/ to get your API key.
-    # You can use multiple keys to avoid rate limits.
-    # For example: pexels_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
-    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
-    pexels_api_keys = []
-
-    # Pixabay API Key
-    # Register at https://pixabay.com/api/docs/ to get your API key.
-    # You can use multiple keys to avoid rate limits.
-    # For example: pixabay_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
-    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
-    pixabay_api_keys = []
-
+    project_version="0.2.0"
     # 如果你没有 OPENAI API Key，可以使用 g4f 代替，或者使用国内的 Moonshot API
     # If you don't have an OPENAI API Key, you can use g4f instead
 
@@ -27,6 +12,8 @@
     #   qwen (通义千问)
     #   gemini
     llm_provider="openai"
+    # 支持多模态视频理解能力的大模型
+    llm_provider_video="gemini"
 
     ########## Ollama Settings
     # No need to set it unless you want to use your own proxy
@@ -184,8 +171,8 @@
     ### Example: "http://user:pass@proxy:1234"
     ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
 
-    # http = "http://10.10.1.10:3128"
-    # https = "http://10.10.1.10:1080"
+    http = "http://127.0.0.1:7890"
+    https = "http://127.0.0.1:7890"
 
 [azure]
     # Azure Speech API Key

From e75157a7ac41d6603448bf7f8750f84a7603083a Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Thu, 26 Sep 2024 18:32:26 +0800
Subject: [PATCH 11/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=89=AA=E8=BE=91?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py      | 141 ++++++++++++++++++++-------------------
 app/services/material.py |  80 ++++++++++++++++++++++
 app/services/task.py     |  95 ++++++++++++++++----------
 app/services/video.py    |  14 ++--
 app/services/voice.py    |   5 +-
 app/utils/utils.py       |   2 +-
 webui.py                 |   2 +-
 7 files changed, 227 insertions(+), 112 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index e9b6048..c5b70dd 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -126,7 +126,7 @@ def handle_exception(err):
     elif isinstance(err, ConnectionError):
         logger.error("网络连接错误，请检查您的网络连接。")
     else:
-        logger.error(f"视频转录失败, 下面是具体报错信息: \n{traceback.format_exc()} \n问题排查指南: https://ai.google.dev/gemini-api/docs/troubleshooting?hl=zh-cn")
+        logger.error(f"大模型请求失败, 下面是具体报错信息: \n{traceback.format_exc()}")
     return ""
 
 
@@ -353,11 +353,11 @@ def _generate_response(prompt: str, llm_provider: str = None) -> str:
     return content.replace("\n", "")
 
 
-def _generate_response_video(prompt: str, llm_provider: str, video_file: str | File) -> str:
+def _generate_response_video(prompt: str, llm_provider_video: str, video_file: str | File) -> str:
     """
     多模态能力大模型
     """
-    if llm_provider == "gemini":
+    if llm_provider_video == "gemini":
         api_key = config.app.get("gemini_api_key")
         model_name = config.app.get("gemini_model_name")
         base_url = "***"
@@ -366,7 +366,7 @@ def _generate_response_video(prompt: str, llm_provider: str, video_file: str | F
             "llm_provider 未设置，请在 config.toml 文件中进行设置。"
         )
 
-    if llm_provider == "gemini":
+    if llm_provider_video == "gemini":
         import google.generativeai as genai
 
         genai.configure(api_key=api_key, transport="rest")
@@ -441,15 +441,44 @@ def generate_script(
     compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4"
     compress_video(video_path, compressed_video_path)
 
-    # 2. 转录视频
-    transcription = gemini_video_transcription(
-        video_name=video_name,
-        video_path=compressed_video_path,
-        language=language,
-        progress_text=progress_text,
-        llm_provider="gemini"
-    )
-
+    # # 2. 转录视频
+    # transcription = gemini_video_transcription(
+    #     video_name=video_name,
+    #     video_path=compressed_video_path,
+    #     language=language,
+    #     progress_text=progress_text,
+    #     llm_provider_video="gemini"
+    # )
+    transcription = """
+[{"timestamp": "00:00-00:06", "picture": "一个穿着蓝色囚服，戴着手铐的人在房间里走路。", "speech": ""},
+{"timestamp": "00:06-00:09", "picture": "一个穿着蓝色囚服，戴着手铐的人，画面上方显示“李自忠 银行抢劫犯”。", "speech": "李自忠 银行抢劫一案 现在宣判"},
+{"timestamp": "00:09-00:12", "picture": "一个穿着黑色西装，打着红色领带的女人，坐在一个牌子上，牌子上写着“书记员”，身后墙上挂着“国徽”。", "speech": "全体起立"},
+{"timestamp": "00:12-00:15", "picture": "一个穿着黑色法官服的男人坐在一个牌子后面，牌子上写着“审判长”，身后墙上挂着“国徽”。法庭上，很多人站着。", "speech": ""},
+{"timestamp": "00:15-00:19", "picture": "一个穿着黑色西装，打着红色领带的女人，坐在一个牌子上，牌子上写着“书记员”，身后墙上挂着“国徽”。法庭上，很多人站着。", "speech": "本庭二审判决如下 被告李自忠 犯抢劫银行罪"},
+{"timestamp": "00:19-00:24", "picture": "一个穿着蓝色囚服，戴着手铐的人，画面上方显示“李自忠 银行抢劫犯”。", "speech": "维持一审判决 判处有期徒刑 二十年"},
+{"timestamp": "00:24-00:27", "picture": "一个穿着黑色法官服的男人坐在一个牌子后面，牌子上写着“审判长”，他敲了一下法槌。", "speech": ""},
+{"timestamp": "00:27-00:32", "picture": "一个穿着蓝色囚服，戴着手铐的人，画面上方显示“李自忠 银行抢劫犯”。", "speech": "我们要让她们牢底坐穿 越父啊越父 你一个平头老百姓 也敢跟外资银行做对 真是不知天高地厚"},
+{"timestamp": "00:32-00:41", "picture": "一个穿着蓝色囚服，戴着手铐的人跪在地上。", "speech": "我要让她们牢底坐穿 越父啊越父 你一个平头老百姓 也敢跟外资银行做对 真是不知天高地厚"},
+{"timestamp": "00:41-00:47", "picture": "两个警察押解着一个穿着蓝色囚服，戴着手铐的人走在路上，一个女记者在路边报道新闻。", "speech": "李先生 这里是孔雀卫视 这里是黄金眼819新闻直播间 这里是浙江卫视新闻直播间 近日李自忠案引发社会热议"},
+{"timestamp": "00:47-01:03", "picture": "一个穿着灰色外套的男人坐在银行柜台前，和银行工作人员说话。画面中还穿插着女记者在路边报道新闻的画面。", "speech": "李自忠案引发社会热议 李自忠在去银行取钱的时候 由于他拿的是儿子的存折 所以银行要求李自忠证明他的儿子就是他的儿子 我说取不了就是取不了啊 这是你儿子的存折啊 你要证明你儿子是你儿子啊"},
+{"timestamp": "01:03-01:10", "picture": "一个穿着灰色外套的男人坐在银行柜台前，和银行工作人员说话。画面中还穿插着女记者在路边报道新闻的画面。", "speech": "李自忠提供了身份证账户户口本后 银行都不认可他的儿子是他的儿子 就在这个时候 银行发生一起抢劫案"},
+{"timestamp": "01:10-01:17", "picture": "三个戴着帽子和口罩的劫匪持枪闯入银行，银行里的人都很害怕，纷纷蹲下躲避。", "speech": "都给我蹲下 老实点 把钱给我交出来"},
+{"timestamp": "01:17-01:28", "picture": "女记者在路边报道新闻，画面中穿插着银行抢劫案的画面。", "speech": "劫匪看到一旁大哭的李自忠 得知他是因为儿子需要治病才取钱的时候 给了他一打钱 怎么 你儿子在医院等着钱救命啊 银行不给取啊"},
+{"timestamp": "01:28-01:36", "picture": "一个戴着黑色帽子和口罩的劫匪，拿着枪，给一个穿着灰色外套的男人一叠钱。", "speech": "银行不给取啊 好了 给儿子看病去 李自忠在把钱给儿子交完药费后被捕"},
+{"timestamp": "01:36-01:58", "picture": "两个警察押解着一个穿着蓝色囚服，戴着手铐的男人走在路上，一个女记者在路边报道新闻。", "speech": "目前一审二审都维持原判 判处有期徒刑二十年 对此你有什么想说的吗 他怎么证明他儿子是他儿子 要是银行早点把钱给我 我也不会遇到劫匪 我儿子还得救命 不是的 儿子 儿子 儿子"},
+{"timestamp": "01:58-02:03", "picture": "两个警察押解着一个穿着蓝色囚服，戴着手铐的男人走在路上，一个女记者在路边报道新闻。男人情绪激动，大声喊叫。", "speech": "儿子 儿子 儿子"},
+{"timestamp": "02:03-02:12", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边。画面中穿插着新闻报道的画面。", "speech": "近日李自忠案引发社会热议 李自忠在去银行取钱的时候 银行要求李自忠证明他的儿子就是他的儿子"},
+{"timestamp": "02:12-02:25", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生站在门口。", "speech": "爸 这家人也真够可怜的 当爹的坐牢 这儿子 恐怕要成植物人了"},
+{"timestamp": "02:25-02:31", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生站在门口。", "speech": "医生啊 我弟弟的情况怎么样 我先看看"},
+{"timestamp": "02:31-02:40", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。", "speech": ""},
+{"timestamp": "02:40-02:46", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。", "speech": "不太理想啊 你弟弟想要醒过来 希望渺茫"},
+{"timestamp": "02:46-02:57", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。", "speech": "这 麟木 麟木你别吓姐啊麟木 麟木"},
+{"timestamp": "02:57-03:02", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。画面中穿插着新闻报道的画面。", "speech": "麟木 儿子 麟木你别吓姐啊麟木"},
+{"timestamp": "03:02-03:08", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。画面中穿插着新闻报道的画面。女人情绪激动，大声哭泣。", "speech": "儿子 麟木你别吓姐啊麟木 儿子"},
+{"timestamp": "03:08-03:14", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。画面中穿插着新闻报道的画面。女人情绪激动，大声哭泣。", "speech": "儿子"},
+{"timestamp": "03:14-03:18", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，画面变成紫色光效。", "speech": ""},
+{"timestamp": "03:18-03:20", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，他突然睁开了眼睛。", "speech": ""}]
+    """
     # # 清理压缩后的视频文件
     # try:
     #     os.remove(compressed_video_path)
@@ -458,7 +487,7 @@ def generate_script(
 
     # 3. 编写解说文案
     progress_text.text("解说文案中...")
-    script = writing_short_play(video_plot, video_name, "openai")
+    script = writing_short_play(video_plot, video_name, "openai", count=300)
 
     # 4. 文案匹配画面
     if transcription != "":
@@ -620,7 +649,7 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
     return response
 
 
-def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider: str, progress_text: st.empty = ""):
+def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider_video: str, progress_text: st.empty = ""):
     '''
     使用 gemini-1.5-xxx 进行视频画面转录
     '''
@@ -661,28 +690,25 @@ def gemini_video_transcription(video_name: str, video_path: str, language: str,
 
     progress_text.text("视频转录中...")
     try:
-        response = _generate_response_video(prompt=prompt, llm_provider=llm_provider, video_file=gemini_video_file)
+        response = _generate_response_video(prompt=prompt, llm_provider_video=llm_provider_video, video_file=gemini_video_file)
         logger.success("视频转录成功")
+        logger.debug(response)
+        print(type(response))
         return response
     except Exception as err:
         return handle_exception(err)
 
 
-def writing_movie(video_plot, video_name):
+def writing_movie(video_plot, video_name, llm_provider):
     """
     影视解说（电影解说）
     """
-    api_key = config.app.get("gemini_api_key")
-    model_name = config.app.get("gemini_model_name")
-
-    gemini.configure(api_key=api_key)
-    model = gemini.GenerativeModel(model_name)
-
     prompt = f"""
     **角色设定：**  
     你是一名有10年经验的影视解说文案的创作者，
     下面是关于如何写解说文案的方法 {Method}，请认真阅读它，之后我会给你一部影视作品的名称，然后让你写一篇文案
-    请根据方法撰写 《{video_name}》的影视解说文案，文案要符合以下要求:
+    请根据方法撰写 《{video_name}》的影视解说文案，《{video_name}》的大致剧情如下: {video_plot}
+    文案要符合以下要求:
     
     **任务目标：**  
     1. 文案字数在 1500字左右，严格要求字数，最低不得少于 1000字。
@@ -691,34 +717,17 @@ def writing_movie(video_plot, video_name):
     4. 不要包含小标题，每个段落以 \n 进行分隔。
     """
     try:
-        response = model.generate_content(
-            prompt,
-            generation_config=gemini.types.GenerationConfig(
-                candidate_count=1,
-                temperature=1.3,
-            ),
-            safety_settings={
-                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
-                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
-                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
-                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
-            }
-        )
-        return response.text
+        response = _generate_response(prompt, llm_provider)
+        logger.success("解说文案生成成功")
+        return response
     except Exception as err:
         return handle_exception(err)
 
 
-def writing_short_play(video_plot: str, video_name: str, llm_provider: str):
+def writing_short_play(video_plot: str, video_name: str, llm_provider: str, count: int = 500):
     """
     影视解说（短剧解说）
     """
-    # api_key = config.app.get("gemini_api_key")
-    # # model_name = config.app.get("gemini_model_name")
-    #
-    # gemini.configure(api_key=api_key)
-    # model = gemini.GenerativeModel(model_name)
-
     if not video_plot:
         raise ValueError("短剧的简介不能为空")
     if not video_name:
@@ -732,27 +741,12 @@ def writing_short_play(video_plot: str, video_name: str, llm_provider: str):
     文案要符合以下要求:
 
     **任务目标：**  
-    1. 文案字数在 800字左右，严格要求字数，最低不得少于 600字。
+    1. 请严格要求文案字数, 字数控制在 {count} 字左右。
     2. 避免使用 markdown 格式输出文案。
     3. 仅输出解说文案，不输出任何其他内容。
     4. 不要包含小标题，每个段落以 \\n 进行分隔。
     """
     try:
-        # if "gemini" in model_name:
-        #     response = model.generate_content(
-        #         prompt,
-        #         generation_config=gemini.types.GenerationConfig(
-        #             candidate_count=1,
-        #             temperature=1.0,
-        #         ),
-        #         safety_settings={
-        #             HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
-        #             HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
-        #             HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
-        #             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
-        #         }
-        #     )
-        # else:
         response = _generate_response(prompt, llm_provider)
         logger.success("解说文案生成成功")
         logger.debug(response)
@@ -763,7 +757,7 @@ def writing_short_play(video_plot: str, video_name: str, llm_provider: str):
 
 def screen_matching(huamian: str, wenan: str, llm_provider: str):
     """
-    画面匹配
+    画面匹配（一次性匹配）
     """
     if not huamian:
         raise ValueError("画面不能为空")
@@ -772,12 +766,18 @@ def screen_matching(huamian: str, wenan: str, llm_provider: str):
 
     prompt = """
     你是一名有10年经验的影视解说创作者，
-    你的任务是根据画面描述文本和解说文案，匹配出每段解说文案对应的画面时间戳, 结果以 json 格式输出。
+    你的任务是根据视频转录脚本和解说文案，匹配出每段解说文案对应的画面时间戳, 结果以 json 格式输出。
     
-    画面描述文本和文案（由 XML 标记<SOURCE_TEXT><SOURCE_TEXT>和 <COPYWRITER><COPYWRITER>分隔）如下所示：
-    <SOURCE_TEXT>
+    注意：
+    转录脚本中 
+        - timestamp: 表示视频时间戳
+        - picture: 表示当前画面描述
+        - speech": 表示当前视频中人物的台词
+    
+    转录脚本和文案（由 XML 标记<PICTURE></PICTURE>和 <COPYWRITER></COPYWRITER>分隔）如下所示：
+    <PICTURE>
     %s
-    </SOURCE_TEXT>
+    </PICTURE>
     
     <COPYWRITER>
     %s
@@ -787,8 +787,15 @@ def screen_matching(huamian: str, wenan: str, llm_provider: str):
     - 使用以下 JSON schema:    
         script = {'picture': str, 'timestamp': str(时间戳), "narration": str, "OST": bool(是否开启原声)}
         Return: list[script]
+    - picture: 字段表示当前画面描述，与转录脚本保持一致
+    - timestamp: 字段表示某一段文案对应的画面的时间戳，不必和转录脚本的时间戳一致，应该充分考虑文案内容，匹配出与其描述最匹配的时间戳
+    - narration: 字段表示需要解说文案，每段解说文案尽量不要超过30字
+    - OST: 字段表示是否开启原声，即当 OST 字段为 true 时，narration 字段为空字符串，当 OST 为 false 时，narration 字段为对应的解说文案
+    - 注意，在画面匹配的过程中，需要适当的加入原声播放，使得解说和画面更加匹配，请按照 1:1 的比例，生成原声和解说的脚本内容。
+    - 注意，在时间戳匹配上，一定不能原样照搬“转录脚本”，应当适当的合并或者删减一些片段。
+    - 注意，第一个画面一定是原声播放并且时长不少于 20 s，为了吸引观众，第一段一定是整个转录脚本中最精彩的片段。
+    - 注意，匹配的画面不能重复出现，即生成的脚本中 timestamp 不能重复。
     - 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
-    - 
     """ % (huamian, wenan)
     try:
         response = _generate_response(prompt, llm_provider)
diff --git a/app/services/material.py b/app/services/material.py
index 7eca553..d63e6fc 100644
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -1,4 +1,5 @@
 import os
+import subprocess
 import random
 from urllib.parse import urlencode
 
@@ -329,6 +330,85 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -
     return video_paths
 
 
+def merge_videos(video_paths, ost_list):
+    """
+    合并多个视频为一个视频，可选择是否保留每个视频的原声。
+
+    :param video_paths: 视频文件路径列表
+    :param ost_list: 是否保留原声的布尔值列表
+    :return: 合并后的视频文件路径
+    """
+    if len(video_paths) != len(ost_list):
+        raise ValueError("视频路径列表和保留原声列表长度必须相同")
+
+    if not video_paths:
+        raise ValueError("视频路径列表不能为空")
+
+    # 准备临时文件列表
+    temp_file = "temp_file_list.txt"
+    with open(temp_file, "w") as f:
+        for video_path, keep_ost in zip(video_paths, ost_list):
+            if keep_ost:
+                f.write(f"file '{video_path}'\n")
+            else:
+                # 如果不保留原声，创建一个无声的临时视频
+                silent_video = f"silent_{os.path.basename(video_path)}"
+                subprocess.run(["ffmpeg", "-i", video_path, "-c:v", "copy", "-an", silent_video], check=True)
+                f.write(f"file '{silent_video}'\n")
+
+    # 合并视频
+    output_file = "combined.mp4"
+    ffmpeg_cmd = [
+        "ffmpeg",
+        "-f", "concat",
+        "-safe", "0",
+        "-i", temp_file,
+        "-c:v", "copy",
+        "-c:a", "aac",
+        "-strict", "experimental",
+        output_file
+    ]
+
+    try:
+        subprocess.run(ffmpeg_cmd, check=True)
+        print(f"视频合并成功：{output_file}")
+    except subprocess.CalledProcessError as e:
+        print(f"视频合并失败：{e}")
+        return None
+    finally:
+        # 清理临时文件
+        os.remove(temp_file)
+        for video_path, keep_ost in zip(video_paths, ost_list):
+            if not keep_ost:
+                silent_video = f"silent_{os.path.basename(video_path)}"
+                if os.path.exists(silent_video):
+                    os.remove(silent_video)
+
+    return output_file
+
+
+# 使用示例
+# if __name__ == "__main__":
+#     video_paths = ['/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_17-01_37.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_00-00_06.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_06-00_09.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_03-01_10.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_10-01_17.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_24-00_27.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_28-01_36.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_32-00_41.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_36-01_58.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_12-00_15.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_09-00_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_12-02_25.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_03-02_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_58-02_03.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_14-03_18.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_18-03_20.mp4']
+#
+#     ost_list = [True, False, False, False, False, False, False, False, True, False, False, False, False, False, False,
+#                 False]
+#
+#     result = merge_videos(video_paths, ost_list)
+#     if result:
+#         print(f"合并后的视频文件：{result}")
+#     else:
+#         print("视频合并失败")
+#
+
+
 if __name__ == "__main__":
     download_videos(
         "test123", ["Money Exchange Medium"], audio_duration=100, source="pixabay"
diff --git a/app/services/task.py b/app/services/task.py
index 0c544c2..43e9b27 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -211,7 +211,7 @@ def start(task_id, params: VideoParams, stop_at: str = "video"):
 
     if type(params.video_concat_mode) is str:
         params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
-        
+
     # 1. Generate script
     video_script = generate_script(task_id, params)
     if not video_script:
@@ -323,7 +323,7 @@ def start(task_id, params: VideoParams, stop_at: str = "video"):
     return kwargs
 
 
-def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
+def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
     """
     后台任务（自动剪辑视频进行剪辑）
 
@@ -423,39 +423,46 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
     combined_video_paths = []
 
     _progress = 50
-    for i in range(params.video_count):
-        index = i + 1
-        combined_video_path = path.join(utils.task_dir(task_id), f"combined-{index}.mp4")
-        logger.info(f"\n\n## 5. 合并视频: {index} => {combined_video_path}")
-        video.combine_clip_videos(
-            combined_video_path=combined_video_path,
-            video_paths=subclip_videos,
-            video_ost_list=video_ost,
-            list_script=list_script,
-            video_aspect=params.video_aspect,
-            threads=1   # 暂时只支持单线程
-        )
+    # for i in range(params.video_count):
+    index = 1
+    combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
+    print("111", subclip_videos)
+    print("222", video_ost)
+    print("333", len(subclip_videos))
+    print("444", len(video_ost))
+    # for video_path, video_ost in zip(subclip_videos, video_ost):
+    #     print(video_path)
+    #     print(video_ost)
+    video.combine_clip_videos(
+        combined_video_path=combined_video_path,
+        video_paths=subclip_videos,
+        video_ost_list=video_ost,
+        list_script=list_script,
+        video_aspect=params.video_aspect,
+        threads=1  # 暂时只支持单线程
+    )
 
-        _progress += 50 / params.video_count / 2
-        sm.state.update_task(task_id, progress=_progress)
+    _progress += 50 / 2
+    sm.state.update_task(task_id, progress=_progress)
 
-        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+    final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
 
-        logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
-        # 把所有东西合到在一起
-        video.generate_video_v2(
-            video_path=combined_video_path,
-            audio_paths=audio_files,
-            subtitle_path=subtitle_path,
-            output_file=final_video_path,
-            params=params,
-        )
+    logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
+    # 把所有东西合到在一起
+    video.generate_video_v2(
+        video_path=combined_video_path,
+        audio_paths=audio_files,
+        subtitle_path=subtitle_path,
+        output_file=final_video_path,
+        params=params,
+    )
 
-        _progress += 50 / params.video_count / 2
-        sm.state.update_task(task_id, progress=_progress)
+    _progress += 50 / 2
+    sm.state.update_task(task_id, progress=_progress)
 
-        final_video_paths.append(final_video_path)
-        combined_video_paths.append(combined_video_path)
+    final_video_paths.append(final_video_path)
+    combined_video_paths.append(combined_video_path)
 
     logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
 
@@ -468,11 +475,25 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
 
 
 if __name__ == "__main__":
-    task_id = "task_id"
-    params = VideoParams(
-        video_subject="金钱的作用",
-        voice_name="zh-CN-XiaoyiNeural-Female",
-        voice_rate=1.0,
-
+    task_id = "test123456"
+    subclip_path_videos = {'01:17-01:37': './storage/cache_videos/vid-01_17-01_37.mp4',
+                           '00:00-00:06': './storage/cache_videos/vid-00_00-00_06.mp4',
+                           '00:06-00:09': './storage/cache_videos/vid-00_06-00_09.mp4',
+                           '01:03-01:10': './storage/cache_videos/vid-01_03-01_10.mp4',
+                           '01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4',
+                           '00:24-00:27': './storage/cache_videos/vid-00_24-00_27.mp4',
+                           '01:28-01:36': './storage/cache_videos/vid-01_28-01_36.mp4',
+                           '00:32-00:41': './storage/cache_videos/vid-00_32-00_41.mp4',
+                           '01:36-01:58': './storage/cache_videos/vid-01_36-01_58.mp4',
+                           '00:12-00:15': './storage/cache_videos/vid-00_12-00_15.mp4',
+                           '00:09-00:12': './storage/cache_videos/vid-00_09-00_12.mp4',
+                           '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4',
+                           '02:03-02:12': './storage/cache_videos/vid-02_03-02_12.mp4',
+                           '01:58-02:03': './storage/cache_videos/vid-01_58-02_03.mp4',
+                           '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
+                           '03:18-03:20': './storage/cache_videos/vid-03_18-03_20.mp4'}
+    params = VideoClipParams(
+        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json",
+        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
     )
-    start(task_id, params, stop_at="video")
+    start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
diff --git a/app/services/video.py b/app/services/video.py
index dd9907f..2adcfc5 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -521,7 +521,8 @@ def combine_clip_videos(combined_video_path: str,
     Args:
         combined_video_path: 合并后的存储路径
         video_paths: 子视频路径列表
-        audio_file: mp3旁白
+        video_ost_list: 原声播放列表
+        list_script: 剪辑脚本
         video_aspect: 屏幕比例
         threads: 线程数
 
@@ -545,10 +546,13 @@ def combine_clip_videos(combined_video_path: str,
     # 一遍又一遍地添加下载的剪辑，直到达到音频的持续时间 （max_duration）
     # while video_duration < audio_duration:
     for video_path, video_ost in zip(video_paths, video_ost_list):
-        clip = VideoFileClip(video_path)
-        # 通过 ost 字段判断是否播放原声
-        if not video_ost:
-            clip = clip.without_audio()
+        print(video_path)
+        print(video_ost)
+        cache_video_path = utils.root_dir()
+        clip = VideoFileClip(os.path.join(cache_video_path, video_path))
+        # # 通过 ost 字段判断是否播放原声
+        # if not video_ost:
+        #     clip = clip.without_audio()
         # # 检查剪辑是否比剩余音频长
         # if (audio_duration - video_duration) < clip.duration:
         #     clip = clip.subclip(0, (audio_duration - video_duration))
diff --git a/app/services/voice.py b/app/services/voice.py
index 785f3f1..e40fa5d 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1071,7 +1071,10 @@ def azure_tts_v1(
                                 (chunk["offset"], chunk["duration"]), chunk["text"]
                             )
                 return sub_maker
-
+            # 判断音频文件是否一件存在
+            if os.path.exists(voice_file):
+                logger.info(f"voice file exists, skip tts: {voice_file}")
+                continue
             sub_maker = asyncio.run(_do())
             if not sub_maker or not sub_maker.subs:
                 logger.warning(f"failed, sub_maker is None or sub_maker.subs is None")
diff --git a/app/utils/utils.py b/app/utils/utils.py
index d897442..ecf8aa8 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -358,7 +358,7 @@ def add_new_timestamps(scenes):
         new_scene['new_timestamp'] = f"{new_start_str}-{new_end_str}"
 
         # 为"原生播放"的narration添加唯一标识符
-        if new_scene.get('narration') == "原声播放" or new_scene.get('narration') == None:
+        if new_scene.get('narration') == "" or new_scene.get('narration') == None:
             unique_id = str(uuid4())[:8]  # 使用UUID的前8个字符作为唯一标识符
             new_scene['narration'] = f"原声播放_{unique_id}"
 
diff --git a/webui.py b/webui.py
index 49d52f6..c4853d8 100644
--- a/webui.py
+++ b/webui.py
@@ -456,7 +456,7 @@ with left_panel:
                         video_script['path'] = subclip_videos[video_script['timestamp']]
                     except KeyError as err:
                         st.error(f"裁剪视频失败 {err}")
-                # logger.debug(f"当前的脚本为：{st.session_state.video_script_list}")
+                logger.debug(f"当前的脚本为：{st.session_state.subclip_videos}")
             else:
                 st.error(tr("请先生成视频脚本"))
 

From 67d6f353ebbc4a7b609045bdbf5eb6d34194b76c Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Fri, 27 Sep 2024 01:01:49 +0800
Subject: [PATCH 12/21] =?UTF-8?q?=E5=89=A9=E4=BD=99=E6=9C=80=E5=90=8E?=
 =?UTF-8?q?=E4=B8=80=E6=AD=A5=EF=BC=8C=E5=90=88=E6=88=90=E6=89=80=E6=9C=89?=
 =?UTF-8?q?=E7=B4=A0=E6=9D=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py   | 28 ++++++++++++++++------------
 app/services/task.py  | 29 +++++++++++------------------
 app/services/video.py | 23 +++++++++++++++++------
 webui.bat             |  8 +++++++-
 4 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index c5b70dd..0fe4950 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -7,6 +7,7 @@ from typing import List
 from loguru import logger
 from openai import OpenAI
 from openai import AzureOpenAI
+from moviepy.editor import VideoFileClip
 from openai.types.chat import ChatCompletion
 import google.generativeai as gemini
 from googleapiclient.errors import ResumableUploadError
@@ -406,17 +407,19 @@ def compress_video(input_path: str, output_path: str):
         return
 
     try:
-        command = [
-            ffmpeg_path,
-            "-i", input_path,
-            "-c:v", "h264",
-            "-b:v", "500k",
-            "-c:a", "aac",
-            "-b:a", "128k",
-            output_path
-        ]
-        logger.info(f"执行命令: {' '.join(command)}")
-        subprocess.run(command, check=True)
+        clip = VideoFileClip(input_path)
+        clip.write_videofile(output_path, codec='libx264', audio_codec='aac', bitrate="500k", audio_bitrate="128k")
+        # command = [
+        #     ffmpeg_path,
+        #     "-i", input_path,
+        #     "-c:v", "h264",
+        #     "-b:v", "500k",
+        #     "-c:a", "aac",
+        #     "-b:a", "128k",
+        #     output_path
+        # ]
+        # logger.info(f"执行命令: {' '.join(command)}")
+        # subprocess.run(command, check=True)
     except subprocess.CalledProcessError as e:
         logger.error(f"视频压缩失败: {e}")
         raise
@@ -814,7 +817,8 @@ if __name__ == "__main__":
     # gemini_video_transcription(video_subject, video_path, language)
 
     # 2. 解说文案
-    video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4"
+    # video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4"
+    video_path = "E:\\projects\\NarratoAI\\resource\\videos\\1.mp4"
     video_plot = """
         李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明"你儿子是你儿子"。
     走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
diff --git a/app/services/task.py b/app/services/task.py
index 43e9b27..2e3ff30 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -476,24 +476,17 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
 
 if __name__ == "__main__":
     task_id = "test123456"
-    subclip_path_videos = {'01:17-01:37': './storage/cache_videos/vid-01_17-01_37.mp4',
-                           '00:00-00:06': './storage/cache_videos/vid-00_00-00_06.mp4',
-                           '00:06-00:09': './storage/cache_videos/vid-00_06-00_09.mp4',
-                           '01:03-01:10': './storage/cache_videos/vid-01_03-01_10.mp4',
-                           '01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4',
-                           '00:24-00:27': './storage/cache_videos/vid-00_24-00_27.mp4',
-                           '01:28-01:36': './storage/cache_videos/vid-01_28-01_36.mp4',
-                           '00:32-00:41': './storage/cache_videos/vid-00_32-00_41.mp4',
-                           '01:36-01:58': './storage/cache_videos/vid-01_36-01_58.mp4',
-                           '00:12-00:15': './storage/cache_videos/vid-00_12-00_15.mp4',
-                           '00:09-00:12': './storage/cache_videos/vid-00_09-00_12.mp4',
-                           '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4',
-                           '02:03-02:12': './storage/cache_videos/vid-02_03-02_12.mp4',
-                           '01:58-02:03': './storage/cache_videos/vid-01_58-02_03.mp4',
-                           '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
-                           '03:18-03:20': './storage/cache_videos/vid-03_18-03_20.mp4'}
+    subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
+                           '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4',
+                           '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4',
+                           '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4',
+                           '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4',
+                           '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4',
+                           '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4',
+                           '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'}
+
     params = VideoClipParams(
-        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json",
-        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
+        video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json",
+        video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4",
     )
     start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
diff --git a/app/services/video.py b/app/services/video.py
index 2adcfc5..864634c 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -300,7 +300,8 @@ def generate_video(
     )
     video_clip.close()
     del video_clip
-    logger.success("completed")
+    logger.success(""
+                   "completed")
 
 
 def generate_video_v2(
@@ -310,6 +311,18 @@ def generate_video_v2(
         output_file: str,
         params: Union[VideoParams, VideoClipParams],
 ):
+    """
+    合并所有素材
+    Args:
+        video_path:
+        audio_paths:
+        subtitle_path:
+        output_file:
+        params:
+
+    Returns:
+
+    """
     aspect = VideoAspect(params.video_aspect)
     video_width, video_height = aspect.to_resolution()
 
@@ -546,13 +559,11 @@ def combine_clip_videos(combined_video_path: str,
     # 一遍又一遍地添加下载的剪辑，直到达到音频的持续时间 （max_duration）
     # while video_duration < audio_duration:
     for video_path, video_ost in zip(video_paths, video_ost_list):
-        print(video_path)
-        print(video_ost)
         cache_video_path = utils.root_dir()
         clip = VideoFileClip(os.path.join(cache_video_path, video_path))
-        # # 通过 ost 字段判断是否播放原声
-        # if not video_ost:
-        #     clip = clip.without_audio()
+        # 通过 ost 字段判断是否播放原声
+        if not video_ost:
+            clip = clip.without_audio()
         # # 检查剪辑是否比剩余音频长
         # if (audio_duration - video_duration) < clip.duration:
         #     clip = clip.subclip(0, (audio_duration - video_duration))
diff --git a/webui.bat b/webui.bat
index 111e1d3..f56d6ef 100644
--- a/webui.bat
+++ b/webui.bat
@@ -3,6 +3,12 @@ set CURRENT_DIR=%CD%
 echo ***** Current directory: %CURRENT_DIR% *****
 set PYTHONPATH=%CURRENT_DIR%
 
+set "vpn_proxy_url=%http://127.0.0.1:7890%"
+
+:: 使用VPN代理进行一些操作，例如通过代理下载文件
+set "http_proxy=%vpn_proxy_url%"
+set "https_proxy=%vpn_proxy_url%"
+
 @echo off
 setlocal enabledelayedexpansion
 
@@ -40,4 +46,4 @@ pause
 
 
 rem set HF_ENDPOINT=https://hf-mirror.com
-streamlit run webui.py --browser.gatherUsageStats=False --server.enableCORS=True
+streamlit run webui.py --browser.serverAddress="127.0.0.1" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False

From e440dc619f2482ac905763c772185e3e1e6f5138 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Fri, 27 Sep 2024 12:01:42 +0800
Subject: [PATCH 13/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=89=AA=E8=BE=91?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91123?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/audio_merger.py | 108 +++++++++++++++++++++++++++++++++++
 app/services/llm.py          |   4 +-
 app/services/task.py         |  66 ++++++++++++++-------
 app/services/voice.py        |  92 ++++++++++++++++++++++++++++-
 4 files changed, 247 insertions(+), 23 deletions(-)
 create mode 100644 app/services/audio_merger.py

diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py
new file mode 100644
index 0000000..cf2a204
--- /dev/null
+++ b/app/services/audio_merger.py
@@ -0,0 +1,108 @@
+import os
+import subprocess
+import edge_tts
+from edge_tts import submaker
+from pydub import AudioSegment
+from typing import List
+from loguru import logger
+from app.utils import utils
+
+
+def check_ffmpeg():
+    """检查FFmpeg是否已安装"""
+    try:
+        subprocess.run(['ffmpeg', '-version'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return True
+    except FileNotFoundError:
+        return False
+
+
+def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int):
+    """
+    合并多个音频文件到一个指定总时长的音频文件中
+    
+    :param audio_file_paths: 音频文件路径列表
+    :param total_duration: 最终音频文件的总时长（秒）
+    """
+    output_dir = utils.task_dir(task_id)
+
+    if not check_ffmpeg():
+        logger.error("错误：FFmpeg未安装。请安装FFmpeg后再运行此脚本。")
+        return None, None
+
+    # 创建一个总时长为total_duration的空白音频
+    blank_audio = AudioSegment.silent(duration=total_duration * 1000)  # pydub使用毫秒
+    # 创建SubMaker对象
+    sub_maker = edge_tts.SubMaker()
+
+    for audio_path in audio_file_paths:
+        if not os.path.exists(audio_path):
+            logger.info(f"警告：文件 {audio_path} 不存在，已跳过。")
+            continue
+
+        # 从文件名中提取时间戳
+        filename = os.path.basename(audio_path)
+        start_time, end_time = extract_timestamp(filename)
+
+        # 读取音频文件
+        try:
+            audio = AudioSegment.from_mp3(audio_path)
+        except Exception as e:
+            logger.error(f"错误：无法读取文件 {audio_path}。错误信息：{str(e)}")
+            continue
+        # 将音频插入到空白音频的指定位置
+        blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
+
+        # 添加字幕信息
+        duration = (end_time - start_time) * 1000  # 转换为毫秒
+        # TODO 不是 filename 需要考虑怎么把字幕文本弄过来
+        sub_maker.create_sub((start_time * 1000, duration), filename)
+
+    # 尝试导出为WAV格式
+    try:
+        output_file = os.path.join(output_dir, "audio.wav")
+        blank_audio.export(output_file, format="wav")
+        logger.info(f"音频合并完成，已保存为 {output_file}")
+    except Exception as e:
+        logger.info(f"导出为WAV格式失败，尝试使用MP3格式：{str(e)}")
+        try:
+            output_file = "merged_audio.mp3"
+            blank_audio.export(output_file, format="mp3", codec="libmp3lame")
+            logger.info(f"音频合并完成，已保存为 {output_file}")
+        except Exception as e:
+            logger.error(f"导出音频失败：{str(e)}")
+            return None, None
+
+    return output_file, sub_maker
+
+
+def extract_timestamp(filename):
+    """从文件名中提取开始和结束时间戳"""
+    time_part = filename.split('_')[1].split('.')[0]
+    times = time_part.split('-')
+
+    # 将时间戳转换为秒
+    start_seconds = time_to_seconds(times[0], times[1])
+    end_seconds = time_to_seconds(times[2], times[3])
+
+    return start_seconds, end_seconds
+
+
+def time_to_seconds(minutes, seconds):
+    """将分钟和秒转换为总秒数"""
+    return int(minutes) * 60 + int(seconds)
+
+
+if __name__ == "__main__":
+    # 示例用法
+    audio_files = [
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-06-00-24.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-32-00-38.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-43-00-52.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-52-01-09.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3"
+    ]
+    total_duration = 75
+
+    a, b = merge_audio_files("test456", audio_files, total_duration)
+    print(a, b)
\ No newline at end of file
diff --git a/app/services/llm.py b/app/services/llm.py
index 0fe4950..66784a7 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -817,8 +817,8 @@ if __name__ == "__main__":
     # gemini_video_transcription(video_subject, video_path, language)
 
     # 2. 解说文案
-    # video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4"
-    video_path = "E:\\projects\\NarratoAI\\resource\\videos\\1.mp4"
+    video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4"
+    # video_path = "E:\\projects\\NarratoAI\\resource\\videos\\1.mp4"
     video_plot = """
         李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明"你儿子是你儿子"。
     走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
diff --git a/app/services/task.py b/app/services/task.py
index 2e3ff30..7de5ac4 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -8,7 +8,7 @@ from loguru import logger
 from app.config import config
 from app.models import const
 from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
-from app.services import llm, material, subtitle, video, voice
+from app.services import llm, material, subtitle, video, voice, audio_merger
 from app.services import state as sm
 from app.utils import utils
 
@@ -97,7 +97,7 @@ def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
     if not params.subtitle_enabled:
         return ""
 
-    subtitle_path = path.join(utils.task_dir(task_id), "subtitle.srt")
+    subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
     subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
     logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
 
@@ -353,6 +353,9 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
                 logger.debug(f"解说完整脚本: \n{video_script}")
                 logger.debug(f"解说 OST 列表: \n{video_ost}")
                 logger.debug(f"解说时间戳列表: \n{time_list}")
+                # 获取视频总时长(单位 s)
+                total_duration = list_script[-1]['new_timestamp']
+                total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(total_duration.split("-")[1].split(":")[1])
         except Exception as e:
             logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
             raise ValueError("无法读取视频json脚本，请检查配置是否正确")
@@ -372,21 +375,27 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
         logger.error(
             "音频文件为空，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
         return
+    logger.info("合并音频")
+    audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration)
+
+    # audio_duration = voice.get_audio_duration(sub_maker)
+    # audio_duration = math.ceil(audio_duration)
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
 
     subtitle_path = ""
     if params.subtitle_enabled:
-        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt")
         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
         # subtitle_fallback = False
         if subtitle_provider == "edge":
-            voice.create_subtitle_from_multiple(
-                text=video_script,
-                sub_maker_list=sub_maker_list,
-                list_script=list_script,
-                subtitle_file=subtitle_path
-            )
+            voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
+            # voice.create_subtitle(
+            #     text=video_script,
+            #     sub_maker_list=sub_maker_list,
+            #     list_script=list_script,
+            #     subtitle_file=subtitle_path
+            # )
         #     if not os.path.exists(subtitle_path):
         #         subtitle_fallback = True
         #         logger.warning("找不到字幕文件，回退到whisper")
@@ -475,18 +484,35 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
 
 
 if __name__ == "__main__":
-    task_id = "test123456"
-    subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
-                           '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4',
-                           '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4',
-                           '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4',
-                           '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4',
-                           '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4',
-                           '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4',
-                           '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'}
+    # task_id = "test123"
+    # subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
+    #                        '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4',
+    #                        '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4',
+    #                        '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4',
+    #                        '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4',
+    #                        '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4',
+    #                        '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4',
+    #                        '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'}
+    #
+    # params = VideoClipParams(
+    #     video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json",
+    #     video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4",
+    # )
+    # start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
+
+    task_id = "test456"
+    subclip_path_videos = {'00:00-00:06': './storage/cache_videos/vid-00_00-00_06.mp4',
+                           '00:06-00:24': './storage/cache_videos/vid-00_06-00_24.mp4',
+                           '01:28-01:36': './storage/cache_videos/vid-01_28-01_36.mp4',
+                           '00:41-00:47': './storage/cache_videos/vid-00_41-00_47.mp4',
+                           '01:58-02:03': './storage/cache_videos/vid-01_58-02_03.mp4',
+                           '02:03-02:12': './storage/cache_videos/vid-02_03-02_12.mp4',
+                           '02:40-02:57': './storage/cache_videos/vid-02_40-02_57.mp4',
+                           '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
+                           '03:18-03:20': './storage/cache_videos/vid-03_18-03_20.mp4'}
 
     params = VideoClipParams(
-        video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json",
-        video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4",
+        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json",
+        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
     )
     start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
diff --git a/app/services/voice.py b/app/services/voice.py
index e40fa5d..4464140 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -2,12 +2,12 @@ import os
 import re
 import json
 import traceback
-
 import edge_tts
 import asyncio
 from loguru import logger
 from typing import List
 from datetime import datetime
+from edge_tts.submaker import mktimestamp
 from xml.sax.saxutils import unescape
 from edge_tts import submaker, SubMaker
 from moviepy.video.tools import subtitles
@@ -1293,6 +1293,96 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
         traceback.print_exc()
 
 
+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    text = _format_text(text)
+
+    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
+        """
+        1
+        00:00:00,000 --> 00:00:02,360
+        跑步是一项简单易行的运动
+        """
+        start_t = mktimestamp(start_time).replace(".", ",")
+        end_t = mktimestamp(end_time).replace(".", ",")
+        return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
+
+    start_time = -1.0
+    sub_items = []
+    sub_index = 0
+
+    script_lines = utils.split_string_by_punctuations(text)
+
+    def match_line(_sub_line: str, _sub_index: int):
+        if len(script_lines) <= _sub_index:
+            return ""
+
+        _line = script_lines[_sub_index]
+        if _sub_line == _line:
+            return script_lines[_sub_index].strip()
+
+        _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
+        _line_ = re.sub(r"[^\w\s]", "", _line)
+        if _sub_line_ == _line_:
+            return _line_.strip()
+
+        _sub_line_ = re.sub(r"\W+", "", _sub_line)
+        _line_ = re.sub(r"\W+", "", _line)
+        if _sub_line_ == _line_:
+            return _line.strip()
+
+        return ""
+
+    sub_line = ""
+
+    try:
+        for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
+            _start_time, end_time = offset
+            if start_time < 0:
+                start_time = _start_time
+
+            sub = unescape(sub)
+            sub_line += sub
+            sub_text = match_line(sub_line, sub_index)
+            if sub_text:
+                sub_index += 1
+                line = formatter(
+                    idx=sub_index,
+                    start_time=start_time,
+                    end_time=end_time,
+                    sub_text=sub_text,
+                )
+                sub_items.append(line)
+                start_time = -1.0
+                sub_line = ""
+
+        if len(sub_items) == len(script_lines):
+            with open(subtitle_file, "w", encoding="utf-8") as file:
+                file.write("\n".join(sub_items) + "\n")
+            try:
+                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
+                duration = max([tb for ((ta, tb), txt) in sbs])
+                logger.info(
+                    f"completed, subtitle file created: {subtitle_file}, duration: {duration}"
+                )
+            except Exception as e:
+                logger.error(f"failed, error: {str(e)}")
+                os.remove(subtitle_file)
+        else:
+            logger.warning(
+                f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}"
+            )
+
+    except Exception as e:
+        logger.error(f"failed, error: {str(e)}")
+
+
 def get_audio_duration(sub_maker: submaker.SubMaker):
     """
     获取音频时长

From 7b3014ad42873a2f89cb73a32dc00f92ac0e0c9a Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Sat, 28 Sep 2024 17:10:43 +0800
Subject: [PATCH 14/21] =?UTF-8?q?=E5=89=AA=E8=BE=91=E9=80=BB=E8=BE=91?=
 =?UTF-8?q?=E8=BF=9B=E5=BA=A660%=EF=BC=9B=20=E5=BE=85=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E7=82=B9=EF=BC=9A=201.=20=E7=94=9F=E6=88=90=E5=AD=97=E5=B9=95?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91=E4=BC=98=E5=8C=96=202.=20=E6=96=87=E6=A1=88?=
 =?UTF-8?q?=E8=A7=A3=E8=AF=B4=E7=9A=84=E6=97=B6=E9=97=B4=E5=92=8C=E8=84=9A?=
 =?UTF-8?q?=E6=9C=AC=E6=97=B6=E9=97=B4=E7=9A=84=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/models/schema.py         |  4 +-
 app/services/audio_merger.py | 46 ++++++++++++++-------
 app/services/subtitle.py     | 43 +++++++++++++++++++-
 app/services/task.py         | 13 +++---
 app/services/video.py        | 78 +++++++++++-------------------------
 app/services/voice.py        |  6 +--
 6 files changed, 109 insertions(+), 81 deletions(-)

diff --git a/app/models/schema.py b/app/models/schema.py
index f20657a..bf39e2b 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -353,7 +353,7 @@ class VideoClipParams(BaseModel):
     bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
     bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量")
 
-    subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕")
+    subtitle_enabled: Optional[bool] = Field(default=False, description="是否启用字幕")
     subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置")  # top, bottom, center
     font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称")
     text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色")
@@ -364,5 +364,5 @@ class VideoClipParams(BaseModel):
     stroke_width: float = Field(default=1.5, description="文字描边宽度")
     custom_position: float = Field(default=70.0, description="自定义位置")
 
-    # n_threads: Optional[int] = 2    # 线程数
+    n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
     # paragraph_number: Optional[int] = 1     # 段落数量
diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py
index cf2a204..e35a22c 100644
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@@ -1,9 +1,10 @@
 import os
+import json
 import subprocess
 import edge_tts
 from edge_tts import submaker
 from pydub import AudioSegment
-from typing import List
+from typing import List, Dict
 from loguru import logger
 from app.utils import utils
 
@@ -17,12 +18,13 @@ def check_ffmpeg():
         return False
 
 
-def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int):
+def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list):
     """
-    合并多个音频文件到一个指定总时长的音频文件中
-    
+    合并多个音频文件到一个指定总时长的音频文件中，并生成相应的字幕
+    :param task_id: 任务ID
     :param audio_file_paths: 音频文件路径列表
     :param total_duration: 最终音频文件的总时长（秒）
+    :param video_script: JSON格式的视频脚本
     """
     output_dir = utils.task_dir(task_id)
 
@@ -35,6 +37,17 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
     # 创建SubMaker对象
     sub_maker = edge_tts.SubMaker()
 
+    # 解析JSON格式的video_script
+    script_data = video_script
+
+    for segment in script_data:
+        start_time, end_time = parse_timestamp(segment['new_timestamp'])
+        duration = (end_time - start_time) * 1000  # 转换为毫秒
+
+        if not segment['OST']:
+            # 如果不是原声，则添加narration作为字幕
+            sub_maker.create_sub((start_time * 1000, duration), segment['narration'])
+
     for audio_path in audio_file_paths:
         if not os.path.exists(audio_path):
             logger.info(f"警告：文件 {audio_path} 不存在，已跳过。")
@@ -50,14 +63,10 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
         except Exception as e:
             logger.error(f"错误：无法读取文件 {audio_path}。错误信息：{str(e)}")
             continue
+        
         # 将音频插入到空白音频的指定位置
         blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
 
-        # 添加字幕信息
-        duration = (end_time - start_time) * 1000  # 转换为毫秒
-        # TODO 不是 filename 需要考虑怎么把字幕文本弄过来
-        sub_maker.create_sub((start_time * 1000, duration), filename)
-
     # 尝试导出为WAV格式
     try:
         output_file = os.path.join(output_dir, "audio.wav")
@@ -66,7 +75,7 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
     except Exception as e:
         logger.info(f"导出为WAV格式失败，尝试使用MP3格式：{str(e)}")
         try:
-            output_file = "merged_audio.mp3"
+            output_file = os.path.join(output_dir, "audio.mp3")
             blank_audio.export(output_file, format="mp3", codec="libmp3lame")
             logger.info(f"音频合并完成，已保存为 {output_file}")
         except Exception as e:
@@ -75,6 +84,10 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
 
     return output_file, sub_maker
 
+def parse_timestamp(timestamp: str) -> tuple:
+    """解析时间戳字符串为秒数"""
+    start, end = timestamp.split('-')
+    return time_to_seconds(*start.split(':')), time_to_seconds(*end.split(':'))
 
 def extract_timestamp(filename):
     """从文件名中提取开始和结束时间戳"""
@@ -95,14 +108,17 @@ def time_to_seconds(minutes, seconds):
 
 if __name__ == "__main__":
     # 示例用法
-    audio_files = [
+    audio_files =[
         "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-06-00-24.mp3",
         "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-32-00-38.mp3",
         "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-43-00-52.mp3",
         "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-52-01-09.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3"
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3",
     ]
-    total_duration = 75
+    total_duration = 38
+    video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json"
+    with open(video_script_path, "r", encoding="utf-8") as f:
+        video_script = json.load(f)
 
-    a, b = merge_audio_files("test456", audio_files, total_duration)
-    print(a, b)
\ No newline at end of file
+    output_file, sub_maker = merge_audio_files("test456", audio_files, total_duration, video_script)
+    print(output_file, sub_maker)
\ No newline at end of file
diff --git a/app/services/subtitle.py b/app/services/subtitle.py
index ba6e224..b915c6c 100644
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -1,10 +1,12 @@
 import json
 import os.path
 import re
+from typing import Optional
 
 from faster_whisper import WhisperModel
 from timeit import default_timer as timer
 from loguru import logger
+import google.generativeai as genai
 
 from app.config import config
 from app.utils import utils
@@ -278,8 +280,40 @@ def correct(subtitle_file, video_script):
         logger.success("Subtitle is correct")
 
 
+def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Optional[str] = None) -> Optional[str]:
+    if not api_key:
+        logger.error("Gemini API key is not provided")
+        return None
+
+    genai.configure(api_key=api_key)
+
+    logger.info(f"开始使用Gemini模型处理音频文件: {audio_file}")
+    
+    model = genai.GenerativeModel(model_name="gemini-1.5-flash")
+    prompt = "生成这段语音的转录文本。请以SRT格式输出，包含时间戳。"
+
+    try:
+        with open(audio_file, "rb") as f:
+            audio_data = f.read()
+        
+        response = model.generate_content([prompt, audio_data])
+        transcript = response.text
+
+        if not subtitle_file:
+            subtitle_file = f"{audio_file}.srt"
+
+        with open(subtitle_file, "w", encoding="utf-8") as f:
+            f.write(transcript)
+
+        logger.info(f"Gemini生成的字幕文件已保存: {subtitle_file}")
+        return subtitle_file
+    except Exception as e:
+        logger.error(f"使用Gemini处理音频时出错: {e}")
+        return None
+
+
 if __name__ == "__main__":
-    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_id = "task456"
     task_dir = utils.task_dir(task_id)
     subtitle_file = f"{task_dir}/subtitle.srt"
     audio_file = f"{task_dir}/audio.mp3"
@@ -297,3 +331,10 @@ if __name__ == "__main__":
 
     subtitle_file = f"{task_dir}/subtitle-test.srt"
     create(audio_file, subtitle_file)
+
+    # 使用Gemini模型处理音频
+    gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
+    gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
+
+    if gemini_subtitle_file:
+        print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
diff --git a/app/services/task.py b/app/services/task.py
index 7de5ac4..fd53d1d 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -338,7 +338,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
     # tts 角色名称
     voice_name = voice.parse_voice_name(params.voice_name)
 
-    logger.info("\n\n## 1. 读取视频json脚本")
+    logger.info("\n\n## 1. 加载视频脚本")
     video_script_path = path.join(params.video_clip_json_path)
     # 判断json文件是否存在
     if path.exists(video_script_path):
@@ -376,7 +376,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
             "音频文件为空，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
         return
     logger.info("合并音频")
-    audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration)
+    audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)
 
     # audio_duration = voice.get_audio_duration(sub_maker)
     # audio_duration = math.ceil(audio_duration)
@@ -387,7 +387,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
         subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt")
         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        # subtitle_fallback = False
+        subtitle_fallback = False
         if subtitle_provider == "edge":
             voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
             # voice.create_subtitle(
@@ -401,7 +401,8 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
         #         logger.warning("找不到字幕文件，回退到whisper")
         #
         # if subtitle_provider == "whisper" or subtitle_fallback:
-        #     subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        #     # subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        #     subtitle.create_with_gemini(audio_file=audio_file, subtitle_file=subtitle_path, api_key=config.app.get("gemini_api_key", ""))
         #     logger.info("\n\n## 更正字幕")
         #     subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
 
@@ -449,7 +450,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
         video_ost_list=video_ost,
         list_script=list_script,
         video_aspect=params.video_aspect,
-        threads=1  # 暂时只支持单线程
+        threads=params.n_threads  # 多线程
     )
 
     _progress += 50 / 2
@@ -461,7 +462,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
     # 把所有东西合到在一起
     video.generate_video_v2(
         video_path=combined_video_path,
-        audio_paths=audio_files,
+        audio_path=audio_file,
         subtitle_path=subtitle_path,
         output_file=final_video_path,
         params=params,
diff --git a/app/services/video.py b/app/services/video.py
index 864634c..6bfb9bf 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -294,7 +294,7 @@ def generate_video(
         output_file,
         audio_codec="aac",
         temp_audiofile_path=output_dir,
-        threads=params.n_threads or 2,
+        threads=params.n_threads,
         logger=None,
         fps=30,
     )
@@ -306,7 +306,7 @@ def generate_video(
 
 def generate_video_v2(
         video_path: str,
-        audio_paths: List[str],
+        audio_path: str,
         subtitle_path: str,
         output_file: str,
         params: Union[VideoParams, VideoClipParams],
@@ -314,11 +314,11 @@ def generate_video_v2(
     """
     合并所有素材
     Args:
-        video_path:
-        audio_paths:
-        subtitle_path:
-        output_file:
-        params:
+        video_path: 视频路径
+        audio_path: 单个音频文件路径
+        subtitle_path: 字幕文件路径
+        output_file: 输出文件路径
+        params: 视频参数
 
     Returns:
 
@@ -328,7 +328,7 @@ def generate_video_v2(
 
     logger.info(f"开始，视频尺寸: {video_width} x {video_height}")
     logger.info(f"  ① 视频: {video_path}")
-    logger.info(f"  ② 音频文件数量: {len(audio_paths)}")
+    logger.info(f"  ② 音频: {audio_path}")
     logger.info(f"  ③ 字幕: {subtitle_path}")
     logger.info(f"  ④ 输出: {output_file}")
 
@@ -386,40 +386,8 @@ def generate_video_v2(
     original_audio = video_clip.audio  # 保存原始视频的音轨
     video_duration = video_clip.duration
 
-    # 处理多个音频文件
-    audio_clips = []
-    for audio_path in audio_paths:
-        # 确保每个音频文件路径是正确的
-        if not os.path.exists(audio_path):
-            logger.warning(f"音频文件不存在: {audio_path}")
-            continue
-
-        # 从文件名中提取时间信息
-        match = re.search(r'audio_(\d{2}-\d{2}-\d{2}-\d{2})\.mp3', os.path.basename(audio_path))
-        if match:
-            time_str = match.group(1)
-            start, end = time_str.split('-')[:2], time_str.split('-')[2:]
-            start_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(start)))
-            end_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(end)))
-
-            audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume)
-            
-            # 确保结束时间不超过音频实际长度
-            actual_end_time = min(end_time - start_time, audio_clip.duration)
-            
-            audio_clip = audio_clip.subclip(0, actual_end_time)
-            audio_clip = audio_clip.set_start(start_time).set_end(start_time + actual_end_time)
-            audio_clips.append(audio_clip)
-        else:
-            logger.warning(f"无法从文件名解析时间信息: {audio_path}")
-
-    # 合并所有音频剪辑，包括原始音轨
-    if audio_clips:
-        audio_clips.insert(0, original_audio)  # 将原始音轨添加到音频剪辑列表的开头
-        audio_clip = CompositeAudioClip(audio_clips)
-    else:
-        logger.warning("没有有效的音频文件，使用原始音轨")
-        audio_clip = original_audio
+    # 处理新的音频文件
+    new_audio = AudioFileClip(audio_path).volumex(params.voice_volume)
 
     # 字幕处理部分
     if subtitle_path and os.path.exists(subtitle_path):
@@ -451,22 +419,29 @@ def generate_video_v2(
 
     # 背景音乐处理部分
     bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
+    
+    # 合并音频轨道
+    audio_tracks = [original_audio, new_audio]
+    
     if bgm_file:
         try:
             bgm_clip = (
                 AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
             )
-            bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration)
-            audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
+            bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration)
+            audio_tracks.append(bgm_clip)
         except Exception as e:
             logger.error(f"添加背景音乐失败: {str(e)}")
 
-    video_clip = video_clip.set_audio(audio_clip)
+    # 合并所有音频轨道
+    final_audio = CompositeAudioClip(audio_tracks)
+
+    video_clip = video_clip.set_audio(final_audio)
     video_clip.write_videofile(
         output_file,
         audio_codec="aac",
         temp_audiofile_path=output_dir,
-        threads=params.n_threads or 2,
+        threads=params.n_threads,
         logger=None,
         fps=30,
     )
@@ -607,7 +582,7 @@ def combine_clip_videos(combined_video_path: str,
 
     video_clip = concatenate_videoclips(clips)
     video_clip = video_clip.set_fps(30)
-    logger.info(f"合并中...")
+    logger.info(f"合并视频中...")
     video_clip.write_videofile(filename=combined_video_path,
                                threads=threads,
                                logger=None,
@@ -687,19 +662,14 @@ if __name__ == "__main__":
 
     video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4"
 
-    audio_paths = ['../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3',
-                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-14-00-17.mp3',
-                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-17-00-22.mp3',
-                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-34-00-45.mp3',
-                   '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-59-01-09.mp3',
-                   ]
+    audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3"
 
     subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt"
 
     output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4"
 
     generate_video_v2(video_path=video_path,
-                       audio_paths=audio_paths,
+                       audio_path=audio_path,
                        subtitle_path=subtitle_path,
                        output_file=output_file,
                        params=cfg
diff --git a/app/services/voice.py b/app/services/voice.py
index 4464140..cf5c24d 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1034,8 +1034,8 @@ def is_azure_v2_voice(voice_name: str):
 def tts(
     text: str, voice_name: str, voice_rate: float, voice_file: str
 ) -> [SubMaker, None]:
-    if is_azure_v2_voice(voice_name):
-        return azure_tts_v2(text, voice_name, voice_file)
+    # if is_azure_v2_voice(voice_name):
+    #     return azure_tts_v2(text, voice_name, voice_file)
     return azure_tts_v1(text, voice_name, voice_rate, voice_file)
 
 
@@ -1414,7 +1414,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
             audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
             
             # 检查文件是否已存在，如存在且不强制重新生成，则跳过
-            if os.path.exists(audio_file) and not force_regenerate:
+            if os.path.exists(audio_file):
                 logger.info(f"音频文件已存在，跳过生成: {audio_file}")
                 audio_files.append(audio_file)
                 continue

From 02589c8355444445ec4b362f8109207bbfc7ff24 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Sun, 29 Sep 2024 00:02:40 +0800
Subject: [PATCH 15/21] =?UTF-8?q?=E5=89=AA=E8=BE=91=E9=80=BB=E8=BE=91?=
 =?UTF-8?q?=E8=BF=9B=E5=BA=A670%=EF=BC=9B=20=E5=BE=85=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E7=82=B9=EF=BC=9A=201.=20=E7=94=9F=E6=88=90=E5=AD=97=E5=B9=95?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91=E4=BC=98=E5=8C=96=202.=20=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC-=E8=A7=A3=E8=AF=B4=E8=B4=A8=E9=87=8F=203.=20?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=AD=97=E5=B9=95bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/audio_merger.py |  46 +++-----
 app/services/llm.py          |  57 +++++++++-
 app/services/task.py         |  42 +++----
 app/services/voice.py        |   3 +-
 app/utils/check_script.py    | 205 +++++++++++------------------------
 app/utils/utils.py           |   4 +
 webui.sh                     |   2 +-
 7 files changed, 154 insertions(+), 205 deletions(-)

diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py
index e35a22c..80c9aff 100644
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@@ -34,19 +34,6 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
 
     # 创建一个总时长为total_duration的空白音频
     blank_audio = AudioSegment.silent(duration=total_duration * 1000)  # pydub使用毫秒
-    # 创建SubMaker对象
-    sub_maker = edge_tts.SubMaker()
-
-    # 解析JSON格式的video_script
-    script_data = video_script
-
-    for segment in script_data:
-        start_time, end_time = parse_timestamp(segment['new_timestamp'])
-        duration = (end_time - start_time) * 1000  # 转换为毫秒
-
-        if not segment['OST']:
-            # 如果不是原声，则添加narration作为字幕
-            sub_maker.create_sub((start_time * 1000, duration), segment['narration'])
 
     for audio_path in audio_file_paths:
         if not os.path.exists(audio_path):
@@ -82,12 +69,12 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
             logger.error(f"导出音频失败：{str(e)}")
             return None, None
 
-    return output_file, sub_maker
+    return output_file
 
-def parse_timestamp(timestamp: str) -> tuple:
+def parse_timestamp(timestamp: str):
     """解析时间戳字符串为秒数"""
-    start, end = timestamp.split('-')
-    return time_to_seconds(*start.split(':')), time_to_seconds(*end.split(':'))
+    # start, end = timestamp.split('-')
+    return time_to_seconds(timestamp)
 
 def extract_timestamp(filename):
     """从文件名中提取开始和结束时间戳"""
@@ -95,30 +82,31 @@ def extract_timestamp(filename):
     times = time_part.split('-')
 
     # 将时间戳转换为秒
-    start_seconds = time_to_seconds(times[0], times[1])
-    end_seconds = time_to_seconds(times[2], times[3])
+    start_seconds = time_to_seconds(times[0])
+    end_seconds = time_to_seconds(times[1])
 
     return start_seconds, end_seconds
 
 
-def time_to_seconds(minutes, seconds):
-    """将分钟和秒转换为总秒数"""
-    return int(minutes) * 60 + int(seconds)
+def time_to_seconds(times):
+    """将 “00:06” 转换为总秒数 """
+    times = times.split(':')
+    return int(times[0]) * 60 + int(times[1])
 
 
 if __name__ == "__main__":
     # 示例用法
     audio_files =[
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-06-00-24.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-32-00-38.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-43-00-52.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-52-01-09.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:06-00:24.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:32-00:38.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:43-00:52.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:52-01:09.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01:13-01:15.mp3",
     ]
     total_duration = 38
     video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json"
     with open(video_script_path, "r", encoding="utf-8") as f:
         video_script = json.load(f)
 
-    output_file, sub_maker = merge_audio_files("test456", audio_files, total_duration, video_script)
-    print(output_file, sub_maker)
\ No newline at end of file
+    output_file = merge_audio_files("test456", audio_files, total_duration, video_script)
+    print(output_file)
diff --git a/app/services/llm.py b/app/services/llm.py
index 66784a7..d3742df 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -792,14 +792,67 @@ def screen_matching(huamian: str, wenan: str, llm_provider: str):
         Return: list[script]
     - picture: 字段表示当前画面描述，与转录脚本保持一致
     - timestamp: 字段表示某一段文案对应的画面的时间戳，不必和转录脚本的时间戳一致，应该充分考虑文案内容，匹配出与其描述最匹配的时间戳
+        - 请注意，请严格的执行已经出现的画面不能重复出现，即生成的脚本中 timestamp 不能有重叠的部分。
     - narration: 字段表示需要解说文案，每段解说文案尽量不要超过30字
     - OST: 字段表示是否开启原声，即当 OST 字段为 true 时，narration 字段为空字符串，当 OST 为 false 时，narration 字段为对应的解说文案
     - 注意，在画面匹配的过程中，需要适当的加入原声播放，使得解说和画面更加匹配，请按照 1:1 的比例，生成原声和解说的脚本内容。
     - 注意，在时间戳匹配上，一定不能原样照搬“转录脚本”，应当适当的合并或者删减一些片段。
     - 注意，第一个画面一定是原声播放并且时长不少于 20 s，为了吸引观众，第一段一定是整个转录脚本中最精彩的片段。
-    - 注意，匹配的画面不能重复出现，即生成的脚本中 timestamp 不能重复。
     - 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
     """ % (huamian, wenan)
+
+    prompt = """
+    你是一位拥有10年丰富经验的影视解说创作专家。你的任务是根据提供的视频转录脚本和解说文案，创作一个引人入胜的解说脚本。请按照以下要求完成任务：
+
+1. 输入数据：
+   - 视频转录脚本：包含时间戳、画面描述和人物台词
+   - 解说文案：需要你进行匹配和编排的内容
+   - 视频转录脚本和文案（由 XML 标记<PICTURE></PICTURE>和 <COPYWRITER></COPYWRITER>分隔）如下所示：
+    视频转录脚本
+    <PICTURE>
+    %s
+    </PICTURE>
+    文案：
+    <COPYWRITER>
+    %s
+    </COPYWRITER>
+
+2. 输出要求：
+   - 格式：严格的JSON格式，可直接被json.loads()解析
+   - 结构：list[script]，其中script为字典类型
+   - script字段：
+     {
+       "picture": "画面描述",
+       "timestamp": "时间戳",
+       "narration": "解说文案",
+       "OST": true/false
+     }
+
+3. 匹配规则：
+   a) 时间戳匹配：
+      - 根据文案内容选择最合适的画面时间段
+      - 避免时间重叠，确保画面不重复出现
+      - 适当合并或删减片段，不要完全照搬转录脚本
+   b) 画面描述：与转录脚本保持一致
+   c) 解说文案：
+      - 当OST为true时，narration为空字符串
+      - 当OST为false时，narration为解说文案，但是要确保文案字数不要超过 30字，若文案较长，则添加到下一个片段
+   d) OST（原声）：
+      - 按1:1比例穿插原声和解说片段
+      - 第一个片段必须是原声，时长不少于20秒
+      - 选择整个视频中最精彩的片段作为开场
+
+4. 创作重点：
+   - 确保解说与画面高度匹配
+   - 巧妙安排原声和解说的交替，提升观众体验
+   - 创造一个引人入胜、节奏紧凑的解说脚本
+
+5. 注意事项：
+   - 严格遵守JSON格式，不包含任何注释或额外标记
+   - 充分利用你的专业经验，创作出高质量、吸引人的解说内容
+
+请基于以上要求，将提供的视频转录脚本和解说文案整合成一个专业、吸引人的解说脚本。你的创作将直接影响观众的观看体验，请发挥你的专业素养，创作出最佳效果。
+    """ % (huamian, wenan)
     try:
         response = _generate_response(prompt, llm_provider)
         logger.success("匹配成功")
@@ -830,5 +883,3 @@ if __name__ == "__main__":
     res = clean_model_output(res)
     aaa = json.loads(res)
     print(json.dumps(aaa, indent=2, ensure_ascii=False))
-    # response = _generate_response("你好，介绍一下你自己")
-    # print(response)
diff --git a/app/services/task.py b/app/services/task.py
index fd53d1d..b6bc504 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -355,7 +355,8 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
                 logger.debug(f"解说时间戳列表: \n{time_list}")
                 # 获取视频总时长(单位 s)
                 total_duration = list_script[-1]['new_timestamp']
-                total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(total_duration.split("-")[1].split(":")[1])
+                total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(
+                    total_duration.split("-")[1].split(":")[1])
         except Exception as e:
             logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
             raise ValueError("无法读取视频json脚本，请检查配置是否正确")
@@ -375,11 +376,9 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
         logger.error(
             "音频文件为空，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
         return
-    logger.info("合并音频")
-    audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)
+    logger.info(f"合并音频:\n\n {audio_files}")
+    audio_file = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)
 
-    # audio_duration = voice.get_audio_duration(sub_maker)
-    # audio_duration = math.ceil(audio_duration)
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
 
     subtitle_path = ""
@@ -389,7 +388,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
         subtitle_fallback = False
         if subtitle_provider == "edge":
-            voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
+            voice.create_subtitle(text=video_script, sub_maker="sub_maker", subtitle_file=subtitle_path)
             # voice.create_subtitle(
             #     text=video_script,
             #     sub_maker_list=sub_maker_list,
@@ -415,10 +414,6 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
 
     logger.info("\n\n## 4. 裁剪视频")
     subclip_videos = [x for x in subclip_path_videos.values()]
-    # subclip_videos = material.clip_videos(task_id=task_id,
-    #                                          timestamp_terms=time_list,
-    #                                          origin_video=params.video_origin_path
-    #                                          )
     logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")
 
     if not subclip_videos:
@@ -433,17 +428,10 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
     combined_video_paths = []
 
     _progress = 50
-    # for i in range(params.video_count):
     index = 1
     combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
     logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
-    print("111", subclip_videos)
-    print("222", video_ost)
-    print("333", len(subclip_videos))
-    print("444", len(video_ost))
-    # for video_path, video_ost in zip(subclip_videos, video_ost):
-    #     print(video_path)
-    #     print(video_ost)
+
     video.combine_clip_videos(
         combined_video_path=combined_video_path,
         video_paths=subclip_videos,
@@ -502,18 +490,18 @@ if __name__ == "__main__":
     # start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
 
     task_id = "test456"
-    subclip_path_videos = {'00:00-00:06': './storage/cache_videos/vid-00_00-00_06.mp4',
-                           '00:06-00:24': './storage/cache_videos/vid-00_06-00_24.mp4',
-                           '01:28-01:36': './storage/cache_videos/vid-01_28-01_36.mp4',
-                           '00:41-00:47': './storage/cache_videos/vid-00_41-00_47.mp4',
-                           '01:58-02:03': './storage/cache_videos/vid-01_58-02_03.mp4',
-                           '02:03-02:12': './storage/cache_videos/vid-02_03-02_12.mp4',
-                           '02:40-02:57': './storage/cache_videos/vid-02_40-02_57.mp4',
+    subclip_path_videos = {'01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4',
+                           '01:58-02:04': './storage/cache_videos/vid-01_58-02_04.mp4',
+                           '02:25-02:31': './storage/cache_videos/vid-02_25-02_31.mp4',
+                           '01:28-01:33': './storage/cache_videos/vid-01_28-01_33.mp4',
                            '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
-                           '03:18-03:20': './storage/cache_videos/vid-03_18-03_20.mp4'}
+                           '00:24-00:28': './storage/cache_videos/vid-00_24-00_28.mp4',
+                           '03:02-03:08': './storage/cache_videos/vid-03_02-03_08.mp4',
+                           '00:41-00:44': './storage/cache_videos/vid-00_41-00_44.mp4',
+                           '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4'}
 
     params = VideoClipParams(
-        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json",
+        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json",
         video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
     )
     start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
diff --git a/app/services/voice.py b/app/services/voice.py
index cf5c24d..e4776bf 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1410,7 +1410,8 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
 
     for item in list_script:
         if not item['OST']:
-            timestamp = item['new_timestamp'].replace(':', '-')
+            # timestamp = item['new_timestamp'].replace(':', '@')
+            timestamp = item['new_timestamp']
             audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
             
             # 检查文件是否已存在，如存在且不强制重新生成，则跳过
diff --git a/app/utils/check_script.py b/app/utils/check_script.py
index e10bd3f..623c42a 100644
--- a/app/utils/check_script.py
+++ b/app/utils/check_script.py
@@ -1,37 +1,46 @@
 import json
 from loguru import logger
 import os
-from datetime import datetime, timedelta
-import re
-
+from datetime import timedelta
 
 def time_to_seconds(time_str):
-    time_obj = datetime.strptime(time_str, "%M:%S")
-    return timedelta(minutes=time_obj.minute, seconds=time_obj.second).total_seconds()
-
+    parts = list(map(int, time_str.split(':')))
+    if len(parts) == 2:
+        return timedelta(minutes=parts[0], seconds=parts[1]).total_seconds()
+    elif len(parts) == 3:
+        return timedelta(hours=parts[0], minutes=parts[1], seconds=parts[2]).total_seconds()
+    raise ValueError(f"无法解析时间字符串: {time_str}")
 
 def seconds_to_time_str(seconds):
-    minutes, seconds = divmod(int(seconds), 60)
-    return f"{minutes:02d}:{seconds:02d}"
+    hours, remainder = divmod(int(seconds), 3600)
+    minutes, seconds = divmod(remainder, 60)
+    if hours > 0:
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+    else:
+        return f"{minutes:02d}:{seconds:02d}"
 
+def adjust_timestamp(start_time, duration):
+    start_seconds = time_to_seconds(start_time)
+    end_seconds = start_seconds + duration
+    return f"{start_time}-{seconds_to_time_str(end_seconds)}"
 
-def check_script(file_path, total_duration):
-    with open(file_path, 'r', encoding='utf-8') as f:
-        data = json.load(f)
+def estimate_audio_duration(text):
+    # 假设平均每个字符需要 0.2 秒
+    return len(text) * 0.2
 
+def check_script(data, total_duration):
     errors = []
-    ost_narrations = set()
-    last_end_time = 0
+    time_ranges = []
 
-    logger.info(f"开始检查文件: {file_path}")
+    logger.info("开始检查脚本")
     logger.info(f"视频总时长: {total_duration:.2f} 秒")
     logger.info("=" * 50)
 
     for i, item in enumerate(data, 1):
         logger.info(f"\n检查第 {i} 项:")
 
-        # 检查所有必需字段是否存在
-        required_fields = ['picture', 'timestamp', 'narration', 'OST', 'new_timestamp']
+        # 检查所有必需字段
+        required_fields = ['picture', 'timestamp', 'narration', 'OST']
         for field in required_fields:
             if field not in item:
                 errors.append(f"第 {i} 项缺少 {field} 字段")
@@ -39,160 +48,68 @@ def check_script(file_path, total_duration):
             else:
                 logger.info(f"  - {field}: {item[field]}")
 
-        # 检查 OST 为 false 的情况
+        # 检查 OST 相关规则
         if item.get('OST') == False:
             if not item.get('narration'):
                 errors.append(f"第 {i} 项 OST 为 false，但 narration 为空")
                 logger.info("  - 错误: OST 为 false，但 narration 为空")
-            elif len(item['narration']) > 30:
-                errors.append(f"第 {i} 项 OST 为 false，但 narration 超过 30 字")
-                logger.info(f"  - 错误: OST 为 false，但 narration 超过 30 字 (当前: {len(item['narration'])} 字)")
+            elif len(item['narration']) > 60:
+                errors.append(f"第 {i} 项 OST 为 false，但 narration 超过 60 字")
+                logger.info(f"  - 错误: OST 为 false，但 narration 超过 60 字 (当前: {len(item['narration'])} 字)")
             else:
                 logger.info("  - OST 为 false，narration 检查通过")
-
-        # 检查 OST 为 true 的情况
-        if item.get('OST') == True:
-            if not item.get('narration').startswith('原声播放_'):
-                errors.append(f"第 {i} 项 OST 为 true，但 narration 不是 '原声播放_xxx' 格式")
-                logger.info("  - 错误: OST 为 true，但 narration 不是 '原声播放_xxx' 格式")
-            elif item['narration'] in ost_narrations:
-                errors.append(f"第 {i} 项 OST 为 true，但 narration '{item['narration']}' 不是唯一值")
-                logger.info(f"  - 错误: OST 为 true，但 narration '{item['narration']}' 不是唯一值")
+        elif item.get('OST') == True:
+            if "原声播放_" not in item.get('narration'):
+                errors.append(f"第 {i} 项 OST 为 true，但 narration 不为空")
+                logger.info("  - 错误: OST 为 true，但 narration 不为空")
             else:
                 logger.info("  - OST 为 true，narration 检查通过")
-                ost_narrations.add(item['narration'])
 
-        # 检查 timestamp 是否重叠
+        # 检查 timestamp
         if 'timestamp' in item:
             start, end = map(time_to_seconds, item['timestamp'].split('-'))
-            if start < last_end_time:
-                errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 与前一项重叠")
-                logger.info(f"  - 错误: timestamp '{item['timestamp']}' 与前一项重叠")
+            if any((start < existing_end and end > existing_start) for existing_start, existing_end in time_ranges):
+                errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 与其他时间段重叠")
+                logger.info(f"  - 错误: timestamp '{item['timestamp']}' 与其他时间段重叠")
             else:
                 logger.info(f"  - timestamp '{item['timestamp']}' 检查通过")
-            last_end_time = end
+                time_ranges.append((start, end))
 
-            # 检查 timestamp 是否超过总时长
-            if end > total_duration:
-                errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
-                logger.info(f"  - 错误: timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
-            else:
-                logger.info(f"  - timestamp 在总时长范围内")
+            # if end > total_duration:
+            #     errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
+            #     logger.info(f"  - 错误: timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
+            # else:
+            #     logger.info(f"  - timestamp 在总时长范围内")
 
-    # 检查 new_timestamp 是否连续
-    logger.info("\n检查 new_timestamp 连续性:")
-    last_end_time = 0
-    for i, item in enumerate(data, 1):
-        if 'new_timestamp' in item:
-            start, end = map(time_to_seconds, item['new_timestamp'].split('-'))
-            if start != last_end_time:
-                errors.append(f"第 {i} 项 new_timestamp '{item['new_timestamp']}' 与前一项不连续")
-                logger.info(f"  - 错误: 第 {i} 项 new_timestamp '{item['new_timestamp']}' 与前一项不连续")
-            else:
-                logger.info(f"  - 第 {i} 项 new_timestamp '{item['new_timestamp']}' 连续性检查通过")
-            last_end_time = end
+        # 处理 narration 字段
+        if item.get('OST') == False and item.get('narration'):
+            estimated_duration = estimate_audio_duration(item['narration'])
+            start_time = item['timestamp'].split('-')[0]
+            item['timestamp'] = adjust_timestamp(start_time, estimated_duration)
+            logger.info(f"  - 已调整 timestamp 为 {item['timestamp']} (估算音频时长: {estimated_duration:.2f} 秒)")
 
     if errors:
         logger.info("检查结果：不通过")
         logger.info("发现以下错误：")
         for error in errors:
             logger.info(f"- {error}")
-        fix_script(file_path, data, errors)
     else:
         logger.info("检查结果：通过")
         logger.info("所有项目均符合规则要求。")
 
-
-def fix_script(file_path, data, errors):
-    logger.info("\n开始修复脚本...")
-    fixed_data = []
-    for i, item in enumerate(data, 1):
-        if item['OST'] == False and (not item['narration'] or len(item['narration']) > 30):
-            if not item['narration']:
-                logger.info(f"第 {i} 项 narration 为空，需要人工参与修复。")
-                fixed_data.append(item)
-            else:
-                logger.info(f"修复第 {i} 项 narration 超过 30 字的问题...")
-                fixed_items = split_narration(item)
-                fixed_data.extend(fixed_items)
-        else:
-            fixed_data.append(item)
-
-    for error in errors:
-        if not error.startswith("第") or "OST 为 false" not in error:
-            logger.info(f"需要人工参与修复: {error}")
-
-    # 生成新的文件名
-    file_name, file_ext = os.path.splitext(file_path)
-    new_file_path = f"{file_name}_revise{file_ext}"
-
-    # 保存修复后的数据到新文件
-    with open(new_file_path, 'w', encoding='utf-8') as f:
-        json.dump(fixed_data, f, ensure_ascii=False, indent=4)
-
-    logger.info(f"\n脚本修复完成，已保存到新文件: {new_file_path}")
-
-
-def split_narration(item):
-    narration = item['narration']
-    chunks = smart_split(narration)
-
-    start_time, end_time = map(time_to_seconds, item['timestamp'].split('-'))
-    new_start_time, new_end_time = map(time_to_seconds, item['new_timestamp'].split('-'))
-
-    total_duration = end_time - start_time
-    new_total_duration = new_end_time - new_start_time
-    chunk_duration = total_duration / len(chunks)
-    new_chunk_duration = new_total_duration / len(chunks)
-
-    fixed_items = []
-    for i, chunk in enumerate(chunks):
-        new_item = item.copy()
-        new_item['narration'] = chunk
-
-        chunk_start = start_time + i * chunk_duration
-        chunk_end = chunk_start + chunk_duration
-        new_item['timestamp'] = f"{seconds_to_time_str(chunk_start)}-{seconds_to_time_str(chunk_end)}"
-
-        new_chunk_start = new_start_time + i * new_chunk_duration
-        new_chunk_end = new_chunk_start + new_chunk_duration
-        new_item['new_timestamp'] = f"{seconds_to_time_str(new_chunk_start)}-{seconds_to_time_str(new_chunk_end)}"
-
-        fixed_items.append(new_item)
-
-    return fixed_items
-
-
-def smart_split(text, target_length=30):
-    # 使用正则表达式分割文本，保留标点符号
-    segments = re.findall(r'[^，。！？,!?]+[，。！？,!?]?', text)
-    result = []
-    current_chunk = ""
-
-    for segment in segments:
-        if len(current_chunk) + len(segment) <= target_length:
-            current_chunk += segment
-        else:
-            if current_chunk:
-                result.append(current_chunk.strip())
-            current_chunk = segment
-
-    if current_chunk:
-        result.append(current_chunk.strip())
-
-    # 如果有任何chunk超过了目标长度，进行进一步的分割
-    final_result = []
-    for chunk in result:
-        if len(chunk) > target_length:
-            sub_chunks = [chunk[i:i + target_length] for i in range(0, len(chunk), target_length)]
-            final_result.extend(sub_chunks)
-        else:
-            final_result.append(chunk)
-
-    return final_result
+    return errors, data
 
 
 if __name__ == "__main__":
-    file_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/2024-0923-085036.json"
+    file_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json"
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
     total_duration = 280
-    check_script(file_path, total_duration)
+
+    # check_script(data, total_duration)
+
+    from app.utils.utils import add_new_timestamps
+    res = add_new_timestamps(data)
+    print(json.dumps(res, indent=4, ensure_ascii=False))
diff --git a/app/utils/utils.py b/app/utils/utils.py
index ecf8aa8..e4ba419 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -10,6 +10,7 @@ import urllib3
 from datetime import datetime, timedelta
 
 from app.models import const
+from app.utils import check_script
 
 urllib3.disable_warnings()
 
@@ -340,6 +341,9 @@ def add_new_timestamps(scenes):
     current_time = timedelta()
     updated_scenes = []
 
+    # 保存脚本前先检查脚本是否正确
+    check_script.check_script(scenes, calculate_total_duration(scenes))
+
     for scene in scenes:
         new_scene = scene.copy()  # 创建场景的副本，以保留原始数据
         start, end = new_scene['timestamp'].split('-')
diff --git a/webui.sh b/webui.sh
index 001eaae..c188c2b 100644
--- a/webui.sh
+++ b/webui.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # 从环境变量中加载VPN代理的配置URL
-vpn_proxy_url="$VPN_PROXY_URL"
+vpn_proxy_url="http://127.0.0.1:7890"
 # 检查是否成功加载
 if [ -z "$vpn_proxy_url" ]; then
     echo "VPN代理配置URL未设置，请检查环境变量VPN_PROXY_URL"

From dc4ce80ea5d0a44346e448a69deceb678f7c1767 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Sun, 29 Sep 2024 14:39:20 +0800
Subject: [PATCH 16/21] =?UTF-8?q?=E5=89=AA=E8=BE=91=E9=80=BB=E8=BE=91?=
 =?UTF-8?q?=E8=BF=9B=E5=BA=A680%=EF=BC=9B=20=E5=BE=85=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E7=82=B9=EF=BC=9A=201.=20=E4=BC=98=E5=8C=96=E8=84=9A=E6=9C=AC-?=
 =?UTF-8?q?=E8=A7=A3=E8=AF=B4=E8=B4=A8=E9=87=8F=202.=20=E4=BC=98=E5=8C=96w?=
 =?UTF-8?q?ebui=E4=BD=93=E9=AA=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/models/schema.py     |  3 +-
 app/services/subtitle.py | 80 +++++++++++++++++++++++++++-------------
 app/services/task.py     | 22 ++---------
 config.example.toml      |  7 ++--
 4 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/app/models/schema.py b/app/models/schema.py
index bf39e2b..682cd94 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -353,7 +353,7 @@ class VideoClipParams(BaseModel):
     bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
     bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量")
 
-    subtitle_enabled: Optional[bool] = Field(default=False, description="是否启用字幕")
+    subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕")
     subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置")  # top, bottom, center
     font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称")
     text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色")
@@ -365,4 +365,3 @@ class VideoClipParams(BaseModel):
     custom_position: float = Field(default=70.0, description="自定义位置")
 
     n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
-    # paragraph_number: Optional[int] = 1     # 段落数量
diff --git a/app/services/subtitle.py b/app/services/subtitle.py
index b915c6c..b9894b0 100644
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -1,6 +1,7 @@
 import json
 import os.path
 import re
+import traceback
 from typing import Optional
 
 from faster_whisper import WhisperModel
@@ -11,35 +12,53 @@ import google.generativeai as genai
 from app.config import config
 from app.utils import utils
 
-model_size = config.whisper.get("model_size", "large-v3")
+model_size = config.whisper.get("model_size", "faster-whisper-large-v2")
 device = config.whisper.get("device", "cpu")
 compute_type = config.whisper.get("compute_type", "int8")
 model = None
 
 
 def create(audio_file, subtitle_file: str = ""):
+    """
+    为给定的音频文件创建字幕文件。
+
+    参数:
+    - audio_file: 音频文件的路径。
+    - subtitle_file: 字幕文件的输出路径（可选）。如果未提供，将根据音频文件的路径生成字幕文件。
+
+    返回:
+    无返回值，但会在指定路径生成字幕文件。
+    """
     global model
     if not model:
-        model_path = f"{utils.root_dir()}/models/whisper-{model_size}"
+        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2"
         model_bin_file = f"{model_path}/model.bin"
         if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
-            model_path = model_size
+            logger.error(
+                "请先下载 whisper 模型\n\n"
+                "********************************************\n"
+                "下载地址：https://huggingface.co/guillaumekln/faster-whisper-large-v2\n"
+                "存放路径：app/models \n"
+                "********************************************\n"
+            )
+            return None
 
         logger.info(
-            f"loading model: {model_path}, device: {device}, compute_type: {compute_type}"
+            f"加载模型: {model_path}, 设备: {device}, 计算类型: {compute_type}"
         )
         try:
             model = WhisperModel(
-                model_size_or_path=model_path, device=device, compute_type=compute_type
+                model_size_or_path=model_path, device=device, compute_type=compute_type, local_files_only=True
             )
         except Exception as e:
             logger.error(
-                f"failed to load model: {e} \n\n"
+                f"加载模型失败: {e} \n\n"
                 f"********************************************\n"
-                f"this may be caused by network issue. \n"
-                f"please download the model manually and put it in the 'models' folder. \n"
-                f"see [README.md FAQ](https://github.com/harry0703/NarratoAI) for more details.\n"
+                f"这可能是由网络问题引起的. \n"
+                f"请手动下载模型并将其放入 'app/models' 文件夹中。 \n"
+                f"see [README.md FAQ](https://github.com/linyqh/NarratoAI) for more details.\n"
                 f"********************************************\n\n"
+                f"{traceback.format_exc()}"
             )
             return None
 
@@ -56,7 +75,7 @@ def create(audio_file, subtitle_file: str = ""):
     )
 
     logger.info(
-        f"detected language: '{info.language}', probability: {info.language_probability:.2f}"
+        f"检测到的语言: '{info.language}', probability: {info.language_probability:.2f}"
     )
 
     start = timer()
@@ -139,6 +158,15 @@ def create(audio_file, subtitle_file: str = ""):
 
 
 def file_to_subtitles(filename):
+    """
+    将字幕文件转换为字幕列表。
+
+    参数:
+    filename (str): 字幕文件的路径。
+
+    返回:
+    list: 包含字幕序号、出现时间、和字幕文本的元组列表。
+    """
     if not filename or not os.path.isfile(filename):
         return []
 
@@ -313,28 +341,28 @@ def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Option
 
 
 if __name__ == "__main__":
-    task_id = "task456"
+    task_id = "test456"
     task_dir = utils.task_dir(task_id)
     subtitle_file = f"{task_dir}/subtitle.srt"
-    audio_file = f"{task_dir}/audio.mp3"
+    audio_file = f"{task_dir}/audio.wav"
 
     subtitles = file_to_subtitles(subtitle_file)
     print(subtitles)
 
-    script_file = f"{task_dir}/script.json"
-    with open(script_file, "r") as f:
-        script_content = f.read()
-    s = json.loads(script_content)
-    script = s.get("script")
+    # script_file = f"{task_dir}/script.json"
+    # with open(script_file, "r") as f:
+    #     script_content = f.read()
+    # s = json.loads(script_content)
+    # script = s.get("script")
+    #
+    # correct(subtitle_file, script)
 
-    correct(subtitle_file, script)
-
-    subtitle_file = f"{task_dir}/subtitle-test.srt"
+    subtitle_file = f"{task_dir}/subtitle111.srt"
     create(audio_file, subtitle_file)
 
-    # 使用Gemini模型处理音频
-    gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
-    gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
-
-    if gemini_subtitle_file:
-        print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
+    # # 使用Gemini模型处理音频
+    # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
+    # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
+    #
+    # if gemini_subtitle_file:
+    #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
diff --git a/app/services/task.py b/app/services/task.py
index b6bc504..946b4cd 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -383,27 +383,11 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
 
     subtitle_path = ""
     if params.subtitle_enabled:
-        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt")
+        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        subtitle_fallback = False
-        if subtitle_provider == "edge":
-            voice.create_subtitle(text=video_script, sub_maker="sub_maker", subtitle_file=subtitle_path)
-            # voice.create_subtitle(
-            #     text=video_script,
-            #     sub_maker_list=sub_maker_list,
-            #     list_script=list_script,
-            #     subtitle_file=subtitle_path
-            # )
-        #     if not os.path.exists(subtitle_path):
-        #         subtitle_fallback = True
-        #         logger.warning("找不到字幕文件，回退到whisper")
-        #
-        # if subtitle_provider == "whisper" or subtitle_fallback:
-        #     # subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
-        #     subtitle.create_with_gemini(audio_file=audio_file, subtitle_file=subtitle_path, api_key=config.app.get("gemini_api_key", ""))
-        #     logger.info("\n\n## 更正字幕")
-        #     subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+        # 使用 faster-whisper-large-v2 模型生成字幕
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
 
         subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
         if not subtitle_lines:
diff --git a/config.example.toml b/config.example.toml
index 50b2531..7b4e09e 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -73,9 +73,10 @@
     deepseek_base_url = "https://api.deepseek.com"
     deepseek_model_name = "deepseek-chat"
 
-    # Subtitle Provider, "edge" or "whisper"
+    # Subtitle Provider, "whisper"
     # If empty, the subtitle will not be generated
-    subtitle_provider = "edge"
+    subtitle_provider = "faster-whisper-large-v2"
+    subtitle_enabled = true
 
     #
     # ImageMagick
@@ -159,7 +160,7 @@
     # model = WhisperModel(model_size, device="cpu", compute_type="int8")
 
     # recommended model_size: "large-v3"
-    model_size="large-v3"
+    model_size="faster-whisper-large-v2"
     # if you want to use GPU, set device="cuda"
     device="CPU"
     compute_type="int8"

From decac3b11d0db7553be5644dc6933868073b4da1 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Sun, 29 Sep 2024 18:34:36 +0800
Subject: [PATCH 17/21] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BC=98=E5=8C=96webui?=
 =?UTF-8?q?=E4=BD=93=E9=AA=8C-=E5=89=AA=E8=BE=91=E9=80=BB=E8=BE=91?=
 =?UTF-8?q?=E8=BF=9B=E5=BA=A690%=EF=BC=9B=20=E5=BE=85=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E7=82=B9=EF=BC=9A=201.=20=E4=BC=98=E5=8C=96=E8=84=9A=E6=9C=AC-?=
 =?UTF-8?q?=E8=A7=A3=E8=AF=B4=E8=B4=A8=E9=87=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/llm.py      | 289 ++++++++++++++++++---------------------
 app/services/material.py |  15 +-
 app/services/subtitle.py |   6 +-
 app/services/task.py     |   5 +-
 app/utils/utils.py       |  60 +++++++-
 webui.py                 | 210 ++++++++++++++--------------
 webui/i18n/zh.json       |   5 +-
 7 files changed, 317 insertions(+), 273 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index d3742df..adb3f6d 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -31,7 +31,7 @@ Method = """
 文案的前三句，是整部电影的概括总结，2-3句介绍后，开始叙述故事剧情！
 推荐新手（新号）做：（盘点型）
 盘点全球最恐怖的10部电影
-盘点全球最科幻的10部电影
+盘���全球最科幻的10部电影
 盘点全球最悲惨的10部电影
 盘全球最值得看的10部灾难电影
 盘点全球最值得看的10部励志电影
@@ -43,13 +43,13 @@ Method = """
 4.是什么样的一个人被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
 5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/……
 6.这到底是一部什么样的影片，他一个人就拿了4个顶级奖项，第一季8.7分，第二季直接干到9.5分，11万人给出5星好评，一共也就6集，却斩获26项国际大奖，看过的人都说，他是近年来最好的xxx剧，几乎成为了近年来xxx剧的标杆。故事发生在……
-7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《时代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
+7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《��代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
 8.这是一部让所有人看得荷尔蒙飙升的爽片……
 9.他被成为世界上最虐心绝望的电影，至今无人敢看第二遍，很难想象，他是根据真实事件改编而来……
 10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他是顶级神作《xxxx》……
 11.这是X国有史以来最高赞的一部悬疑电影，然而却因为某些原因，国内90%的人，没能看过这部片子，他就是《xxx》……
 12.有这样一部电影，这辈子，你绝对不想再看第二遍，并不是它剧情烂俗，而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场，更让许多同行都不想解说这部电影，他就是大名鼎鼎的暗黑神作《xxx》…
-13.它被誉为史上最牛悬疑片无数人在看完它时候，一个月不敢照镜子，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
+13.它被誉为史上最牛悬疑片无数人在看完它时候，一个月不敢照镜��，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
 14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影，能够在某瓣上被70万人打出9.3分的高分……
 15.这是一部细思极恐的科幻大片，整部电影颠覆你的三观，它的名字叫……
 16.史上最震撼的灾难片，每一点都不舍得快进的电影，他叫……
@@ -66,7 +66,7 @@ Method = """
 2.这是一部印度高分悬疑片，
 3.这部电影原在日本因为……而被下架，
 4.这是韩国最恐怖的犯罪片，
-5.这是最近国产片评分最高的悬疑片
+5.这是最近国产片评分最高的悬疑��
 以上均按照影片国家来区分，然后简单介绍下主题。就可以开始直接叙述作品。也是一个很不错的方法！
 
 ### 方式四：如何自由发挥
@@ -97,7 +97,7 @@ Method = """
 后面水平越来越高的时候，可以进行人生道理的讲评。
 
 比如：这部电影告诉我们……
-类似于哲理性质的，作为一个总结！
+类似于哲理性质��作为一个总结！
 也可以把最后的影视反转，原生放出来，留下悬念。
 
 比如：也可以总结下这部短片如何的好，推荐/值得大家去观看之类的话语。
@@ -426,7 +426,7 @@ def compress_video(input_path: str, output_path: str):
 
 
 def generate_script(
-    video_path: str, video_plot: str, video_name: str, language: str = "zh-CN", progress_text: st.empty = st.empty()
+    video_path: str, video_plot: str, video_name: str, language: str = "zh-CN", progress_callback=None
 ) -> str:
     """
     生成视频剪辑脚本
@@ -435,73 +435,102 @@ def generate_script(
         video_plot: 视频剧情内容
         video_name: 视频名称
         language: 语言
+        progress_callback: 进度回调函数
 
     Returns:
         str: 生成的脚本
     """
     # 1. 压缩视频
-    progress_text.text("压缩视频中...")
     compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4"
     compress_video(video_path, compressed_video_path)
 
-    # # 2. 转录视频
-    # transcription = gemini_video_transcription(
-    #     video_name=video_name,
-    #     video_path=compressed_video_path,
-    #     language=language,
-    #     progress_text=progress_text,
-    #     llm_provider_video="gemini"
-    # )
-    transcription = """
-[{"timestamp": "00:00-00:06", "picture": "一个穿着蓝色囚服，戴着手铐的人在房间里走路。", "speech": ""},
-{"timestamp": "00:06-00:09", "picture": "一个穿着蓝色囚服，戴着手铐的人，画面上方显示“李自忠 银行抢劫犯”。", "speech": "李自忠 银行抢劫一案 现在宣判"},
-{"timestamp": "00:09-00:12", "picture": "一个穿着黑色西装，打着红色领带的女人，坐在一个牌子上，牌子上写着“书记员”，身后墙上挂着“国徽”。", "speech": "全体起立"},
-{"timestamp": "00:12-00:15", "picture": "一个穿着黑色法官服的男人坐在一个牌子后面，牌子上写着“审判长”，身后墙上挂着“国徽”。法庭上，很多人站着。", "speech": ""},
-{"timestamp": "00:15-00:19", "picture": "一个穿着黑色西装，打着红色领带的女人，坐在一个牌子上，牌子上写着“书记员”，身后墙上挂着“国徽”。法庭上，很多人站着。", "speech": "本庭二审判决如下 被告李自忠 犯抢劫银行罪"},
-{"timestamp": "00:19-00:24", "picture": "一个穿着蓝色囚服，戴着手铐的人，画面上方显示“李自忠 银行抢劫犯”。", "speech": "维持一审判决 判处有期徒刑 二十年"},
-{"timestamp": "00:24-00:27", "picture": "一个穿着黑色法官服的男人坐在一个牌子后面，牌子上写着“审判长”，他敲了一下法槌。", "speech": ""},
-{"timestamp": "00:27-00:32", "picture": "一个穿着蓝色囚服，戴着手铐的人，画面上方显示“李自忠 银行抢劫犯”。", "speech": "我们要让她们牢底坐穿 越父啊越父 你一个平头老百姓 也敢跟外资银行做对 真是不知天高地厚"},
-{"timestamp": "00:32-00:41", "picture": "一个穿着蓝色囚服，戴着手铐的人跪在地上。", "speech": "我要让她们牢底坐穿 越父啊越父 你一个平头老百姓 也敢跟外资银行做对 真是不知天高地厚"},
-{"timestamp": "00:41-00:47", "picture": "两个警察押解着一个穿着蓝色囚服，戴着手铐的人走在路上，一个女记者在路边报道新闻。", "speech": "李先生 这里是孔雀卫视 这里是黄金眼819新闻直播间 这里是浙江卫视新闻直播间 近日李自忠案引发社会热议"},
-{"timestamp": "00:47-01:03", "picture": "一个穿着灰色外套的男人坐在银行柜台前，和银行工作人员说话。画面中还穿插着女记者在路边报道新闻的画面。", "speech": "李自忠案引发社会热议 李自忠在去银行取钱的时候 由于他拿的是儿子的存折 所以银行要求李自忠证明他的儿子就是他的儿子 我说取不了就是取不了啊 这是你儿子的存折啊 你要证明你儿子是你儿子啊"},
-{"timestamp": "01:03-01:10", "picture": "一个穿着灰色外套的男人坐在银行柜台前，和银行工作人员说话。画面中还穿插着女记者在路边报道新闻的画面。", "speech": "李自忠提供了身份证账户户口本后 银行都不认可他的儿子是他的儿子 就在这个时候 银行发生一起抢劫案"},
-{"timestamp": "01:10-01:17", "picture": "三个戴着帽子和口罩的劫匪持枪闯入银行，银行里的人都很害怕，纷纷蹲下躲避。", "speech": "都给我蹲下 老实点 把钱给我交出来"},
-{"timestamp": "01:17-01:28", "picture": "女记者在路边报道新闻，画面中穿插着银行抢劫案的画面。", "speech": "劫匪看到一旁大哭的李自忠 得知他是因为儿子需要治病才取钱的时候 给了他一打钱 怎么 你儿子在医院等着钱救命啊 银行不给取啊"},
-{"timestamp": "01:28-01:36", "picture": "一个戴着黑色帽子和口罩的劫匪，拿着枪，给一个穿着灰色外套的男人一叠钱。", "speech": "银行不给取啊 好了 给儿子看病去 李自忠在把钱给儿子交完药费后被捕"},
-{"timestamp": "01:36-01:58", "picture": "两个警察押解着一个穿着蓝色囚服，戴着手铐的男人走在路上，一个女记者在路边报道新闻。", "speech": "目前一审二审都维持原判 判处有期徒刑二十年 对此你有什么想说的吗 他怎么证明他儿子是他儿子 要是银行早点把钱给我 我也不会遇到劫匪 我儿子还得救命 不是的 儿子 儿子 儿子"},
-{"timestamp": "01:58-02:03", "picture": "两个警察押解着一个穿着蓝色囚服，戴着手铐的男人走在路上，一个女记者在路边报道新闻。男人情绪激动，大声喊叫。", "speech": "儿子 儿子 儿子"},
-{"timestamp": "02:03-02:12", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边。画面中穿插着新闻报道的画面。", "speech": "近日李自忠案引发社会热议 李自忠在去银行取钱的时候 银行要求李自忠证明他的儿子就是他的儿子"},
-{"timestamp": "02:12-02:25", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生站在门口。", "speech": "爸 这家人也真够可怜的 当爹的坐牢 这儿子 恐怕要成植物人了"},
-{"timestamp": "02:25-02:31", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生站在门口。", "speech": "医生啊 我弟弟的情况怎么样 我先看看"},
-{"timestamp": "02:31-02:40", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。", "speech": ""},
-{"timestamp": "02:40-02:46", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。", "speech": "不太理想啊 你弟弟想要醒过来 希望渺茫"},
-{"timestamp": "02:46-02:57", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。", "speech": "这 麟木 麟木你别吓姐啊麟木 麟木"},
-{"timestamp": "02:57-03:02", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。画面中穿插着新闻报道的画面。", "speech": "麟木 儿子 麟木你别吓姐啊麟木"},
-{"timestamp": "03:02-03:08", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。画面中穿插着新闻报道的画面。女人情绪激动，大声哭泣。", "speech": "儿子 麟木你别吓姐啊麟木 儿子"},
-{"timestamp": "03:08-03:14", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，一个穿着粉色上衣的女人站在病床边，一个白头发的医生正在给男人做检查。画面中穿插着新闻报道的画面。女人情绪激动，大声哭泣。", "speech": "儿子"},
-{"timestamp": "03:14-03:18", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，画面变成紫色光效。", "speech": ""},
-{"timestamp": "03:18-03:20", "picture": "一个病房里，一个年轻男人躺在病床上，戴着呼吸机，他突然睁开了眼睛。", "speech": ""}]
-    """
-    # # 清理压缩后的视频文件
-    # try:
-    #     os.remove(compressed_video_path)
-    # except OSError as e:
-    #     logger.warning(f"删除压缩视频文件失败: {e}")
+    # 在关键步骤更新进度
+    if progress_callback:
+        progress_callback(15, "压缩完成")  # 例如,在压缩视频后
+
+    # 2. 转录视频
+    transcription = gemini_video_transcription(
+        video_name=video_name,
+        video_path=compressed_video_path,
+        language=language,
+        llm_provider_video="gemini",
+        progress_callback=progress_callback
+    )
+    if progress_callback:
+        progress_callback(60, "生成解说文案...")  # 例如,在转录视频后
 
     # 3. 编写解说文案
-    progress_text.text("解说文案中...")
     script = writing_short_play(video_plot, video_name, "openai", count=300)
 
+    # 在关键步骤更新进度
+    if progress_callback:
+        progress_callback(70, "匹配画面...")  # 例如,在生成脚本后
+
     # 4. 文案匹配画面
     if transcription != "":
-        progress_text.text("画面匹配中...")
         matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider="openai")
-
+        # 在关键步骤更新进度
+        if progress_callback:
+            progress_callback(80, "匹配成功")
         return matched_script
     else:
         return ""
 
 
+def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider_video: str, progress_callback=None):
+    '''
+    使用 gemini-1.5-xxx 进行视频画面转录
+    '''
+    api_key = config.app.get("gemini_api_key")
+    gemini.configure(api_key=api_key)
+
+    prompt = """
+    请转录音频，包括时间戳，并提供视觉描述，然后以 JSON 格式输出，当前视频中使用的语言为 %s。
+    
+    在转录视频时，请通过确保以下条件来完成转录：
+    1. 画面描述使用语言: %s 进行输出。
+    2. 同一个画面合并为一个转录记录。
+    3. 使用以下 JSON schema:    
+        Graphics = {"timestamp": "MM:SS-MM:SS"(时间戳格式), "picture": "str"(画面描述), "speech": "str"(台词，如果没有人说话，则使用空字符串。)}
+        Return: list[Graphics]
+    4. 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
+    """ % (language, language)
+
+    logger.debug(f"视频名称: {video_name}")
+    try:
+        if progress_callback:
+            progress_callback(20, "上传视频至 Google cloud")
+        gemini_video_file = gemini.upload_file(video_path)
+        logger.debug(f"视频 {gemini_video_file.name} 上传至 Google cloud 成功, 开始解析...")
+        while gemini_video_file.state.name == "PROCESSING":
+            gemini_video_file = gemini.get_file(gemini_video_file.name)
+            if progress_callback:
+                progress_callback(30, "上传成功, 开始解析")  # 更新进度为20%
+        if gemini_video_file.state.name == "FAILED":
+            raise ValueError(gemini_video_file.state.name)
+        elif gemini_video_file.state.name == "ACTIVE":
+            if progress_callback:
+                progress_callback(40, "解析完成, 开始转录...")  # 更新进度为30%
+            logger.debug("解析完成, 开始转录...")
+    except ResumableUploadError as err:
+        logger.error(f"上传视频至 Google cloud 失败, 用户的位置信息不支持用于该API; \n{traceback.format_exc()}")
+        return False
+    except FailedPrecondition as err:
+        logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}")
+        return False
+
+    if progress_callback:
+        progress_callback(50, "开始转录")
+    try:
+        response = _generate_response_video(prompt=prompt, llm_provider_video=llm_provider_video, video_file=gemini_video_file)
+        logger.success("视频转录成功")
+        logger.debug(response)
+        print(type(response))
+        return response
+    except Exception as err:
+        return handle_exception(err)
+
+
 def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
     prompt = f"""
 # Role: Video Search Terms Generator
@@ -652,56 +681,6 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
     return response
 
 
-def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider_video: str, progress_text: st.empty = ""):
-    '''
-    使用 gemini-1.5-xxx 进行视频画面转录
-    '''
-    api_key = config.app.get("gemini_api_key")
-    gemini.configure(api_key=api_key)
-
-    prompt = """
-    请转录音频，包括时间戳，并提供视觉描述，然后以 JSON 格式输出，当前视频中使用的语言为 %s。
-    
-    在转录视频时，请通过确保以下条件来完成转录：
-    1. 画面描述使用语言: %s 进行输出。
-    2. 同一个画面合并为一个转录记录。
-    3. 使用以下 JSON schema:    
-        Graphics = {"timestamp": "MM:SS-MM:SS"(时间戳格式), "picture": "str"(画面描述), "speech": "str"(台词，如果没有人说话，则使用空字符串。)}
-        Return: list[Graphics]
-    4. 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
-    """ % (language, language)
-
-    logger.debug(f"视频名称: {video_name}")
-    try:
-        progress_text.text("上传视频中...")
-        gemini_video_file = gemini.upload_file(video_path)
-        logger.debug(f"视频 {gemini_video_file.name} 上传至 Google cloud 成功, 开始解析...")
-        while gemini_video_file.state.name == "PROCESSING":
-            gemini_video_file = gemini.get_file(gemini_video_file.name)
-            progress_text.text(f"解析视频中, 当前状态: {gemini_video_file.state.name}")
-        if gemini_video_file.state.name == "FAILED":
-            raise ValueError(gemini_video_file.state.name)
-        elif gemini_video_file.state.name == "ACTIVE":
-            progress_text.text("解析完成")
-            logger.debug("解析完成, 开始转录...")
-    except ResumableUploadError as err:
-        logger.error(f"上传视频至 Google cloud 失败, 用户的位置信息不支持用于该API; \n{traceback.format_exc()}")
-        return False
-    except FailedPrecondition as err:
-        logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}")
-        return False
-
-    progress_text.text("视频转录中...")
-    try:
-        response = _generate_response_video(prompt=prompt, llm_provider_video=llm_provider_video, video_file=gemini_video_file)
-        logger.success("视频转录成功")
-        logger.debug(response)
-        print(type(response))
-        return response
-    except Exception as err:
-        return handle_exception(err)
-
-
 def writing_movie(video_plot, video_name, llm_provider):
     """
     影视解说（电影解说）
@@ -801,58 +780,58 @@ def screen_matching(huamian: str, wenan: str, llm_provider: str):
     - 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
     """ % (huamian, wenan)
 
-    prompt = """
-    你是一位拥有10年丰富经验的影视解说创作专家。你的任务是根据提供的视频转录脚本和解说文案，创作一个引人入胜的解说脚本。请按照以下要求完成任务：
-
-1. 输入数据：
-   - 视频转录脚本：包含时间戳、画面描述和人物台词
-   - 解说文案：需要你进行匹配和编排的内容
-   - 视频转录脚本和文案（由 XML 标记<PICTURE></PICTURE>和 <COPYWRITER></COPYWRITER>分隔）如下所示：
-    视频转录脚本
-    <PICTURE>
-    %s
-    </PICTURE>
-    文案：
-    <COPYWRITER>
-    %s
-    </COPYWRITER>
-
-2. 输出要求：
-   - 格式：严格的JSON格式，可直接被json.loads()解析
-   - 结构：list[script]，其中script为字典类型
-   - script字段：
-     {
-       "picture": "画面描述",
-       "timestamp": "时间戳",
-       "narration": "解说文案",
-       "OST": true/false
-     }
-
-3. 匹配规则：
-   a) 时间戳匹配：
-      - 根据文案内容选择最合适的画面时间段
-      - 避免时间重叠，确保画面不重复出现
-      - 适当合并或删减片段，不要完全照搬转录脚本
-   b) 画面描述：与转录脚本保持一致
-   c) 解说文案：
-      - 当OST为true时，narration为空字符串
-      - 当OST为false时，narration为解说文案，但是要确保文案字数不要超过 30字，若文案较长，则添加到下一个片段
-   d) OST（原声）：
-      - 按1:1比例穿插原声和解说片段
-      - 第一个片段必须是原声，时长不少于20秒
-      - 选择整个视频中最精彩的片段作为开场
-
-4. 创作重点：
-   - 确保解说与画面高度匹配
-   - 巧妙安排原声和解说的交替，提升观众体验
-   - 创造一个引人入胜、节奏紧凑的解说脚本
-
-5. 注意事项：
-   - 严格遵守JSON格式，不包含任何注释或额外标记
-   - 充分利用你的专业经验，创作出高质量、吸引人的解说内容
-
-请基于以上要求，将提供的视频转录脚本和解说文案整合成一个专业、吸引人的解说脚本。你的创作将直接影响观众的观看体验，请发挥你的专业素养，创作出最佳效果。
-    """ % (huamian, wenan)
+#     prompt = """
+#     你是一位拥有10年丰富经验的影视解说创作专家。你的任务是根据提供的视频转录脚本和解说文案，创作一个引人入胜的解说脚本。请按照以下要求完成任务：
+#
+# 1. 输入数据：
+#    - 视频转录脚本：包含时间戳、画面描述和人物台词
+#    - 解说文案：需要你进行匹配和编排的内容
+#    - 视频转录脚本和文案（由 XML 标记<PICTURE></PICTURE>和 <COPYWRITER></COPYWRITER>分隔）如下所示：
+#     视频转录脚本
+#     <PICTURE>
+#     %s
+#     </PICTURE>
+#     文案：
+#     <COPYWRITER>
+#     %s
+#     </COPYWRITER>
+#
+# 2. 输出要求：
+#    - 格式：严格的JSON格式，可直接被json.loads()解析
+#    - 结构：list[script]，其中script为字典类型
+#    - script字段：
+#      {
+#        "picture": "画面描述",
+#        "timestamp": "时间戳",
+#        "narration": "解说文案",
+#        "OST": true/false
+#      }
+#
+# 3. 匹配规则：
+#    a) 时间戳匹配：
+#       - 根据文案内容选择最合适的画面时间段
+#       - 避免时间重叠，确保画面不重复出现
+#       - 适当合并或删减片段，不要完全照搬转录脚本
+#    b) 画面描述：与转录脚本保持一致
+#    c) 解说文案：
+#       - 当OST为true时，narration为空字符串
+#       - 当OST为false时，narration为解说文案，但是要确保文案字数不要超过 30字，若文案较长，则添加到下一个片段
+#    d) OST（原声）：
+#       - 按1:1比例穿插原声和解说片段
+#       - 第一个片段必须是原声，时长不少于20秒
+#       - 选择整个视频中最精彩的片段作为开场
+#
+# 4. 创作重点：
+#    - 确保解说与画面高度匹配
+#    - 巧妙安排原声和解说的交替，提升观众体验
+#    - 创造一个引人入胜、节奏紧凑的解说脚本
+#
+# 5. 注意事项：
+#    - 严格遵守JSON格式，不包含任何注释或额外标记
+#    - 充分利用你的专业经验，创作出高质量、吸引人的解说内容
+#
+# 请基于以上要求，将提供的视频转录脚本和解说文案整合成一个专业、吸引人的解说脚本。你的创作将直接影响观众的观看体验，请发挥你的专业素养，创作出最佳效果。
+#     """ % (huamian, wenan)
     try:
         response = _generate_response(prompt, llm_provider)
         logger.success("匹配成功")
diff --git a/app/services/material.py b/app/services/material.py
index d63e6fc..bc4d118 100644
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -267,7 +267,6 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
 
-    # url_hash = utils.md5(str(uuid.uuid4()))
     video_id = f"vid-{timestamp.replace(':', '_')}"
     video_path = f"{save_dir}/{video_id}.mp4"
 
@@ -278,7 +277,7 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
     # 剪辑视频
     start, end = utils.split_timestamp(timestamp)
     video = VideoFileClip(origin_video).subclip(start, end)
-    video.write_videofile(video_path)
+    video.write_videofile(video_path, logger=None)  # 禁用 MoviePy 的内置日志
 
     if os.path.getsize(video_path) > 0 and os.path.exists(video_path):
         try:
@@ -297,20 +296,21 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
     return {}
 
 
-def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -> dict:
+def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None):
     """
     剪辑视频
     Args:
         task_id: 任务id
         timestamp_terms: 需要剪辑的时间戳列表，如:['00:00-00:20', '00:36-00:40', '07:07-07:22']
         origin_video: 原视频路径
+        progress_callback: 进度回调函数
 
     Returns:
         剪辑后的视频路径
     """
     video_paths = {}
-    for item in timestamp_terms:
-        logger.info(f"需要裁剪 '{origin_video}' 为 {len(timestamp_terms)} 个视频")
+    total_items = len(timestamp_terms)
+    for index, item in enumerate(timestamp_terms):
         material_directory = config.app.get("material_directory", "").strip()
         if material_directory == "task":
             material_directory = utils.task_dir(task_id)
@@ -318,11 +318,14 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -
             material_directory = ""
 
         try:
-            logger.info(f"clip video: {item}")
             saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
             if saved_video_path:
                 logger.info(f"video saved: {saved_video_path}")
                 video_paths.update(saved_video_path)
+            
+            # 更新进度
+            if progress_callback:
+                progress_callback(index + 1, total_items)
         except Exception as e:
             logger.error(f"视频裁剪失败: {utils.to_json(item)} => {str(e)}")
             return {}
diff --git a/app/services/subtitle.py b/app/services/subtitle.py
index b9894b0..c792667 100644
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -48,7 +48,10 @@ def create(audio_file, subtitle_file: str = ""):
         )
         try:
             model = WhisperModel(
-                model_size_or_path=model_path, device=device, compute_type=compute_type, local_files_only=True
+                model_size_or_path=model_path,
+                device=device,
+                compute_type=compute_type,
+                local_files_only=True
             )
         except Exception as e:
             logger.error(
@@ -72,6 +75,7 @@ def create(audio_file, subtitle_file: str = ""):
         word_timestamps=True,
         vad_filter=True,
         vad_parameters=dict(min_silence_duration_ms=500),
+        initial_prompt="以下是普通话的句子"
     )
 
     logger.info(
diff --git a/app/services/task.py b/app/services/task.py
index 946b4cd..78941f8 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -2,6 +2,7 @@ import math
 import json
 import os.path
 import re
+import traceback
 from os import path
 from loguru import logger
 
@@ -323,7 +324,7 @@ def start(task_id, params: VideoParams, stop_at: str = "video"):
     return kwargs
 
 
-def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
+def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: list):
     """
     后台任务（自动剪辑视频进行剪辑）
 
@@ -340,6 +341,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
 
     logger.info("\n\n## 1. 加载视频脚本")
     video_script_path = path.join(params.video_clip_json_path)
+    # video_script_path = video_clip_json_path
     # 判断json文件是否存在
     if path.exists(video_script_path):
         try:
@@ -361,6 +363,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
             logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
             raise ValueError("无法读取视频json脚本，请检查配置是否正确")
     else:
+        logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
         raise ValueError("解说脚本不存在！请检查配置是否正确。")
 
     logger.info("\n\n## 2. 生成音频列表")
diff --git a/app/utils/utils.py b/app/utils/utils.py
index e4ba419..3a0600f 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -1,9 +1,12 @@
 import locale
 import os
+import traceback
+
 import requests
 import threading
 from typing import Any
 from loguru import logger
+import streamlit as st
 import json
 from uuid import uuid4
 import urllib3
@@ -11,6 +14,7 @@ from datetime import datetime, timedelta
 
 from app.models import const
 from app.utils import check_script
+from app.services import material
 
 urllib3.disable_warnings()
 
@@ -372,10 +376,52 @@ def add_new_timestamps(scenes):
 
 
 def clean_model_output(output):
-    """
-    模型输出包含 ```json 标记时的处理
-    """
-    if "```json" in output:
-        print("##########")
-        output = output.replace("```json", "").replace("```", "")
-    return output.strip()
+    # 移除可能的代码块标记
+    output = output.strip('```json').strip('```')
+    # 移除开头和结尾的空白字符
+    output = output.strip()
+    return output
+
+
+def cut_video(params, progress_callback=None):
+    try:
+        task_id = str(uuid4())
+        st.session_state['task_id'] = task_id
+
+        if not st.session_state.get('video_clip_json'):
+            raise ValueError("视频脚本不能为空")
+
+        video_script_list = st.session_state['video_clip_json']
+        time_list = [i['timestamp'] for i in video_script_list]
+        
+        total_clips = len(time_list)
+        
+        def clip_progress(current, total):
+            progress = int((current / total) * 100)
+            if progress_callback:
+                progress_callback(progress)
+
+        subclip_videos = material.clip_videos(
+            task_id=task_id,
+            timestamp_terms=time_list,
+            origin_video=params.video_origin_path,
+            progress_callback=clip_progress
+        )
+
+        if subclip_videos is None:
+            raise ValueError("裁剪视频失败")
+
+        st.session_state['subclip_videos'] = subclip_videos
+
+        for i, video_script in enumerate(video_script_list):
+            try:
+                video_script['path'] = subclip_videos[video_script['timestamp']]
+            except KeyError as err:
+                logger.error(f"裁剪视频失败: {err}")
+                raise ValueError(f"裁剪视频失败: {err}")
+
+        return task_id, subclip_videos
+
+    except Exception as e:
+        logger.error(f"视频裁剪过程中发生错误: {traceback.format_exc()}")
+        raise
diff --git a/webui.py b/webui.py
index c4853d8..4410c2d 100644
--- a/webui.py
+++ b/webui.py
@@ -61,13 +61,11 @@ config_file = os.path.join(root_dir, "webui", ".streamlit", "webui.toml")
 system_locale = utils.get_system_locale()
 
 if 'video_clip_json' not in st.session_state:
-    st.session_state['video_clip_json'] = ''
+    st.session_state['video_clip_json'] = []
 if 'video_plot' not in st.session_state:
     st.session_state['video_plot'] = ''
 if 'ui_language' not in st.session_state:
     st.session_state['ui_language'] = config.ui.get("language", system_locale)
-if 'script_generation_status' not in st.session_state:
-    st.session_state['script_generation_status'] = ""
 
 
 def get_all_fonts():
@@ -124,7 +122,7 @@ def init_log():
     _lvl = "DEBUG"
 
     def format_record(record):
-        # 获取日志记录中的文件全路径
+        # 获取日志记录中的文件全���径
         file_path = record["file"].path
         # 将绝对路径转换为相对于项目根目录的路径
         relative_path = os.path.relpath(file_path, root_dir)
@@ -272,7 +270,7 @@ with left_panel:
         # 按创建时间降序排序
         script_list.sort(key=lambda x: x["ctime"], reverse=True)
 
-        # 脚本文件 下拉框
+        # ��本文件 下拉框
         script_path = [(tr("Auto Generate"), ""), ]
         for file in script_list:
             display_name = file['file'].replace(root_dir, "")
@@ -282,8 +280,9 @@ with left_panel:
                                              options=range(len(script_path)),  # 使用索引作为内部选项值
                                              format_func=lambda x: script_path[x][0]  # 显示给用户的是标签
                                              )
-        params.video_clip_json = script_path[selected_script_index][1]
-        video_json_file = params.video_clip_json
+        params.video_clip_json_path = script_path[selected_script_index][1]
+        config.app["video_clip_json_path"] = params.video_clip_json_path
+        st.session_state['video_clip_json_path'] = params.video_clip_json_path
 
         # 视频文件处理
         video_files = []
@@ -301,18 +300,20 @@ with left_panel:
             })
         # 按创建时间降序排序
         video_list.sort(key=lambda x: x["ctime"], reverse=True)
-        video_path = [("None", ""), (tr("Upload Local Files"), "local")]
-        for code in [file['file'] for file in video_list]:
-            video_path.append((code, code))
+        video_path = [(tr("None"), ""), (tr("Upload Local Files"), "local")]
+        for file in video_list:
+            display_name = file['file'].replace(root_dir, "")
+            video_path.append((display_name, file['file']))
 
         # 视频文件
         selected_video_index = st.selectbox(tr("Video File"),
                                             index=0,
                                             options=range(len(video_path)),  # 使用索引作为内部选项值
-                                            format_func=lambda x: video_path[x][0]  # 显示给用户的是标
+                                            format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
                                             )
         params.video_origin_path = video_path[selected_video_index][1]
         config.app["video_origin_path"] = params.video_origin_path
+        st.session_state['video_origin_path'] = params.video_origin_path
 
         # 从本地上传 mp4 文件
         if params.video_origin_path == "local":
@@ -347,40 +348,73 @@ with left_panel:
         )
 
         # 生成视频脚本
-        st.session_state['script_generation_status'] = "开始生成视频脚本"
-        if st.button(tr("Video Script Generate"), key="auto_generate_script"):
-            with st.spinner("正在生成脚本..."):
-                # 这里可以用 st.empty() 来动态更新文本
-                progress_text = st.empty()
-                progress_text.text("正在处理...")
+        if st.session_state['video_clip_json_path']:
+            generate_button_name = tr("Video Script Load")
+        else:
+            generate_button_name = tr("Video Script Generate")
+        if st.button(generate_button_name, key="auto_generate_script"):
+            progress_bar = st.progress(0)
+            status_text = st.empty()
 
-                if video_json_file == "" and params.video_origin_path != "":
-                    progress_text.text("开始压缩...")
-                    # 使用大模型生成视频脚本
-                    script = llm.generate_script(
-                        video_path=params.video_origin_path,
-                        video_plot=video_plot,
-                        video_name=video_name,
-                        language=params.video_language,
-                        progress_text=progress_text
-                    )
-                    if script is None:
-                        st.error("生成脚本失败，请检查日志")
-                        st.stop()
-                    st.session_state['video_clip_json'] = script
-                    cleaned_string = script.strip("```json").strip("```")
-                    st.session_state['video_script_list'] = json.loads(cleaned_string)
+            def update_progress(progress: float, message: str = ""):
+                progress_bar.progress(progress)
+                if message:
+                    status_text.text(f"{progress}% - {message}")
                 else:
-                    with open(video_json_file, 'r', encoding='utf-8') as f:
-                        script = f.read()
-                        st.session_state['video_clip_json'] = script
-                        cleaned_string = script.strip("```json").strip("```")
-                        st.session_state['video_script_list'] = json.loads(cleaned_string)
+                    status_text.text(f"进度: {progress}%")
+
+            try:
+                with st.spinner("正在生成脚本..."):
+                    if not video_name:
+                        st.warning("视频名称不能为空")
+                        st.stop()
+                    if not video_plot:
+                        st.warning("视频剧情不能为空")
+                        st.stop()
+                    if params.video_clip_json_path == "" and params.video_origin_path != "":
+                        update_progress(10, "压缩视频中...")
+                        # 使用大模型生成视频脚本
+                        script = llm.generate_script(
+                            video_path=params.video_origin_path,
+                            video_plot=video_plot,
+                            video_name=video_name,
+                            language=params.video_language,
+                            progress_callback=update_progress
+                        )
+                        if script is None:
+                            st.error("生成脚本失败，请检查日志")
+                            st.stop()
+                        else:
+                            update_progress(90)
+
+                        script = utils.clean_model_output(script)
+                        st.session_state['video_clip_json'] = json.loads(script)
+                    else:
+                        # 从本地加载
+                        with open(params.video_clip_json_path, 'r', encoding='utf-8') as f:
+                            update_progress(50)
+                            status_text.text("从本地加载中...")
+                            script = f.read()
+                            script = utils.clean_model_output(script)
+                            st.session_state['video_clip_json'] = json.loads(script)
+                            update_progress(100)
+                            status_text.text("从本地加载成功")
+
+                time.sleep(0.5)  # 给进度条一点时间到达100%
+                progress_bar.progress(100)
+                status_text.text("脚本生成完成！")
+                st.success("视频脚本生成成功！")
+            except Exception as e:
+                st.error(f"生成过程中发生错误: {traceback.format_exc()}")
+            finally:
+                time.sleep(2)  # 给用户一些时间查看最终状态
+                progress_bar.empty()
+                status_text.empty()
 
         # 视频脚本
         video_clip_json_details = st.text_area(
             tr("Video Script"),
-            value=st.session_state['video_clip_json'],
+            value=json.dumps(st.session_state.video_clip_json, indent=2, ensure_ascii=False),
             height=180
         )
 
@@ -398,73 +432,43 @@ with left_panel:
                     timestamp = datetime.datetime.now().strftime("%Y-%m%d-%H%M%S")
                     save_path = os.path.join(script_dir, f"{timestamp}.json")
 
-                    # 尝试解析输入的 JSON 数据
-                    input_json = str(video_clip_json_details)
-                    # 去掉json的头尾标识
-                    input_json = input_json.strip('```json').strip('```')
                     try:
-                        data = utils.add_new_timestamps(json.loads(input_json))
+                        data = utils.add_new_timestamps(json.loads(video_clip_json_details))
                     except Exception as err:
                         st.error(f"视频脚本格式错误，请检查脚本是否符合 JSON 格式；{err} \n\n{traceback.format_exc()}")
                         st.stop()
 
-                    # 检查是否是一个列表
-                    if not isinstance(data, list):
-                        st.error("JSON is not a list")
-                        st.stop()
-
-                    # 检查列表中的每个元素是否包含所需的键
-                    required_keys = {"picture", "timestamp", "narration"}
-                    for item in data:
-                        if not isinstance(item, dict):
-                            st.error("List 元素不是字典")
-                            st.stop()
-                        if not required_keys.issubset(item.keys()):
-                            st.error("Dict 元素不包含必需的键")
-                            st.stop()
-
                     # 存储为新的 JSON 文件
                     with open(save_path, 'w', encoding='utf-8') as file:
                         json.dump(data, file, ensure_ascii=False, indent=4)
                         # 将data的值存储到 session_state 中，类似缓存
-                        st.session_state['video_script_list'] = data
+                        st.session_state['video_clip_json'] = data
                         st.session_state['video_clip_json_path'] = save_path
                         # 刷新页面
-                        st.rerun()
-
-
-        def caijian():
-            with st.spinner(tr("裁剪视频中...")):
-                st.session_state['task_id'] = str(uuid4())
-
-            if st.session_state.get('video_script_list', None) is not None:
-                video_script_list = st.session_state.video_script_list
-                print(video_script_list)
-                print(type(video_script_list))
-                time_list = [i['timestamp'] for i in video_script_list]
-                subclip_videos = material.clip_videos(
-                    task_id=st.session_state['task_id'],
-                    timestamp_terms=time_list,
-                    origin_video=params.video_origin_path
-                )
-                if subclip_videos is None:
-                    st.error(tr("裁剪视频失败"))
-                    st.stop()
-                st.session_state['subclip_videos'] = subclip_videos
-                for video_script in video_script_list:
-                    try:
-                        video_script['path'] = subclip_videos[video_script['timestamp']]
-                    except KeyError as err:
-                        st.error(f"裁剪视频失败 {err}")
-                logger.debug(f"当前的脚本为：{st.session_state.subclip_videos}")
-            else:
-                st.error(tr("请先生成视频脚本"))
-
+                        # st.rerun()
 
         # 裁剪视频
         with button_columns[1]:
             if st.button(tr("Crop Video"), key="auto_crop_video", use_container_width=True):
-                caijian()
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+
+                def update_progress(progress):
+                    progress_bar.progress(progress)
+                    status_text.text(f"剪辑进度: {progress}%")
+
+                try:
+                    utils.cut_video(params, update_progress)
+                    time.sleep(0.5)  # 给进度条一点时间到达100%
+                    progress_bar.progress(100)
+                    status_text.text("剪辑完成！")
+                    st.success("视频剪辑成功完成！")
+                except Exception as e:
+                    st.error(f"剪辑过程中发生错误: {str(e)}")
+                finally:
+                    time.sleep(2)  # 给用户一些时间查看最终状态
+                    progress_bar.empty()
+                    status_text.empty()
 
 # 新中间面板
 with middle_panel:
@@ -703,14 +707,16 @@ with st.expander(tr("Video Check"), expanded=False):
                         # 可编辑的输入框
                         text_panels = st.columns(2)
                         with text_panels[0]:
-                            text1 = st.text_area(tr("timestamp"), value=initial_timestamp, height=20)
+                            text1 = st.text_area(tr("timestamp"), value=initial_timestamp, height=20,
+                                                 key=f"timestamp_{index}")
                         with text_panels[1]:
-                            text2 = st.text_area(tr("Picture description"), value=initial_picture, height=20)
-                        logger.debug(initial_narration)
-                        text3 = st.text_area(tr("Narration"), value=initial_narration, height=100)
+                            text2 = st.text_area(tr("Picture description"), value=initial_picture, height=20,
+                                                 key=f"picture_{index}")
+                        text3 = st.text_area(tr("Narration"), value=initial_narration, height=100,
+                                             key=f"narration_{index}")
 
                         # 重新生成按钮
-                        if st.button(tr("Rebuild"), key=f"button_{index}"):
+                        if st.button(tr("Rebuild"), key=f"rebuild_{index}"):
                             # 更新video_list中的对应项
                             video_list[index]['timestamp'] = text1
                             video_list[index]['picture'] = text2
@@ -719,12 +725,12 @@ with st.expander(tr("Video Check"), expanded=False):
                             for video in video_list:
                                 if 'path' in video:
                                     del video['path']
-                                    # 更新session_state以确保更改被保存
+                            # 更新session_state以确保更改被保存
                             st.session_state['video_clip_json'] = utils.to_json(video_list)
                             # 替换原JSON 文件
-                            with open(video_json_file, 'w', encoding='utf-8') as file:
+                            with open(params.video_clip_json_path, 'w', encoding='utf-8') as file:
                                 json.dump(video_list, file, ensure_ascii=False, indent=4)
-                            caijian()
+                            utils.cut_video(params, progress_callback=None)
                             st.rerun()
 
 # 开始按钮
@@ -735,13 +741,15 @@ if start_button:
     if st.session_state.get('video_script_json_path') is not None:
         params.video_clip_json = st.session_state.get('video_clip_json')
 
-    logger.debug(f"当前的脚本为：{params.video_clip_json}")
+    logger.debug(f"当前的脚本文件为：{st.session_state.video_clip_json_path}")
+    logger.debug(f"当前的视频文件为：{st.session_state.video_origin_path}")
+    logger.debug(f"裁剪后是视频列表：{st.session_state.subclip_videos}")
 
     if not task_id:
         st.error(tr("请先裁剪视频"))
         scroll_to_bottom()
         st.stop()
-    if not params.video_clip_json:
+    if not params.video_clip_json_path:
         st.error(tr("脚本文件不能为空"))
         scroll_to_bottom()
         st.stop()
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
index dc1da54..aa588fd 100644
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -10,7 +10,7 @@
     "Auto Detect": "自动检测",
     "Auto Generate": "自动生成",
     "Video Name": "视频名称",
-    "Video Script": "视频脚本（:blue[①可不填，使用AI生成  ②合理使用标点断句，有助于生成字幕]）",
+    "Video Script": "视频脚本（:blue[①使用AI生成    ②从本机加载]）",
     "Save Script": "保存脚本",
     "Crop Video": "裁剪视频",
     "Video File": "视频文件（:blue[1️⃣支持上传视频文件(限制2G) 2️⃣大文件建议直接导入 ./resource/videos 目录]）",
@@ -91,6 +91,7 @@
     "timestamp": "时间戳",
     "Picture description": "图片描述",
     "Narration": "视频文案",
-    "Rebuild": "重新生成"
+    "Rebuild": "重新生成",
+    "Video Script Load": "加载视频脚本"
   }
 }
\ No newline at end of file

From f04fd70ab8625e851ba9c66ea76af8b86057c871 Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Mon, 30 Sep 2024 01:37:39 +0800
Subject: [PATCH 18/21] =?UTF-8?q?=E6=9F=A5=E6=BC=8F=E8=A1=A5=E7=BC=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore          |   1 +
 Dockerfile          |  16 +++----
 app/services/llm.py | 103 ++++++++++++++++++++------------------------
 docker-compose.yml  |   6 ++-
 requirements.txt    |   3 +-
 webui.py            |  10 +++--
 webui.sh            |  19 +++++++-
 7 files changed, 82 insertions(+), 76 deletions(-)

diff --git a/.gitignore b/.gitignore
index c51c4e8..f10a692 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ resource/scripts/*
 resource/videos/*
 resource/songs/*
 resource/fonts/*
+app/models/faster-whisper-large-v2/*
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 8cf53f0..4beabe4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,3 @@
-# Use an official Python runtime as a parent image
 FROM python:3.10-slim-bullseye
 
 # Set the working directory in the container
@@ -12,6 +11,7 @@ ENV PYTHONPATH="/NarratoAI"
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     git \
+    git-lfs \
     imagemagick \
     ffmpeg \
     wget \
@@ -29,17 +29,11 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Now copy the rest of the codebase into the image
 COPY . .
 
+# 安装 git lfs 并下载模型到指定目录
+RUN git lfs install
+
 # Expose the port the app runs on
 EXPOSE 8501
 
 # Command to run the application
-CMD ["streamlit", "run", "./webui/Main.py","--browser.serverAddress=127.0.0.1","--server.enableCORS=True","--browser.gatherUsageStats=False"]
-
-# 1. Build the Docker image using the following command
-# docker build -t moneyprinterturbo .
-
-# 2. Run the Docker container using the following command
-## For Linux or MacOS:
-# docker run -v $(pwd)/config.toml:/NarratoAI/config.toml -v $(pwd)/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
-## For Windows:
-# docker run -v %cd%/config.toml:/NarratoAI/config.toml -v %cd%/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
\ No newline at end of file
+CMD ["streamlit", "run", "webui.py","--browser.serverAddress=127.0.0.1","--server.enableCORS=True","--browser.gatherUsageStats=False"]
diff --git a/app/services/llm.py b/app/services/llm.py
index adb3f6d..01bef0a 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -109,26 +109,25 @@ Method = """
 
 def handle_exception(err):
     if isinstance(err, PermissionDenied):
-        logger.error("403 用户没有权限访问该资源")
+        raise Exception("403 用户没有权限访问该资源")
     elif isinstance(err, ResourceExhausted):
-        logger.error("429 您的配额已用尽。请稍后重试。请考虑设置自动重试来处理这些错误")
+        raise Exception("429 您的配额已用尽。请稍后重试。请考虑设置自动重试来处理这些错误")
     elif isinstance(err, InvalidArgument):
-        logger.error("400 参数无效。例如，文件过大，超出了载荷大小限制。另一个事件提供了无效的 API 密钥。")
+        raise Exception("400 参数无效。例如，文件过大，超出了载荷大小限制。另一个事件提供了无效的 API 密钥。")
     elif isinstance(err, AlreadyExists):
-        logger.error("409 已存在具有相同 ID 的已调参模型。对新模型进行调参时，请指定唯一的模型 ID。")
+        raise Exception("409 已存在具有相同 ID 的已调参模型。对新模型进行调参时，请指定唯一的模型 ID。")
     elif isinstance(err, RetryError):
-        logger.error("使用不支持 gRPC 的代理时可能会引起此错误。请尝试将 REST 传输与 genai.configure(..., transport=rest) 搭配使用。")
+        raise Exception("使用不支持 gRPC 的代理时可能会引起此错误。请尝试将 REST 传输与 genai.configure(..., transport=rest) 搭配使用。")
     elif isinstance(err, BlockedPromptException):
-        logger.error("400 出于安全原因，该提示已被屏蔽。")
+        raise Exception("400 出于安全原因，该提示已被屏蔽。")
     elif isinstance(err, BrokenResponseError):
-        logger.error("500 流式传输响应已损坏。在访问需要完整响应的内容（例如聊天记录）时引发。查看堆栈轨迹中提供的错误详情。")
+        raise Exception("500 流式传输响应已损坏。在访问需要完整响应的内容（例如聊天记录）时引发。查看堆栈轨迹中提供的错误详情。")
     elif isinstance(err, IncompleteIterationError):
-        logger.error("500 访问需要完整 API 响应但流式响应尚未完全迭代的内容时引发。对响应对象调用 resolve() 以使用迭代器。")
+        raise Exception("500 访问需要完整 API 响应但流式响应尚未完全迭代的内容时引发。对响应对象调用 resolve() 以使用迭代器。")
     elif isinstance(err, ConnectionError):
-        logger.error("网络连接错误，请检查您的网络连接。")
+        raise Exception("网络连接错误, 请检查您的网络连接(建议使用 NarratoAI 官方提供的 url)")
     else:
-        logger.error(f"大模型请求失败, 下面是具体报错信息: \n{traceback.format_exc()}")
-    return ""
+        raise Exception(f"大模型请求失败, 下面是具体报错信息: \n\n{traceback.format_exc()}")
 
 
 def _generate_response(prompt: str, llm_provider: str = None) -> str:
@@ -398,9 +397,6 @@ def compress_video(input_path: str, output_path: str):
         input_path: 输入视频文件路径
         output_path: 输出压缩后的视频文件路径
     """
-    # 指定 ffmpeg 的完整路径
-    ffmpeg_path = os.getenv("FFMPEG_PATH") or config.app.get("ffmpeg_path") or "ffmpeg"
-
     # 如果压缩后的视频文件已经存在，则直接使用
     if os.path.exists(output_path):
         logger.info(f"压缩视频文件已存在: {output_path}")
@@ -409,17 +405,6 @@ def compress_video(input_path: str, output_path: str):
     try:
         clip = VideoFileClip(input_path)
         clip.write_videofile(output_path, codec='libx264', audio_codec='aac', bitrate="500k", audio_bitrate="128k")
-        # command = [
-        #     ffmpeg_path,
-        #     "-i", input_path,
-        #     "-c:v", "h264",
-        #     "-b:v", "500k",
-        #     "-c:a", "aac",
-        #     "-b:a", "128k",
-        #     output_path
-        # ]
-        # logger.info(f"执行命令: {' '.join(command)}")
-        # subprocess.run(command, check=True)
     except subprocess.CalledProcessError as e:
         logger.error(f"视频压缩失败: {e}")
         raise
@@ -440,41 +425,45 @@ def generate_script(
     Returns:
         str: 生成的脚本
     """
-    # 1. 压缩视频
-    compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4"
-    compress_video(video_path, compressed_video_path)
+    try:
+        # 1. 压缩视频
+        compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4"
+        compress_video(video_path, compressed_video_path)
 
-    # 在关键步骤更新进度
-    if progress_callback:
-        progress_callback(15, "压缩完成")  # 例如,在压缩视频后
-
-    # 2. 转录视频
-    transcription = gemini_video_transcription(
-        video_name=video_name,
-        video_path=compressed_video_path,
-        language=language,
-        llm_provider_video="gemini",
-        progress_callback=progress_callback
-    )
-    if progress_callback:
-        progress_callback(60, "生成解说文案...")  # 例如,在转录视频后
-
-    # 3. 编写解说文案
-    script = writing_short_play(video_plot, video_name, "openai", count=300)
-
-    # 在关键步骤更新进度
-    if progress_callback:
-        progress_callback(70, "匹配画面...")  # 例如,在生成脚本后
-
-    # 4. 文案匹配画面
-    if transcription != "":
-        matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider="openai")
         # 在关键步骤更新进度
         if progress_callback:
-            progress_callback(80, "匹配成功")
-        return matched_script
-    else:
-        return ""
+            progress_callback(15, "压缩完成")  # 例如,在压缩视频后
+
+        # 2. 转录视频
+        transcription = gemini_video_transcription(
+            video_name=video_name,
+            video_path=compressed_video_path,
+            language=language,
+            llm_provider_video="gemini",
+            progress_callback=progress_callback
+        )
+        if progress_callback:
+            progress_callback(60, "生成解说文案...")  # 例如,在转录视频后
+
+        # 3. 编写解说文案
+        script = writing_short_play(video_plot, video_name, "openai", count=300)
+
+        # 在关键步骤更新进度
+        if progress_callback:
+            progress_callback(70, "匹配画面...")  # 例如,在生成脚本后
+
+        # 4. 文案匹配画面
+        if transcription != "":
+            matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider="openai")
+            # 在关键步骤更新进度
+            if progress_callback:
+                progress_callback(80, "匹配成功")
+            return matched_script
+        else:
+            return ""
+    except Exception as e:
+        handle_exception(e)
+        raise
 
 
 def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider_video: str, progress_callback=None):
diff --git a/docker-compose.yml b/docker-compose.yml
index cc94678..399c588 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,6 +6,7 @@ services:
     build:
       context: .
       dockerfile: Dockerfile
+    image: linyq1/narratoai:latest
     container_name: "webui"
     ports:
       - "8501:8501"
@@ -18,10 +19,11 @@ services:
     build:
       context: .
       dockerfile: Dockerfile
+    image: linyq1/narratoai:latest
     container_name: "api"
     ports:
-      - "8502:22"
-    command: [ "sleep", "48h" ]
+      - "8502:8080"
+    command: [ "python3", "main.py" ]
     volumes: *common-volumes
     environment:
       - "VPN_PROXY_URL=http://host.docker.internal:7890"
diff --git a/requirements.txt b/requirements.txt
index a562dcb..af5d8b1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,7 @@ pillow~=10.3.0
 pydantic~=2.6.3
 g4f~=0.3.0.4
 dashscope~=1.15.0
-google.generativeai>=0.7.2
+google.generativeai>=0.8.2
 python-multipart~=0.0.9
 redis==5.0.3
 # if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
@@ -25,3 +25,4 @@ opencv-python~=4.9.0.80
 azure-cognitiveservices-speech~=1.37.0
 git-changelog~=2.5.2
 watchdog==5.0.2
+pydub==0.25.1
diff --git a/webui.py b/webui.py
index 4410c2d..aa272b6 100644
--- a/webui.py
+++ b/webui.py
@@ -66,6 +66,8 @@ if 'video_plot' not in st.session_state:
     st.session_state['video_plot'] = ''
 if 'ui_language' not in st.session_state:
     st.session_state['ui_language'] = config.ui.get("language", system_locale)
+if 'subclip_videos' not in st.session_state:
+    st.session_state['subclip_videos'] = {}
 
 
 def get_all_fonts():
@@ -404,8 +406,8 @@ with left_panel:
                 progress_bar.progress(100)
                 status_text.text("脚本生成完成！")
                 st.success("视频脚本生成成功！")
-            except Exception as e:
-                st.error(f"生成过程中发生错误: {traceback.format_exc()}")
+            except Exception as err:
+                st.error(f"生成过程中发生错误: {str(err)}")
             finally:
                 time.sleep(2)  # 给用户一些时间查看最终状态
                 progress_bar.empty()
@@ -445,7 +447,7 @@ with left_panel:
                         st.session_state['video_clip_json'] = data
                         st.session_state['video_clip_json_path'] = save_path
                         # 刷新页面
-                        # st.rerun()
+                        st.rerun()
 
         # 裁剪视频
         with button_columns[1]:
@@ -677,7 +679,7 @@ with right_panel:
 # 视频编辑面板
 with st.expander(tr("Video Check"), expanded=False):
     try:
-        video_list = st.session_state['video_script_list']
+        video_list = st.session_state.video_clip_json
     except KeyError as e:
         video_list = []
 
diff --git a/webui.sh b/webui.sh
index c188c2b..dcdea0a 100644
--- a/webui.sh
+++ b/webui.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # 从环境变量中加载VPN代理的配置URL
-vpn_proxy_url="http://127.0.0.1:7890"
+vpn_proxy_url="$VPN_PROXY_URL"
 # 检查是否成功加载
 if [ -z "$vpn_proxy_url" ]; then
     echo "VPN代理配置URL未设置，请检查环境变量VPN_PROXY_URL"
@@ -44,7 +44,24 @@ for url in "${!urls_paths[@]}"; do
         echo "下载失败: $url" >&2
     }
 done
+
+# 安装 git lfs 并下载模型到指定目录
+git lfs install
+mkdir -p /NarratoAI/app/models
+cd /NarratoAI/app/models
+if [ ! -d "faster-whisper-large-v2" ] || [ -z "$(ls -A faster-whisper-large-v2)" ]; then
+    if git clone https://huggingface.co/guillaumekln/faster-whisper-large-v2; then
+        echo "下载faster-whisper-large-v2成功"
+    else
+        echo "下载faster-whisper-large-v2失败" >&2
+        exit 1
+    fi
+else
+    echo "faster-whisper-large-v2 已存在，跳过下载"
+fi
+
 # 等待所有后台任务完成
 wait
 echo "所有文件已成功下载到指定目录"
+cd /NarratoAI/
 streamlit run webui.py --browser.serverAddress="0.0.0.0" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False

From 33b78a3697c052f683d037ccaa73a7db9f74eb03 Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Mon, 30 Sep 2024 01:59:50 +0800
Subject: [PATCH 19/21] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=87=AA=E5=8A=A8?=
 =?UTF-8?q?=E6=89=93=E5=8C=85=E5=8F=91=E5=B8=83=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../{dockerImageBuild.yml.bak => dockerImageBuild.yml} | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
 rename .github/workflows/{dockerImageBuild.yml.bak => dockerImageBuild.yml} (77%)

diff --git a/.github/workflows/dockerImageBuild.yml.bak b/.github/workflows/dockerImageBuild.yml
similarity index 77%
rename from .github/workflows/dockerImageBuild.yml.bak
rename to .github/workflows/dockerImageBuild.yml
index 3fc14bd..eda2bb7 100644
--- a/.github/workflows/dockerImageBuild.yml.bak
+++ b/.github/workflows/dockerImageBuild.yml
@@ -14,14 +14,22 @@ jobs:
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
+
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
+
       - name: Login to DockerHub
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
+      - name: Extract project version
+        id: extract_version
+        run: |
+          project_version=$(grep 'project_version' config.example.toml | cut -d '"' -f 2)
+          echo "PROJECT_VERSION=$project_version" >> $GITHUB_ENV
+
       - name: Build and push
         id: docker_build
         uses: docker/build-push-action@v6
@@ -31,5 +39,5 @@ jobs:
           push: true
           platforms: linux/amd64,linux/arm64
           tags: |
-            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:${{ github.ref_name }}
+            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:${{ env.PROJECT_VERSION }}
             ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:latest

From 56b027bd435f5c39322bed0db45fa7523812d818 Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Mon, 30 Sep 2024 02:14:20 +0800
Subject: [PATCH 20/21] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=9F=BA=E7=A1=80?=
 =?UTF-8?q?=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.example.toml |  2 +-
 webui.py            | 42 ++++++++++++++++++++++++++++++------------
 webui/i18n/zh.json  |  5 ++---
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/config.example.toml b/config.example.toml
index 7b4e09e..1557101 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -2,6 +2,7 @@
     project_version="0.2.0"
     # 如果你没有 OPENAI API Key，可以使用 g4f 代替，或者使用国内的 Moonshot API
     # If you don't have an OPENAI API Key, you can use g4f instead
+    video_llm_provider="gemini"
 
     # 支持的提供商 (Supported providers):
     #   openai
@@ -13,7 +14,6 @@
     #   gemini
     llm_provider="openai"
     # 支持多模态视频理解能力的大模型
-    llm_provider_video="gemini"
 
     ########## Ollama Settings
     # No need to set it unless you want to use your own proxy
diff --git a/webui.py b/webui.py
index aa272b6..79bd7a8 100644
--- a/webui.py
+++ b/webui.py
@@ -188,8 +188,37 @@ with st.expander(tr("Basic Settings"), expanded=False):
         if HTTPS_PROXY:
             config.proxy["https"] = HTTPS_PROXY
 
+    # 视频转录大模型
     with middle_config_panel:
-        llm_providers = ['Gemini']
+        video_llm_providers = ['Gemini']
+        saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
+        saved_llm_provider_index = 0
+        for i, provider in enumerate(video_llm_providers):
+            if provider.lower() == saved_llm_provider:
+                saved_llm_provider_index = i
+                break
+
+        video_llm_provider = st.selectbox(tr("Video LLM Provider"), options=video_llm_providers, index=saved_llm_provider_index)
+        video_llm_provider = video_llm_provider.lower()
+        config.app["video_llm_provider"] = video_llm_provider
+
+        video_llm_api_key = config.app.get(f"{video_llm_provider}_api_key", "")
+        video_llm_base_url = config.app.get(f"{video_llm_provider}_base_url", "")
+        video_llm_model_name = config.app.get(f"{video_llm_provider}_model_name", "")
+        video_llm_account_id = config.app.get(f"{video_llm_provider}_account_id", "")
+        st_llm_api_key = st.text_input(tr("Video API Key"), value=video_llm_api_key, type="password")
+        st_llm_base_url = st.text_input(tr("Video Base Url"), value=video_llm_base_url)
+        st_llm_model_name = st.text_input(tr("Video Model Name"), value=video_llm_model_name)
+        if st_llm_api_key:
+            config.app[f"{video_llm_provider}_api_key"] = st_llm_api_key
+        if st_llm_base_url:
+            config.app[f"{video_llm_provider}_base_url"] = st_llm_base_url
+        if st_llm_model_name:
+            config.app[f"{video_llm_provider}_model_name"] = st_llm_model_name
+
+    # 大语言模型
+    with right_config_panel:
+        llm_providers = ['Gemini', 'OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Ollama', 'G4f', 'OneAPI', "Cloudflare"]
         saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
         saved_llm_provider_index = 0
         for i, provider in enumerate(llm_providers):
@@ -220,17 +249,6 @@ with st.expander(tr("Basic Settings"), expanded=False):
             if st_llm_account_id:
                 config.app[f"{llm_provider}_account_id"] = st_llm_account_id
 
-    with right_config_panel:
-        pexels_api_keys = config.app.get("pexels_api_keys", [])
-        if isinstance(pexels_api_keys, str):
-            pexels_api_keys = [pexels_api_keys]
-        pexels_api_key = ", ".join(pexels_api_keys)
-
-        pexels_api_key = st.text_input(tr("Pexels API Key"), value=pexels_api_key, type="password")
-        pexels_api_key = pexels_api_key.replace(" ", "")
-        if pexels_api_key:
-            config.app["pexels_api_keys"] = pexels_api_key.split(",")
-
 panel = st.columns(3)
 left_panel = panel[0]
 middle_panel = panel[1]
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
index aa588fd..cbad21b 100644
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -64,15 +64,14 @@
     "You can download the generated video from the following links": "你可以从以下链接下载生成的视频",
     "Basic Settings": "**基础设置** (:blue[点击展开])",
     "Language": "界面语言",
-    "Pexels API Key": "Pexels API Key ([点击获取](https://www.pexels.com/api/)) :red[推荐使用]",
     "Pixabay API Key": "Pixabay API Key ([点击获取](https://pixabay.com/api/docs/#api_search_videos)) :red[可以不用配置，如果 Pexels 无法使用，再选择Pixabay]",
-    "LLM Provider": "大模型提供商",
+    "Video LLM Provider": "视频转录大模型",
+    "LLM Provider": "大语言模型",
     "API Key": "API Key (:red[必填，需要到大模型提供商的后台申请])",
     "Base Url": "Base Url (可选)",
     "Account ID": "账户ID (Cloudflare的dash面板url中获取)",
     "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])",
     "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
-    "Please Enter the Pexels API Key": "请先填写 **Pexels API Key**",
     "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**",
     "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://github.com/linyqh/NarratoAI/wiki",
     "Video Source": "视频来源",

From 95828144be64894db6e1457323a2739e8d668a23 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Mon, 30 Sep 2024 17:41:11 +0800
Subject: [PATCH 21/21] 123

---
 app/services/llm.py | 58 +++------------------------------------------
 webui.py            | 19 ++++++---------
 2 files changed, 10 insertions(+), 67 deletions(-)

diff --git a/app/services/llm.py b/app/services/llm.py
index 01bef0a..3e9ba16 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -439,14 +439,14 @@ def generate_script(
             video_name=video_name,
             video_path=compressed_video_path,
             language=language,
-            llm_provider_video="gemini",
+            llm_provider_video=config.app["video_llm_provider"],
             progress_callback=progress_callback
         )
         if progress_callback:
             progress_callback(60, "生成解说文案...")  # 例如,在转录视频后
 
         # 3. 编写解说文案
-        script = writing_short_play(video_plot, video_name, "openai", count=300)
+        script = writing_short_play(video_plot, video_name, config.app["llm_provider"], count=300)
 
         # 在关键步骤更新进度
         if progress_callback:
@@ -454,7 +454,7 @@ def generate_script(
 
         # 4. 文案匹配画面
         if transcription != "":
-            matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider="openai")
+            matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider=config.app["video_llm_provider"])
             # 在关键步骤更新进度
             if progress_callback:
                 progress_callback(80, "匹配成功")
@@ -769,58 +769,6 @@ def screen_matching(huamian: str, wenan: str, llm_provider: str):
     - 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
     """ % (huamian, wenan)
 
-#     prompt = """
-#     你是一位拥有10年丰富经验的影视解说创作专家。你的任务是根据提供的视频转录脚本和解说文案，创作一个引人入胜的解说脚本。请按照以下要求完成任务：
-#
-# 1. 输入数据：
-#    - 视频转录脚本：包含时间戳、画面描述和人物台词
-#    - 解说文案：需要你进行匹配和编排的内容
-#    - 视频转录脚本和文案（由 XML 标记<PICTURE></PICTURE>和 <COPYWRITER></COPYWRITER>分隔）如下所示：
-#     视频转录脚本
-#     <PICTURE>
-#     %s
-#     </PICTURE>
-#     文案：
-#     <COPYWRITER>
-#     %s
-#     </COPYWRITER>
-#
-# 2. 输出要求：
-#    - 格式：严格的JSON格式，可直接被json.loads()解析
-#    - 结构：list[script]，其中script为字典类型
-#    - script字段：
-#      {
-#        "picture": "画面描述",
-#        "timestamp": "时间戳",
-#        "narration": "解说文案",
-#        "OST": true/false
-#      }
-#
-# 3. 匹配规则：
-#    a) 时间戳匹配：
-#       - 根据文案内容选择最合适的画面时间段
-#       - 避免时间重叠，确保画面不重复出现
-#       - 适当合并或删减片段，不要完全照搬转录脚本
-#    b) 画面描述：与转录脚本保持一致
-#    c) 解说文案：
-#       - 当OST为true时，narration为空字符串
-#       - 当OST为false时，narration为解说文案，但是要确保文案字数不要超过 30字，若文案较长，则添加到下一个片段
-#    d) OST（原声）：
-#       - 按1:1比例穿插原声和解说片段
-#       - 第一个片段必须是原声，时长不少于20秒
-#       - 选择整个视频中最精彩的片段作为开场
-#
-# 4. 创作重点：
-#    - 确保解说与画面高度匹配
-#    - 巧妙安排原声和解说的交替，提升观众体验
-#    - 创造一个引人入胜、节奏紧凑的解说脚本
-#
-# 5. 注意事项：
-#    - 严格遵守JSON格式，不包含任何注释或额外标记
-#    - 充分利用你的专业经验，创作出高质量、吸引人的解说内容
-#
-# 请基于以上要求，将提供的视频转录脚本和解说文案整合成一个专业、吸引人的解说脚本。你的创作将直接影响观众的观看体验，请发挥你的专业素养，创作出最佳效果。
-#     """ % (huamian, wenan)
     try:
         response = _generate_response(prompt, llm_provider)
         logger.success("匹配成功")
diff --git a/webui.py b/webui.py
index 79bd7a8..5e37dd7 100644
--- a/webui.py
+++ b/webui.py
@@ -124,7 +124,7 @@ def init_log():
     _lvl = "DEBUG"
 
     def format_record(record):
-        # 获取日志记录中的文件全���径
+        # 获取日志记录中的文件全径
         file_path = record["file"].path
         # 将绝对路径转换为相对于项目根目录的路径
         relative_path = os.path.relpath(file_path, root_dir)
@@ -290,7 +290,7 @@ with left_panel:
         # 按创建时间降序排序
         script_list.sort(key=lambda x: x["ctime"], reverse=True)
 
-        # ��本文件 下拉框
+        # 本文件 下拉框
         script_path = [(tr("Auto Generate"), ""), ]
         for file in script_list:
             display_name = file['file'].replace(root_dir, "")
@@ -385,12 +385,8 @@ with left_panel:
 
             try:
                 with st.spinner("正在生成脚本..."):
-                    if not video_name:
-                        st.warning("视频名称不能为空")
-                        st.stop()
                     if not video_plot:
-                        st.warning("视频剧情不能为空")
-                        st.stop()
+                        st.warning("视频剧情为空; 会极大影响生成效果！")
                     if params.video_clip_json_path == "" and params.video_origin_path != "":
                         update_progress(10, "压缩视频中...")
                         # 使用大模型生成视频脚本
@@ -756,6 +752,10 @@ with st.expander(tr("Video Check"), expanded=False):
 # 开始按钮
 start_button = st.button(tr("Generate Video"), use_container_width=True, type="primary")
 if start_button:
+    # 重置日志容器和记录
+    log_container = st.empty()
+    log_records = []
+
     config.save_config()
     task_id = st.session_state.get('task_id')
     if st.session_state.get('video_script_json_path') is not None:
@@ -778,16 +778,11 @@ if start_button:
         scroll_to_bottom()
         st.stop()
 
-    log_container = st.empty()
-    log_records = []
-
-
     def log_received(msg):
         with log_container:
             log_records.append(msg)
             st.code("\n".join(log_records))
 
-
     logger.add(log_received)
 
     st.toast(tr("生成视频"))