diff --git a/app/services/generate_narration_script.py b/app/services/generate_narration_script.py index f21aa6a..f6640db 100644 --- a/app/services/generate_narration_script.py +++ b/app/services/generate_narration_script.py @@ -237,28 +237,28 @@ if __name__ == '__main__': video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json" # 测试新的JSON文件 - test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1458.json" + test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_2258.json" markdown_output = parse_frame_analysis_to_markdown(test_file_path) # print(markdown_output) # 输出到文件以便检查格式 - output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/narration_script.md" + output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/家里家外1-5.md" with open(output_file, 'w', encoding='utf-8') as f: f.write(markdown_output) # print(f"\n已将Markdown输出保存到: {output_file}") - # 生成解说文案 - narration = generate_narration( - markdown_output, - text_api_key, - base_url=text_base_url, - model=text_model - ) - - # 保存解说文案 - print(narration) - print(type(narration)) - narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json" - with open(narration_file, 'w', encoding='utf-8') as f: - f.write(narration) - print(f"\n已将解说文案保存到: {narration_file}") + # # 生成解说文案 + # narration = generate_narration( + # markdown_output, + # text_api_key, + # base_url=text_base_url, + # model=text_model + # ) + # + # # 保存解说文案 + # print(narration) + # print(type(narration)) + # narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json" + # with open(narration_file, 'w', encoding='utf-8') as f: + # f.write(narration) + # print(f"\n已将解说文案保存到: {narration_file}") diff --git a/config.example.toml b/config.example.toml index 762651b..bfc504d 100644 --- a/config.example.toml +++ b/config.example.toml @@ -1,175 +1,85 @@ [app] project_version="0.6.0" # 支持视频理解的大模型提供商 - # gemini - # qwenvl - vision_llm_provider="qwenvl" + # gemini (谷歌, 需要 VPN) + # siliconflow (硅基流动) + # qwenvl (通义千问) + vision_llm_provider="Siliconflow" - ########## Vision Gemini API Key + ########## Gemini 视觉模型 vision_gemini_api_key = "" - vision_gemini_model_name = "gemini-2.0-flash" + vision_gemini_model_name = "gemini-2.0-flash-lite" - ########## Vision Qwen API Key (默认使用“硅基流动”的QwenVL模型) + ########## QwenVL 视觉模型 vision_qwenvl_api_key = "" - vision_qwenvl_model_name = "Qwen/Qwen2.5-VL-32B-Instruct" - vision_qwenvl_base_url = "https://api.siliconflow.cn/v1" + vision_qwenvl_model_name = "qwen2.5-vl-32b-instruct" + vision_qwenvl_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" - ########### Vision NarratoAPI Key + ########## siliconflow 视觉模型 + vision_siliconflow_api_key = "" + vision_siliconflow_model_name = "Qwen/Qwen2.5-VL-32B-Instruct" + vision_siliconflow_base_url = "https://api.siliconflow.cn/v1" + + ########## OpenAI 视觉模型 + vision_openai_api_key = "" + vision_openai_model_name = "gpt-4.1-nano-2025-04-14" + vision_openai_base_url = "https://api.openai.com/v1" + + ########### NarratoAPI 微调模型 (未发布) narrato_api_key = "ggyY91BAO-_ULvAqKum3XexcyN1G3dP86DEzvjZDcrg" narrato_api_url = "https://narratoinsight.scsmtech.cn/api/v1" - narrato_vision_model = "gemini-1.5-flash" - narrato_vision_key = "" - narrato_llm_model = "gpt-4o" - narrato_llm_key = "" + narrato_model = "narra-1.0-2025-05-09" # 用于生成文案的大模型支持的提供商 (Supported providers): - # openai (默认) - # deepseek (默认使用“硅基流动”的模型) - # moonshot (月之暗面) + # openai (默认, 需要 VPN) + # siliconflow (硅基流动) + # deepseek (深度求索) + # gemini (谷歌, 需要 VPN) # qwen (通义千问) - # gemini - text_llm_provider="deepseek" + # moonshot (月之暗面) + text_llm_provider="openai" ########## OpenAI API Key # Get your API key at https://platform.openai.com/api-keys text_openai_api_key = "" text_openai_base_url = "https://api.openai.com/v1" - text_openai_model_name = "gpt-4o-mini" + text_openai_model_name = "gpt-4.1-mini-2025-04-14" + + # 使用 硅基流动 第三方 API Key,使用手机号注册:https://cloud.siliconflow.cn/i/pyOKqFCV + # 访问 https://cloud.siliconflow.cn/account/ak 获取你的 API 密钥 + text_siliconflow_api_key = "" + text_siliconflow_base_url = "https://api.siliconflow.cn/v1" + text_siliconflow_model_name = "deepseek-ai/DeepSeek-R1" ########## DeepSeek API Key - # 使用 硅基流动 第三方 API Key,使用手机号注册:https://cloud.siliconflow.cn/i/pyOKqFCV + # 访问 https://platform.deepseek.com/api_keys 获取你的 API 密钥 text_deepseek_api_key = "" - text_deepseek_base_url = "https://api.siliconflow.cn/v1" - text_deepseek_model_name = "deepseek-ai/DeepSeek-V3" - - ########## Moonshot API Key - # Visit https://platform.moonshot.cn/console/api-keys to get your API key. - text_moonshot_api_key="" - text_moonshot_base_url = "https://api.moonshot.cn/v1" - text_moonshot_model_name = "moonshot-v1-8k" - - ########## G4F - # Visit https://github.com/xtekky/gpt4free to get more details - # Supported model list: https://github.com/xtekky/gpt4free/blob/main/g4f/models.py - text_g4f_model_name = "gpt-3.5-turbo" - - ########## Azure API Key - # Visit https://learn.microsoft.com/zh-cn/azure/ai-services/openai/ to get more details - # API documentation: https://learn.microsoft.com/zh-cn/azure/ai-services/openai/reference - text_azure_api_key = "" - text_azure_base_url="" - text_azure_model_name="gpt-35-turbo" # replace with your model deployment name - text_azure_api_version = "2024-02-15-preview" + text_deepseek_base_url = "https://api.deepseek.com" + text_deepseek_model_name = "deepseek-chat" ########## Gemini API Key text_gemini_api_key="" - text_gemini_model_name = "gemini-1.5-flash" + text_gemini_model_name = "gemini-2.0-flash" ########## Qwen API Key - # Visit https://dashscope.console.aliyun.com/apiKey to get your API key - # Visit below links to get more details - # https://tongyi.aliyun.com/qianwen/ - # https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction + # 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥 text_qwen_api_key = "" text_qwen_model_name = "qwen-plus-1127" text_qwen_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" - - # 字幕提供商、可选,支持 whisper 和 faster-whisper-large-v2"whisper" - # 默认为 faster-whisper-large-v2 模型地址:https://huggingface.co/guillaumekln/faster-whisper-large-v2 - subtitle_provider = "faster-whisper-large-v2" - subtitle_enabled = true - - # ImageMagick - # 安装后,将自动检测到 ImageMagick,Windows 除外! - # 例如,在 Windows 上 "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe" - # 下载位置 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe - # imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe" - - # FFMPEG - # - # 通常情况下,ffmpeg 会被自动下载,并且会被自动检测到。 - # 但是如果你的环境有问题,无法自动下载,可能会遇到如下错误: - # RuntimeError: No ffmpeg exe could be found. - # Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable. - # 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path,下载地址:https://www.gyan.dev/ffmpeg/builds/ - - # ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe" - ######################################################################################### - - # 当视频生成成功后,API服务提供的视频下载接入点,默认为当前服务的地址和监听端口 - # 比如 http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4 - # 如果你需要使用域名对外提供服务(一般会用nginx做代理),则可以设置为你的域名 - # 比如 https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4 - # endpoint="https://xxxx.com" - - # When the video is successfully generated, the API service provides a download endpoint for the video, defaulting to the service's current address and listening port. - # For example, http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4 - # If you need to provide the service externally using a domain name (usually done with nginx as a proxy), you can set it to your domain name. - # For example, https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4 - # endpoint="https://xxxx.com" - endpoint="" - - - # Video material storage location - # material_directory = "" # Indicates that video materials will be downloaded to the default folder, the default folder is ./storage/cache_videos under the current project - # material_directory = "/user/harry/videos" # Indicates that video materials will be downloaded to a specified folder - # material_directory = "task" # Indicates that video materials will be downloaded to the current task's folder, this method does not allow sharing of already downloaded video materials - - # 视频素材存放位置 - # material_directory = "" #表示将视频素材下载到默认的文件夹,默认文件夹为当前项目下的 ./storage/cache_videos - # material_directory = "/user/harry/videos" #表示将视频素材下载到指定的文件夹中 - # material_directory = "task" #表示将视频素材下载到当前任务的文件夹中,这种方式无法共享已经下载的视频素材 - - material_directory = "" - - # 用于任务的状态管理 - enable_redis = false - redis_host = "localhost" - redis_port = 6379 - redis_db = 0 - redis_password = "" - - # 文生视频时的最大并发任务数 - max_concurrent_tasks = 5 + ########## Moonshot API Key + # 访问 https://platform.moonshot.cn/console/api-keys 获取你的 API 密钥 + text_moonshot_api_key="" + text_moonshot_base_url = "https://api.moonshot.cn/v1" + text_moonshot_model_name = "moonshot-v1-8k" # webui界面是否显示配置项 - hide_config = false - - -[whisper] - # Only effective when subtitle_provider is "whisper" - - # Run on GPU with FP16 - # model = WhisperModel(model_size, device="cuda", compute_type="float16") - - # Run on GPU with INT8 - # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") - - # Run on CPU with INT8 - # model = WhisperModel(model_size, device="cpu", compute_type="int8") - - # recommended model_size: "large-v3" - model_size="faster-whisper-large-v2" - # 如果要使用 GPU,请设置 device=“cuda” - device="CPU" - compute_type="int8" - + hide_config = true [proxy] - ### Use a proxy to access the Pexels API - ### Format: "http://:@:" - ### Example: "http://user:pass@proxy:1234" - ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies - http = "http://127.0.0.1:7890" https = "http://127.0.0.1:7890" - -[azure] - # Azure Speech API Key - # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices - speech_key="" - speech_region="" + enabled = false [frames] # 提取关键帧的间隔时间 diff --git a/webui.py b/webui.py index 7c65df6..94217fc 100644 --- a/webui.py +++ b/webui.py @@ -5,7 +5,7 @@ from loguru import logger from app.config import config from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \ review_settings, merge_settings, system_settings -from webui.utils import cache, file_utils +# from webui.utils import cache, file_utils from app.utils import utils from app.models.schema import VideoClipParams, VideoAspect @@ -184,7 +184,7 @@ def render_generate_button(): except Exception as e: logger.error(f"播放视频失败: {e}") - file_utils.open_task_folder(config.root_dir, task_id) + # file_utils.open_task_folder(config.root_dir, task_id) logger.info(tr("视频生成完成")) diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index cae4c16..a5f3c62 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -64,25 +64,25 @@ def render_proxy_settings(tr): proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled) # 保存代理开关状态 - config.proxy["enabled"] = proxy_enabled + # config.proxy["enabled"] = proxy_enabled # 只有在代理启用时才显示代理设置输入框 if proxy_enabled: HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http) HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https) - if HTTP_PROXY: + if HTTP_PROXY and HTTPS_PROXY: config.proxy["http"] = HTTP_PROXY - os.environ["HTTP_PROXY"] = HTTP_PROXY - if HTTPS_PROXY: config.proxy["https"] = HTTPS_PROXY + os.environ["HTTP_PROXY"] = HTTP_PROXY os.environ["HTTPS_PROXY"] = HTTPS_PROXY + # logger.debug(f"代理已启用: {HTTP_PROXY}") else: # 当代理被禁用时,清除环境变量和配置 os.environ.pop("HTTP_PROXY", None) os.environ.pop("HTTPS_PROXY", None) - config.proxy["http"] = "" - config.proxy["https"] = "" + # config.proxy["http"] = "" + # config.proxy["https"] = "" def test_vision_model_connection(api_key, base_url, model_name, provider, tr): @@ -108,29 +108,6 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr): return True, tr("gemini model is available") except Exception as e: return False, f"{tr('gemini model is not available')}: {str(e)}" - - elif provider.lower() == 'qwenvl': - from openai import OpenAI - try: - client = OpenAI( - api_key=api_key, - base_url=base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" - ) - - # 发送一个简单的测试请求 - response = client.chat.completions.create( - model=model_name or "qwen-vl-max-latest", - messages=[{"role": "user", "content": "直接回复我文本'当前网络可用'"}] - ) - - if response and response.choices: - return True, tr("QwenVL model is available") - else: - return False, tr("QwenVL model returned invalid response") - - except Exception as e: - return False, f"{tr('QwenVL model is not available')}: {str(e)}" - elif provider.lower() == 'narratoapi': import requests try: @@ -148,9 +125,46 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr): return False, f"{tr('NarratoAPI is not available')}: HTTP {response.status_code}" except Exception as e: return False, f"{tr('NarratoAPI is not available')}: {str(e)}" - + else: - return False, f"{tr('Unsupported provider')}: {provider}" + from openai import OpenAI + try: + client = OpenAI( + api_key=api_key, + base_url=base_url, + ) + + response = client.chat.completions.create( + model=model_name, + messages=[ + { + "role": "system", + "content": [{"type": "text", "text": "You are a helpful assistant."}], + }, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg" + }, + }, + {"type": "text", "text": "回复我网络可用即可"}, + ], + }, + ], + ) + if response and response.choices: + return True, tr("QwenVL model is available") + else: + return False, tr("QwenVL model returned invalid response") + + except Exception as e: + # logger.debug(api_key) + # logger.debug(base_url) + # logger.debug(model_name) + return False, f"{tr('QwenVL model is not available')}: {str(e)}" def render_vision_llm_settings(tr): @@ -158,7 +172,7 @@ def render_vision_llm_settings(tr): st.subheader(tr("Vision Model Settings")) # 视频分析模型提供商选择 - vision_providers = ['Gemini', 'QwenVL', 'NarratoAPI(待发布)'] + vision_providers = ['Siliconflow', 'Gemini', 'QwenVL', 'OpenAI'] saved_vision_provider = config.app.get("vision_llm_provider", "Gemini").lower() saved_provider_index = 0 @@ -194,8 +208,8 @@ def render_vision_llm_settings(tr): ) st_vision_model_name = st.text_input( tr("Vision Model Name"), - value=vision_model_name or "gemini-1.5-flash", - help=tr("Default: gemini-1.5-flash") + value=vision_model_name or "gemini-2.0-flash-lite", + help=tr("Default: gemini-2.0-flash-lite") ) elif vision_provider == 'qwenvl': st_vision_base_url = st.text_input( @@ -261,52 +275,45 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr): "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } - - # 如果没有指定base_url,使用默认值 - if not base_url: - if provider.lower() == 'openai': - base_url = "https://api.openai.com/v1" - elif provider.lower() == 'moonshot': - base_url = "https://api.moonshot.cn/v1" - elif provider.lower() == 'deepseek': - base_url = "https://api.deepseek.com" - - # 构建测试URL - test_url = f"{base_url.rstrip('/')}/chat/completions" - + # 特殊处理Gemini if provider.lower() == 'gemini': import google.generativeai as genai try: genai.configure(api_key=api_key) - model = genai.GenerativeModel(model_name or 'gemini-pro') + model = genai.GenerativeModel(model_name) model.generate_content("直接回复我文本'当前网络可用'") return True, tr("Gemini model is available") except Exception as e: return False, f"{tr('Gemini model is not available')}: {str(e)}" - - # 构建测试消息 - test_data = { - "model": model_name, - "messages": [ - {"role": "user", "content": "直接回复我文本'当前网络可用'"} - ], - "stream": False - } - - # 发送测试请求 - response = requests.post( - test_url, - headers=headers, - json=test_data, - ) - - if response.status_code == 200: - return True, tr("Text model is available") else: - return False, f"{tr('Text model is not available')}: HTTP {response.status_code}" + test_url = f"{base_url.rstrip('/')}/chat/completions" + + # 构建测试消息 + test_data = { + "model": model_name, + "messages": [ + {"role": "user", "content": "直接回复我文本'当前网络可用'"} + ], + "stream": False + } + + # 发送测试请求 + response = requests.post( + test_url, + headers=headers, + json=test_data, + ) + # logger.debug(model_name) + # logger.debug(api_key) + # logger.debug(test_url) + if response.status_code == 200: + return True, tr("Text model is available") + else: + return False, f"{tr('Text model is not available')}: HTTP {response.status_code}" except Exception as e: + logger.error(traceback.format_exc()) return False, f"{tr('Connection failed')}: {str(e)}" @@ -315,8 +322,8 @@ def render_text_llm_settings(tr): st.subheader(tr("Text Generation Model Settings")) # 文案生成模型提供商选择 - text_providers = ['DeepSeek', 'OpenAI', 'Siliconflow', 'Qwen', 'Moonshot', 'Gemini'] - saved_text_provider = config.app.get("text_llm_provider", "DeepSeek").lower() + text_providers = ['OpenAI', 'Siliconflow', 'DeepSeek', 'Gemini', 'Qwen', 'Moonshot'] + saved_text_provider = config.app.get("text_llm_provider", "OpenAI").lower() saved_provider_index = 0 for i, provider in enumerate(text_providers): @@ -344,8 +351,6 @@ def render_text_llm_settings(tr): # 添加测试按钮 if st.button(tr("Test Connection"), key="test_text_connection"): - logger.debug(st_text_base_url) - logger.debug(st_text_model_name) with st.spinner(tr("Testing connection...")): success, message = test_text_model_connection( api_key=st_text_api_key, diff --git a/webui/tools/base.py b/webui/tools/base.py index 06b749a..439e465 100644 --- a/webui/tools/base.py +++ b/webui/tools/base.py @@ -24,15 +24,13 @@ def create_vision_analyzer(provider, api_key, model, base_url): """ if provider == 'gemini': return gemini_analyzer.VisionAnalyzer(model_name=model, api_key=api_key) - elif provider == 'qwenvl': + else: # 只传入必要的参数 return qwenvl_analyzer.QwenAnalyzer( model_name=model, api_key=api_key, base_url=base_url ) - else: - raise ValueError(f"不支持的视觉分析提供商: {provider}") def get_batch_timestamps(batch_files, prev_batch_files=None): diff --git a/webui/tools/generate_script_docu.py b/webui/tools/generate_script_docu.py index 5f958ba..92cab5f 100644 --- a/webui/tools/generate_script_docu.py +++ b/webui/tools/generate_script_docu.py @@ -4,16 +4,12 @@ import json import time import asyncio import traceback -import requests -from app.utils import video_processor import streamlit as st from loguru import logger -from requests.adapters import HTTPAdapter from datetime import datetime from app.config import config -from app.utils.script_generator import ScriptProcessor -from app.utils import utils, video_processor, qwenvl_analyzer +from app.utils import utils, video_processor from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config @@ -111,12 +107,10 @@ def generate_script_docu(params): vision_api_key = st.session_state.get('vision_gemini_api_key') vision_model = st.session_state.get('vision_gemini_model_name') vision_base_url = st.session_state.get('vision_gemini_base_url') - elif vision_llm_provider == 'qwenvl': - vision_api_key = st.session_state.get('vision_qwenvl_api_key') - vision_model = st.session_state.get('vision_qwenvl_model_name', 'qwen-vl-max-latest') - vision_base_url = st.session_state.get('vision_qwenvl_base_url') else: - raise ValueError(f"不支持的视觉分析提供商: {vision_llm_provider}") + vision_api_key = st.session_state.get(f'vision_{vision_llm_provider}_api_key') + vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name') + vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url') # 创建视觉分析器实例 analyzer = create_vision_analyzer( @@ -354,7 +348,6 @@ def generate_script_docu(params): # 整理帧分析数据 markdown_output = parse_frame_analysis_to_markdown(analysis_json_path) - # 生成文案 # 生成解说文案 narration = generate_narration( markdown_output,