(webfeatui): 重构视觉分析功能并添加新模型支持

- 移除了对 QwenVL模型的特定逻辑,改为更通用的实现
- 添加了对 OpenAI 视觉模型的支持- 更新了视觉模型设置界面,增加了新的模型选项
- 重构了测试连接和创建分析器的代码,提高了可维护性
- 调整了配置文件结构,简化了视觉模型的配置
This commit is contained in:
linyq 2025-05-09 12:03:27 +08:00
parent afeeb7c516
commit 3fe8eb50c0
6 changed files with 147 additions and 241 deletions

View File

@ -237,28 +237,28 @@ if __name__ == '__main__':
video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json" video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json"
# 测试新的JSON文件 # 测试新的JSON文件
test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1458.json" test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_2258.json"
markdown_output = parse_frame_analysis_to_markdown(test_file_path) markdown_output = parse_frame_analysis_to_markdown(test_file_path)
# print(markdown_output) # print(markdown_output)
# 输出到文件以便检查格式 # 输出到文件以便检查格式
output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/narration_script.md" output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/家里家外1-5.md"
with open(output_file, 'w', encoding='utf-8') as f: with open(output_file, 'w', encoding='utf-8') as f:
f.write(markdown_output) f.write(markdown_output)
# print(f"\n已将Markdown输出保存到: {output_file}") # print(f"\n已将Markdown输出保存到: {output_file}")
# 生成解说文案 # # 生成解说文案
narration = generate_narration( # narration = generate_narration(
markdown_output, # markdown_output,
text_api_key, # text_api_key,
base_url=text_base_url, # base_url=text_base_url,
model=text_model # model=text_model
) # )
#
# 保存解说文案 # # 保存解说文案
print(narration) # print(narration)
print(type(narration)) # print(type(narration))
narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json" # narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json"
with open(narration_file, 'w', encoding='utf-8') as f: # with open(narration_file, 'w', encoding='utf-8') as f:
f.write(narration) # f.write(narration)
print(f"\n已将解说文案保存到: {narration_file}") # print(f"\n已将解说文案保存到: {narration_file}")

View File

@ -1,175 +1,85 @@
[app] [app]
project_version="0.6.0" project_version="0.6.0"
# 支持视频理解的大模型提供商 # 支持视频理解的大模型提供商
# gemini # gemini (谷歌, 需要 VPN)
# qwenvl # siliconflow (硅基流动)
vision_llm_provider="qwenvl" # qwenvl (通义千问)
vision_llm_provider="Siliconflow"
########## Vision Gemini API Key ########## Gemini 视觉模型
vision_gemini_api_key = "" vision_gemini_api_key = ""
vision_gemini_model_name = "gemini-2.0-flash" vision_gemini_model_name = "gemini-2.0-flash-lite"
########## Vision Qwen API Key (默认使用“硅基流动”的QwenVL模型) ########## QwenVL 视觉模型
vision_qwenvl_api_key = "" vision_qwenvl_api_key = ""
vision_qwenvl_model_name = "Qwen/Qwen2.5-VL-32B-Instruct" vision_qwenvl_model_name = "qwen2.5-vl-32b-instruct"
vision_qwenvl_base_url = "https://api.siliconflow.cn/v1" vision_qwenvl_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
########### Vision NarratoAPI Key ########## siliconflow 视觉模型
vision_siliconflow_api_key = ""
vision_siliconflow_model_name = "Qwen/Qwen2.5-VL-32B-Instruct"
vision_siliconflow_base_url = "https://api.siliconflow.cn/v1"
########## OpenAI 视觉模型
vision_openai_api_key = ""
vision_openai_model_name = "gpt-4.1-nano-2025-04-14"
vision_openai_base_url = "https://api.openai.com/v1"
########### NarratoAPI 微调模型 (未发布)
narrato_api_key = "ggyY91BAO-_ULvAqKum3XexcyN1G3dP86DEzvjZDcrg" narrato_api_key = "ggyY91BAO-_ULvAqKum3XexcyN1G3dP86DEzvjZDcrg"
narrato_api_url = "https://narratoinsight.scsmtech.cn/api/v1" narrato_api_url = "https://narratoinsight.scsmtech.cn/api/v1"
narrato_vision_model = "gemini-1.5-flash" narrato_model = "narra-1.0-2025-05-09"
narrato_vision_key = ""
narrato_llm_model = "gpt-4o"
narrato_llm_key = ""
# 用于生成文案的大模型支持的提供商 (Supported providers): # 用于生成文案的大模型支持的提供商 (Supported providers):
# openai (默认) # openai (默认, 需要 VPN)
# deepseek (默认使用“硅基流动”的模型) # siliconflow (硅基流动)
# moonshot (月之暗面) # deepseek (深度求索)
# gemini (谷歌, 需要 VPN)
# qwen (通义千问) # qwen (通义千问)
# gemini # moonshot (月之暗面)
text_llm_provider="deepseek" text_llm_provider="openai"
########## OpenAI API Key ########## OpenAI API Key
# Get your API key at https://platform.openai.com/api-keys # Get your API key at https://platform.openai.com/api-keys
text_openai_api_key = "" text_openai_api_key = ""
text_openai_base_url = "https://api.openai.com/v1" text_openai_base_url = "https://api.openai.com/v1"
text_openai_model_name = "gpt-4o-mini" text_openai_model_name = "gpt-4.1-mini-2025-04-14"
# 使用 硅基流动 第三方 API Key使用手机号注册https://cloud.siliconflow.cn/i/pyOKqFCV
# 访问 https://cloud.siliconflow.cn/account/ak 获取你的 API 密钥
text_siliconflow_api_key = ""
text_siliconflow_base_url = "https://api.siliconflow.cn/v1"
text_siliconflow_model_name = "deepseek-ai/DeepSeek-R1"
########## DeepSeek API Key ########## DeepSeek API Key
# 使用 硅基流动 第三方 API Key使用手机号注册https://cloud.siliconflow.cn/i/pyOKqFCV # 访问 https://platform.deepseek.com/api_keys 获取你的 API 密钥
text_deepseek_api_key = "" text_deepseek_api_key = ""
text_deepseek_base_url = "https://api.siliconflow.cn/v1" text_deepseek_base_url = "https://api.deepseek.com"
text_deepseek_model_name = "deepseek-ai/DeepSeek-V3" text_deepseek_model_name = "deepseek-chat"
########## Moonshot API Key
# Visit https://platform.moonshot.cn/console/api-keys to get your API key.
text_moonshot_api_key=""
text_moonshot_base_url = "https://api.moonshot.cn/v1"
text_moonshot_model_name = "moonshot-v1-8k"
########## G4F
# Visit https://github.com/xtekky/gpt4free to get more details
# Supported model list: https://github.com/xtekky/gpt4free/blob/main/g4f/models.py
text_g4f_model_name = "gpt-3.5-turbo"
########## Azure API Key
# Visit https://learn.microsoft.com/zh-cn/azure/ai-services/openai/ to get more details
# API documentation: https://learn.microsoft.com/zh-cn/azure/ai-services/openai/reference
text_azure_api_key = ""
text_azure_base_url=""
text_azure_model_name="gpt-35-turbo" # replace with your model deployment name
text_azure_api_version = "2024-02-15-preview"
########## Gemini API Key ########## Gemini API Key
text_gemini_api_key="" text_gemini_api_key=""
text_gemini_model_name = "gemini-1.5-flash" text_gemini_model_name = "gemini-2.0-flash"
########## Qwen API Key ########## Qwen API Key
# Visit https://dashscope.console.aliyun.com/apiKey to get your API key # 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥
# Visit below links to get more details
# https://tongyi.aliyun.com/qianwen/
# https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction
text_qwen_api_key = "" text_qwen_api_key = ""
text_qwen_model_name = "qwen-plus-1127" text_qwen_model_name = "qwen-plus-1127"
text_qwen_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" text_qwen_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
########## Moonshot API Key
# 字幕提供商、可选,支持 whisper 和 faster-whisper-large-v2"whisper" # 访问 https://platform.moonshot.cn/console/api-keys 获取你的 API 密钥
# 默认为 faster-whisper-large-v2 模型地址https://huggingface.co/guillaumekln/faster-whisper-large-v2 text_moonshot_api_key=""
subtitle_provider = "faster-whisper-large-v2" text_moonshot_base_url = "https://api.moonshot.cn/v1"
subtitle_enabled = true text_moonshot_model_name = "moonshot-v1-8k"
# ImageMagick
# 安装后,将自动检测到 ImageMagickWindows 除外!
# 例如,在 Windows 上 "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
# 下载位置 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
# imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"
# FFMPEG
#
# 通常情况下ffmpeg 会被自动下载,并且会被自动检测到。
# 但是如果你的环境有问题,无法自动下载,可能会遇到如下错误:
# RuntimeError: No ffmpeg exe could be found.
# Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
# 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path下载地址https://www.gyan.dev/ffmpeg/builds/
# ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
#########################################################################################
# 当视频生成成功后API服务提供的视频下载接入点默认为当前服务的地址和监听端口
# 比如 http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
# 如果你需要使用域名对外提供服务一般会用nginx做代理则可以设置为你的域名
# 比如 https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
# endpoint="https://xxxx.com"
# When the video is successfully generated, the API service provides a download endpoint for the video, defaulting to the service's current address and listening port.
# For example, http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
# If you need to provide the service externally using a domain name (usually done with nginx as a proxy), you can set it to your domain name.
# For example, https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
# endpoint="https://xxxx.com"
endpoint=""
# Video material storage location
# material_directory = "" # Indicates that video materials will be downloaded to the default folder, the default folder is ./storage/cache_videos under the current project
# material_directory = "/user/harry/videos" # Indicates that video materials will be downloaded to a specified folder
# material_directory = "task" # Indicates that video materials will be downloaded to the current task's folder, this method does not allow sharing of already downloaded video materials
# 视频素材存放位置
# material_directory = "" #表示将视频素材下载到默认的文件夹,默认文件夹为当前项目下的 ./storage/cache_videos
# material_directory = "/user/harry/videos" #表示将视频素材下载到指定的文件夹中
# material_directory = "task" #表示将视频素材下载到当前任务的文件夹中,这种方式无法共享已经下载的视频素材
material_directory = ""
# 用于任务的状态管理
enable_redis = false
redis_host = "localhost"
redis_port = 6379
redis_db = 0
redis_password = ""
# 文生视频时的最大并发任务数
max_concurrent_tasks = 5
# webui界面是否显示配置项 # webui界面是否显示配置项
hide_config = false hide_config = true
[whisper]
# Only effective when subtitle_provider is "whisper"
# Run on GPU with FP16
# model = WhisperModel(model_size, device="cuda", compute_type="float16")
# Run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# Run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")
# recommended model_size: "large-v3"
model_size="faster-whisper-large-v2"
# 如果要使用 GPU请设置 device=“cuda”
device="CPU"
compute_type="int8"
[proxy] [proxy]
### Use a proxy to access the Pexels API
### Format: "http://<username>:<password>@<proxy>:<port>"
### Example: "http://user:pass@proxy:1234"
### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
http = "http://127.0.0.1:7890" http = "http://127.0.0.1:7890"
https = "http://127.0.0.1:7890" https = "http://127.0.0.1:7890"
enabled = false
[azure]
# Azure Speech API Key
# Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
speech_key=""
speech_region=""
[frames] [frames]
# 提取关键帧的间隔时间 # 提取关键帧的间隔时间

View File

@ -5,7 +5,7 @@ from loguru import logger
from app.config import config from app.config import config
from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \ from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \
review_settings, merge_settings, system_settings review_settings, merge_settings, system_settings
from webui.utils import cache, file_utils # from webui.utils import cache, file_utils
from app.utils import utils from app.utils import utils
from app.models.schema import VideoClipParams, VideoAspect from app.models.schema import VideoClipParams, VideoAspect
@ -184,7 +184,7 @@ def render_generate_button():
except Exception as e: except Exception as e:
logger.error(f"播放视频失败: {e}") logger.error(f"播放视频失败: {e}")
file_utils.open_task_folder(config.root_dir, task_id) # file_utils.open_task_folder(config.root_dir, task_id)
logger.info(tr("视频生成完成")) logger.info(tr("视频生成完成"))

View File

@ -64,25 +64,25 @@ def render_proxy_settings(tr):
proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled) proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled)
# 保存代理开关状态 # 保存代理开关状态
config.proxy["enabled"] = proxy_enabled # config.proxy["enabled"] = proxy_enabled
# 只有在代理启用时才显示代理设置输入框 # 只有在代理启用时才显示代理设置输入框
if proxy_enabled: if proxy_enabled:
HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http) HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http)
HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https) HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https)
if HTTP_PROXY: if HTTP_PROXY and HTTPS_PROXY:
config.proxy["http"] = HTTP_PROXY config.proxy["http"] = HTTP_PROXY
os.environ["HTTP_PROXY"] = HTTP_PROXY
if HTTPS_PROXY:
config.proxy["https"] = HTTPS_PROXY config.proxy["https"] = HTTPS_PROXY
os.environ["HTTP_PROXY"] = HTTP_PROXY
os.environ["HTTPS_PROXY"] = HTTPS_PROXY os.environ["HTTPS_PROXY"] = HTTPS_PROXY
# logger.debug(f"代理已启用: {HTTP_PROXY}")
else: else:
# 当代理被禁用时,清除环境变量和配置 # 当代理被禁用时,清除环境变量和配置
os.environ.pop("HTTP_PROXY", None) os.environ.pop("HTTP_PROXY", None)
os.environ.pop("HTTPS_PROXY", None) os.environ.pop("HTTPS_PROXY", None)
config.proxy["http"] = "" # config.proxy["http"] = ""
config.proxy["https"] = "" # config.proxy["https"] = ""
def test_vision_model_connection(api_key, base_url, model_name, provider, tr): def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
@ -108,29 +108,6 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
return True, tr("gemini model is available") return True, tr("gemini model is available")
except Exception as e: except Exception as e:
return False, f"{tr('gemini model is not available')}: {str(e)}" return False, f"{tr('gemini model is not available')}: {str(e)}"
elif provider.lower() == 'qwenvl':
from openai import OpenAI
try:
client = OpenAI(
api_key=api_key,
base_url=base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
)
# 发送一个简单的测试请求
response = client.chat.completions.create(
model=model_name or "qwen-vl-max-latest",
messages=[{"role": "user", "content": "直接回复我文本'当前网络可用'"}]
)
if response and response.choices:
return True, tr("QwenVL model is available")
else:
return False, tr("QwenVL model returned invalid response")
except Exception as e:
return False, f"{tr('QwenVL model is not available')}: {str(e)}"
elif provider.lower() == 'narratoapi': elif provider.lower() == 'narratoapi':
import requests import requests
try: try:
@ -148,9 +125,46 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
return False, f"{tr('NarratoAPI is not available')}: HTTP {response.status_code}" return False, f"{tr('NarratoAPI is not available')}: HTTP {response.status_code}"
except Exception as e: except Exception as e:
return False, f"{tr('NarratoAPI is not available')}: {str(e)}" return False, f"{tr('NarratoAPI is not available')}: {str(e)}"
else: else:
return False, f"{tr('Unsupported provider')}: {provider}" from openai import OpenAI
try:
client = OpenAI(
api_key=api_key,
base_url=base_url,
)
response = client.chat.completions.create(
model=model_name,
messages=[
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}],
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
},
},
{"type": "text", "text": "回复我网络可用即可"},
],
},
],
)
if response and response.choices:
return True, tr("QwenVL model is available")
else:
return False, tr("QwenVL model returned invalid response")
except Exception as e:
# logger.debug(api_key)
# logger.debug(base_url)
# logger.debug(model_name)
return False, f"{tr('QwenVL model is not available')}: {str(e)}"
def render_vision_llm_settings(tr): def render_vision_llm_settings(tr):
@ -158,7 +172,7 @@ def render_vision_llm_settings(tr):
st.subheader(tr("Vision Model Settings")) st.subheader(tr("Vision Model Settings"))
# 视频分析模型提供商选择 # 视频分析模型提供商选择
vision_providers = ['Gemini', 'QwenVL', 'NarratoAPI(待发布)'] vision_providers = ['Siliconflow', 'Gemini', 'QwenVL', 'OpenAI']
saved_vision_provider = config.app.get("vision_llm_provider", "Gemini").lower() saved_vision_provider = config.app.get("vision_llm_provider", "Gemini").lower()
saved_provider_index = 0 saved_provider_index = 0
@ -194,8 +208,8 @@ def render_vision_llm_settings(tr):
) )
st_vision_model_name = st.text_input( st_vision_model_name = st.text_input(
tr("Vision Model Name"), tr("Vision Model Name"),
value=vision_model_name or "gemini-1.5-flash", value=vision_model_name or "gemini-2.0-flash-lite",
help=tr("Default: gemini-1.5-flash") help=tr("Default: gemini-2.0-flash-lite")
) )
elif vision_provider == 'qwenvl': elif vision_provider == 'qwenvl':
st_vision_base_url = st.text_input( st_vision_base_url = st.text_input(
@ -261,52 +275,45 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
"Authorization": f"Bearer {api_key}", "Authorization": f"Bearer {api_key}",
"Content-Type": "application/json" "Content-Type": "application/json"
} }
# 如果没有指定base_url使用默认值
if not base_url:
if provider.lower() == 'openai':
base_url = "https://api.openai.com/v1"
elif provider.lower() == 'moonshot':
base_url = "https://api.moonshot.cn/v1"
elif provider.lower() == 'deepseek':
base_url = "https://api.deepseek.com"
# 构建测试URL
test_url = f"{base_url.rstrip('/')}/chat/completions"
# 特殊处理Gemini # 特殊处理Gemini
if provider.lower() == 'gemini': if provider.lower() == 'gemini':
import google.generativeai as genai import google.generativeai as genai
try: try:
genai.configure(api_key=api_key) genai.configure(api_key=api_key)
model = genai.GenerativeModel(model_name or 'gemini-pro') model = genai.GenerativeModel(model_name)
model.generate_content("直接回复我文本'当前网络可用'") model.generate_content("直接回复我文本'当前网络可用'")
return True, tr("Gemini model is available") return True, tr("Gemini model is available")
except Exception as e: except Exception as e:
return False, f"{tr('Gemini model is not available')}: {str(e)}" return False, f"{tr('Gemini model is not available')}: {str(e)}"
# 构建测试消息
test_data = {
"model": model_name,
"messages": [
{"role": "user", "content": "直接回复我文本'当前网络可用'"}
],
"stream": False
}
# 发送测试请求
response = requests.post(
test_url,
headers=headers,
json=test_data,
)
if response.status_code == 200:
return True, tr("Text model is available")
else: else:
return False, f"{tr('Text model is not available')}: HTTP {response.status_code}" test_url = f"{base_url.rstrip('/')}/chat/completions"
# 构建测试消息
test_data = {
"model": model_name,
"messages": [
{"role": "user", "content": "直接回复我文本'当前网络可用'"}
],
"stream": False
}
# 发送测试请求
response = requests.post(
test_url,
headers=headers,
json=test_data,
)
# logger.debug(model_name)
# logger.debug(api_key)
# logger.debug(test_url)
if response.status_code == 200:
return True, tr("Text model is available")
else:
return False, f"{tr('Text model is not available')}: HTTP {response.status_code}"
except Exception as e: except Exception as e:
logger.error(traceback.format_exc())
return False, f"{tr('Connection failed')}: {str(e)}" return False, f"{tr('Connection failed')}: {str(e)}"
@ -315,8 +322,8 @@ def render_text_llm_settings(tr):
st.subheader(tr("Text Generation Model Settings")) st.subheader(tr("Text Generation Model Settings"))
# 文案生成模型提供商选择 # 文案生成模型提供商选择
text_providers = ['DeepSeek', 'OpenAI', 'Siliconflow', 'Qwen', 'Moonshot', 'Gemini'] text_providers = ['OpenAI', 'Siliconflow', 'DeepSeek', 'Gemini', 'Qwen', 'Moonshot']
saved_text_provider = config.app.get("text_llm_provider", "DeepSeek").lower() saved_text_provider = config.app.get("text_llm_provider", "OpenAI").lower()
saved_provider_index = 0 saved_provider_index = 0
for i, provider in enumerate(text_providers): for i, provider in enumerate(text_providers):
@ -344,8 +351,6 @@ def render_text_llm_settings(tr):
# 添加测试按钮 # 添加测试按钮
if st.button(tr("Test Connection"), key="test_text_connection"): if st.button(tr("Test Connection"), key="test_text_connection"):
logger.debug(st_text_base_url)
logger.debug(st_text_model_name)
with st.spinner(tr("Testing connection...")): with st.spinner(tr("Testing connection...")):
success, message = test_text_model_connection( success, message = test_text_model_connection(
api_key=st_text_api_key, api_key=st_text_api_key,

View File

@ -24,15 +24,13 @@ def create_vision_analyzer(provider, api_key, model, base_url):
""" """
if provider == 'gemini': if provider == 'gemini':
return gemini_analyzer.VisionAnalyzer(model_name=model, api_key=api_key) return gemini_analyzer.VisionAnalyzer(model_name=model, api_key=api_key)
elif provider == 'qwenvl': else:
# 只传入必要的参数 # 只传入必要的参数
return qwenvl_analyzer.QwenAnalyzer( return qwenvl_analyzer.QwenAnalyzer(
model_name=model, model_name=model,
api_key=api_key, api_key=api_key,
base_url=base_url base_url=base_url
) )
else:
raise ValueError(f"不支持的视觉分析提供商: {provider}")
def get_batch_timestamps(batch_files, prev_batch_files=None): def get_batch_timestamps(batch_files, prev_batch_files=None):

View File

@ -4,16 +4,12 @@ import json
import time import time
import asyncio import asyncio
import traceback import traceback
import requests
from app.utils import video_processor
import streamlit as st import streamlit as st
from loguru import logger from loguru import logger
from requests.adapters import HTTPAdapter
from datetime import datetime from datetime import datetime
from app.config import config from app.config import config
from app.utils.script_generator import ScriptProcessor from app.utils import utils, video_processor
from app.utils import utils, video_processor, qwenvl_analyzer
from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config
@ -111,12 +107,10 @@ def generate_script_docu(params):
vision_api_key = st.session_state.get('vision_gemini_api_key') vision_api_key = st.session_state.get('vision_gemini_api_key')
vision_model = st.session_state.get('vision_gemini_model_name') vision_model = st.session_state.get('vision_gemini_model_name')
vision_base_url = st.session_state.get('vision_gemini_base_url') vision_base_url = st.session_state.get('vision_gemini_base_url')
elif vision_llm_provider == 'qwenvl':
vision_api_key = st.session_state.get('vision_qwenvl_api_key')
vision_model = st.session_state.get('vision_qwenvl_model_name', 'qwen-vl-max-latest')
vision_base_url = st.session_state.get('vision_qwenvl_base_url')
else: else:
raise ValueError(f"不支持的视觉分析提供商: {vision_llm_provider}") vision_api_key = st.session_state.get(f'vision_{vision_llm_provider}_api_key')
vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name')
vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url')
# 创建视觉分析器实例 # 创建视觉分析器实例
analyzer = create_vision_analyzer( analyzer = create_vision_analyzer(
@ -354,7 +348,6 @@ def generate_script_docu(params):
# 整理帧分析数据 # 整理帧分析数据
markdown_output = parse_frame_analysis_to_markdown(analysis_json_path) markdown_output = parse_frame_analysis_to_markdown(analysis_json_path)
# 生成文案
# 生成解说文案 # 生成解说文案
narration = generate_narration( narration = generate_narration(
markdown_output, markdown_output,