From 6c6ceeebea234156c311d2729a90ad2cba264cd4 Mon Sep 17 00:00:00 2001 From: linyq Date: Thu, 8 May 2025 11:00:47 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96qwenvl=5Fanalyzer.py=E4=B8=AD?= =?UTF-8?q?=E7=9A=84base=5Furl=E9=85=8D=E7=BD=AE=EF=BC=8C=E7=A7=BB?= =?UTF-8?q?=E9=99=A4=E9=BB=98=E8=AE=A4=E5=80=BC=E8=AE=BE=E7=BD=AE=EF=BC=9B?= =?UTF-8?q?=E5=9C=A8utils.py=E4=B8=AD=E6=96=B0=E5=A2=9Eformat=5Ftime?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E4=BB=A5=E6=94=AF=E6=8C=81=E7=A7=92=E6=95=B0?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96=E4=B8=BAHH:MM:SS,mmm=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F=EF=BC=9B=E5=9C=A8generate=5Fscript=5Fdocu.py=E4=B8=AD?= =?UTF-8?q?=E9=87=8D=E6=9E=84=E5=88=86=E6=9E=90=E7=BB=93=E6=9E=9C=E5=A4=84?= =?UTF-8?q?=E7=90=86=E9=80=BB=E8=BE=91=EF=BC=8C=E5=90=88=E5=B9=B6=E6=89=B9?= =?UTF-8?q?=E6=AC=A1=E5=88=86=E6=9E=90=E7=BB=93=E6=9E=9C=E5=B9=B6=E4=BF=9D?= =?UTF-8?q?=E5=AD=98=E4=B8=BAJSON=E6=A0=BC=E5=BC=8F=EF=BC=8C=E5=90=8C?= =?UTF-8?q?=E6=97=B6=E5=85=BC=E5=AE=B9=E6=97=A7=E7=9A=84=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E8=BE=93=E5=87=BA=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/utils/qwenvl_analyzer.py | 2 +- app/utils/utils.py | 22 ++++ webui/tools/generate_script_docu.py | 188 ++++++++++++++++++++++------ 3 files changed, 175 insertions(+), 37 deletions(-) diff --git a/app/utils/qwenvl_analyzer.py b/app/utils/qwenvl_analyzer.py index ec4de39..6d1669a 100644 --- a/app/utils/qwenvl_analyzer.py +++ b/app/utils/qwenvl_analyzer.py @@ -30,7 +30,7 @@ class QwenAnalyzer: self.model_name = model_name self.api_key = api_key - self.base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" + self.base_url = base_url # 配置API客户端 self._configure_client() diff --git a/app/utils/utils.py b/app/utils/utils.py index e80cd87..56eba09 100644 --- a/app/utils/utils.py +++ b/app/utils/utils.py @@ -197,6 +197,28 @@ def time_convert_seconds_to_hmsm(seconds) -> str: return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds) +def format_time(seconds: float) -> str: + """ + 将秒数转换为格式化的时间字符串 (HH:MM:SS,mmm) + + 参数: + seconds: 需要转换的秒数,可以是整数或浮点数 + + 返回: + 格式化的时间字符串,格式为 HH:MM:SS,mmm + """ + # 计算小时、分钟、秒和毫秒 + hours = int(seconds // 3600) + remaining_seconds = seconds % 3600 + minutes = int(remaining_seconds // 60) + remaining_seconds = remaining_seconds % 60 + secs = int(remaining_seconds) + milliseconds = int((remaining_seconds - secs) * 1000) + + # 格式化为时间字符串 + return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, secs, milliseconds) + + def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str: start_time = time_convert_seconds_to_hmsm(start_time) end_time = time_convert_seconds_to_hmsm(end_time) diff --git a/webui/tools/generate_script_docu.py b/webui/tools/generate_script_docu.py index ee388dc..21abcab 100644 --- a/webui/tools/generate_script_docu.py +++ b/webui/tools/generate_script_docu.py @@ -96,7 +96,7 @@ def generate_script_docu(params): raise Exception(f"关键帧提取失败: {str(e)}") """ - 2. 视觉分析 + 2. 视觉分析(批量分析每一帧) """ vision_llm_provider = st.session_state.get('vision_llm_providers').lower() logger.debug(f"VLM 视觉大模型提供商: {vision_llm_provider}") @@ -167,37 +167,138 @@ def generate_script_docu(params): # ===================处理分析结果=================== update_progress(60, "正在整理分析结果...") - # 合并所有批次的析结果 + # 合并所有批次的分析结果 frame_analysis = "" + merged_frame_observations = [] # 合并所有批次的帧观察 + overall_activity_summaries = [] # 合并所有批次的整体总结 prev_batch_files = None - + frame_counter = 1 # 初始化帧计数器,用于给所有帧分配连续的序号 + logger.debug(json.dumps(results, indent=4, ensure_ascii=False)) + for result in results: if 'error' in result: logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}") - - # 获取当前批次的文件列表 keyframe_001136_000045.jpg 将 000045 精度提升到 毫秒 + continue + + # 获取当前批次的文件列表 batch_files = get_batch_files(keyframe_files, result, vision_batch_size) logger.debug(f"批次 {result['batch_index']} 处理完成,共 {len(batch_files)} 张图片") - # logger.debug(batch_files) - - first_timestamp, last_timestamp, _ = get_batch_timestamps(batch_files, prev_batch_files) + + # 获取批次的时间戳范围 + first_timestamp, last_timestamp, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files) logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}") - - # 添加带时间戳的分析结果 - frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n" - frame_analysis += result['response'] - frame_analysis += "\n" - + + # 解析响应中的JSON数据 + response_text = result['response'] + try: + # 处理可能包含```json```格式的响应 + if "```json" in response_text: + json_content = response_text.split("```json")[1].split("```")[0].strip() + elif "```" in response_text: + json_content = response_text.split("```")[1].split("```")[0].strip() + else: + json_content = response_text.strip() + + response_data = json.loads(json_content) + + # 提取frame_observations和overall_activity_summary + if "frame_observations" in response_data: + frame_obs = response_data["frame_observations"] + overall_summary = response_data.get("overall_activity_summary", "") + + # 添加时间戳信息到每个帧观察 + for i, obs in enumerate(frame_obs): + if i < len(batch_files): + # 从文件名中提取时间戳 + file_path = batch_files[i] + file_name = os.path.basename(file_path) + # 提取时间戳字符串 (格式如: keyframe_000675_000027000.jpg) + # 格式解析: keyframe_帧序号_毫秒时间戳.jpg + timestamp_parts = file_name.split('_') + if len(timestamp_parts) >= 3: + timestamp_str = timestamp_parts[-1].split('.')[0] + try: + timestamp_seconds = int(timestamp_str) / 1000 # 转换为秒 + formatted_time = utils.format_time(timestamp_seconds) # 格式化时间戳 + except ValueError: + logger.warning(f"无法解析时间戳: {timestamp_str}") + timestamp_seconds = 0 + formatted_time = "00:00:00,000" + else: + logger.warning(f"文件名格式不符合预期: {file_name}") + timestamp_seconds = 0 + formatted_time = "00:00:00,000" + + # 添加额外信息到帧观察 + obs["frame_path"] = file_path + obs["timestamp"] = formatted_time + obs["timestamp_seconds"] = timestamp_seconds + + # 使用全局递增的帧计数器替换原始的frame_number + if "frame_number" in obs: + obs["original_frame_number"] = obs["frame_number"] # 保留原始编号作为参考 + obs["frame_number"] = frame_counter # 赋值连续的帧编号 + frame_counter += 1 # 增加帧计数器 + + # 添加到合并列表 + merged_frame_observations.append(obs) + + # 添加批次整体总结信息 + if overall_summary: + # 从文件名中提取时间戳数值 + first_time_str = first_timestamp.split('_')[-1].split('.')[0] + last_time_str = last_timestamp.split('_')[-1].split('.')[0] + + # 转换为毫秒并计算持续时间(秒) + try: + first_time_ms = int(first_time_str) + last_time_ms = int(last_time_str) + batch_duration = (last_time_ms - first_time_ms) / 1000 + except ValueError: + # 使用 utils.time_to_seconds 函数处理格式化的时间戳 + first_time_seconds = utils.time_to_seconds(first_time_str.replace('_', ':').replace('-', ',')) + last_time_seconds = utils.time_to_seconds(last_time_str.replace('_', ':').replace('-', ',')) + batch_duration = last_time_seconds - first_time_seconds + + overall_activity_summaries.append({ + "batch_index": result['batch_index'], + "time_range": f"{first_timestamp}-{last_timestamp}", + "duration_seconds": batch_duration, + "summary": overall_summary + }) + except Exception as e: + logger.error(f"解析批次 {result['batch_index']} 的响应数据失败: {str(e)}") + # 添加原始响应作为回退 + frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n" + frame_analysis += response_text + frame_analysis += "\n" + # 更新上一个批次的文件 prev_batch_files = batch_files - + + # 将合并后的结果转为JSON字符串 + merged_results = { + "frame_observations": merged_frame_observations, + "overall_activity_summaries": overall_activity_summaries + } + + # 保存完整的分析结果为JSON + analysis_json_path = os.path.join(utils.task_dir(), "frame_analysis.json") + with open(analysis_json_path, 'w', encoding='utf-8') as f: + json.dump(merged_results, f, ensure_ascii=False, indent=2) + + # 同时保存原始文本格式的分析结果(兼容性) + if not frame_analysis.strip() and merged_frame_observations: + # 如果没有原始文本但有合并结果,则从合并结果生成文本 + frame_analysis = json.dumps(merged_results, ensure_ascii=False, indent=2) + if not frame_analysis.strip(): raise Exception("未能生成有效的帧分析结果") - - # 保存分析结果 - analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt") - with open(analysis_path, 'w', encoding='utf-8') as f: - f.write(frame_analysis) + + # # 保存文本格式分析结果 + # analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt") + # with open(analysis_path, 'w', encoding='utf-8') as f: + # f.write(frame_analysis) update_progress(70, "正在生成脚本...") @@ -211,26 +312,41 @@ def generate_script_docu(params): frame_content_list = [] prev_batch_files = None - for i, result in enumerate(results): - if 'error' in result: - continue + # 使用合并后的观察结果构建帧内容列表 + if merged_frame_observations: + for obs in merged_frame_observations: + frame_content = { + "_id": obs.get("frame_number", 0), # 使用全局连续的帧编号作为ID + "timestamp": obs.get("timestamp", ""), + "picture": obs.get("observation", ""), + "narration": "", + "OST": 2, + "timestamp_seconds": obs.get("timestamp_seconds", 0) + } + frame_content_list.append(frame_content) + logger.debug(f"添加帧内容: ID={obs.get('frame_number', 0)}, 时间={obs.get('timestamp', '')}, 描述长度={len(obs.get('observation', ''))}") + else: + # 兼容旧的处理方式,如果没有合并后的观察结果 + for i, result in enumerate(results): + if 'error' in result: + continue - batch_files = get_batch_files(keyframe_files, result, vision_batch_size) - _, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files) + batch_files = get_batch_files(keyframe_files, result, vision_batch_size) + _, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files) - frame_content = { - "_id": i + 1, - "timestamp": timestamp_range, - "picture": result['response'], - "narration": "", - "OST": 2 - } - frame_content_list.append(frame_content) + frame_content = { + "_id": i + 1, + "timestamp": timestamp_range, + "picture": result['response'], + "narration": "", + "OST": 2 + } + frame_content_list.append(frame_content) - logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}") + logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}") - # 更新上一个批次的文件 - prev_batch_files = batch_files + # 更新上一个批次的文件 + prev_batch_files = batch_files if not frame_content_list: raise Exception("没有有效的帧内容可以处理")