From 26f0dfeab5d9d0cd2cf3ddf6a456370a1303c6e0 Mon Sep 17 00:00:00 2001 From: linyq Date: Thu, 25 Dec 2025 01:16:00 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=9F=AD=E5=89=A7?= =?UTF-8?q?=E6=B7=B7=E5=89=AA=E5=AD=97=E5=B9=95=E5=9C=A8=20windows=20?= =?UTF-8?q?=E7=8E=AF=E5=A2=83=E4=B8=8B=E5=8A=A0=E8=BD=BD=E5=A4=B1=E8=B4=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 7 +- .../utils/step1_subtitle_analyzer_openai.py | 14 +++ app/services/SDP/utils/utils.py | 87 +++++++++++++------ webui/components/script_settings.py | 42 +++++++-- 4 files changed, 115 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index 3e055d7..bd3c487 100644 --- a/.gitignore +++ b/.gitignore @@ -39,4 +39,9 @@ bug清单.md task.md .claude/* .serena/* -CLAUDE.md \ No newline at end of file + +# OpenSpec: 忽略活动的变更提案,但保留归档和规范 +openspec/* +AGENTS.md +CLAUDE.md +tests/* \ No newline at end of file diff --git a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py index 8752d38..f55cb56 100644 --- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py +++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py @@ -39,6 +39,20 @@ def analyze_subtitle( try: # 加载字幕文件 subtitles = load_srt(srt_path) + + # 检查字幕是否为空 + if not subtitles: + error_msg = ( + f"字幕文件 {srt_path} 解析后无有效内容。\n" + f"请检查:\n" + f"1. 文件格式是否为标准 SRT\n" + f"2. 文件编码是否为 UTF-8、GBK 或 GB2312\n" + f"3. 文件内容是否为空" + ) + logger.error(error_msg) + raise ValueError(error_msg) + + logger.info(f"成功加载字幕文件 {srt_path},共 {len(subtitles)} 条有效字幕") subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles]) # 初始化统一LLM服务 diff --git a/app/services/SDP/utils/utils.py b/app/services/SDP/utils/utils.py index 292d5e0..d6e5e38 100644 --- a/app/services/SDP/utils/utils.py +++ b/app/services/SDP/utils/utils.py @@ -1,45 +1,80 @@ # 公共方法 import json import requests # 新增 +import pysrt +from loguru import logger from typing import List, Dict def load_srt(file_path: str) -> List[Dict]: - """加载并解析SRT文件 + """加载并解析SRT文件(使用 pysrt 库,支持多种编码和格式) Args: file_path: SRT文件路径 Returns: - 字幕内容列表 + 字幕内容列表,格式: + [ + { + 'number': int, # 字幕序号 + 'timestamp': str, # "00:00:01,000 --> 00:00:03,000" + 'text': str, # 字幕文本 + 'start_time': str, # "00:00:01,000" + 'end_time': str # "00:00:03,000" + }, + ... + ] + + Raises: + FileNotFoundError: 文件不存在 + ValueError: 文件编码不支持或格式错误 """ - with open(file_path, 'r', encoding='utf-8-sig') as f: - content = f.read().strip() + # 编码自动检测:依次尝试常见编码 + encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312'] + subs = None + detected_encoding = None - # 按空行分割字幕块 - subtitle_blocks = content.split('\n\n') + for encoding in encodings: + try: + subs = pysrt.open(file_path, encoding=encoding) + detected_encoding = encoding + logger.info(f"成功加载字幕文件 {file_path},编码:{encoding},共 {len(subs)} 条") + break + except UnicodeDecodeError: + continue + except Exception as e: + logger.warning(f"使用编码 {encoding} 加载失败: {e}") + continue + + if subs is None: + # 所有编码都失败 + raise ValueError( + f"无法读取字幕文件 {file_path}," + f"请检查文件编码(支持 UTF-8、GBK、GB2312)" + ) + + # 检查是否为空 + if not subs: + logger.warning(f"字幕文件 {file_path} 解析后无有效内容") + return [] + + # 转换为原格式(向后兼容) subtitles = [] + for sub in subs: + # 合并多行文本为单行(某些 SRT 文件会有换行) + text = sub.text.replace('\n', ' ').strip() - for block in subtitle_blocks: - lines = block.split('\n') - if len(lines) >= 3: # 确保块包含足够的行 - try: - number = int(lines[0].strip()) - timestamp = lines[1] - text = ' '.join(lines[2:]) + # 跳过空字幕 + if not text: + continue - # 解析时间戳 - start_time, end_time = timestamp.split(' --> ') - - subtitles.append({ - 'number': number, - 'timestamp': timestamp, - 'text': text, - 'start_time': start_time, - 'end_time': end_time - }) - except ValueError as e: - print(f"Warning: 跳过无效的字幕块: {e}") - continue + subtitles.append({ + 'number': sub.index, + 'timestamp': f"{sub.start} --> {sub.end}", + 'text': text, + 'start_time': str(sub.start), + 'end_time': str(sub.end) + }) + logger.info(f"成功解析 {len(subtitles)} 条有效字幕") return subtitles diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index a97e53a..2af53ea 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -343,12 +343,34 @@ def short_drama_summary(tr): # 只有当有文件上传且尚未处理时才执行处理逻辑 if subtitle_file is not None and not st.session_state['subtitle_file_processed']: try: - # 读取上传的SRT内容 - script_content = subtitle_file.read().decode('utf-8') + # 清理文件名,防止路径污染和路径遍历攻击 + safe_filename = os.path.basename(subtitle_file.name) + + # 编码自动检测:依次尝试常见编码 + encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312'] + script_content = None + detected_encoding = None + + for encoding in encodings: + try: + subtitle_file.seek(0) # 重置文件指针 + script_content = subtitle_file.read().decode(encoding) + detected_encoding = encoding + break + except UnicodeDecodeError: + continue + + if script_content is None: + st.error(tr("无法读取字幕文件,请检查文件编码(支持 UTF-8、GBK、GB2312)")) + st.stop() + + # 验证字幕内容(简单检查) + if len(script_content.strip()) < 10: + st.warning(tr("字幕文件内容似乎为空,请检查文件")) # 保存到字幕目录 - script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name) - file_name, file_extension = os.path.splitext(subtitle_file.name) + script_file_path = os.path.join(utils.subtitle_dir(), safe_filename) + file_name, file_extension = os.path.splitext(safe_filename) # 如果文件已存在,添加时间戳 if os.path.exists(script_file_path): @@ -356,18 +378,22 @@ def short_drama_summary(tr): file_name_with_timestamp = f"{file_name}_{timestamp}" script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension) - # 直接写入SRT内容,不进行JSON转换 + # 直接写入SRT内容(统一使用 UTF-8) with open(script_file_path, "w", encoding='utf-8') as f: f.write(script_content) # 更新状态 - st.success(tr("字幕上传成功")) + st.success( + f"{tr('字幕上传成功')} " + f"(编码: {detected_encoding.upper()}, " + f"大小: {len(script_content)} 字符)" + ) st.session_state['subtitle_path'] = script_file_path st.session_state['subtitle_file_processed'] = True # 标记已处理 - + # 避免使用rerun,使用更新状态的方式 # st.rerun() - + except Exception as e: st.error(f"{tr('Upload failed')}: {str(e)}")