fix: 修复短剧混剪字幕在 windows 环境下加载失败

2026-02-22 00:10:27 +00:00 · 2025-12-25 01:16:00 +08:00 · 2025-12-25 01:16:00 +08:00 · 26f0dfeab5
commit 26f0dfeab5
parent 5e46ea2746
4 changed files with 115 additions and 35 deletions
--- a/.gitignore
+++ b/.gitignore
@ -39,4 +39,9 @@ bug清单.md
 task.md
 .claude/*
 .serena/*
 # OpenSpec: 忽略活动的变更提案，但保留归档和规范
 openspec/*
 AGENTS.md
 CLAUDE.md
 tests/*
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
@ -39,6 +39,20 @@ def analyze_subtitle(
    try:
        # 加载字幕文件
        subtitles = load_srt(srt_path)
        # 检查字幕是否为空
        if not subtitles:
            error_msg = (
                f"字幕文件 {srt_path} 解析后无有效内容。\n"
                f"请检查：\n"
                f"1. 文件格式是否为标准 SRT\n"
                f"2. 文件编码是否为 UTF-8、GBK 或 GB2312\n"
                f"3. 文件内容是否为空"
            )
            logger.error(error_msg)
            raise ValueError(error_msg)
        logger.info(f"成功加载字幕文件 {srt_path}，共 {len(subtitles)} 条有效字幕")
        subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles])
        # 初始化统一LLM服务
--- a/app/services/SDP/utils/utils.py
+++ b/app/services/SDP/utils/utils.py
@ -1,45 +1,80 @@
 # 公共方法
 import json
 import requests  # 新增
 import pysrt
 from loguru import logger
 from typing import List, Dict
 def load_srt(file_path: str) -> List[Dict]:
-    """加载并解析SRT文件
+    """加载并解析SRT文件（使用 pysrt 库，支持多种编码和格式）
    Args:
        file_path: SRT文件路径
    Returns:
-        字幕内容列表
+        字幕内容列表，格式：
        [
            {
                'number': int,           # 字幕序号
                'timestamp': str,        # "00:00:01,000 --> 00:00:03,000"
                'text': str,             # 字幕文本
                'start_time': str,       # "00:00:01,000"
                'end_time': str          # "00:00:03,000"
            },
            ...
        ]
    Raises:
        FileNotFoundError: 文件不存在
        ValueError: 文件编码不支持或格式错误
    """
-    with open(file_path, 'r', encoding='utf-8-sig') as f:
+    # 编码自动检测：依次尝试常见编码
-        content = f.read().strip()
+    encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
    subs = None
    detected_encoding = None
-    # 按空行分割字幕块
+    for encoding in encodings:
-    subtitle_blocks = content.split('\n\n')
+        try:
            subs = pysrt.open(file_path, encoding=encoding)
            detected_encoding = encoding
            logger.info(f"成功加载字幕文件 {file_path}，编码：{encoding}，共 {len(subs)} 条")
            break
        except UnicodeDecodeError:
            continue
        except Exception as e:
            logger.warning(f"使用编码 {encoding} 加载失败: {e}")
            continue
    if subs is None:
        # 所有编码都失败
        raise ValueError(
            f"无法读取字幕文件 {file_path}，"
            f"请检查文件编码（支持 UTF-8、GBK、GB2312）"
        )
    # 检查是否为空
    if not subs:
        logger.warning(f"字幕文件 {file_path} 解析后无有效内容")
        return []
    # 转换为原格式（向后兼容）
    subtitles = []
    for sub in subs:
        # 合并多行文本为单行（某些 SRT 文件会有换行）
        text = sub.text.replace('\n', ' ').strip()
-    for block in subtitle_blocks:
+        # 跳过空字幕
-        lines = block.split('\n')
+        if not text:
-        if len(lines) >= 3:  # 确保块包含足够的行
+            continue
            try:
                number = int(lines[0].strip())
                timestamp = lines[1]
                text = ' '.join(lines[2:])
-                # 解析时间戳
+        subtitles.append({
-                start_time, end_time = timestamp.split(' --> ')
+            'number': sub.index,
-
+            'timestamp': f"{sub.start} --> {sub.end}",
-                subtitles.append({
+            'text': text,
-                    'number': number,
+            'start_time': str(sub.start),
-                    'timestamp': timestamp,
+            'end_time': str(sub.end)
-                    'text': text,
+        })
                    'start_time': start_time,
                    'end_time': end_time
                })
            except ValueError as e:
                print(f"Warning: 跳过无效的字幕块: {e}")
                continue
    logger.info(f"成功解析 {len(subtitles)} 条有效字幕")
    return subtitles
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@ -343,12 +343,34 @@ def short_drama_summary(tr):
    # 只有当有文件上传且尚未处理时才执行处理逻辑
    if subtitle_file is not None and not st.session_state['subtitle_file_processed']:
        try:
-            # 读取上传的SRT内容
+            # 清理文件名，防止路径污染和路径遍历攻击
-            script_content = subtitle_file.read().decode('utf-8')
+            safe_filename = os.path.basename(subtitle_file.name)
            # 编码自动检测：依次尝试常见编码
            encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
            script_content = None
            detected_encoding = None
            for encoding in encodings:
                try:
                    subtitle_file.seek(0)  # 重置文件指针
                    script_content = subtitle_file.read().decode(encoding)
                    detected_encoding = encoding
                    break
                except UnicodeDecodeError:
                    continue
            if script_content is None:
                st.error(tr("无法读取字幕文件，请检查文件编码（支持 UTF-8、GBK、GB2312）"))
                st.stop()
            # 验证字幕内容（简单检查）
            if len(script_content.strip()) < 10:
                st.warning(tr("字幕文件内容似乎为空，请检查文件"))
            # 保存到字幕目录
-            script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name)
+            script_file_path = os.path.join(utils.subtitle_dir(), safe_filename)
-            file_name, file_extension = os.path.splitext(subtitle_file.name)
+            file_name, file_extension = os.path.splitext(safe_filename)
            # 如果文件已存在,添加时间戳
            if os.path.exists(script_file_path):
@ -356,12 +378,16 @@ def short_drama_summary(tr):
                file_name_with_timestamp = f"{file_name}_{timestamp}"
                script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension)
-            # 直接写入SRT内容，不进行JSON转换
+            # 直接写入SRT内容（统一使用 UTF-8）
            with open(script_file_path, "w", encoding='utf-8') as f:
                f.write(script_content)
            # 更新状态
-            st.success(tr("字幕上传成功"))
+            st.success(
                f"{tr('字幕上传成功')} "
                f"(编码: {detected_encoding.upper()}, "
                f"大小: {len(script_content)} 字符)"
            )
            st.session_state['subtitle_path'] = script_file_path
            st.session_state['subtitle_file_processed'] = True  # 标记已处理