fix: 修复短剧混剪字幕在 windows 环境下加载失败

2026-02-18 22:43:51 +00:00 · 2025-12-25 01:16:00 +08:00 · 2025-12-25 01:16:00 +08:00 · 26f0dfeab5
commit 26f0dfeab5
parent 5e46ea2746
4 changed files with 115 additions and 35 deletions
--- a/.gitignore
+++ b/.gitignore
@ -39,4 +39,9 @@ bug清单.md
 task.md
 .claude/*
 .serena/*
-CLAUDE.md
+
+# OpenSpec: 忽略活动的变更提案，但保留归档和规范
+openspec/*
+AGENTS.md
+CLAUDE.md
+tests/*
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
@ -39,6 +39,20 @@ def analyze_subtitle(
    try:
        # 加载字幕文件
        subtitles = load_srt(srt_path)
+
+        # 检查字幕是否为空
+        if not subtitles:
+            error_msg = (
+                f"字幕文件 {srt_path} 解析后无有效内容。\n"
+                f"请检查：\n"
+                f"1. 文件格式是否为标准 SRT\n"
+                f"2. 文件编码是否为 UTF-8、GBK 或 GB2312\n"
+                f"3. 文件内容是否为空"
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        logger.info(f"成功加载字幕文件 {srt_path}，共 {len(subtitles)} 条有效字幕")
        subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles])

        # 初始化统一LLM服务
--- a/app/services/SDP/utils/utils.py
+++ b/app/services/SDP/utils/utils.py
@ -1,45 +1,80 @@
 # 公共方法
 import json
 import requests  # 新增
+import pysrt
+from loguru import logger
 from typing import List, Dict


 def load_srt(file_path: str) -> List[Dict]:
-    """加载并解析SRT文件
+    """加载并解析SRT文件（使用 pysrt 库，支持多种编码和格式）

    Args:
        file_path: SRT文件路径

    Returns:
-        字幕内容列表
+        字幕内容列表，格式：
+        [
+            {
+                'number': int,           # 字幕序号
+                'timestamp': str,        # "00:00:01,000 --> 00:00:03,000"
+                'text': str,             # 字幕文本
+                'start_time': str,       # "00:00:01,000"
+                'end_time': str          # "00:00:03,000"
+            },
+            ...
+        ]
+
+    Raises:
+        FileNotFoundError: 文件不存在
+        ValueError: 文件编码不支持或格式错误
    """
-    with open(file_path, 'r', encoding='utf-8-sig') as f:
-        content = f.read().strip()
+    # 编码自动检测：依次尝试常见编码
+    encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
+    subs = None
+    detected_encoding = None

-    # 按空行分割字幕块
-    subtitle_blocks = content.split('\n\n')
+    for encoding in encodings:
+        try:
+            subs = pysrt.open(file_path, encoding=encoding)
+            detected_encoding = encoding
+            logger.info(f"成功加载字幕文件 {file_path}，编码：{encoding}，共 {len(subs)} 条")
+            break
+        except UnicodeDecodeError:
+            continue
+        except Exception as e:
+            logger.warning(f"使用编码 {encoding} 加载失败: {e}")
+            continue
+
+    if subs is None:
+        # 所有编码都失败
+        raise ValueError(
+            f"无法读取字幕文件 {file_path}，"
+            f"请检查文件编码（支持 UTF-8、GBK、GB2312）"
+        )
+
+    # 检查是否为空
+    if not subs:
+        logger.warning(f"字幕文件 {file_path} 解析后无有效内容")
+        return []
+
+    # 转换为原格式（向后兼容）
    subtitles = []
+    for sub in subs:
+        # 合并多行文本为单行（某些 SRT 文件会有换行）
+        text = sub.text.replace('\n', ' ').strip()

-    for block in subtitle_blocks:
-        lines = block.split('\n')
-        if len(lines) >= 3:  # 确保块包含足够的行
-            try:
-                number = int(lines[0].strip())
-                timestamp = lines[1]
-                text = ' '.join(lines[2:])
+        # 跳过空字幕
+        if not text:
+            continue

-                # 解析时间戳
-                start_time, end_time = timestamp.split(' --> ')
-
-                subtitles.append({
-                    'number': number,
-                    'timestamp': timestamp,
-                    'text': text,
-                    'start_time': start_time,
-                    'end_time': end_time
-                })
-            except ValueError as e:
-                print(f"Warning: 跳过无效的字幕块: {e}")
-                continue
+        subtitles.append({
+            'number': sub.index,
+            'timestamp': f"{sub.start} --> {sub.end}",
+            'text': text,
+            'start_time': str(sub.start),
+            'end_time': str(sub.end)
+        })

+    logger.info(f"成功解析 {len(subtitles)} 条有效字幕")
    return subtitles
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@ -343,12 +343,34 @@ def short_drama_summary(tr):
    # 只有当有文件上传且尚未处理时才执行处理逻辑
    if subtitle_file is not None and not st.session_state['subtitle_file_processed']:
        try:
-            # 读取上传的SRT内容
-            script_content = subtitle_file.read().decode('utf-8')
+            # 清理文件名，防止路径污染和路径遍历攻击
+            safe_filename = os.path.basename(subtitle_file.name)
+
+            # 编码自动检测：依次尝试常见编码
+            encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
+            script_content = None
+            detected_encoding = None
+
+            for encoding in encodings:
+                try:
+                    subtitle_file.seek(0)  # 重置文件指针
+                    script_content = subtitle_file.read().decode(encoding)
+                    detected_encoding = encoding
+                    break
+                except UnicodeDecodeError:
+                    continue
+
+            if script_content is None:
+                st.error(tr("无法读取字幕文件，请检查文件编码（支持 UTF-8、GBK、GB2312）"))
+                st.stop()
+
+            # 验证字幕内容（简单检查）
+            if len(script_content.strip()) < 10:
+                st.warning(tr("字幕文件内容似乎为空，请检查文件"))

            # 保存到字幕目录
-            script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name)
-            file_name, file_extension = os.path.splitext(subtitle_file.name)
+            script_file_path = os.path.join(utils.subtitle_dir(), safe_filename)
+            file_name, file_extension = os.path.splitext(safe_filename)

            # 如果文件已存在,添加时间戳
            if os.path.exists(script_file_path):
@ -356,18 +378,22 @@ def short_drama_summary(tr):
                file_name_with_timestamp = f"{file_name}_{timestamp}"
                script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension)

-            # 直接写入SRT内容，不进行JSON转换
+            # 直接写入SRT内容（统一使用 UTF-8）
            with open(script_file_path, "w", encoding='utf-8') as f:
                f.write(script_content)

            # 更新状态
-            st.success(tr("字幕上传成功"))
+            st.success(
+                f"{tr('字幕上传成功')} "
+                f"(编码: {detected_encoding.upper()}, "
+                f"大小: {len(script_content)} 字符)"
+            )
            st.session_state['subtitle_path'] = script_file_path
            st.session_state['subtitle_file_processed'] = True  # 标记已处理
-            
+
            # 避免使用rerun，使用更新状态的方式
            # st.rerun()
-            
+
        except Exception as e:
            st.error(f"{tr('Upload failed')}: {str(e)}")