From 26f0dfeab5d9d0cd2cf3ddf6a456370a1303c6e0 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Thu, 25 Dec 2025 01:16:00 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=9F=AD=E5=89=A7?=
 =?UTF-8?q?=E6=B7=B7=E5=89=AA=E5=AD=97=E5=B9=95=E5=9C=A8=20windows=20?=
 =?UTF-8?q?=E7=8E=AF=E5=A2=83=E4=B8=8B=E5=8A=A0=E8=BD=BD=E5=A4=B1=E8=B4=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |  7 +-
 .../utils/step1_subtitle_analyzer_openai.py   | 14 +++
 app/services/SDP/utils/utils.py               | 87 +++++++++++++------
 webui/components/script_settings.py           | 42 +++++++--
 4 files changed, 115 insertions(+), 35 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3e055d7..bd3c487 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,4 +39,9 @@ bug清单.md
 task.md
 .claude/*
 .serena/*
-CLAUDE.md
\ No newline at end of file
+
+# OpenSpec: 忽略活动的变更提案，但保留归档和规范
+openspec/*
+AGENTS.md
+CLAUDE.md
+tests/*
\ No newline at end of file
diff --git a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
index 8752d38..f55cb56 100644
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
@@ -39,6 +39,20 @@ def analyze_subtitle(
     try:
         # 加载字幕文件
         subtitles = load_srt(srt_path)
+
+        # 检查字幕是否为空
+        if not subtitles:
+            error_msg = (
+                f"字幕文件 {srt_path} 解析后无有效内容。\n"
+                f"请检查：\n"
+                f"1. 文件格式是否为标准 SRT\n"
+                f"2. 文件编码是否为 UTF-8、GBK 或 GB2312\n"
+                f"3. 文件内容是否为空"
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        logger.info(f"成功加载字幕文件 {srt_path}，共 {len(subtitles)} 条有效字幕")
         subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles])
 
         # 初始化统一LLM服务
diff --git a/app/services/SDP/utils/utils.py b/app/services/SDP/utils/utils.py
index 292d5e0..d6e5e38 100644
--- a/app/services/SDP/utils/utils.py
+++ b/app/services/SDP/utils/utils.py
@@ -1,45 +1,80 @@
 # 公共方法
 import json
 import requests  # 新增
+import pysrt
+from loguru import logger
 from typing import List, Dict
 
 
 def load_srt(file_path: str) -> List[Dict]:
-    """加载并解析SRT文件
+    """加载并解析SRT文件（使用 pysrt 库，支持多种编码和格式）
 
     Args:
         file_path: SRT文件路径
 
     Returns:
-        字幕内容列表
+        字幕内容列表，格式：
+        [
+            {
+                'number': int,           # 字幕序号
+                'timestamp': str,        # "00:00:01,000 --> 00:00:03,000"
+                'text': str,             # 字幕文本
+                'start_time': str,       # "00:00:01,000"
+                'end_time': str          # "00:00:03,000"
+            },
+            ...
+        ]
+
+    Raises:
+        FileNotFoundError: 文件不存在
+        ValueError: 文件编码不支持或格式错误
     """
-    with open(file_path, 'r', encoding='utf-8-sig') as f:
-        content = f.read().strip()
+    # 编码自动检测：依次尝试常见编码
+    encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
+    subs = None
+    detected_encoding = None
 
-    # 按空行分割字幕块
-    subtitle_blocks = content.split('\n\n')
+    for encoding in encodings:
+        try:
+            subs = pysrt.open(file_path, encoding=encoding)
+            detected_encoding = encoding
+            logger.info(f"成功加载字幕文件 {file_path}，编码：{encoding}，共 {len(subs)} 条")
+            break
+        except UnicodeDecodeError:
+            continue
+        except Exception as e:
+            logger.warning(f"使用编码 {encoding} 加载失败: {e}")
+            continue
+
+    if subs is None:
+        # 所有编码都失败
+        raise ValueError(
+            f"无法读取字幕文件 {file_path}，"
+            f"请检查文件编码（支持 UTF-8、GBK、GB2312）"
+        )
+
+    # 检查是否为空
+    if not subs:
+        logger.warning(f"字幕文件 {file_path} 解析后无有效内容")
+        return []
+
+    # 转换为原格式（向后兼容）
     subtitles = []
+    for sub in subs:
+        # 合并多行文本为单行（某些 SRT 文件会有换行）
+        text = sub.text.replace('\n', ' ').strip()
 
-    for block in subtitle_blocks:
-        lines = block.split('\n')
-        if len(lines) >= 3:  # 确保块包含足够的行
-            try:
-                number = int(lines[0].strip())
-                timestamp = lines[1]
-                text = ' '.join(lines[2:])
+        # 跳过空字幕
+        if not text:
+            continue
 
-                # 解析时间戳
-                start_time, end_time = timestamp.split(' --> ')
-
-                subtitles.append({
-                    'number': number,
-                    'timestamp': timestamp,
-                    'text': text,
-                    'start_time': start_time,
-                    'end_time': end_time
-                })
-            except ValueError as e:
-                print(f"Warning: 跳过无效的字幕块: {e}")
-                continue
+        subtitles.append({
+            'number': sub.index,
+            'timestamp': f"{sub.start} --> {sub.end}",
+            'text': text,
+            'start_time': str(sub.start),
+            'end_time': str(sub.end)
+        })
 
+    logger.info(f"成功解析 {len(subtitles)} 条有效字幕")
     return subtitles
diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py
index a97e53a..2af53ea 100644
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@@ -343,12 +343,34 @@ def short_drama_summary(tr):
     # 只有当有文件上传且尚未处理时才执行处理逻辑
     if subtitle_file is not None and not st.session_state['subtitle_file_processed']:
         try:
-            # 读取上传的SRT内容
-            script_content = subtitle_file.read().decode('utf-8')
+            # 清理文件名，防止路径污染和路径遍历攻击
+            safe_filename = os.path.basename(subtitle_file.name)
+
+            # 编码自动检测：依次尝试常见编码
+            encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
+            script_content = None
+            detected_encoding = None
+
+            for encoding in encodings:
+                try:
+                    subtitle_file.seek(0)  # 重置文件指针
+                    script_content = subtitle_file.read().decode(encoding)
+                    detected_encoding = encoding
+                    break
+                except UnicodeDecodeError:
+                    continue
+
+            if script_content is None:
+                st.error(tr("无法读取字幕文件，请检查文件编码（支持 UTF-8、GBK、GB2312）"))
+                st.stop()
+
+            # 验证字幕内容（简单检查）
+            if len(script_content.strip()) < 10:
+                st.warning(tr("字幕文件内容似乎为空，请检查文件"))
 
             # 保存到字幕目录
-            script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name)
-            file_name, file_extension = os.path.splitext(subtitle_file.name)
+            script_file_path = os.path.join(utils.subtitle_dir(), safe_filename)
+            file_name, file_extension = os.path.splitext(safe_filename)
 
             # 如果文件已存在,添加时间戳
             if os.path.exists(script_file_path):
@@ -356,18 +378,22 @@ def short_drama_summary(tr):
                 file_name_with_timestamp = f"{file_name}_{timestamp}"
                 script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension)
 
-            # 直接写入SRT内容，不进行JSON转换
+            # 直接写入SRT内容（统一使用 UTF-8）
             with open(script_file_path, "w", encoding='utf-8') as f:
                 f.write(script_content)
 
             # 更新状态
-            st.success(tr("字幕上传成功"))
+            st.success(
+                f"{tr('字幕上传成功')} "
+                f"(编码: {detected_encoding.upper()}, "
+                f"大小: {len(script_content)} 字符)"
+            )
             st.session_state['subtitle_path'] = script_file_path
             st.session_state['subtitle_file_processed'] = True  # 标记已处理
-            
+
             # 避免使用rerun，使用更新状态的方式
             # st.rerun()
-            
+
         except Exception as e:
             st.error(f"{tr('Upload failed')}: {str(e)}")