fix: 修复短剧混剪字幕在 windows 环境下加载失败

This commit is contained in:
linyq 2025-12-25 01:16:00 +08:00
parent 5e46ea2746
commit 26f0dfeab5
4 changed files with 115 additions and 35 deletions

7
.gitignore vendored
View File

@ -39,4 +39,9 @@ bug清单.md
task.md
.claude/*
.serena/*
CLAUDE.md
# OpenSpec: 忽略活动的变更提案,但保留归档和规范
openspec/*
AGENTS.md
CLAUDE.md
tests/*

View File

@ -39,6 +39,20 @@ def analyze_subtitle(
try:
# 加载字幕文件
subtitles = load_srt(srt_path)
# 检查字幕是否为空
if not subtitles:
error_msg = (
f"字幕文件 {srt_path} 解析后无有效内容。\n"
f"请检查:\n"
f"1. 文件格式是否为标准 SRT\n"
f"2. 文件编码是否为 UTF-8、GBK 或 GB2312\n"
f"3. 文件内容是否为空"
)
logger.error(error_msg)
raise ValueError(error_msg)
logger.info(f"成功加载字幕文件 {srt_path},共 {len(subtitles)} 条有效字幕")
subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles])
# 初始化统一LLM服务

View File

@ -1,45 +1,80 @@
# 公共方法
import json
import requests # 新增
import pysrt
from loguru import logger
from typing import List, Dict
def load_srt(file_path: str) -> List[Dict]:
"""加载并解析SRT文件
"""加载并解析SRT文件(使用 pysrt 库,支持多种编码和格式)
Args:
file_path: SRT文件路径
Returns:
字幕内容列表
字幕内容列表格式
[
{
'number': int, # 字幕序号
'timestamp': str, # "00:00:01,000 --> 00:00:03,000"
'text': str, # 字幕文本
'start_time': str, # "00:00:01,000"
'end_time': str # "00:00:03,000"
},
...
]
Raises:
FileNotFoundError: 文件不存在
ValueError: 文件编码不支持或格式错误
"""
with open(file_path, 'r', encoding='utf-8-sig') as f:
content = f.read().strip()
# 编码自动检测:依次尝试常见编码
encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
subs = None
detected_encoding = None
# 按空行分割字幕块
subtitle_blocks = content.split('\n\n')
for encoding in encodings:
try:
subs = pysrt.open(file_path, encoding=encoding)
detected_encoding = encoding
logger.info(f"成功加载字幕文件 {file_path},编码:{encoding},共 {len(subs)}")
break
except UnicodeDecodeError:
continue
except Exception as e:
logger.warning(f"使用编码 {encoding} 加载失败: {e}")
continue
if subs is None:
# 所有编码都失败
raise ValueError(
f"无法读取字幕文件 {file_path}"
f"请检查文件编码(支持 UTF-8、GBK、GB2312"
)
# 检查是否为空
if not subs:
logger.warning(f"字幕文件 {file_path} 解析后无有效内容")
return []
# 转换为原格式(向后兼容)
subtitles = []
for sub in subs:
# 合并多行文本为单行(某些 SRT 文件会有换行)
text = sub.text.replace('\n', ' ').strip()
for block in subtitle_blocks:
lines = block.split('\n')
if len(lines) >= 3: # 确保块包含足够的行
try:
number = int(lines[0].strip())
timestamp = lines[1]
text = ' '.join(lines[2:])
# 跳过空字幕
if not text:
continue
# 解析时间戳
start_time, end_time = timestamp.split(' --> ')
subtitles.append({
'number': number,
'timestamp': timestamp,
'text': text,
'start_time': start_time,
'end_time': end_time
})
except ValueError as e:
print(f"Warning: 跳过无效的字幕块: {e}")
continue
subtitles.append({
'number': sub.index,
'timestamp': f"{sub.start} --> {sub.end}",
'text': text,
'start_time': str(sub.start),
'end_time': str(sub.end)
})
logger.info(f"成功解析 {len(subtitles)} 条有效字幕")
return subtitles

View File

@ -343,12 +343,34 @@ def short_drama_summary(tr):
# 只有当有文件上传且尚未处理时才执行处理逻辑
if subtitle_file is not None and not st.session_state['subtitle_file_processed']:
try:
# 读取上传的SRT内容
script_content = subtitle_file.read().decode('utf-8')
# 清理文件名,防止路径污染和路径遍历攻击
safe_filename = os.path.basename(subtitle_file.name)
# 编码自动检测:依次尝试常见编码
encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
script_content = None
detected_encoding = None
for encoding in encodings:
try:
subtitle_file.seek(0) # 重置文件指针
script_content = subtitle_file.read().decode(encoding)
detected_encoding = encoding
break
except UnicodeDecodeError:
continue
if script_content is None:
st.error(tr("无法读取字幕文件,请检查文件编码(支持 UTF-8、GBK、GB2312"))
st.stop()
# 验证字幕内容(简单检查)
if len(script_content.strip()) < 10:
st.warning(tr("字幕文件内容似乎为空,请检查文件"))
# 保存到字幕目录
script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name)
file_name, file_extension = os.path.splitext(subtitle_file.name)
script_file_path = os.path.join(utils.subtitle_dir(), safe_filename)
file_name, file_extension = os.path.splitext(safe_filename)
# 如果文件已存在,添加时间戳
if os.path.exists(script_file_path):
@ -356,18 +378,22 @@ def short_drama_summary(tr):
file_name_with_timestamp = f"{file_name}_{timestamp}"
script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension)
# 直接写入SRT内容不进行JSON转换
# 直接写入SRT内容(统一使用 UTF-8
with open(script_file_path, "w", encoding='utf-8') as f:
f.write(script_content)
# 更新状态
st.success(tr("字幕上传成功"))
st.success(
f"{tr('字幕上传成功')} "
f"(编码: {detected_encoding.upper()}, "
f"大小: {len(script_content)} 字符)"
)
st.session_state['subtitle_path'] = script_file_path
st.session_state['subtitle_file_processed'] = True # 标记已处理
# 避免使用rerun使用更新状态的方式
# st.rerun()
except Exception as e:
st.error(f"{tr('Upload failed')}: {str(e)}")