mirror of
https://github.com/linyqh/NarratoAI.git
synced 2025-12-31 10:18:12 +00:00
fix: 修复短剧混剪字幕在 windows 环境下加载失败
This commit is contained in:
parent
5e46ea2746
commit
26f0dfeab5
7
.gitignore
vendored
7
.gitignore
vendored
@ -39,4 +39,9 @@ bug清单.md
|
||||
task.md
|
||||
.claude/*
|
||||
.serena/*
|
||||
CLAUDE.md
|
||||
|
||||
# OpenSpec: 忽略活动的变更提案,但保留归档和规范
|
||||
openspec/*
|
||||
AGENTS.md
|
||||
CLAUDE.md
|
||||
tests/*
|
||||
@ -39,6 +39,20 @@ def analyze_subtitle(
|
||||
try:
|
||||
# 加载字幕文件
|
||||
subtitles = load_srt(srt_path)
|
||||
|
||||
# 检查字幕是否为空
|
||||
if not subtitles:
|
||||
error_msg = (
|
||||
f"字幕文件 {srt_path} 解析后无有效内容。\n"
|
||||
f"请检查:\n"
|
||||
f"1. 文件格式是否为标准 SRT\n"
|
||||
f"2. 文件编码是否为 UTF-8、GBK 或 GB2312\n"
|
||||
f"3. 文件内容是否为空"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
logger.info(f"成功加载字幕文件 {srt_path},共 {len(subtitles)} 条有效字幕")
|
||||
subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles])
|
||||
|
||||
# 初始化统一LLM服务
|
||||
|
||||
@ -1,45 +1,80 @@
|
||||
# 公共方法
|
||||
import json
|
||||
import requests # 新增
|
||||
import pysrt
|
||||
from loguru import logger
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
def load_srt(file_path: str) -> List[Dict]:
|
||||
"""加载并解析SRT文件
|
||||
"""加载并解析SRT文件(使用 pysrt 库,支持多种编码和格式)
|
||||
|
||||
Args:
|
||||
file_path: SRT文件路径
|
||||
|
||||
Returns:
|
||||
字幕内容列表
|
||||
字幕内容列表,格式:
|
||||
[
|
||||
{
|
||||
'number': int, # 字幕序号
|
||||
'timestamp': str, # "00:00:01,000 --> 00:00:03,000"
|
||||
'text': str, # 字幕文本
|
||||
'start_time': str, # "00:00:01,000"
|
||||
'end_time': str # "00:00:03,000"
|
||||
},
|
||||
...
|
||||
]
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: 文件不存在
|
||||
ValueError: 文件编码不支持或格式错误
|
||||
"""
|
||||
with open(file_path, 'r', encoding='utf-8-sig') as f:
|
||||
content = f.read().strip()
|
||||
# 编码自动检测:依次尝试常见编码
|
||||
encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
|
||||
subs = None
|
||||
detected_encoding = None
|
||||
|
||||
# 按空行分割字幕块
|
||||
subtitle_blocks = content.split('\n\n')
|
||||
for encoding in encodings:
|
||||
try:
|
||||
subs = pysrt.open(file_path, encoding=encoding)
|
||||
detected_encoding = encoding
|
||||
logger.info(f"成功加载字幕文件 {file_path},编码:{encoding},共 {len(subs)} 条")
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning(f"使用编码 {encoding} 加载失败: {e}")
|
||||
continue
|
||||
|
||||
if subs is None:
|
||||
# 所有编码都失败
|
||||
raise ValueError(
|
||||
f"无法读取字幕文件 {file_path},"
|
||||
f"请检查文件编码(支持 UTF-8、GBK、GB2312)"
|
||||
)
|
||||
|
||||
# 检查是否为空
|
||||
if not subs:
|
||||
logger.warning(f"字幕文件 {file_path} 解析后无有效内容")
|
||||
return []
|
||||
|
||||
# 转换为原格式(向后兼容)
|
||||
subtitles = []
|
||||
for sub in subs:
|
||||
# 合并多行文本为单行(某些 SRT 文件会有换行)
|
||||
text = sub.text.replace('\n', ' ').strip()
|
||||
|
||||
for block in subtitle_blocks:
|
||||
lines = block.split('\n')
|
||||
if len(lines) >= 3: # 确保块包含足够的行
|
||||
try:
|
||||
number = int(lines[0].strip())
|
||||
timestamp = lines[1]
|
||||
text = ' '.join(lines[2:])
|
||||
# 跳过空字幕
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# 解析时间戳
|
||||
start_time, end_time = timestamp.split(' --> ')
|
||||
|
||||
subtitles.append({
|
||||
'number': number,
|
||||
'timestamp': timestamp,
|
||||
'text': text,
|
||||
'start_time': start_time,
|
||||
'end_time': end_time
|
||||
})
|
||||
except ValueError as e:
|
||||
print(f"Warning: 跳过无效的字幕块: {e}")
|
||||
continue
|
||||
subtitles.append({
|
||||
'number': sub.index,
|
||||
'timestamp': f"{sub.start} --> {sub.end}",
|
||||
'text': text,
|
||||
'start_time': str(sub.start),
|
||||
'end_time': str(sub.end)
|
||||
})
|
||||
|
||||
logger.info(f"成功解析 {len(subtitles)} 条有效字幕")
|
||||
return subtitles
|
||||
|
||||
@ -343,12 +343,34 @@ def short_drama_summary(tr):
|
||||
# 只有当有文件上传且尚未处理时才执行处理逻辑
|
||||
if subtitle_file is not None and not st.session_state['subtitle_file_processed']:
|
||||
try:
|
||||
# 读取上传的SRT内容
|
||||
script_content = subtitle_file.read().decode('utf-8')
|
||||
# 清理文件名,防止路径污染和路径遍历攻击
|
||||
safe_filename = os.path.basename(subtitle_file.name)
|
||||
|
||||
# 编码自动检测:依次尝试常见编码
|
||||
encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312']
|
||||
script_content = None
|
||||
detected_encoding = None
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
subtitle_file.seek(0) # 重置文件指针
|
||||
script_content = subtitle_file.read().decode(encoding)
|
||||
detected_encoding = encoding
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
if script_content is None:
|
||||
st.error(tr("无法读取字幕文件,请检查文件编码(支持 UTF-8、GBK、GB2312)"))
|
||||
st.stop()
|
||||
|
||||
# 验证字幕内容(简单检查)
|
||||
if len(script_content.strip()) < 10:
|
||||
st.warning(tr("字幕文件内容似乎为空,请检查文件"))
|
||||
|
||||
# 保存到字幕目录
|
||||
script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name)
|
||||
file_name, file_extension = os.path.splitext(subtitle_file.name)
|
||||
script_file_path = os.path.join(utils.subtitle_dir(), safe_filename)
|
||||
file_name, file_extension = os.path.splitext(safe_filename)
|
||||
|
||||
# 如果文件已存在,添加时间戳
|
||||
if os.path.exists(script_file_path):
|
||||
@ -356,18 +378,22 @@ def short_drama_summary(tr):
|
||||
file_name_with_timestamp = f"{file_name}_{timestamp}"
|
||||
script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension)
|
||||
|
||||
# 直接写入SRT内容,不进行JSON转换
|
||||
# 直接写入SRT内容(统一使用 UTF-8)
|
||||
with open(script_file_path, "w", encoding='utf-8') as f:
|
||||
f.write(script_content)
|
||||
|
||||
# 更新状态
|
||||
st.success(tr("字幕上传成功"))
|
||||
st.success(
|
||||
f"{tr('字幕上传成功')} "
|
||||
f"(编码: {detected_encoding.upper()}, "
|
||||
f"大小: {len(script_content)} 字符)"
|
||||
)
|
||||
st.session_state['subtitle_path'] = script_file_path
|
||||
st.session_state['subtitle_file_processed'] = True # 标记已处理
|
||||
|
||||
|
||||
# 避免使用rerun,使用更新状态的方式
|
||||
# st.rerun()
|
||||
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"{tr('Upload failed')}: {str(e)}")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user