NarratoAI/app/utils/check_script.py

199 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from loguru import logger
import os
from datetime import datetime, timedelta
import re
def time_to_seconds(time_str):
time_obj = datetime.strptime(time_str, "%M:%S")
return timedelta(minutes=time_obj.minute, seconds=time_obj.second).total_seconds()
def seconds_to_time_str(seconds):
minutes, seconds = divmod(int(seconds), 60)
return f"{minutes:02d}:{seconds:02d}"
def check_script(file_path, total_duration):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
errors = []
ost_narrations = set()
last_end_time = 0
logger.info(f"开始检查文件: {file_path}")
logger.info(f"视频总时长: {total_duration:.2f}")
logger.info("=" * 50)
for i, item in enumerate(data, 1):
logger.info(f"\n检查第 {i} 项:")
# 检查所有必需字段是否存在
required_fields = ['picture', 'timestamp', 'narration', 'OST', 'new_timestamp']
for field in required_fields:
if field not in item:
errors.append(f"{i} 项缺少 {field} 字段")
logger.info(f" - 错误: 缺少 {field} 字段")
else:
logger.info(f" - {field}: {item[field]}")
# 检查 OST 为 false 的情况
if item.get('OST') == False:
if not item.get('narration'):
errors.append(f"{i} 项 OST 为 false但 narration 为空")
logger.info(" - 错误: OST 为 false但 narration 为空")
elif len(item['narration']) > 30:
errors.append(f"{i} 项 OST 为 false但 narration 超过 30 字")
logger.info(f" - 错误: OST 为 false但 narration 超过 30 字 (当前: {len(item['narration'])} 字)")
else:
logger.info(" - OST 为 falsenarration 检查通过")
# 检查 OST 为 true 的情况
if item.get('OST') == True:
if not item.get('narration').startswith('原声播放_'):
errors.append(f"{i} 项 OST 为 true但 narration 不是 '原声播放_xxx' 格式")
logger.info(" - 错误: OST 为 true但 narration 不是 '原声播放_xxx' 格式")
elif item['narration'] in ost_narrations:
errors.append(f"{i} 项 OST 为 true但 narration '{item['narration']}' 不是唯一值")
logger.info(f" - 错误: OST 为 true但 narration '{item['narration']}' 不是唯一值")
else:
logger.info(" - OST 为 truenarration 检查通过")
ost_narrations.add(item['narration'])
# 检查 timestamp 是否重叠
if 'timestamp' in item:
start, end = map(time_to_seconds, item['timestamp'].split('-'))
if start < last_end_time:
errors.append(f"{i} 项 timestamp '{item['timestamp']}' 与前一项重叠")
logger.info(f" - 错误: timestamp '{item['timestamp']}' 与前一项重叠")
else:
logger.info(f" - timestamp '{item['timestamp']}' 检查通过")
last_end_time = end
# 检查 timestamp 是否超过总时长
if end > total_duration:
errors.append(f"{i} 项 timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f}")
logger.info(f" - 错误: timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f}")
else:
logger.info(f" - timestamp 在总时长范围内")
# 检查 new_timestamp 是否连续
logger.info("\n检查 new_timestamp 连续性:")
last_end_time = 0
for i, item in enumerate(data, 1):
if 'new_timestamp' in item:
start, end = map(time_to_seconds, item['new_timestamp'].split('-'))
if start != last_end_time:
errors.append(f"{i} 项 new_timestamp '{item['new_timestamp']}' 与前一项不连续")
logger.info(f" - 错误: 第 {i} 项 new_timestamp '{item['new_timestamp']}' 与前一项不连续")
else:
logger.info(f" - 第 {i} 项 new_timestamp '{item['new_timestamp']}' 连续性检查通过")
last_end_time = end
if errors:
logger.info("检查结果:不通过")
logger.info("发现以下错误:")
for error in errors:
logger.info(f"- {error}")
fix_script(file_path, data, errors)
else:
logger.info("检查结果:通过")
logger.info("所有项目均符合规则要求。")
def fix_script(file_path, data, errors):
logger.info("\n开始修复脚本...")
fixed_data = []
for i, item in enumerate(data, 1):
if item['OST'] == False and (not item['narration'] or len(item['narration']) > 30):
if not item['narration']:
logger.info(f"{i} 项 narration 为空,需要人工参与修复。")
fixed_data.append(item)
else:
logger.info(f"修复第 {i} 项 narration 超过 30 字的问题...")
fixed_items = split_narration(item)
fixed_data.extend(fixed_items)
else:
fixed_data.append(item)
for error in errors:
if not error.startswith("") or "OST 为 false" not in error:
logger.info(f"需要人工参与修复: {error}")
# 生成新的文件名
file_name, file_ext = os.path.splitext(file_path)
new_file_path = f"{file_name}_revise{file_ext}"
# 保存修复后的数据到新文件
with open(new_file_path, 'w', encoding='utf-8') as f:
json.dump(fixed_data, f, ensure_ascii=False, indent=4)
logger.info(f"\n脚本修复完成,已保存到新文件: {new_file_path}")
def split_narration(item):
narration = item['narration']
chunks = smart_split(narration)
start_time, end_time = map(time_to_seconds, item['timestamp'].split('-'))
new_start_time, new_end_time = map(time_to_seconds, item['new_timestamp'].split('-'))
total_duration = end_time - start_time
new_total_duration = new_end_time - new_start_time
chunk_duration = total_duration / len(chunks)
new_chunk_duration = new_total_duration / len(chunks)
fixed_items = []
for i, chunk in enumerate(chunks):
new_item = item.copy()
new_item['narration'] = chunk
chunk_start = start_time + i * chunk_duration
chunk_end = chunk_start + chunk_duration
new_item['timestamp'] = f"{seconds_to_time_str(chunk_start)}-{seconds_to_time_str(chunk_end)}"
new_chunk_start = new_start_time + i * new_chunk_duration
new_chunk_end = new_chunk_start + new_chunk_duration
new_item['new_timestamp'] = f"{seconds_to_time_str(new_chunk_start)}-{seconds_to_time_str(new_chunk_end)}"
fixed_items.append(new_item)
return fixed_items
def smart_split(text, target_length=30):
# 使用正则表达式分割文本,保留标点符号
segments = re.findall(r'[^,。!?,!?]+[,。!?,!?]?', text)
result = []
current_chunk = ""
for segment in segments:
if len(current_chunk) + len(segment) <= target_length:
current_chunk += segment
else:
if current_chunk:
result.append(current_chunk.strip())
current_chunk = segment
if current_chunk:
result.append(current_chunk.strip())
# 如果有任何chunk超过了目标长度进行进一步的分割
final_result = []
for chunk in result:
if len(chunk) > target_length:
sub_chunks = [chunk[i:i + target_length] for i in range(0, len(chunk), target_length)]
final_result.extend(sub_chunks)
else:
final_result.append(chunk)
return final_result
if __name__ == "__main__":
file_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/2024-0923-085036.json"
total_duration = 280
check_script(file_path, total_duration)