mirror of
https://github.com/linyqh/NarratoAI.git
synced 2025-12-11 18:42:49 +00:00
更新generate_narration_script.py、base.py和generate_short_summary.py文件,重构解说文案生成和视觉分析器的实现,优先使用新的LLM服务架构。添加回退机制以确保兼容性,增强系统的稳定性和用户体验。
201 lines
7.5 KiB
Python
201 lines
7.5 KiB
Python
"""
|
||
输出格式验证器
|
||
|
||
提供严格的输出格式验证机制,确保大模型输出符合预期格式
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
from typing import Any, Dict, List, Optional, Union
|
||
from loguru import logger
|
||
|
||
from .exceptions import ValidationError
|
||
|
||
|
||
class OutputValidator:
|
||
"""输出格式验证器"""
|
||
|
||
@staticmethod
|
||
def validate_json_output(output: str, schema: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||
"""
|
||
验证JSON输出格式
|
||
|
||
Args:
|
||
output: 待验证的输出字符串
|
||
schema: JSON Schema (可选)
|
||
|
||
Returns:
|
||
解析后的JSON对象
|
||
|
||
Raises:
|
||
ValidationError: 验证失败时抛出
|
||
"""
|
||
try:
|
||
# 清理输出字符串,移除可能的markdown代码块标记
|
||
cleaned_output = OutputValidator._clean_json_output(output)
|
||
|
||
# 解析JSON
|
||
parsed_json = json.loads(cleaned_output)
|
||
|
||
# 如果提供了schema,进行schema验证
|
||
if schema:
|
||
OutputValidator._validate_json_schema(parsed_json, schema)
|
||
|
||
return parsed_json
|
||
|
||
except json.JSONDecodeError as e:
|
||
logger.error(f"JSON解析失败: {str(e)}")
|
||
logger.error(f"原始输出: {output}")
|
||
raise ValidationError(f"JSON格式无效: {str(e)}", "json_parse", output)
|
||
except Exception as e:
|
||
logger.error(f"JSON验证失败: {str(e)}")
|
||
raise ValidationError(f"JSON验证失败: {str(e)}", "json_validation", output)
|
||
|
||
@staticmethod
|
||
def _clean_json_output(output: str) -> str:
|
||
"""清理JSON输出,移除markdown标记等"""
|
||
# 移除可能的markdown代码块标记
|
||
output = re.sub(r'^```json\s*', '', output, flags=re.MULTILINE)
|
||
output = re.sub(r'^```\s*$', '', output, flags=re.MULTILINE)
|
||
output = re.sub(r'^```.*$', '', output, flags=re.MULTILINE)
|
||
|
||
# 移除开头和结尾的```标记
|
||
output = re.sub(r'^```', '', output)
|
||
output = re.sub(r'```$', '', output)
|
||
|
||
# 移除前后空白字符
|
||
output = output.strip()
|
||
|
||
return output
|
||
|
||
@staticmethod
|
||
def _validate_json_schema(data: Dict[str, Any], schema: Dict[str, Any]):
|
||
"""验证JSON Schema (简化版本)"""
|
||
# 这里可以集成jsonschema库进行更严格的验证
|
||
# 目前实现基础的类型检查
|
||
|
||
if "type" in schema:
|
||
expected_type = schema["type"]
|
||
if expected_type == "object" and not isinstance(data, dict):
|
||
raise ValidationError(f"期望对象类型,实际为 {type(data)}", "schema_type")
|
||
elif expected_type == "array" and not isinstance(data, list):
|
||
raise ValidationError(f"期望数组类型,实际为 {type(data)}", "schema_type")
|
||
|
||
if "required" in schema and isinstance(data, dict):
|
||
for required_field in schema["required"]:
|
||
if required_field not in data:
|
||
raise ValidationError(f"缺少必需字段: {required_field}", "schema_required")
|
||
|
||
@staticmethod
|
||
def validate_narration_script(output: str) -> List[Dict[str, Any]]:
|
||
"""
|
||
验证解说文案输出格式
|
||
|
||
Args:
|
||
output: 待验证的解说文案输出
|
||
|
||
Returns:
|
||
解析后的解说文案列表
|
||
|
||
Raises:
|
||
ValidationError: 验证失败时抛出
|
||
"""
|
||
try:
|
||
# 定义解说文案的JSON Schema
|
||
narration_schema = {
|
||
"type": "object",
|
||
"required": ["items"],
|
||
"properties": {
|
||
"items": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"required": ["_id", "timestamp", "picture", "narration"],
|
||
"properties": {
|
||
"_id": {"type": "number"},
|
||
"timestamp": {"type": "string"},
|
||
"picture": {"type": "string"},
|
||
"narration": {"type": "string"},
|
||
"OST": {"type": "number"}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
# 验证JSON格式
|
||
parsed_data = OutputValidator.validate_json_output(output, narration_schema)
|
||
|
||
# 提取items数组
|
||
items = parsed_data.get("items", [])
|
||
|
||
# 验证每个item的具体内容
|
||
for i, item in enumerate(items):
|
||
OutputValidator._validate_narration_item(item, i)
|
||
|
||
logger.info(f"解说文案验证成功,共 {len(items)} 个片段")
|
||
return items
|
||
|
||
except ValidationError:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"解说文案验证失败: {str(e)}")
|
||
raise ValidationError(f"解说文案验证失败: {str(e)}", "narration_validation", output)
|
||
|
||
@staticmethod
|
||
def _validate_narration_item(item: Dict[str, Any], index: int):
|
||
"""验证单个解说文案项目"""
|
||
# 验证时间戳格式
|
||
timestamp = item.get("timestamp", "")
|
||
if not re.match(r'\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}', timestamp):
|
||
raise ValidationError(f"第{index+1}项时间戳格式无效: {timestamp}", "timestamp_format")
|
||
|
||
# 验证内容不为空
|
||
if not item.get("picture", "").strip():
|
||
raise ValidationError(f"第{index+1}项画面描述不能为空", "empty_picture")
|
||
|
||
if not item.get("narration", "").strip():
|
||
raise ValidationError(f"第{index+1}项解说文案不能为空", "empty_narration")
|
||
|
||
# 验证ID为正整数
|
||
item_id = item.get("_id")
|
||
if not isinstance(item_id, (int, float)) or item_id <= 0:
|
||
raise ValidationError(f"第{index+1}项ID必须为正整数: {item_id}", "invalid_id")
|
||
|
||
@staticmethod
|
||
def validate_subtitle_analysis(output: str) -> str:
|
||
"""
|
||
验证字幕分析输出格式
|
||
|
||
Args:
|
||
output: 待验证的字幕分析输出
|
||
|
||
Returns:
|
||
验证后的分析内容
|
||
|
||
Raises:
|
||
ValidationError: 验证失败时抛出
|
||
"""
|
||
try:
|
||
# 基础验证:内容不能为空
|
||
if not output or not output.strip():
|
||
raise ValidationError("字幕分析结果不能为空", "empty_analysis")
|
||
|
||
# 验证内容长度合理
|
||
if len(output.strip()) < 50:
|
||
raise ValidationError("字幕分析结果过短,可能不完整", "analysis_too_short")
|
||
|
||
# 验证是否包含基本的分析要素(可根据需要调整)
|
||
analysis_keywords = ["剧情", "情节", "角色", "故事", "内容"]
|
||
if not any(keyword in output for keyword in analysis_keywords):
|
||
logger.warning("字幕分析结果可能缺少关键分析要素")
|
||
|
||
logger.info("字幕分析验证成功")
|
||
return output.strip()
|
||
|
||
except ValidationError:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"字幕分析验证失败: {str(e)}")
|
||
raise ValidationError(f"字幕分析验证失败: {str(e)}", "analysis_validation", output)
|