ChatDev/server/services/batch_parser.py
2026-01-10 22:35:32 +08:00

193 lines
5.8 KiB
Python

"""Parse batch task files (CSV/Excel) into runnable tasks."""
import json
from dataclasses import dataclass
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import pandas as pd
from utils.exceptions import ValidationError
@dataclass(frozen=True)
class BatchTask:
row_index: int
task_id: Optional[str]
task_prompt: str
attachment_paths: List[str]
vars_override: Dict[str, Any]
def parse_batch_file(content: bytes, filename: str) -> Tuple[List[BatchTask], str]:
"""Parse a CSV/Excel batch file and return tasks plus file base name."""
suffix = Path(filename or "").suffix.lower()
if suffix not in {".csv", ".xlsx", ".xls"}:
raise ValidationError("Unsupported file type; must be .csv or .xlsx/.xls", field="file")
if suffix == ".csv":
df = _read_csv(content)
else:
df = _read_excel(content)
file_base = Path(filename).stem or "batch"
tasks = _parse_dataframe(df)
if not tasks:
raise ValidationError("Batch file contains no tasks", field="file")
return tasks, file_base
def _read_csv(content: bytes) -> pd.DataFrame:
try:
import chardet
except Exception:
chardet = None
encoding = "utf-8"
if chardet:
detected = chardet.detect(content)
encoding = detected.get("encoding") or encoding
try:
return pd.read_csv(BytesIO(content), encoding=encoding)
except Exception as exc:
raise ValidationError(f"Failed to read CSV: {exc}", field="file")
def _read_excel(content: bytes) -> pd.DataFrame:
try:
return pd.read_excel(BytesIO(content))
except Exception as exc:
raise ValidationError(f"Failed to read Excel file: {exc}", field="file")
def _parse_dataframe(df: pd.DataFrame) -> List[BatchTask]:
column_map = {str(col).strip().lower(): col for col in df.columns}
id_col = column_map.get("id")
task_col = column_map.get("task")
attachments_col = column_map.get("attachments")
vars_col = column_map.get("vars")
tasks: List[BatchTask] = []
seen_ids: set[str] = set()
for row_index, row in enumerate(df.to_dict(orient="records"), start=1):
task_prompt = _get_cell_text(row, task_col)
attachment_paths = _parse_json_list(row, attachments_col, row_index)
vars_override = _parse_json_dict(row, vars_col, row_index)
if not task_prompt and not attachment_paths:
raise ValidationError(
"Task and attachments cannot both be empty",
details={"row_index": row_index},
)
task_id = _get_cell_text(row, id_col)
if task_id:
if task_id in seen_ids:
raise ValidationError(
"Duplicate ID in batch file",
details={"row_index": row_index, "task_id": task_id},
)
seen_ids.add(task_id)
tasks.append(
BatchTask(
row_index=row_index,
task_id=task_id or None,
task_prompt=task_prompt,
attachment_paths=attachment_paths,
vars_override=vars_override,
)
)
return tasks
def _get_cell_text(row: Dict[str, Any], column: Optional[str]) -> str:
if not column:
return ""
value = row.get(column)
if value is None:
return ""
if isinstance(value, float) and pd.isna(value):
return ""
if pd.isna(value):
return ""
return str(value).strip()
def _parse_json_list(
row: Dict[str, Any],
column: Optional[str],
row_index: int,
) -> List[str]:
if not column:
return []
raw_value = row.get(column)
if raw_value is None or (isinstance(raw_value, float) and pd.isna(raw_value)):
return []
if isinstance(raw_value, list):
return _ensure_string_list(raw_value, row_index, "Attachments")
if isinstance(raw_value, str):
if not raw_value.strip():
return []
try:
parsed = json.loads(raw_value)
except json.JSONDecodeError as exc:
raise ValidationError(
f"Invalid JSON in Attachments: {exc}",
details={"row_index": row_index},
)
return _ensure_string_list(parsed, row_index, "Attachments")
raise ValidationError(
"Attachments must be a JSON list",
details={"row_index": row_index},
)
def _parse_json_dict(
row: Dict[str, Any],
column: Optional[str],
row_index: int,
) -> Dict[str, Any]:
if not column:
return {}
raw_value = row.get(column)
if raw_value is None or (isinstance(raw_value, float) and pd.isna(raw_value)):
return {}
if isinstance(raw_value, dict):
return raw_value
if isinstance(raw_value, str):
if not raw_value.strip():
return {}
try:
parsed = json.loads(raw_value)
except json.JSONDecodeError as exc:
raise ValidationError(
f"Invalid JSON in Vars: {exc}",
details={"row_index": row_index},
)
if not isinstance(parsed, dict):
raise ValidationError(
"Vars must be a JSON object",
details={"row_index": row_index},
)
return parsed
raise ValidationError(
"Vars must be a JSON object",
details={"row_index": row_index},
)
def _ensure_string_list(value: Any, row_index: int, field: str) -> List[str]:
if not isinstance(value, list):
raise ValidationError(
f"{field} must be a JSON list",
details={"row_index": row_index},
)
result: List[str] = []
for item in value:
if item is None or (isinstance(item, float) and pd.isna(item)):
continue
result.append(str(item))
return result