mirror of
https://github.com/OpenBMB/ChatDev.git
synced 2026-04-25 11:18:06 +00:00
193 lines
5.8 KiB
Python
193 lines
5.8 KiB
Python
"""Parse batch task files (CSV/Excel) into runnable tasks."""
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import pandas as pd
|
|
|
|
from utils.exceptions import ValidationError
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class BatchTask:
|
|
row_index: int
|
|
task_id: Optional[str]
|
|
task_prompt: str
|
|
attachment_paths: List[str]
|
|
vars_override: Dict[str, Any]
|
|
|
|
|
|
def parse_batch_file(content: bytes, filename: str) -> Tuple[List[BatchTask], str]:
|
|
"""Parse a CSV/Excel batch file and return tasks plus file base name."""
|
|
suffix = Path(filename or "").suffix.lower()
|
|
if suffix not in {".csv", ".xlsx", ".xls"}:
|
|
raise ValidationError("Unsupported file type; must be .csv or .xlsx/.xls", field="file")
|
|
|
|
if suffix == ".csv":
|
|
df = _read_csv(content)
|
|
else:
|
|
df = _read_excel(content)
|
|
|
|
file_base = Path(filename).stem or "batch"
|
|
tasks = _parse_dataframe(df)
|
|
if not tasks:
|
|
raise ValidationError("Batch file contains no tasks", field="file")
|
|
return tasks, file_base
|
|
|
|
|
|
def _read_csv(content: bytes) -> pd.DataFrame:
|
|
try:
|
|
import chardet
|
|
except Exception:
|
|
chardet = None
|
|
encoding = "utf-8"
|
|
if chardet:
|
|
detected = chardet.detect(content)
|
|
encoding = detected.get("encoding") or encoding
|
|
try:
|
|
return pd.read_csv(BytesIO(content), encoding=encoding)
|
|
except Exception as exc:
|
|
raise ValidationError(f"Failed to read CSV: {exc}", field="file")
|
|
|
|
|
|
def _read_excel(content: bytes) -> pd.DataFrame:
|
|
try:
|
|
return pd.read_excel(BytesIO(content))
|
|
except Exception as exc:
|
|
raise ValidationError(f"Failed to read Excel file: {exc}", field="file")
|
|
|
|
|
|
def _parse_dataframe(df: pd.DataFrame) -> List[BatchTask]:
|
|
column_map = {str(col).strip().lower(): col for col in df.columns}
|
|
id_col = column_map.get("id")
|
|
task_col = column_map.get("task")
|
|
attachments_col = column_map.get("attachments")
|
|
vars_col = column_map.get("vars")
|
|
|
|
tasks: List[BatchTask] = []
|
|
seen_ids: set[str] = set()
|
|
|
|
for row_index, row in enumerate(df.to_dict(orient="records"), start=1):
|
|
task_prompt = _get_cell_text(row, task_col)
|
|
attachment_paths = _parse_json_list(row, attachments_col, row_index)
|
|
vars_override = _parse_json_dict(row, vars_col, row_index)
|
|
|
|
if not task_prompt and not attachment_paths:
|
|
raise ValidationError(
|
|
"Task and attachments cannot both be empty",
|
|
details={"row_index": row_index},
|
|
)
|
|
|
|
task_id = _get_cell_text(row, id_col)
|
|
if task_id:
|
|
if task_id in seen_ids:
|
|
raise ValidationError(
|
|
"Duplicate ID in batch file",
|
|
details={"row_index": row_index, "task_id": task_id},
|
|
)
|
|
seen_ids.add(task_id)
|
|
|
|
tasks.append(
|
|
BatchTask(
|
|
row_index=row_index,
|
|
task_id=task_id or None,
|
|
task_prompt=task_prompt,
|
|
attachment_paths=attachment_paths,
|
|
vars_override=vars_override,
|
|
)
|
|
)
|
|
return tasks
|
|
|
|
|
|
def _get_cell_text(row: Dict[str, Any], column: Optional[str]) -> str:
|
|
if not column:
|
|
return ""
|
|
value = row.get(column)
|
|
if value is None:
|
|
return ""
|
|
if isinstance(value, float) and pd.isna(value):
|
|
return ""
|
|
if pd.isna(value):
|
|
return ""
|
|
return str(value).strip()
|
|
|
|
|
|
def _parse_json_list(
|
|
row: Dict[str, Any],
|
|
column: Optional[str],
|
|
row_index: int,
|
|
) -> List[str]:
|
|
if not column:
|
|
return []
|
|
raw_value = row.get(column)
|
|
if raw_value is None or (isinstance(raw_value, float) and pd.isna(raw_value)):
|
|
return []
|
|
if isinstance(raw_value, list):
|
|
return _ensure_string_list(raw_value, row_index, "Attachments")
|
|
if isinstance(raw_value, str):
|
|
if not raw_value.strip():
|
|
return []
|
|
try:
|
|
parsed = json.loads(raw_value)
|
|
except json.JSONDecodeError as exc:
|
|
raise ValidationError(
|
|
f"Invalid JSON in Attachments: {exc}",
|
|
details={"row_index": row_index},
|
|
)
|
|
return _ensure_string_list(parsed, row_index, "Attachments")
|
|
raise ValidationError(
|
|
"Attachments must be a JSON list",
|
|
details={"row_index": row_index},
|
|
)
|
|
|
|
|
|
def _parse_json_dict(
|
|
row: Dict[str, Any],
|
|
column: Optional[str],
|
|
row_index: int,
|
|
) -> Dict[str, Any]:
|
|
if not column:
|
|
return {}
|
|
raw_value = row.get(column)
|
|
if raw_value is None or (isinstance(raw_value, float) and pd.isna(raw_value)):
|
|
return {}
|
|
if isinstance(raw_value, dict):
|
|
return raw_value
|
|
if isinstance(raw_value, str):
|
|
if not raw_value.strip():
|
|
return {}
|
|
try:
|
|
parsed = json.loads(raw_value)
|
|
except json.JSONDecodeError as exc:
|
|
raise ValidationError(
|
|
f"Invalid JSON in Vars: {exc}",
|
|
details={"row_index": row_index},
|
|
)
|
|
if not isinstance(parsed, dict):
|
|
raise ValidationError(
|
|
"Vars must be a JSON object",
|
|
details={"row_index": row_index},
|
|
)
|
|
return parsed
|
|
raise ValidationError(
|
|
"Vars must be a JSON object",
|
|
details={"row_index": row_index},
|
|
)
|
|
|
|
|
|
def _ensure_string_list(value: Any, row_index: int, field: str) -> List[str]:
|
|
if not isinstance(value, list):
|
|
raise ValidationError(
|
|
f"{field} must be a JSON list",
|
|
details={"row_index": row_index},
|
|
)
|
|
result: List[str] = []
|
|
for item in value:
|
|
if item is None or (isinstance(item, float) and pd.isna(item)):
|
|
continue
|
|
result.append(str(item))
|
|
return result
|