initial commit

2026-06-09 17:02:17 +00:00 · 2025-09-23 20:31:19 +08:00 · 2025-09-23 20:31:19 +08:00 · b24e6e6bd0
commit b24e6e6bd0
85 changed files with 8843 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,206 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
--- a/README.md
+++ b/README.md
@ -0,0 +1,191 @@
+# Multi-Agent Collaboration via Evolving Orchestration
+**Puppeteer** introduces a new way for large language models (LLMs) to collaborate efficiently on complex tasks.
+
+Instead of static structures, our framework uses a centralized orchestrator (“puppeteer”) that dynamically directs multiple agents (“puppets”) based on evolving task states. The orchestrator is trained with reinforcement learning to sequence and prioritize agents, enabling flexible and adaptive collective reasoning.
+
+![framework](assets/framework.png "Multi-Agent Collaboration via Evolving Orchestration")
+
+# Quick Start
+
+## Prerequisites
+
+- Python 3.11 or higher
+- CUDA-compatible GPU (optional, for policy training)
+- API keys for desired LLM providers
+
+## Installation
+
+1. **Clone the repository**
+   ```bash
+    git clone -b puppeteer https://github.com/OpenBMB/ChatDev
+    cd ChatDev
+    cd puppeteer
+   ```
+
+2. **Set up environment and install dependencies**
+   ```bash
+   # Create conda environment
+   conda create -n puppeteer_env python=3.11
+   conda activate puppeteer_env
+   
+   # Install dependencies
+   pip install -r requirements.txt
+   ```
+3. **Download the pre-trained puppeteer model base**
+
+    We use a 70B reward model as the untrained Puppeteer base, so we first need to download this model. The Hugging Face repository is `nvidia/Llama-3.1-Nemotron-70B-Reward`.
+
+4. **Configure the system**
+   ```bash
+   # Edit configurations with your settings
+   vim config/global.yaml  # Add your API keys
+   ```
+    - Global Configuration (`config/global.yaml`): configure API access, file paths, and system behavior:
+        ```yaml
+        # API Configuration
+        logging:
+            level: INFO               # Logging level, options: DEBUG, INFO, WARNING, ERROR
+            logpath: ./logs           # Folder path to store log files
+
+        # Path to the folder containing model weights of the Puppeteer base model
+        # (downloaded in step 3, or directly loading)
+        model_weight_path: nvidia/Llama-3.1-Nemotron-70B-Reward
+
+        api_keys:
+            openai_api_key: ""        # Your OpenAI API key
+            openai_base_url: "https://api.openai.com/v1/"  # OpenAI base URL
+            bing_api_key: ""          # Bing API key for web search (optional)
+
+        # System retry settings
+        max_retry_times: 10           # Maximum number of times to retry API calls
+        max_json_reformat_turns: 10   # Maximum retries for JSON parsing/reformatting
+
+        # Enable external tools (like web search, file read, etc.)
+        external_tools_enabled: True
+
+        # File paths that agents may need
+        file_path:
+            root_file_path: ./data    # Root folder containing all necessary files for agents
+
+        # Graph exploration parameters for multi-agent reasoning
+        graph:
+            max_parallel_paths: 4     # Maximum number of parallel paths to explore (recommended 2-6)
+            max_step_num: 5           # Maximum number of steps (nodes) in each path (recommended 4-6)
+        ```
+    ⚠️ **Note:** Replace placeholders with your actual API keys and url, all the places are needed.
+
+5. **Quick start with the predefined settings**
+
+    The agents are initialized from `puppeteer/personas/personas.jsonl`, which includes all currently supported reasoning patterns and tool modes. The default model backbone is GPT-4o.
+    ```bash
+    cd puppeteer 
+    python main.py <task> <mode> [--level LEVEL] [--index INDEX] [--data_limit LIMIT] [--personas PATH]
+    ```
+    Example:
+    ```bash
+    # Run MMLU-Pro validation set with a data limit of 10
+    python main.py MMLU-Pro validation --data_limit 10
+    ```
+If the run is successful, you will see output similar to [EXAMPLE](puppeteer/logs/example).
+# Customization 
+
+Puppeteer provides multiple ways to tailor the system to your needs
+
+## Agents
+### 🔎 Agent Categories
+
+In this framework, agents are divided into two main categories based on whether they have access to external tools:
+
+1. Agents with Tools
+    - Description: These agents can interact with external systems to gather data, execute code, or access files.
+    - Supported Actions: `TOOL_ACTION_LIST`
+        - search_arxiv – Search for academic papers on arXiv
+        - search_bing – Query the Bing search engine
+        - access_website – Access websites and extract information
+        - run_python – Execute Python code
+        - read_file – Read and extract content from files
+
+2. Agents without Tools
+    - Description: These agents focus on internal reasoning, critique, reflection, and summarization. They do not interact with external systems.
+    - Supported Actions: `REASONING_ACTION_LIST`
+        - reasoning – Logical reasoning
+        - critique – Evaluate and critique reasoning
+        - question – Generate clarifying sub-questions
+        - reflect – Provide reflective analysis
+        - conclude – Generate final conclusions
+        - summarize – Summarize information concisely
+        - planning – Create structured plans
+        - modify – Correct errors and refine results
+
+3. Termination Agent
+    - Description: A special agent responsible for determining when the reasoning process should stop.
+    - Supported Actions: `TERMINATION_ACTION_LIST`
+        - terminate – End the reasoning process and deliver the final output
+
+### ⚙️ Customize  
+
+You can extend this framework by creating new agents, adding actions, or integrating new base models.  
+
+#### 1. Multiple Actions per Agent  
+- Currently, each agent is designed to perform **a single action** (see [`reasoning_agent.py`](puppeteer/agent/reasoning_agent.py)).  
+- To create an agent that supports **multiple actions**, implement your own custom agent by inheriting from [`agent.py`](puppeteer/agent/agent.py).  
+
+#### 2. Adding New Actions  
+- To introduce a **new action**, you need to:  
+  1. Define the corresponding **prompt or tool**.  
+  2. Modify [`reasoning_agent.py`](puppeteer/agent/reasoning_agent.py) to integrate the new action into the reasoning workflow.  
+
+#### 3. Supporting New Base Models  
+- If you want to use a **new base model** for agents:  
+  - Extend the configuration in [`model_config.py`](puppeteer/model/model_config.py).  
+  - Ensure that the new model is properly registered and compatible with the agent framework.  
+
+## 🎭 Puppeteer Training  
+
+The training parameters are defined in [`policy.json`](puppeteer/config/policy.json). Key parameters include:  
+### 🔹 Optimization  
+- `learning_rate`: `0.0001`  
+  Controls the learning speed of the policy network.  
+- `sample_size`: `1`  
+  Number of samples used per training step.  
+
+### 🔹 Agent Scale Control  
+- `max_num_agents`: `3`  
+  Maximum number of agents allowed in the system.  
+- `next_num_agents`: `3`  
+  Number of agents spawned in the next step.  
+- `max_path`: `6`  
+  Maximum trajectory length for agent exploration.  
+
+### 🔹 Reward Configuration  
+- `gamma`: `0.99`  
+  Discount factor for future rewards.  
+- `reward_factors`: Shaping factors for different actions:  
+  - `default`: `-1.0` → Penalty for invalid/neutral actions.  
+  - `terminator`: `0.5` → Reward for correct termination.  
+  - `web_search`: `-1.5` → Penalty for costly web-search actions.  
+
+### 🔹 Cost Control  
+- `scale`: `0.1`  
+  Base cost scaling factor.  
+- `growth_rate`: `1.0`  
+  Linear growth rate of cost per step.  
+- `inverse`: `false`  
+  If set to `true`, applies inverse cost scaling.  
+
+### 🔹 Training Paradigm  
+The current training paradigm uses the hidden state of the last token from the Reward Model. This hidden state is passed through an MLP-based policy network to generate action probabilities.  
+You can switch the Reward Model or design a new training paradigm by modifying the policy network input/output structure.  
+
+
+# Citation
+If you use Puppeteer in your work, please cite our NeurIPS 2025 paper:
+```bibtex
+@inproceedings{dang2025multiagentcollaboration,
+  title={Multi-Agent Collaboration via Evolving Orchestration},
+  author={Yufan Dang and Chen Qian and Xueheng Luo and Jingru Fan and Zihao Xie and Ruijie Shi and Weize Chen and Cheng Yang and Xiaoyin Che and Ye Tian and Xuantang Xiong and Lei Han and Zhiyuan Liu and Maosong Sun},
+  booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems (NeurIPS)},
+  year={2025},
+  url={https://arxiv.org/abs/2505.19591}
+}
+```
--- a/assets/framework.png
+++ b/assets/framework.png
--- a/puppeteer/agent/init.py
+++ b/puppeteer/agent/init.py
--- a/puppeteer/agent/agent.py
+++ b/puppeteer/agent/agent.py
@ -0,0 +1,132 @@
+import json
+import yaml
+import hashlib
+import re
+import time
+from utils.other_utils import JsonFormat
+from copy import deepcopy
+from abc import ABC, abstractmethod
+from model.query_manager import query_manager
+from agent.agent_info.global_info import GlobalInfo
+
+global_config = yaml.safe_load(open("./config/global.yaml", "r"))
+
+class Agent(ABC):
+    def __init__(self, role, role_prompt, index, model="gpt",  actions=[], policy=None, global_info:GlobalInfo =None, initial_dialog_history=None) -> None:
+        """
+        Initialize the Agent object.
+        :param role: The name of the agent's role
+        :param role_prompt: The role prompt information
+        :param index: The index to distinguish different agent instances
+        :param global_info: Global configuration info, default is None
+        :param model: The model to be used (either 'gpt' or 'gpt4'), default is 'gpt'
+        :param actions: List of actions available to the agent, default is empty
+        :param initial_dialog_history: Initial dialog history, default is None
+        """
+        super().__init__()
+
+        # Initialize model query function
+        self.model = model
+        self.query_func = None
+        self.query_func = self._get_query_function()
+        
+        if not self.query_func:
+            raise ValueError(f"Model '{model}' not implemented")
+        
+        # Other basic settings
+        self.json_format = JsonFormat(query_func=self.query_func)
+        self.role = role
+        self.role_prompt = role_prompt
+        self.system_prompt = self.role_prompt  # Initial system prompt
+        self.policy = policy
+        self.index = index
+        self.hash = hashlib.md5(f"{index}{role}{role_prompt}{model}{time.ctime()}".encode()).hexdigest()
+
+        # Tools and file path settings
+        self.actions = actions
+        self.root_file_path = global_config["file_path"]["root_file_path"]
+        if global_info:
+            self.workspace_path = global_info.workpath
+        
+        # Activation state and dialog history
+        self._activated = False
+        self.initial_dialog_history = initial_dialog_history or []
+        self.dialog_history = deepcopy(self.initial_dialog_history)
+    
+    @property
+    def simplified_dialog_history(self):
+        self._simplified_dialog_history = []
+        for h in self.dialog_history:
+            if h.get("role") == "user":
+                # Mask user input 
+                # "*Your previous reasoning was {}*”
+                masked_text = re.sub(r'\*.*?\*', '', h["content"])
+                self._simplified_dialog_history.append({"role": h["role"], "content": masked_text})
+            else:
+                self._simplified_dialog_history.append(h)
+        return self._simplified_dialog_history
+
+    @property
+    def unique_identifier(self):
+        """Return a unique identifier for the Agent instance."""
+        return {
+            "index": self.index,
+            "role": self.role,
+            "hash": self.hash
+        }
+    def _get_query_function(self):
+        def query_func(messages, system_prompt=None):
+            return query_manager.query(self.model, messages, system_prompt)
+        return query_func
+    
+    @abstractmethod
+    def activate(self, global_info, initial_dialog_history=None):
+        """Activate the agent, enabling it to perform actions."""
+        pass
+
+    @abstractmethod
+    def deactivate(self):
+        """Deactivate the agent."""
+        self._activated = False
+
+    def reset(self):
+        """Reset the agent's state, clearing dialog history and deactivating it."""
+        self.dialog_history = []
+        self.initial_dialog_history = []
+        self.deactivate()
+
+
+    @abstractmethod
+    def _build_current_action(self, format_action, flag, answer, step_data):
+        """Build the current workflow guiding the agent's actions."""
+        pass
+
+    @abstractmethod
+    def take_action(self, global_info, external_tools_enabled=True):
+        """Let the agent take an action based on the current state."""
+        pass
+    
+    @abstractmethod
+    def _execute_action(self, action, global_info):
+        """Execute a specific action."""
+        pass
+    
+    @abstractmethod
+    def _reasoning_operation(self, action, global_info) -> str:
+        """Perform a reasoning operation."""
+        pass
+
+    @abstractmethod
+    def _answer_operation(self, global_info) -> str:
+        """Generate an answer based on the current state."""
+        pass
+
+    @abstractmethod
+    def _tool_operation(self, action: json, global_info) -> str:
+        """Perform an operation involving external tools."""
+        pass
+
+    @abstractmethod
+    def _interaction_operation(self, code, env, global_info) -> str:
+        """Handle operations related to agent interaction."""
+        pass
--- a/puppeteer/agent/agent_info/actions.py
+++ b/puppeteer/agent/agent_info/actions.py
@ -0,0 +1,3 @@
+REASONING_ACTION_LIST = ["reasoning","critique","question","reflect","conclude","summarize","planning","modify"]
+TOOL_ACTION_LIST = ["search_arxiv", "search_bing", "access_website", "run_python", "read_file"]
+TERMINATION_ACTION_LIST = ["terminate"]
--- a/puppeteer/agent/agent_info/global_info.py
+++ b/puppeteer/agent/agent_info/global_info.py
@ -0,0 +1,91 @@
+import os
+import re
+import logging
+from agent.agent_info.workflow import Workflow
+from agent.agent_info.workflow import Action
+
+
+class GlobalInfo:
+    def __init__(self, path_id: int, workpath: str, task: dict, logger: logging.Logger=None, env=None, env_name=None):
+        self.path_id = path_id
+        self.logger = logger
+        self.workpath = workpath
+        self.task = task
+        self.workflow = Workflow(path_id=self.path_id, workpath=self.workpath)
+        self.url = self._extract_url(task.get("Question"))
+        self.file_name = task.get("file_name")
+        self.file_extension = self._extract_file_extension(self.file_name)
+        self.answers = []
+
+        self.code_path = ""
+        self.env_exists = env is not None
+        self.env_name = env_name
+        self.env = env
+        self.supervisor = self._extract_supervisor(env, env_name)
+    
+    @property
+    def state_answers(self):
+        """Returns the list of answers in the state
+        Return: list of answers
+        """
+        ret = []
+        for index, a in enumerate(self.answers):
+            ret.append("{}".format(a))
+        if len(ret) == 0:
+            return []
+        return ret  
+            
+    @property
+    def total_tokens(self):
+        return self.workflow.total_tokens
+
+    @property
+    def total_cost(self):
+        return self.workflow.total_cost
+    
+    def _extract_url(self, question):
+        """Extracts the URL from the task question, if any."""
+        url_regex = r"(https?://[^\s]+)"
+        urls = re.findall(url_regex, question or "")
+        return urls[0] if urls else None
+
+    def _extract_file_extension(self, file_name):
+        """Extracts the file extension from the file name."""
+        if file_name:
+            return os.path.splitext(file_name)[1]
+        return None
+
+    def _extract_supervisor(self, env, env_name):
+        if env_name == "appworld" and env is not None:
+            return env.task.supervisor
+        return None
+
+    def update(self, action: Action):
+        """Updates the workflow with the given action and logs the update."""
+        self.workflow.path_id = self.path_id
+        self.workflow.add_action(action)
+        action.write_code()
+        self.workflow.write_down()
+        self.logger.info(f"Updated workflow: {self.workflow}")
+
+    def add_answer(self, answer):
+        """Adds the answers to the workflow and logs the update."""
+        self.answers.append(answer)
+    
+    def agent_role_list(self):
+        return self.workflow.get_agent_role_list()
+    
+    def to_dict(self):
+        return {
+            "task": self.task,
+            "url": self.url,
+            "file_name": self.file_name,
+            "file_extension": self.file_extension,
+            "answer": self.answer,
+            "workflow": self.workflow,
+            "workspace_path": self.workpath,
+            "env_exists": self.env_exists,
+            "env_name": self.env_name,
+            "supervisor": self.supervisor
+        }
+    
--- a/puppeteer/agent/agent_info/workflow.py
+++ b/puppeteer/agent/agent_info/workflow.py
@ -0,0 +1,187 @@
+import os
+import json
+import networkx as nx
+import matplotlib.pyplot as plt
+from agent.agent_info.actions import REASONING_ACTION_LIST, TOOL_ACTION_LIST, TERMINATION_ACTION_LIST
+from model.model_config import model_registry
+
+class Action:
+    def __init__(self, action:dict, result:dict, success:str, agent_role:str, agent_model:str):
+        self.action = action  # format action, e.g., {"action": "", "parameters": ""}
+        self.result = result  # action result, e.g., {"step_data": "", "answer": ""}
+        self.success = success  # Success or Failure of the action
+        self.agent_role = agent_role  # Role of the agent
+        self.agent_model = agent_model  # Model of the agent
+        self.model_parameter = model_registry.get_model_size(agent_model) if model_registry.get_model_size(agent_model) else 0
+        self.cost = 0
+        self.tokens = 0
+
+    def to_dict(self):
+        return {
+            "agent": self.agent_role,
+            "action": self.action,
+            "cost": self.cost,
+            "tokens": self.tokens,
+            "model_size": self.model_parameter,
+            "result": self.result,
+            "success": self.success
+        }
+    
+    def to_str(self):
+        return "Agent: {}\nAction: {}\nResult: {}\nSuccess: {}".format(self.agent_role, self.action, self.result, self.success)
+    
+    def set_workpath(self, workpath:str):
+        self.workpath = workpath
+
+    def write_code(self):
+        if self.result.get("code") is None:
+            return 
+        else:
+            path = os.path.join(self.workpath, "code_{}.py".format(self.path_id))
+            with open(path, 'w', encoding='utf-8') as file:
+                json.dump(self.workflow, file, ensure_ascii=False, indent=4)
+            file.close()
+
+    def set_cost(self, tokens:int):
+        self.cost = 2 * self.model_parameter * tokens
+        self.tokens = tokens
+        print("[Action Cost]: {}".format(self.cost))
+
+class Workflow:
+    def __init__(self, path_id:int, workpath:str):
+        self.path_id: int = path_id
+        self.workpath: str = workpath
+        self.workflow: list = []
+    
+    @property
+    def total_cost(self):
+        cost = 0
+        for a in self.workflow:
+            cost += a.cost
+        return cost
+
+    @property
+    def total_tokens(self):
+        tokens = 0
+        for a in self.workflow:
+            tokens += a.tokens
+        return tokens
+    
+    def to_dict(self):
+        return [action.to_dict() for action in self.workflow]
+    
+    def write_down(self):
+        path = os.path.join(self.workpath, "path_{}.jsonl".format(self.path_id))
+        with open(path, 'w', encoding='utf-8') as file:
+            json.dump(self.to_dict(), file, ensure_ascii=False, indent=4)
+        file.close()
+    
+    def add_action(self, action:Action):
+        action.set_workpath(self.workpath)
+        self.workflow.append(action)
+    
+    def get_agent_role_list(self):
+        agent_role_list = []
+        for action in self.workflow:
+            role = action.agent_role
+            agent_role_list.append(role)
+        return agent_role_list
+
+    @property
+    def language_state(self):
+        state = []
+        for index, action in enumerate(self.workflow):
+            step_str = "{}({}) - {} - {}".format(
+                action.action.get("action"),
+                action.action.get("parameter"),
+                action.result.get("step_data"),
+                action.result.get("answer")
+            )
+            state.append(step_str)
+        if len(state) == 0:
+            return "None"
+        return "\n".join(state)
+    
+    @property
+    def state(self):
+        state = []
+        for action in self.workflow:
+            flag = 1 if action.success == "Success" else 0  
+            state.append((action.agent_role, action.action.get("action"), flag))
+        if len(state) == 0:
+            return tuple([(None, None, -1)])
+        return tuple(state)
+    
+    @property
+    def valid_code(self):
+        data = []
+        for action in self.workflow:
+            if action.success == "Success":
+                data.append(action.result.get("code"))
+        return data
+    @property
+    def all_actions(self):
+        data = []
+        for action in self.workflow:
+            data.append(action.action.get("action"))
+        return data
+    
+    @property
+    def valid_actions(self):
+        data = []
+        for action in self.workflow:
+            if action.success == "Success":
+                data.append(action.action.get("action"))
+        return data
+    
+    @property
+    def valid_results(self):
+        data = []
+        for action in self.workflow:
+            if action.success == "Success":
+                data.append("Result: {}".format(action.result.get("step_data")))
+        return data
+    
+    @property
+    def valid_reasoning_results(self):
+        data = []
+        for action in self.workflow:
+            if action.action.get("action") in REASONING_ACTION_LIST and action.success == "Success":
+                data.append("Successful Action: {}\nResult: {}".format(action.action.get("action"), action.result.get("step_data")))
+
+        return data
+
+    @property
+    def valid_tool_results(self):
+        data = []
+        for action in self.workflow:
+            if action.action.get("action") not in REASONING_ACTION_LIST and action.success == "Success":
+                data.append("Successful Action: {}\nResult: {}".format(action.action.get("action"), action.result.get("step_data")))
+
+        return data
+    
+    @property
+    def unvalid_tool_results(self):
+        data = []
+        for action in self.workflow:
+            if action.action.get("action") not in REASONING_ACTION_LIST and action.success == "Failure":
+                data.append("Successful Action: {}\nResult: {}".format(action.action.get("action"), action.result.get("step_data")))
+
+        return data
+    
+
+    def visualize(self):
+        G = nx.MultiDiGraph()
+        node_colors = []
+        for i, w in enumerate(self.workflow):
+            G.add_node(i, label=w.action.get("action"), result=w.result, status=w.success)
+            node_colors.append("green" if w.success == "Success" else "red")
+            if i > 0:
+                G.add_edge(i-1, i)
+        pos = nx.kamada_kawai_layout(G)
+        labels = nx.get_node_attributes(G, 'label')
+        nx.draw(G, pos, with_labels=True, labels=labels, node_size=3000, font_size=10, arrows=True, node_color=node_colors)
+        
+        path = os.path.join(self.workpath, "workflow_path_{}.png".format(self.path_id))
+        plt.savefig(path)
+        plt.clf()
--- a/puppeteer/agent/reasoning_agent.py
+++ b/puppeteer/agent/reasoning_agent.py
@ -0,0 +1,375 @@
+import json
+import yaml
+import os
+from tenacity import retry, stop_after_attempt, wait_exponential
+import re
+from copy import deepcopy
+
+from tools.base.register  import global_tool_registry
+from tools.web_search import Web_Search
+from tools.code_interpreter import CodeInterpreter
+from tools.file_read import FileRead
+
+from agent.agent import Agent
+from agent.agent_info.global_info import GlobalInfo
+from agent.agent_info.workflow import Action
+from agent.agent_info.actions import REASONING_ACTION_LIST, TOOL_ACTION_LIST, TERMINATION_ACTION_LIST
+
+from utils.file_utils import format_code_with_prints, extract_code_from_text, write_code, write_text, read_code
+
+global_config = yaml.safe_load(open("./config/global.yaml", "r"))
+
+class Reasoning_Agent(Agent):
+    def __init__(self, role, role_prompt, index,  model="gpt", actions=[], policy=None, global_info=None,initial_dialog_history=None) -> None:
+        super().__init__(role, role_prompt, index, model, actions, policy, global_info, initial_dialog_history)
+
+
+    def activate(self, global_info:GlobalInfo, initial_dialog_history=None):
+        if self._activated:
+            return 
+        self._activated = True
+ 
+        system_step_data = global_info.workflow.valid_tool_results
+        prompt_filepath = "prompts/general/system_prompt.json" 
+        with open(prompt_filepath, "r") as f:
+            system_prompt = json.load(f)
+        system_step_data = [self._compress_data(d) for d in system_step_data]
+        self.system_prompt  =  "\n".join(system_prompt['system_prompt']).format(self.role_prompt, 
+                                                                                str(global_info.task.get("Question")), 
+                                                                                str(system_step_data))
+        
+        self.workspace_path = global_info.workpath
+
+        if initial_dialog_history is None or initial_dialog_history == []:
+            self.dialog_history = [{"role": "system", "content": self.system_prompt}]
+        else:
+            self.dialog_history = deepcopy(initial_dialog_history)
+            self.dialog_history[0] = {"role": "system", "content": self.system_prompt}
+
+    def deactivate(self):
+        self.initial_dialog_history = deepcopy(self.dialog_history)
+        self._activated = False
+
+    def _generate_action_prompt(self, global_info, previous_results, external_tools_enabled):
+        prompt_filepath = "prompts/general/action_decide.json"
+        with open(prompt_filepath, "r") as f:
+            select_prompt = json.load(f)
+        
+        if external_tools_enabled:
+            query_prompt = "\n".join(select_prompt['action_query']).format(
+                global_info.workflow.valid_actions,
+                global_info.url, 
+                global_info.file_name,
+                previous_results
+            )
+        else:
+           query_prompt = "\n".join(select_prompt['action_query_without_tools']).format(global_info.workflow.valid_actions, previous_results)
+        return query_prompt
+
+    def query_action(self, action, external_tools_enabled):
+        if external_tools_enabled:
+            results = self.action_collection.query(
+                query_texts=action,
+                n_results=1
+            )
+        else:
+            results = self.action_collection.query(
+                query_texts=action,
+                n_results=1,
+                where={"category": "reasoning"}
+            )
+        
+        return results
+    
+    def process_tool_parameters(self, results, global_info):
+        parameter = ""
+        parameter_type = results.get("metadatas")[0][0].get("input_type")
+        
+        if "query" in parameter_type:
+            pass
+        elif "file" in parameter_type and global_info.file_name is not None:
+            parameter = global_info.file_name
+        elif "url" in parameter_type and global_info.url is not None:
+            parameter = global_info.url
+        
+        if parameter is None:
+            parameter = ""
+        
+        return  parameter
+    
+    def _compress_data(self, data):
+        if len(data) > 5000:
+            data = data[:5000]
+        return data
+    
+    def _execute_action(self, format_action, global_info):
+        answer = ""
+        total_tokens = 0
+        print("\033[1;33mAgent {} Execute Action: {}\033[0m".format(self.role, format_action.get("action")))
+        code_generated_type = True if global_info.task.get("req")=="code" else False
+        text_generated_type = True if global_info.task.get("req")=="text" else False
+        
+        if format_action.get("action") not in REASONING_ACTION_LIST and format_action.get("action") is not None:
+            flag, step_data = self._tool_operation(format_action, global_info)
+            step_data = self._compress_data(step_data)
+            print("\033[1;33m{} {}\033[0m".format(format_action.get("action"),"Success" if flag else "Failure"))
+            
+            # for code generation task, correct step data as the result
+            if flag and code_generated_type:
+                if len(step_data) > 10:
+                    code_path = write_code(self.workspace_path, step_data, global_info.code_path)
+                    global_info.add_answer({"code_path": code_path, "code": step_data})
+                    global_info.code_path = code_path
+            elif flag and text_generated_type:
+                # for text generation task, store valid step data directly as the answer
+                if len(step_data) > 10:
+                    global_info.add_answer(step_data)
+                    code_path = write_text(self.workspace_path, step_data, global_info.code_path)
+                    global_info.add_answer({"code_path": code_path, "code": step_data})
+                    global_info.code_path = code_path
+            # for code generation task, error code should get corrected
+            if flag or code_generated_type:
+                tool_result = {"role": "user", "content": "You have get results from {}: {}".format(format_action.get("action"), step_data)}
+                self.dialog_history.append(tool_result)
+                answer, total_tokens = self._answer_operation(global_info)
+                print("\033[1;33mAgent {} answered: {}\033[0m".format(self.role, answer))
+    
+        if format_action.get("action") in REASONING_ACTION_LIST:
+            step_data, total_tokens = self._reasoning_operation(format_action, global_info)
+            flag = True
+            print("\033[1;33m{} {}\033[0m".format(format_action.get("action"),"Success" if flag else "Failure"))
+
+        if len(global_info.answers) > 0:
+            answer = global_info.answers[-1]
+        return  step_data, answer, flag, total_tokens
+
+    def _build_current_action(self, format_action, flag=True, answer=None, step_data=None, tokens=0):
+        result = {
+            "step_data": step_data,
+            "answer": answer    
+        }
+        current_action = Action(action=format_action, result=result, 
+                                success="Success" if flag else "Failure", 
+                                agent_role=self.role, agent_model=self.model)
+        if answer is None and step_data is None:
+            current_action.set_cost(tokens=0)
+        else:
+            current_action.set_cost(tokens=tokens)
+        return current_action
+    
+    def take_action(self, global_info, external_tools_enabled=True, env=None, env_name=None):
+        logger = global_info.logger
+        total_tokens = 0
+        code_generated_type = True if global_info.task.get("req")=="code" else False
+        text_generated_type = True if global_info.task.get("req")=="text" else False
+
+        if self.actions[0] in TERMINATION_ACTION_LIST:
+            action_json = {"action": self.actions[0], "parameter": ""}
+            current_action = self._build_current_action(action_json, flag=True, answer=None, step_data=None)
+            terminated = True
+            return current_action, terminated
+        
+        if self.actions[0] in TOOL_ACTION_LIST:
+            # only format the action json, without executing it
+            prompt_filepath = "prompts/general/actions_external_tools.jsonl"
+            prompt = ""
+            with open(prompt_filepath, 'r', encoding='utf-8') as f:
+                for line in f:
+                    json_obj = json.loads(line)
+                    if json_obj.get("action") == self.actions[0]:
+                        prompt = json_obj.get("prompt")
+                        break
+            
+            if global_info.file_name is not None:
+                prompt = "You can access to file named {}.".format(global_info.file_name) + prompt
+            elif global_info.url is not None:
+                prompt = "You can access to the url {}.".format(global_info.url) + prompt
+            elif code_generated_type:
+                prompt = "Your previous code {}".format(read_code(global_info.code_path)) + prompt
+            elif text_generated_type:
+                prompt = "Your previous text {}".format(read_code(global_info.code_path)) + prompt
+
+            response, tokens = self._query(prompt)
+            total_tokens += tokens
+            action_json = self.json_format.json_reformat(response, global_config.get("max_json_reformat_turns"))
+
+            if not isinstance(action_json, dict):
+                action_json = {"action": self.actions[0], "parameter": ""}
+            else:
+                action_json["action"] = self.actions[0]
+                
+            message = {"role": "assistant", "content": str(action_json)}
+            self.dialog_history[-1] = message
+            logger.info("[Action] {}\n".format(action_json))
+        
+        elif self.actions[0] in REASONING_ACTION_LIST:
+            action_json = {"action": self.actions[0], "parameter": ""}
+            logger.info("[Action] {}\n".format(action_json))
+        
+        step_data, answer, flag, tokens = self._execute_action(action_json, global_info)
+        total_tokens += tokens
+        current_action = self._build_current_action(action_json, flag, answer, step_data, total_tokens)
+        logger.info("-"*40)
+        terminated = False
+        self.deactivate()
+        return current_action, terminated
+
+    def _reasoning_operation(self, action, global_info) -> str:
+        logger = global_info.logger
+        prompt_filepath = "prompts/general/actions_reasoning.jsonl" 
+        code_generated_type = True if global_info.task.get("req")=="code" else False
+        text_generated_type = True if global_info.task.get("req")=="text" else False
+        prompt = ""
+        with open(prompt_filepath, "r") as f:
+            for line in f:
+                json_obj = json.loads(line)
+                if json_obj.get("action") == action.get("action"):
+                    prompt = json_obj.get("prompt")
+                    break
+        if code_generated_type or text_generated_type:
+            query_prompt =  prompt.format(read_code(global_info.code_path))
+        else:
+            query_prompt =  prompt.format(global_info.workflow.valid_reasoning_results)
+        logger.info("[System Prompt] {}\n[Query] {}\n".format(self.system_prompt, query_prompt))
+
+        raw_response, total_tokens = self._query(query_prompt)
+        logger.info("[Reasoning]: "+ raw_response)
+
+        if code_generated_type:
+            answer = extract_code_from_text(raw_response)
+            logger.info("[Final Answer]: " + answer)
+            if len(answer) > 10:
+                code_path = write_code(self.workspace_path, answer, global_info.code_path)
+                global_info.add_answer(json.dumps({"code_path": code_path, "code": answer}, ensure_ascii=False))
+                global_info.code_path = code_path
+            reasoning_result = action.get("parameter") + raw_response
+            logger.info("[Reasoning Path]: " + reasoning_result)
+            return reasoning_result, total_tokens
+        elif text_generated_type:
+            regex_answer = r"FINAL ANSWER:([\s\S]*)"
+            matches = re.findall(regex_answer, raw_response)
+            if len(matches) > 0:
+                logger.info("[Final Answer]: "+matches[0])
+                code_path = write_text(self.workspace_path, matches[0], global_info.code_path)
+                global_info.add_answer(json.dumps({"code_path": code_path, "code": matches[0]}, ensure_ascii=False))
+                global_info.code_path = code_path
+                print("\033[1;33mAgent {} answered: {}\033[0m".format(self.role, matches[0]))
+            
+            reasoning_result = action.get("parameter") + raw_response
+            logger.info("[Reasoning Path]: " + reasoning_result)
+            return reasoning_result, total_tokens
+        else:
+            regex_answer = r"FINAL ANSWER:([\s\S]*)"
+            matches = re.findall(regex_answer, raw_response)
+            if len(matches) > 0:
+                logger.info("[Final Answer]: "+matches[0])
+                global_info.add_answer(matches[0])
+                print("\033[1;33mAgent {} answered: {}\033[0m".format(self.role, matches[0]))
+            
+            reasoning_result = action.get("parameter") + raw_response
+            logger.info("[Reasoning Path]: " + reasoning_result)
+            return reasoning_result, total_tokens
+    
+    @retry(wait=wait_exponential(min=1, max=3), stop=stop_after_attempt(3))
+    def _answer_operation(self, global_info) -> str:
+        logger = global_info.logger
+        prompt_filepath = "prompts/general/answer_prompt.json" 
+        code_generated_type = True if global_info.task.get("req")=="code" else False
+        text_generated_type = True if global_info.task.get("req")=="text" else False
+        with open(prompt_filepath, "r") as f:
+            select_prompt = json.load(f)
+        if global_info.task.get("type") == "MMLU" or global_info.task.get("type") == "MMLU-Pro":
+            query_prompt =  "\n".join(select_prompt['MMLU_answer'])
+        elif global_info.task.get("type") == "GAIA":
+            query_prompt =  "\n".join(select_prompt['GAIA_answer'])
+        elif global_info.task.get("type") == "GSM-Hard"  or global_info.task.get("type") == "gsm-hard" or global_info.task.get("type") == "GSM8K":
+            query_prompt =  "\n".join(select_prompt['gsm_answer'])
+        elif code_generated_type:
+            query_prompt =  "\n".join(select_prompt['code_answer'])
+        elif text_generated_type:
+            query_prompt =  "\n".join(select_prompt['text_answer'])
+        else: 
+            query_prompt =  "\n".join(select_prompt['answer'])
+        logger.info("[System Prompt] {}\n[Query] {}\n".format(self.system_prompt, query_prompt))
+        
+        raw_response, total_tokens = self._query(query_prompt)
+        logger.info("[Format to Final Answer]: "+ raw_response)
+        
+        if code_generated_type:
+            answer = extract_code_from_text(raw_response)
+            logger.info("[Final Answer]: " + answer)
+            if len(answer) > 10:
+                code_path = write_code(self.workspace_path, answer, global_info.code_path)
+                global_info.add_answer(json.dumps({"code_path": code_path, "code": answer}, ensure_ascii=False))
+                global_info.code_path = code_path
+            return answer, total_tokens
+        elif text_generated_type:
+            regex_answer = r"FINAL ANSWER: ([\s\S]*)"
+            matches = re.findall(regex_answer, raw_response)
+            if len(matches) > 0:
+                logger.info("[Final Answer]: "+matches[0])
+                code_path = write_text(self.workspace_path, matches[0], global_info.code_path)
+                global_info.add_answer(json.dumps({"code_path": code_path, "code": matches[0]}, ensure_ascii=False))
+                global_info.code_path = code_path
+                return matches[0], total_tokens
+            else:
+                return "", total_tokens
+        else:
+            regex_answer = r"FINAL ANSWER: ([\s\S]*)"
+            matches = re.findall(regex_answer, raw_response)
+            if len(matches) > 0:
+                logger.info("[Final Answer]: "+matches[0])
+                global_info.add_answer(matches[0])
+                return matches[0], total_tokens
+            else:
+                logger.info("[Error] No final answer found in the response: {}\n".format(raw_response))
+                return "", total_tokens
+
+    @retry(wait=wait_exponential(min=3, max=5), stop=stop_after_attempt(2))
+    def _query(self, query) -> str:
+        prompt = {"role": "user", "content": str(query)}
+        if self.dialog_history[-1] != prompt and self.dialog_history[-1]['role'] != 'user':
+            self.dialog_history.append(prompt)
+        elif self.dialog_history[-1] != prompt and self.dialog_history[-1]['role'] == 'user':
+            self.dialog_history[-1]['content'] += str(query)
+        self.last_prompt = prompt['content']
+        messages = list(self.dialog_history)
+        response = self.query_func(messages)
+        message = {"role": "assistant", "content": str(response)}
+        self.dialog_history.append(dict(message))
+        return response
+
+    def _tool_operation(self, action:json, global_info) ->str:
+        logger = global_info.logger 
+        name = action.get("action")
+        parameter = action.get("parameter")
+        logger.info("[Action Execution] {}({})\n".format(name, parameter))
+        if 1:
+            if name == "read_file":
+                file_path = os.path.join(self.root_file_path, str(parameter))
+                flag, step_data = global_tool_registry.execute_tool(name, file_path=file_path, file_extension=global_info.file_extension)
+                logger.info("[Read File] {}: {}".format(("Success"if flag else "Failure"), step_data))
+            elif name == "run_python":
+                if global_info.task.get("type") != "SRDD" or global_info.task.get("type") != "human-eval":
+                    parameter = format_code_with_prints(parameter)
+                    timeout_detected = True
+                else:
+                    timeout_detected = False
+                
+                if global_info.file_name is not None:
+                    file_path = os.path.join(self.root_file_path, global_info.file_name )
+                else: 
+                    file_path = ""
+                flag, step_data = global_tool_registry.execute_tool(name, work_path=self.workspace_path, code=parameter, file_path=file_path, timeout_detected=timeout_detected)
+                logger.info("[Run Python] {}: {}".format(("Success"if flag else "Failure"), step_data))
+            else:
+                flag, step_data = global_tool_registry.execute_tool(name, query=parameter, work_path=self.workspace_path)
+                logger.info("[Web Broswing] {}: {}".format(("Success"if flag else "Failure"), step_data))
+            return flag, step_data
+        else:
+            logger.info("Tool {} not registered for agent {}".format(name, self.role))
+            print("Tool {} not registered for agent {}".format(name, self.role))
+            return None, None   
+
+    def _interaction_operation(self, code, env, global_info) -> str:
+        pass 
--- a/puppeteer/agent/register/register.py
+++ b/puppeteer/agent/register/register.py
@ -0,0 +1,83 @@
+from typing import Any
+from agent.reasoning_agent import Reasoning_Agent
+from utils.file_utils import iter_jsonl
+
+class AgentRegister:
+    def __init__(self):
+        self.agents = {}
+        self.unique_agents = {}
+
+    def _register_agent(self, name, agent):
+        if agent.hash in self.unique_agents:
+            return 
+        self.agents[name] = agent
+        self.unique_agents[agent.hash] = agent
+    
+    def __call__(self, *args: Any, **kwds: Any):
+        def decorator(cls):
+            agent = cls(*args, **kwds)
+            self._register_agent(agent.role, agent)
+            return cls
+        return decorator
+
+    @property 
+    def agent_config(self):
+        return self._agent_personas
+    
+    @property
+    def agent_num(self):
+        return len(self.unique_agents)
+    
+    @property
+    def agent_names(self):
+        return self.agents.keys()
+    
+    @property
+    def agent_identifiers(self):
+        return self.unique_agents.keys()
+    
+    def get_agent_from_name(self, name):
+        return self.agents.get(name)
+    
+    def get_agent_from_idx(self, idx):
+        return self.unique_agents.get(idx) 
+
+    def create_agent(self, name):
+        agent = self.get_agent_from_name(name).reinitialize()
+        if agent.hash in self.unique_agents:
+            raise ValueError(f"Agent {name} with hash {agent.hash} already registered")
+        self.unique_agents[agent.hash] = agent
+        if agent is None:
+            raise ValueError(f"Agent {name} not registered")
+        return agent
+
+    def register_all_agents(self, personas_path):
+        self._agent_personas = list(iter_jsonl(personas_path))
+        self._total_agent_num = len(self._agent_personas)
+        for index in range(self._total_agent_num):
+            self._initialize_agent(index)
+    
+    def reset_all_agents(self):
+        for agent in self.unique_agents.values():
+            agent.reset()
+            
+    def _initialize_agent(self, index):
+        agent_role_name = self._agent_personas[index].get("name")
+        agent_role_prompt = self._agent_personas[index].get("role_prompt")
+        agent_model_type = self._agent_personas[index].get("model_type", None)
+        agent_actions = self._agent_personas[index].get("actions", None)
+        agent_policy = self._agent_personas[index].get("policy", None)
+        if self._agent_personas[index].get("agent_type") == "reasoning":
+            agent = Reasoning_Agent(role=agent_role_name, 
+                          role_prompt=agent_role_prompt, 
+                          index=index,
+                          model=agent_model_type,
+                          actions=agent_actions,
+                          policy=agent_policy)
+        self._register_agent(agent_role_name, agent)
+
+    def __getattribute__(self, name: str) -> Any:
+        return super().__getattribute__(name)
+
+
+agent_global_registry = AgentRegister()
--- a/puppeteer/config/global.yaml
+++ b/puppeteer/config/global.yaml
@ -0,0 +1,19 @@
+logging:
+  level:  # logging level, e.g., INFO
+  logpath:  # the path to the folder for storing log files
+model_weight_path: # the path to the folder containing model weights of the puppeteer model base
+api_keys:
+  openai_api_key:  # openai api key
+  openai_base_url:  # openai base url, e.g., https://api.openai.com/v1/
+  bing_api_key: # bing api key for web search
+
+max_retry_times:  # maximum number of times to retry API calls
+max_json_reformat_turns:  # maximum number of times to retry JSON parsing/reformatting
+external_tools_enabled: True # whether to enable external tools like web search, file search, etc.
+
+file_path:
+  root_file_path: # the path to the folder containing all the files which agents might need
+
+graph:
+  max_parallel_paths:  # maximum number of parallel paths to explore in the graph
+  max_step_num:  # maximum number of steps (nodes) in each path
--- a/puppeteer/config/policy.json
+++ b/puppeteer/config/policy.json
@ -0,0 +1,56 @@
+{
+    "device": {
+        "type": "cuda"
+    },
+    "paths": {
+        "checkpoint_path": "checkpoint/MMLU-Pro_test",
+        "model_path": "checkpoint/gsm-hard_validation/policy_net_20250414_105845.pt"
+    },
+    "training": {
+        "loading": false,
+        "training": true,
+        "learning_rate": 0.0001,
+        "sample_size": 1,
+        "gamma": 0.99,
+        "lambda_kl_loss": 0.0
+    },
+    "agent": {
+        "max_num_agents": 3,
+        "next_num_agents": 3,
+        "max_path": 6,
+        "threshold": 0.5,
+        "reward_factors": {
+            "default": -1.0,
+            "terminator": 0.5,
+            "web_search": -1.5
+        }
+    },
+    "llm": {
+        "prior": false,
+        "prior_redistribution": false,
+        "redistribution_weight": 0.1
+    },
+    "cost": {
+        "scale": 0.1,
+        "growth_rate": 1.0,
+        "inverse": false
+    },
+    "visualization": {
+        "base_path": "results",
+        "dpi": 300,
+        "figure_sizes": {
+            "policy_loss": [
+                12,
+                6
+            ],
+            "entropy": [
+                10,
+                6
+            ],
+            "action_probs": [
+                20,
+                10
+            ]
+        }
+    }
+}
--- a/puppeteer/data/CW/creative_writing.jsonl
+++ b/puppeteer/data/CW/creative_writing.jsonl
@ -0,0 +1,200 @@
+{"concepts":["vest","snow","remove","silhouette","dunk","step","item","toddler","plaster","wall","piece","tool","circle","pull","wrap","headphone","deck","suit","press","mother","pit","cupcake","priest","rink","counter","rest","piano","wine","serve","winter"]}
+{"concepts":["push","club","scramble","goat","sausage","wrap","chest","fix","stop","tattoo","plaster","tube","report","scooter","bird","pan","flip","animal","flute","tennis","concrete","boat","bed","phone","tooth","house","bartender"]}
+{"concepts":["fruit","motorcycle","perform","jacket","vehicle","place","mat","walk","world","area","kiss","mother","pass","report","club","axis","tricep","patient","listen","owner","uniform","floor","hamburger","use","wine","cross","bull","sell","lawn","friend"]}
+{"concepts":["shirt","dunk","bang","soap","court","float","swimmer","coat","sleigh","groom","station","pitcher","artist","enjoy","headphone","clear","drill","street","pig","bathroom","clap","orange","television","ride","treadmill","brick","policeman","building"]}
+{"concepts":["individual","oil","item","sweep","mow","rink","hill","gear","leash","arrow","weight","yolk","stage","motorcycle","groom","ocean","fly","doctor","smash","newspaper","page","toe","employee","goat","smoke","wrap","mower"]}
+{"concepts":["oxidation","cup","mower","space","window","arch","pineapple","spatula","smoke","tie","gun","insert","boat","use","tomato","chest","vest","bartender","move","microphone","axis"]}
+{"concepts":["male","word","gear","dish","day","stove","home","ski","eyelash","pile","lawn","counter","toe","bike","cardboard","technique","goldfish","baseball","pitch","sink"]}
+{"concepts":["drive","vest","performer","edge","wand","check","hat","clap","tooth","brush","box","chainsaw","engage","wheelbarrow","ear","sleep","sword","wetsuit","hose","trash","oil","fire","dry","card","bathroom","technique","tennis"]}
+{"concepts":["goalie","stare","stage","team","piano","bike","shake","teammate","dinner","foot","garment","jersey","liquid","beach","audience","shine","wave","gas","winter","diver","curl","dig","razor"]}
+{"concepts":["enjoy","faucet","headphone","eat","curl","demonstrate","surfboard","bat","rodeo","chainsaw","vault","technique","mortar","instruction","chisel","gong","roll","side","skater","spark"]}
+{"concepts":["scooter","driveway","line","needle","card","treat","raft","shirt","vegetable","cereal","clip","cain","outfit","park","pair","javelin","pose","stuff","distance","jacket","spark"]}
+{"concepts":["chip","lace","wrap","begin","bat","soap","crack","beverage","lie","stroll","outfit","horse","car","color","forest","suit","candle","costume","skater","chainsaw","wash","break","drill","block","hoop","court","presentation","carve"]}
+{"concepts":["bowl","bridesmaid","card","beat","lens","screen","walk","orange","buck","stone","spark","swallow","suck","check","rope","shuffle","button","park","pet","singe","pancake","horse","arrow","flight"]}
+{"concepts":["goldfish","microphone","pancake","squeegee","portrait","clipper","engage","spaghetti","faucet","glass","end","book","climb","present","sweep","laugh","point","bride","soldier","student","serve","smooth","yolk","sharpener","music","clear","bank","clothe"]}
+{"concepts":["rest","singe","bench","oil","policeman","racket","team","basketball","alley","chase","clear","shot","sail","crosswalk","lasso","cake","backseat","fish","indoor","break","mallet","mortar","door","engage","dunk","burn","talk","microphone","shake","wear"]}
+{"concepts":["drum","clip","window","cover","graffiti","wave","vomit","hurdle","dye","sidewalk","animal","slice","swing","pottery","pineapple","fix","decorate","tire","tag","bicycle","watermelon","sign","mask","plant"]}
+{"concepts":["animal","pit","tourist","arch","shoe","tube","chair","card","lay","store","knit","sheep","trim","kid","camel","cardboard","remove","part","game","cart","hockey","rink","sail","vest","beverage","examine","market","tool","branch","burn"]}
+{"concepts":["house","surgery","turn","singer","harness","hair","patient","bride","attire","bicycle","insert","crack","air","blanket","needle","grill","skater","pump","stone","candle","listen","put","truck","snow","type","toddler","solve","collar","rodeo"]}
+{"concepts":["use","goat","wine","frisbee","leap","pole","tell","pencil","spin","birdie","catcher","fence","world","step","chop","sword","march","stage","axe","bat","place","roller","tomato"]}
+{"concepts":["lift","fill","boat","bathtub","whistle","slice","bank","day","lap","oil","pole","stuff","mower","pin","flag","shirt","singe","discuss","pond","grape","oxidation","gong","spark","receive","store","lot","spray","ramp"]}
+{"concepts":["plant","word","remove","toddler","hamburger","potato","set","line","place","control","eat","banner","singe","hurdle","bed","bar","alley","stove","top","air","cowboy","sharpen","pair","beat","arena","wave","singer","jacket"]}
+{"concepts":["control","court","help","pop","wing","lean","peeler","continue","attempt","baseball","band","goal","bicycle","turn","rope","dry","bubble","cliff","cheer","part","glass","break","grape","roll","shuffle","bridge"]}
+{"concepts":["fall","run","prevent","hurdle","gym","soldier","accept","suit","end","deal","cow","jersey","counter","food","snow","fold","dunk","wetsuit","oyster","shot","balloon","cream","leg","score","trick","orange","beat","direct","carry","lift"]}
+{"concepts":["surgery","treadmill","coat","rider","swing","scissor","groom","sander","castle","keyboard","pierce","horse","sleigh","crack","hookah","dryer","meat","graffiti","goldfish","lake","spaghetti","bronco","lie"]}
+{"concepts":["word","build","celebrate","attempt","win","calf","sock","wetsuit","family","stunt","garment","kid","lay","restaurant","hang","chef","bench","silhouette","board","roast","goalie","bridesmaid","end","decorate","accept","pen","color","portrait","ocean"]}
+{"concepts":["stand","chew","ground","toe","time","vault","liquid","cream","wool","eat","button","fireplace","braid","prepare","stair","dealer","gutter","arrow","castle","batter","score","report"]}
+{"concepts":["scramble","give","eyelash","carve","serve","hill","pet","drape","unicycle","skip","winter","ceremony","groom","roof","discuss","chainsaw","dive","gutter","tattoo","lumber","fill","speech","teach","trash","raft","singer","sun"]}
+{"concepts":["priest","view","path","sand","worker","shoot","building","weight","number","shift","volleyball","horse","violin","court","puff","bathroom","bowl","sharpen","award","club","flap","jump","student","tell","drive","technique","surf","catcher"]}
+{"concepts":["piano","engage","car","cast","hold","garment","spin","batter","lasso","toe","bench","football","roll","rodeo","gun","sand","press","cheer","middle","runner","event","dough","pond"]}
+{"concepts":["pet","orange","gun","bang","crash","frisbee","laugh","suspend","stare","score","help","traffic","field","add","counter","beverage","path","examine","employee","side","plaster","beat","peeler","face","metal","foot","snake","harness","hammer"]}
+{"concepts":["weight","walk","owner","engage","mallet","bull","bath","concert","dealer","ladder","home","shovel","motorcycle","tricep","concrete","dig","toe","move","sander","range","explain","barber","float","wing","hill","lady","cheer","discuss"]}
+{"concepts":["street","comb","sofa","bikini","bat","towel","water","pierce","present","sle","lumber","court","tattoo","dive","carry","pitcher","volleyball","ceremony","ride","uniform","iron","chest","juice","mower","wine","separate","screen","sunglass","teammate"]}
+{"concepts":["brush","land","bar","skater","tourist","track","hat","examine","officer","cain","routine","seat","lane","performance","suit","snowmobile","tank","speech","paper","water","collect","hurdle","follow"]}
+{"concepts":["roll","pineapple","goldfish","pan","scissor","equipment","stroll","eat","give","course","axis","peel","collect","arch","stand","foot","tie","march","clear","climb","pottery","thread","room","knife","jacket"]}
+{"concepts":["swimsuit","lady","lay","teach","push","nozzle","drop","saw","backseat","dock","swallow","target","spoon","type","gauge","trampoline","lot","clean","football","canoe","mat","vegetable","pineapple"]}
+{"concepts":["sausage","football","joke","seat","blanket","concert","customer","talk","teacher","beer","kite","chest","bounce","shoe","jump","head","cellphone","map","drill","throw","fish","gong","leg","beam"]}
+{"concepts":["light","market","cake","band","brush","distance","come","cymbal","fork","pond","mower","fruit","punch","music","report","towel","lie","umbrella","shine","hedge"]}
+{"concepts":["lean","pump","stir","sle","clothing","blower","podium","ski","bathroom","animal","blow","student","cook","wand","background","house","purse","presentation","cowboy","towel","front","point","tell","dance"]}
+{"concepts":["draw","body","time","saw","course","axis","curl","plant","massage","animal","ski","push","walk","tractor","scissor","gun","eye","jersey","bull","market"]}
+{"concepts":["bottle","sell","stir","trash","edge","spaghetti","lean","body","instructor","bed","space","front","band","middle","pair","decorate","dish","technique","doctor","shake","portrait","ice","bride","kick"]}
+{"concepts":["flip","short","suspend","smash","enjoy","playground","score","plate","audience","deal","class","gong","cast","down","tell","marker","car","part","club","night","carriage","discuss","tricep","throw","jean"]}
+{"concepts":["sheep","flash","hamburger","tricycle","arm","golfer","collar","ground","song","body","worker","drill","stroll","dribble","bowl","ball","television","jump","sink","slice","book","demonstrate","spray","smooth","button","soap"]}
+{"concepts":["vegetable","chess","keyboard","motorcycle","sun","sip","ski","part","shoe","mask","use","hose","racket","paint","shake","wheel","hang","hookah","dive","hole","lawn","wand","base","calf","crash","game"]}
+{"concepts":["house","wool","suit","jersey","cellphone","shift","rip","bull","pressure","sip","rest","dance","fly","cart","roof","braid","way","mouth","water","dryer"]}
+{"concepts":["log","chew","waterfall","pitch","squeegee","leash","coin","place","separate","cream","chest","stove","use","presentation","saw","net","eyelash","plate","rope","sword","ladder","market","egg","hill","lady","flag","solve","house"]}
+{"concepts":["bow","balloon","cliff","wait","sausage","bench","object","chew","massage","treat","flute","work","mow","roll","cart","cup","crowd","hurdle","car","light","song","dribble","football","garage","pull","bathtub","backseat","board"]}
+{"concepts":["look","dryer","forest","male","sew","puzzle","bubble","stone","purse","teacher","oil","beach","banner","pool","yard","piano","portrait","axe","playground","clean","report"]}
+{"concepts":["chef","orange","ceremony","consist","branch","lock","engage","end","razor","face","paint","target","leash","stretch","pin","outfit","gym","country","backseat","shoulder","costume","ground","vomit","dive","fold"]}
+{"concepts":["artist","bartender","bird","peace","line","dress","dunk","goldfish","collar","snake","insert","dice","dog","down","tooth","sle","leap","soldier","cross","hat","close","yolk","individual","draw","shift"]}
+{"concepts":["receive","concrete","stool","air","sail","cheer","paint","help","unicycle","microphone","fish","part","flag","plate","vomit","pile","toothbrush","room","short","dryer","gauge","counter","direct","way","swing","candle","cup","object","bird","sit"]}
+{"concepts":["middle","arena","style","driveway","policeman","silhouette","cover","snake","beam","mow","ingredient","pond","equipment","key","goal","gas","dry","give","groom","competition","spark","pump","short","slope"]}
+{"concepts":["drink","rock","vendor","scooter","employee","cain","bride","top","kite","alligator","rope","patient","box","racket","stretch","arena","sharpen","pet","braid","down","dart","mortar","technique","couch","goldfish","rain","roof","hoop","skate","mix"]}
+{"concepts":["leash","beach","gun","space","beverage","wound","punch","wall","teach","playground","field","sleigh","couch","stretch","mitt","trampoline","animal","boat","juggle","bowl","event","deal","camel","arrow"]}
+{"concepts":["joke","feed","frisbee","glass","massage","toss","snow","soldier","rink","puff","sle","tank","bull","desert","tricep","pierce","design","gauge","lunch","cowboy","player","lace","mat","hit","band","table"]}
+{"concepts":["scooter","end","indoor","number","mascara","axis","owner","hamburger","pottery","beat","color","serve","plate","guitarist","stone","cream","bubble","blower","ladder","wing","wool"]}
+{"concepts":["duck","room","owner","suck","shoulder","fry","birthday","performer","blower","beam","cigarette","eye","basketball","bird","park","yard","separate","spark","drummer","bowl","smash","worker","ride","river"]}
+{"concepts":["drop","audience","horse","sheep","water","ear","sharpener","pick","engage","wave","podium","swim","pumpkin","mother","axe","surgery","mower","presentation","lay","log","trim","color","short","pop","page","blow","suspend","racket","forest","attire"]}
+{"concepts":["shine","sink","raise","pop","oil","surf","football","remove","flap","look","cellphone","equipment","liquid","officer","chip","motorcycle","read","gymnast","stuff","dinner","bag","tank","athlete"]}
+{"concepts":["guitarist","driveway","restaurant","peeler","crosswalk","building","hold","park","beer","puck","celebrate","beam","frisbee","seat","leave","shoulder","fish","portrait","cliff","cat"]}
+{"concepts":["pig","football","silhouette","sing","gym","carriage","attempt","leave","pitcher","cup","drop","goldfish","snowmobile","axis","pit","tag","turn","blower","wine","gun","bubble","barber"]}
+{"concepts":["tie","stop","turn","dinner","world","protect","pose","axis","stage","braid","tennis","weld","skateboard","hockey","surfer","stool","peace","toe","plant","bunch","hand","smash","brush","suit","kick","yarn","mow","swimsuit"]}
+{"concepts":["trim","screen","stir","swim","compete","golfer","hamburger","drummer","treat","needle","surfer","card","tie","shoe","paint","metal","stare","give","dye","perform","stove","ride","stone","graphic","mix"]}
+{"concepts":["frisbee","catcher","screen","cigarette","gun","bat","customer","punch","hedge","rink","lasso","gum","glass","building","sleigh","suspend","lawn","smooth","ground","use","dive","dice","roof"]}
+{"concepts":["pose","roller","scramble","night","individual","snake","smooth","drill","place","bottle","piece","gym","hold","smash","leash","goldfish","stop","washer","serve","axis","bath","microphone","burn","stare"]}
+{"concepts":["rub","mower","fall","surf","fix","microphone","lock","ingredient","playground","student","decorate","lie","walk","toothbrush","chainsaw","dry","part","cream","roof","stop","snowball","cellphone","flag","turn","goggle"]}
+{"concepts":["slope","gymnast","buck","shore","graffiti","hair","mat","winter","mitt","pitch","mirror","diver","clothe","peeler","bathroom","rock","stool","area","puff","climb","close","leash","turn","down"]}
+{"concepts":["report","wheelbarrow","country","gas","dish","lake","lumber","needle","sell","field","remove","donut","discuss","drive","concrete","roast","skateboard","score","hedge","customer","speech","base"]}
+{"concepts":["beard","area","gun","speech","audience","pitcher","friend","lock","birdie","room","motorcycle","ball","scrub","home","water","discuss","push","drill","pedestrian","tooth","policeman","cow","equipment","pose","tank","sheep","jog","broom","wax","gym"]}
+{"concepts":["goldfish","wax","dock","door","roller","guitarist","basketball","surgery","birthday","blanket","dress","runner","tractor","background","cupcake","toe","treadmill","beverage","put","vendor","doctor"]}
+{"concepts":["sand","rope","chip","drink","sharpener","building","tricep","sing","comb","employee","clear","female","board","faucet","teacher","part","razor","slide","silhouette","race","artist","give","stove","fish","cow","shift","dinner"]}
+{"concepts":["pancake","event","goal","broom","basketball","stretch","athlete","point","stove","direct","soap","lap","wave","swimsuit","harness","family","object","car","chess","cliff"]}
+{"concepts":["officer","rain","female","prepare","towel","blanket","sign","burger","fall","build","cain","fetch","bride","garment","dig","shake","instructor","pedestrian","music","sle","calf","set","say","saw","helmet","cowboy","beach","fill","ice"]}
+{"concepts":["razor","paddle","rinse","pig","golfer","baton","cymbal","slope","crowd","run","jump","teach","graphic","skip","lotion","sheep","student","attach","hurdle","wound"]}
+{"concepts":["cow","treadmill","wheel","worker","tongue","guitar","spaghetti","lot","fall","lake","washer","building","axe","singer","hamburger","course","swimsuit","station","shore","suit","cain","card","squeegee","motorcycle","grass"]}
+{"concepts":["top","dice","vault","bath","chainsaw","jacket","punch","rock","stone","roast","arena","pass","stove","technique","swimsuit","shore","hit","walk","tattoo","lock","racket","drop","washer","pressure","listen"]}
+{"concepts":["pool","sing","trash","tie","shake","flute","treat","patient","wall","word","surfboard","shoulder","oyster","talk","chip","massage","clean","bench","top","teacher","dock","cigarette"]}
+{"concepts":["chisel","kite","shift","tire","leap","garage","fire","white","cardboard","spin","tag","ocean","short","surf","tomato","wash","feed","wave","blanket","routine","exercise","bed","talk","shoot","arm","roof","family","read","food","scrub"]}
+{"concepts":["pencil","towel","door","priest","cream","makeup","policeman","garage","bike","birthday","crowd","student","peace","practice","whistle","wire","blower","burger","piano","accept","dress"]}
+{"concepts":["pole","goal","mouth","castle","item","cart","teacher","chip","wing","rider","kiss","drive","runner","stop","lotion","attempt","guitarist","lawn","fire","card","goggle","body"]}
+{"concepts":["peeler","hookah","newspaper","bench","serve","pile","sun","chase","desert","goggle","brick","drop","thread","raise","pot","prevent","sit","land","orange","sew","dance","hedge","spray"]}
+{"concepts":["dancer","hat","chisel","fork","toy","control","lumber","brush","pedestrian","spoon","faucet","look","eye","stove","wave","batter","carve","tell","flute","tricep","helmet","bike","mountain","pet","raise","watch","flag","pipe","graphic"]}
+{"concepts":["dye","follow","drape","solve","meat","store","give","unicycle","beach","building","down","station","pierce","castle","beer","razor","volleyball","number","desert","rest","chase","dribble","bartender","butter","flute","spin","harness","clear","cow","prevent"]}
+{"concepts":["wear","roll","mountain","dribble","paddle","waterfall","bridesmaid","watch","newspaper","treadmill","bathtub","roof","bed","cigarette","scrub","lift","sock","board","crowd","pose","toy","key","paper","close","marker","use"]}
+{"concepts":["guitarist","toothbrush","tooth","turn","cupcake","dice","tool","bartender","flash","blanket","kiss","puff","blow","arena","collect","win","skip","watermelon","celebrate","costume","room"]}
+{"concepts":["roller","egg","donut","guitarist","dancer","athlete","move","silhouette","pile","pancake","purse","team","kid","continue","set","read","fill","goat","hold","individual","cat","slope","shake","tree","football","hand","birthday"]}
+{"concepts":["couple","motorcycle","chew","clap","duck","tag","pressure","fill","family","owner","feed","shoulder","broom","customer","rock","gather","yard","singe","piano","bat","crack","cellphone","suspend","shuffle","clipper","brick","keyboard","helmet","compete","rip"]}
+{"concepts":["violin","country","win","vault","drum","bridesmaid","ground","seat","spin","drummer","number","fish","dinner","fetch","goalie","fry","circle","insert","pen","graffiti","hammer","base","map","stroll"]}
+{"concepts":["machine","alley","microphone","routine","base","design","joke","pop","motorcycle","squeegee","practice","hookah","dart","item","candle","doctor","suspend","spoon","wave","cube","event","raft","game","guitar","drum","bikini","pig","top"]}
+{"concepts":["watermelon","word","beer","look","treat","puzzle","sit","fish","ear","bronco","river","family","fireplace","smoke","teammate","pan","sleep","game","swimmer","plate","volleyball","guitar","shovel","garage","mirror","bed"]}
+{"concepts":["toe","band","hoop","cardboard","comb","base","jersey","contact","marker","run","tooth","short","potato","tricep","game","silhouette","ear","ski","slide","athlete","clean","suit","garage","tomato","jog","path","lap","pressure"]}
+{"concepts":["driveway","sing","bar","tag","award","sand","harness","dive","student","mat","tomato","gong","oil","barber","bicycle","object","fork","lock","pile","hold","close","employee"]}
+{"concepts":["area","river","gauge","way","dunk","station","continue","umbrella","axis","shot","front","stir","microphone","board","braid","lady","pierce","mask","chest","dry","sew","skateboard","orange","wire","range","flag","stage","trim"]}
+{"concepts":["prevent","compete","home","wire","land","fireplace","drummer","use","family","garment","wound","scooter","horse","machine","direct","wing","white","distance","goat","puff","perform","tie","skateboard","spark","sip"]}
+{"concepts":["raft","sip","hair","close","pocket","stretch","catch","pit","chase","hat","stop","yolk","dancer","apple","horse","tire","carry","drop","clip","deal","tell","bridesmaid","boat","cover","frisbee","line","sidewalk","animal"]}
+{"concepts":["pair","couch","mat","drill","shoe","present","compete","trick","performer","pineapple","body","fish","crowd","build","towel","stroll","juice","banner","smoke","oyster","come","sleep","scooter","view","bike"]}
+{"concepts":["cymbal","couch","sew","counter","cupcake","fly","puck","dish","cast","mat","pole","rock","goalie","sheep","bunch","marker","dryer","market","pond","prevent","lace","puff","toilet","wire","listen"]}
+{"concepts":["apply","clothing","vomit","gymnast","tattoo","stove","fly","wheelbarrow","ladder","lot","cake","wetsuit","performance","event","bathtub","cereal","page","fork","yolk","hair","barbell","coat","leg","priest","drop","bunch"]}
+{"concepts":["treadmill","laugh","perform","world","tie","shoe","yolk","tap","dog","pedestrian","leg","soldier","report","surgery","drink","side","headphone","cube","design","page","waterfall","drive","kite","singe"]}
+{"concepts":["light","truck","examine","mask","treat","rain","drum","toss","shuffle","guitarist","prepare","cart","ocean","marker","podium","rink","pitcher","worker","head","look","washer","singe","background","concrete","wheelbarrow","instrument"]}
+{"concepts":["needle","wall","tie","drink","building","pool","animal","sle","lady","jacket","sweep","bat","cymbal","eyelash","jersey","umbrella","body","vehicle","podium","room","tourist","bronco","tap","door","grill","canoe","paint","hookah"]}
+{"concepts":["wall","wear","court","mitt","flag","dress","machine","donut","paint","burger","tooth","contact","part","treadmill","runner","pet","beer","vault","butter","hill","peel","compete","castle","decorate","rock","stuff","wing","piece","swim"]}
+{"concepts":["side","horse","rodeo","goldfish","punch","joke","mother","surf","backseat","protect","eat","watermelon","mouth","drink","rock","rider","stroll","microphone","skater","meal","officer","flight","barber"]}
+{"concepts":["paint","flash","bridesmaid","cain","block","cut","car","customer","draw","highchair","swimsuit","instruction","hit","down","beard","sander","dribble","field","song","clipper","area","sail","eyelash"]}
+{"concepts":["skip","swing","accept","talk","bowl","tomato","wear","pedestrian","doctor","sidewalk","build","space","roof","march","glove","eat","listen","lens","crosswalk","bull","skate","orange","clipper","trampoline","toddler","award","arena","rub","playground"]}
+{"concepts":["down","oyster","pile","continue","hockey","prevent","puff","bikini","drill","rodeo","weld","knife","lace","flight","sink","distance","scramble","bank","chew","birthday","egg","use","golfer"]}
+{"concepts":["attach","instruction","cube","spatula","bank","clip","picture","pool","birthday","prepare","graffiti","come","cheer","edge","goat","continue","beam","garage","country","crash","room","rip","pile","arena"]}
+{"concepts":["athlete","tank","groom","meat","head","toe","bowl","pet","pond","bike","braid","wine","hose","toss","indoor","control","press","garment","smoke","mower","student","look","egg","fire","country"]}
+{"concepts":["roast","sand","vest","prepare","stick","pit","road","cube","boat","lotion","hoop","puff","ramp","part","player","smoke","pick","watermelon","yolk","tourist","smash","wax","win","mirror","axis"]}
+{"concepts":["read","braid","dryer","sand","check","lake","soap","deal","hair","instruction","style","policeman","receive","knit","equipment","press","laugh","shot","mat","card","pan","design"]}
+{"concepts":["consist","stir","engage","mouth","headphone","goal","cowboy","dice","number","tomato","shirt","cain","dealer","peace","leap","chip","bow","traffic","portrait","cross","walk"]}
+{"concepts":["meat","clip","jean","come","day","indoor","catch","window","singer","javelin","step","knit","pocket","eyelash","wheelie","cereal","word","eye","toothbrush","pour","court","gas","garment","birdie","contact","stone","separate","wait"]}
+{"concepts":["headphone","blow","serve","athlete","raise","hat","stair","hand","pass","point","alligator","egg","ice","deal","country","apple","dunk","piece","store","puck","instructor"]}
+{"concepts":["technique","suck","artist","lace","bag","area","massage","box","score","wire","perform","ice","deal","flash","candle","gauge","individual","blow","restaurant","tank","protect","alley","silhouette"]}
+{"concepts":["sleigh","race","drape","down","chef","fill","hockey","screen","song","newspaper","pan","building","team","cup","nozzle","wire","beat","fork","cowboy","market"]}
+{"concepts":["washer","bounce","indoor","roll","hat","jersey","alley","dig","sew","wheelie","check","compete","truck","fork","sleigh","gather","sidewalk","mascara","side","stool","mask","eat","paddle","tooth"]}
+{"concepts":["owner","tube","thread","teach","waterfall","shoe","street","bow","cereal","tag","skip","bride","shot","chess","plate","team","lake","suck","beer","bus","solve","slide","forest","bathroom","spark","remove","rip"]}
+{"concepts":["side","castle","flip","country","clothe","field","wand","house","cliff","saw","report","fireplace","clean","lunch","bird","hair","bank","picture","read","close","outfit","male","teacher","pen","mask","owner","skateboard"]}
+{"concepts":["mat","cake","tank","remove","dress","raise","bank","pineapple","arrow","rope","bar","beard","arena","toe","shave","golfer","gas","court","present","sofa","brick","stroll","bubble","station","award","skate"]}
+{"concepts":["instruction","river","clothing","seat","birdie","jog","pipe","give","bridge","lace","bar","unicycle","tourist","bounce","button","block","pedestrian","alligator","purse","lumber","apple","scramble","truck","rock","hole","stand","faucet"]}
+{"concepts":["sail","press","pit","balloon","bike","stir","lock","joke","air","apply","way","sword","ceremony","raise","scramble","pour","game","collar","concert","performance","gather","male"]}
+{"concepts":["thread","goal","performance","garage","sausage","playground","turn","type","swing","yarn","pedestrian","market","hat","jacket","soldier","shore","jump","needle","use","field","tooth","bike","continue","concrete"]}
+{"concepts":["water","beat","punch","alley","face","portrait","clip","piano","headphone","suit","continue","court","bow","house","officer","artist","kid","sail","course","friend","skateboarder","brick","pin","basketball","bathtub","snake","weld","fire","donut"]}
+{"concepts":["bathroom","paper","clap","wetsuit","peel","fold","purse","face","pepper","wear","sand","smooth","duck","type","television","raise","podium","plate","sew","leave","yard","broom","vomit"]}
+{"concepts":["sand","hat","clap","gutter","help","color","bathtub","direct","clothing","lace","front","teach","tooth","dish","equipment","contact","makeup","outfit","scrub","draw","beverage","chest"]}
+{"concepts":["gum","spaghetti","field","sharpen","deck","chair","roof","towel","seat","world","kitchen","pineapple","traffic","kick","clipper","smoke","carriage","blow","report","brush","fruit","arena","sweep","goalie","fish","skateboarder","street","pressure","routine"]}
+{"concepts":["pressure","clothing","gutter","bronco","spoon","swing","smash","rider","hair","fry","picture","float","plaster","mortar","circle","toy","line","chip","sleigh","track","snowball","wool","blanket","suspend","cut","needle","wave","suit","drop","ice"]}
+{"concepts":["oxidation","concert","dance","protect","mascara","fall","barbell","lake","side","object","restaurant","bridge","station","report","item","class","peeler","street","table","dress","desert","dough","move","separate","flap"]}
+{"concepts":["lady","broom","spoon","arch","sweep","tie","ski","song","point","apply","shave","chase","swing","number","band","scramble","policeman","solve","pump","report","golfer","board","rinse","turn"]}
+{"concepts":["point","coin","alley","flash","skip","open","dart","beer","singe","put","yard","wire","dance","drape","gym","seat","score","side","knock","washer","pepper","catch","patient","cellphone","sand","spin","path","tool","dice","insert"]}
+{"concepts":["priest","guitarist","shift","lot","floor","shake","dance","pool","engage","cowboy","cereal","brick","item","gym","slide","land","jump","driveway","pressure","add","card","bow","fish","chip","ice","beer","attach","drill"]}
+{"concepts":["mallet","paper","white","wear","dribble","picture","wine","boat","point","joke","brush","stuff","distance","smooth","control","uniform","costume","paddle","toilet","store","cake","basketball","help","lift","bathtub","turn","home","sunglass"]}
+{"concepts":["iron","pencil","wax","leash","marker","cymbal","lap","coat","score","tank","win","peace","smile","wheelie","fill","shine","examine","dish","female","flag","snow"]}
+{"concepts":["tricycle","arena","braid","brush","slice","tennis","bottle","jean","stare","sign","balloon","stair","restaurant","way","officer","scooter","shake","male","beat","take","wire","hand","alley","store","footprint","gun","pocket"]}
+{"concepts":["bull","stuff","help","hoop","piece","water","exercise","pop","house","goat","eat","grill","celebrate","soap","draw","speech","leg","bounce","clipper","swimmer","smooth","kick","policeman","dive","write"]}
+{"concepts":["tractor","family","path","hedge","gutter","train","vehicle","vault","pick","bat","country","desert","perform","plaster","boat","toy","couch","knock","potato","chase","cellphone","camel","hamburger","ladder","fire","pile","skateboard","drop","mascara"]}
+{"concepts":["tongue","plate","cellphone","jean","give","chase","lock","fetch","cereal","cart","umbrella","athlete","pull","fall","deal","sidewalk","skateboard","rip","bench","fence","player","begin","bike","ground","wound","grape","mascara","white","mower"]}
+{"concepts":["sit","chew","card","clipper","buck","tricep","indoor","ground","basketball","background","track","costume","ladder","body","lace","toy","television","towel","soldier","makeup","bird","sail"]}
+{"concepts":["coin","trick","direct","compete","pit","meal","carry","cowboy","station","throw","braid","footprint","shoulder","tourist","dough","skateboarder","sharpener","hold","winter","create","duck","silhouette","jersey","indoor","body","jog","chase"]}
+{"concepts":["pottery","song","hose","grape","glove","cube","cellphone","street","wear","couple","button","harness","word","kite","unicycle","press","scissor","plaster","shuffle","clip","garment","umbrella","duck","family","gear","jean","circle","instructor","rip"]}
+{"concepts":["floor","write","shine","front","roller","footprint","sle","pot","weight","trash","wing","item","stair","block","present","wheelbarrow","pepper","drop","hill","jog","sink","country","create"]}
+{"concepts":["cube","use","demonstrate","fill","weld","number","collar","block","lay","treadmill","football","wine","mortar","crack","dress","routine","cliff","garment","wall","catch","ladder","bottle","perform","pot"]}
+{"concepts":["employee","cream","traffic","dinner","beverage","gym","suit","pair","prevent","club","open","event","bowl","ceremony","puff","follow","presentation","pipe","lens","crosswalk","tricep","control","sausage","attire"]}
+{"concepts":["highchair","yard","press","hedge","bathroom","carry","street","comb","run","award","stick","perform","trim","snake","hang","sign","insert","apple","fold","blower","chef"]}
+{"concepts":["raise","bed","microphone","kite","dryer","baseball","weight","road","front","mortar","piano","gauge","calf","canoe","suit","bubble","spoon","short","skateboarder","feed","control","lunch","trash","surf","policeman","ski","fence"]}
+{"concepts":["hedge","treadmill","smash","country","weld","spray","break","flight","roast","mortar","gymnast","punch","marker","smoke","fry","sail","cover","castle","sign","foot","wrap","ground","head"]}
+{"concepts":["enjoy","chew","cut","lens","plaster","blanket","skateboard","cigarette","vomit","rock","leap","meat","tomato","help","individual","engage","goldfish","cake","cover","sle","piano"]}
+{"concepts":["board","roller","male","break","vendor","light","collar","wing","plant","lie","suck","bikini","backseat","lace","juggle","push","tennis","direct","hedge","television","garment","lay"]}
+{"concepts":["fall","distance","pineapple","cheerleader","metal","tie","cat","talk","examine","priest","driveway","peeler","pick","keyboard","dry","fly","wait","pet","tongue","arena","cain","vegetable","wand","carry","crowd","glove","hat","brick","joke","point"]}
+{"concepts":["fire","concert","spoon","bronco","mortar","helmet","lean","dealer","couch","unicycle","cross","fetch","toss","shot","performance","plate","faucet","trampoline","camel","stop","bride"]}
+{"concepts":["yard","mallet","goal","vendor","leave","rain","braid","card","wine","puck","skateboarder","swimmer","boat","indoor","wing","toe","club","shoot","cow","tooth"]}
+{"concepts":["mask","fix","boat","sharpen","smile","swim","chair","piano","drink","building","pig","cow","wheel","scrub","tennis","bench","check","dunk","bikini","pin","slice","axis","drum","lawn","mirror","top","air","remove"]}
+{"concepts":["piece","down","climb","concert","dig","rock","wine","spatula","bang","key","hammer","music","teammate","tie","stunt","costume","feed","football","class","pipe","rip","deck","accept","hedge","decorate","pedestrian","liquid","gear","leave","gong"]}
+{"concepts":["broom","key","balloon","grape","picture","mirror","brick","clap","costume","cigarette","microphone","stool","blanket","customer","playground","instruction","slice","world","piece","fly","presentation","store","concert","dunk","fold","mouth","set","chew","lotion","type"]}
+{"concepts":["pineapple","burger","piece","treadmill","serve","practice","cliff","distance","fork","bar","pen","peel","roast","object","say","climb","garage","cereal","surf","type"]}
+{"concepts":["exercise","fold","drill","engage","cheer","saw","crash","puff","bikini","pineapple","dryer","cover","event","dart","dance","sand","band","clothing","sun","side","pierce","race","doctor"]}
+{"concepts":["pipe","unicycle","sofa","female","scramble","puzzle","chop","mother","consist","contact","competition","ground","sail","accept","spray","smooth","washer","write","guitarist","edge","dye","pick","gather"]}
+{"concepts":["serve","throw","stir","bottle","routine","juice","suit","flute","draw","ramp","truck","design","say","crosswalk","pet","fall","lap","weld","pit","meat","kid","view","purse"]}
+{"concepts":["rub","prepare","move","worker","trim","candle","dance","bank","television","catch","design","shuffle","sharpen","work","take","decorate","massage","purse","road","coin","arrow","stuff","fix","home","hoop","family","fireplace","singe","skater"]}
+{"concepts":["thread","attire","podium","dinner","hair","wheelie","oil","climb","stretch","microphone","beam","band","vehicle","beat","skip","cast","stop","puff","surgery","smash","routine"]}
+{"concepts":["kitchen","policeman","read","ingredient","help","vomit","violin","pop","hold","speech","take","screen","stroll","crack","baton","soap","plaster","lace","hookah","concert","prepare","pencil","trim"]}
+{"concepts":["drill","say","beam","hole","basketball","fall","attach","swim","stunt","purse","style","map","tractor","cain","shoulder","sleep","tube","bathtub","drummer","donut","shift","horse","jog"]}
+{"concepts":["shovel","teach","mitt","key","deck","leap","cross","celebrate","fetch","needle","vault","toy","silhouette","juice","win","market","tree","gum","soldier","apply","sofa","paddle","arena","sip","collar","stroll","beer","park","circle"]}
+{"concepts":["chew","rock","serve","spin","building","look","crack","hole","wear","drive","leap","goalie","bird","tap","eye","hang","pan","snake","garage","cliff","mascara","float","dice","suspend"]}
+{"concepts":["drop","groom","deck","pepper","check","mouth","gather","attire","arrow","mother","goat","backseat","garment","graphic","hat","wood","clean","toddler","artist","score","mascara","uniform","background","sander","candle"]}
+{"concepts":["work","write","sausage","bird","race","whistle","country","design","give","rub","sand","bottle","cake","chess","table","button","top","down","seat","hit","stretch","artist","mother","cupcake","groom","batter","shore","head","baseball"]}
+{"concepts":["gum","butter","trim","chest","watermelon","put","sink","block","style","surgery","swim","vehicle","bottle","goggle","look","patient","shot","bike","worker","policeman","pass","tattoo","gauge","comb","coin"]}
+{"concepts":["box","scrub","pile","buck","lunch","bronco","pop","dig","compete","ground","gum","watch","step","surfer","push","down","phone","edge","range","front","snowmobile","singer"]}
+{"concepts":["shirt","spray","wine","cube","smile","rub","duck","volleyball","cupcake","work","drum","tank","pile","snake","dye","key","wound","bathtub","equipment","roast","surfboard","hammer"]}
+{"concepts":["dough","competition","ball","lot","hamburger","pile","shirt","accept","egg","spatula","oil","floor","crack","pool","faucet","catcher","runner","frisbee","bar","spray","apple","crowd","shake","pipe","serve"]}
+{"concepts":["protect","table","phone","roller","unicycle","traffic","pass","beer","perform","barber","plant","jersey","listen","dinner","fish","alligator","snowmobile","batter","wire","bank","tricycle","instrument"]}
+{"concepts":["traffic","arm","box","purse","stick","suit","beach","point","gum","way","vehicle","hold","line","clip","hoop","dry","pin","cook","makeup","pass","bathtub","pick"]}
+{"concepts":["puzzle","beat","wave","male","tennis","color","oil","blow","tire","umbrella","surfboard","yarn","ingredient","catch","object","hose","seat","pressure","vegetable","spoon","knit","hockey","circle"]}
+{"concepts":["walk","ski","metal","tap","spin","scooter","suspend","stair","dog","joke","dance","student","roll","slice","hair","yard","toss","tourist","blow","oil","smash","river","eye","helmet"]}
+{"concepts":["owner","cup","burn","toy","win","store","gear","board","cheer","punch","class","sand","prepare","female","beat","graffiti","scooter","brush","lumber","pan","consist","protect"]}
+{"concepts":["target","pineapple","park","helmet","music","white","thread","plant","wetsuit","cube","birthday","officer","vomit","tooth","mitt","burger","tree","garage","chisel","teammate","object","help","pump","food","foot","chip","background","dough","bathroom"]}
+{"concepts":["football","donut","card","tooth","cross","move","butter","employee","surgery","spaghetti","iron","pen","guitarist","hand","circle","mascara","screen","field","house","knock","separate","ramp","crosswalk"]}
+{"concepts":["design","banner","plate","music","serve","glove","cream","lotion","surgery","clap","shine","attach","backseat","home","cake","pass","bridesmaid","thread","pan","treat"]}
+{"concepts":["chew","bubble","chainsaw","object","lawn","roll","hit","fetch","separate","pair","sew","fruit","drink","ground","puck","pole","policeman","bathtub","examine","surfboard","ingredient","equipment","wetsuit","class"]}
+{"concepts":["batter","award","slide","crash","pepper","throw","castle","station","outfit","world","sidewalk","alley","sock","lie","flash","class","shirt","fork","tag","cain","cut","chess","part","bathroom","skater"]}
+{"concepts":["exercise","bike","artist","prevent","calf","pig","crash","base","mallet","book","number","pedestrian","lean","dive","background","march","dart","check","crack","duck","dryer","horse","spoon","vehicle","counter","canoe","event"]}
+{"concepts":["pit","field","tricep","juggle","drink","hold","land","say","tourist","point","win","put","bunch","swim","game","catcher","pan","bang","iron","front","stretch","talk","whistle","crowd","ingredient","listen","object"]}
+{"concepts":["tank","paddle","jersey","help","stair","class","car","puzzle","read","batter","insert","sail","bunch","mortar","background","cheerleader","cover","practice","jump","cat","fence","leave","base"]}
+{"concepts":["silhouette","card","washer","door","concrete","jump","press","fireplace","gauge","help","restaurant","banner","field","arrow","plaster","diver","grape","worker","peace","rink","paddle","point","lawn","saw","eyelash","face","frisbee","mix","liquid","tattoo"]}
+{"concepts":["swimmer","pitcher","lay","bank","treadmill","routine","car","eyelash","chess","lotion","teammate","hat","pancake","metal","book","blow","pen","drink","lace","tie","suspend","close","arena","stunt"]}
+{"concepts":["speech","mascara","toddler","branch","yolk","lot","object","spark","sign","net","goalie","pencil","beam","highchair","joke","officer","team","bird","stove","alley","dog","smooth","cymbal","night","bartender","scooter"]}
+{"concepts":["sleep","piece","iron","barbell","lake","begin","smoke","accept","bubble","cut","pencil","uniform","step","mow","edge","dealer","bathtub","feed","violin","carriage","snowmobile","wheelie","rip","tell","makeup","chair","yolk"]}
+{"concepts":["pump","curl","mouth","trick","dish","climb","drink","cup","gather","eyelash","protect","vegetable","female","rock","dart","instrument","puzzle","present","chase","individual","clear","add","dice","portrait","ceremony","solve","razor","put","skateboard"]}
+{"concepts":["check","train","dinner","canoe","floor","jog","gong","calf","ladder","ball","guitar","athlete","pin","gym","hedge","book","flight","chew","hit","clean","spin","lens","bike","trampoline","cliff","goldfish","waterfall","grass","wall","chef"]}
+{"concepts":["punch","ground","pan","stare","guitar","blanket","chew","bag","shine","collect","dough","map","mat","eyelash","machine","balloon","javelin","ramp","mower","pedestrian","award","pin","plant"]}
+{"concepts":["instrument","ladder","bird","chess","hoop","book","drive","world","practice","guitarist","racket","bathroom","range","style","push","build","bridge","juggle","couple","routine","barber","hit"]}
+{"concepts":["swimmer","stool","highchair","banner","apple","point","walk","frisbee","clip","batter","room","slice","fork","shoulder","report","juggle","stick","jump","bunch","wine","toe","pick","open","beam","hole","sail"]}
+{"concepts":["dealer","walk","pass","worker","ramp","float","sun","pond","design","rock","white","tricycle","grape","word","lock","rip","saw","cellphone","home","close","counter","lawn","spin","gear"]}
+{"concepts":["butter","attire","country","puzzle","arena","arm","bride","attempt","key","podium","coat","wash","eat","end","presentation","trail","team","cut","goggle","customer","rodeo","doctor","market","stretch","grape","purse","instruction","pass"]}
+{"concepts":["hug","stand","dryer","swallow","animal","pierce","horse","backseat","pop","bride","machine","picture","help","skateboard","celebrate","swimsuit","slice","weld","mat","sail","ocean","hurdle","paddle","dog"]}
+{"concepts":["attire","shake","lasso","street","pipe","suit","say","instrument","begin","cellphone","drill","lift","dryer","decorate","oxidation","room","head","flap","lunch","costume","glove","mortar","castle","net","block"]}
+{"concepts":["dye","fork","pull","way","lean","slope","shake","night","oyster","court","bird","drum","dive","face","guitarist","trim","ceremony","glove","collar","kite","fold","desert"]}
+{"concepts":["bubble","singe","concert","vehicle","slope","nozzle","sock","gong","teach","sharpener","practice","rodeo","beach","marker","music","coat","design","fruit","weld","shovel","walk","guitarist","distance","mix"]}
+{"concepts":["hurdle","wall","tomato","oxidation","market","cymbal","toilet","vegetable","spatula","building","cook","pit","teacher","song","trash","purse","throw","flash","hat","eat","club","horse","stove","step","stroll"]}
+{"concepts":["phone","pair","end","tooth","concrete","shoulder","cube","lane","ball","sip","egg","clear","come","close","equipment","clip","grass","blanket","book","truck","branch","sleigh","hose","baseball","smash","flip","fruit","kid","band","squeegee"]}
+{"concepts":["stair","bubble","team","dryer","bath","set","vegetable","step","bowl","bathtub","grill","soldier","metal","roll","rest","graphic","take","cellphone","scooter","driveway","pencil","presentation","uniform","gym","dance","stick","explain","bike"]}
--- a/puppeteer/data/GSM-Hard/test.parquet
+++ b/puppeteer/data/GSM-Hard/test.parquet
--- a/puppeteer/data/MMLU-Pro/test.parquet
+++ b/puppeteer/data/MMLU-Pro/test.parquet
--- a/puppeteer/data/MMLU-Pro/validation.parquet
+++ b/puppeteer/data/MMLU-Pro/validation.parquet
--- a/puppeteer/data/SRDD/SRDD.csv
+++ b/puppeteer/data/SRDD/SRDD.csv
--- a/puppeteer/inference/init.py
+++ b/puppeteer/inference/init.py
--- a/puppeteer/inference/base/init.py
+++ b/puppeteer/inference/base/init.py
--- a/puppeteer/inference/base/edge.py
+++ b/puppeteer/inference/base/edge.py
@ -0,0 +1,6 @@
+class Edge:
+    def __init__(self, u, v, index):
+        self.u = u
+        self.v = v
+        self.index = index
+        self.color = "black"
--- a/puppeteer/inference/base/graph.py
+++ b/puppeteer/inference/base/graph.py
@ -0,0 +1,46 @@
+from typing import List, Any
+from abc import ABC, abstractmethod
+from inference.base.edge import Edge
+
+
+class Graph(ABC):
+    def __init__(self) -> None:
+        self._nodes = []
+        self._edges = []
+        self._nodes_num = 0 
+        self._edges_num = 0
+    
+    def initialize_nodes(self, nodes: List[Any]):
+        self._nodes = nodes
+    
+    def _add_node(self, node):
+        self._nodes.append(node)
+
+    def _add_edge(self, node1, node2, index):
+        edge = Edge(node1, node2, index)
+        self._edges.append(edge)
+    
+    def _get_edge(self, node1, node2):
+        idx_list = []
+        for edge in self._edges:
+            if edge.u == node1 and edge.v == node2:
+                idx_list.append(edge.index)
+        
+        if len(idx_list) > 0:
+            return idx_list
+        else:        
+            return None
+    
+    def _remove_edges(self, node1, node2):
+        self._edges = [edge for edge in self._edges if edge != (node1, node2)]
+    
+    def adjacency_matrix(self):
+        matrix = [[0 for _ in range(len(self._nodes))] for _ in range(len(self._nodes))]
+        for edge in self._edges:
+            matrix[edge[0].index][edge[1].index] = 1
+        return matrix
+    
+    @abstractmethod
+    def visualize(self):
+        pass
+    
--- a/puppeteer/inference/graph/init.py
+++ b/puppeteer/inference/graph/init.py
--- a/puppeteer/inference/graph/action_graph.py
+++ b/puppeteer/inference/graph/action_graph.py
@ -0,0 +1,48 @@
+import yaml
+import networkx as nx
+from inference.base.graph import Graph
+from pyvis.network import Network
+from agent.agent_info.actions import REASONING_ACTION_LIST, TOOL_ACTION_LIST, TERMINATION_ACTION_LIST
+
+class ActionGraph(Graph):
+    def __init__(self):
+        super().__init__()
+        self.REASONING_ACTION_LIST = REASONING_ACTION_LIST
+        self.TOOL_ACTION_LIST = TOOL_ACTION_LIST
+        self.TERMINATION_ACTION_LIST = TERMINATION_ACTION_LIST
+        global_config = yaml.safe_load(open("./config/global.yaml", "r"))
+        external_tools_enabled = global_config.get("external_tools_enabled")
+        if external_tools_enabled:
+            self.actions_collection = REASONING_ACTION_LIST + TOOL_ACTION_LIST + TERMINATION_ACTION_LIST
+        else:
+            self.actions_collection = REASONING_ACTION_LIST + TERMINATION_ACTION_LIST
+
+
+    def add_action(self, action_id, action_data, agent_data):
+        self._add_node({"id": action_id, "action": action_data, "agent": agent_data})
+
+    def add_dependency(self, from_action_id, to_action_id):
+        self._add_edge(from_action_id, to_action_id, len(self._edges))
+
+    def visualize(self, path="action_graph.html"):
+        G = nx.DiGraph()
+        nodes_colors = []
+        for node in self._nodes:
+            G.add_node(node["id"], label=node["action"]["action"]["action"] + "\n" + node["agent"], 
+                       status=node["action"]["success"], 
+                       color="green" if node["action"]["success"] == "Success" else "red")
+            nodes_colors.append("green" if node["action"]["success"] == "Success" else "red")
+        for edge in self._edges:
+            G.add_edge(edge.u, edge.v)
+        net = Network(notebook=True, height="750px", width="100%", bgcolor="#FFFFFF", font_color="black", directed=True)
+        net.from_nx(G)
+        net.show(path)
+
+    def get_action_data(self, action_id):
+        for node in self._nodes:
+            if node["id"] == action_id:
+                return node
+        return None
+    
+    def get_dependencies(self, action_id):
+        return [edge.v for edge in self._edges if edge.u == action_id]
--- a/puppeteer/inference/graph/agent_graph.py
+++ b/puppeteer/inference/graph/agent_graph.py
@ -0,0 +1,119 @@
+import networkx as nx
+from pyvis.network import Network
+import seaborn as sns
+import yaml
+from inference.base.graph import Graph
+from agent.register.register import agent_global_registry
+import logging
+main_logger = logging.getLogger('global') 
+
+class AgentGraph(Graph):
+    def __init__(self):
+        super().__init__()     
+        self._nodes_num = agent_global_registry.agent_num
+        self._edges_num = 0
+        for agent in agent_global_registry.unique_agents.values():
+            self._add_node(agent)
+        print("-"*10+"\033[31mAgent Graph Initialized\033[0m"+"-"*10)
+
+    @property
+    def hash_nodes(self):
+        return [node.hash for node in self._nodes]
+    
+    @property
+    def role_nodes(self):
+        return [node.role for node in self._nodes]
+    
+    def get_agent_from_index(self, index):
+        return self._nodes[index]
+    
+    def get_agent_from_role(self, role):
+        for agent in self._nodes:
+            if agent.role == role:
+                return agent
+        return None
+    
+    def get_agent_from_hash(self, hash):
+        for agent in self._nodes:
+            if agent.hash == hash:
+                return agent
+        return None
+    
+    def get_agent_dialog_history(self, agent_role_list: list, **kwargs):
+        """get agent dialog history
+        
+        Keyword arguments:
+        idx -- agent idx
+        Return: corresponding agent dialog history. If idx is illegal, return []
+        """
+        question = kwargs.get("question", None)
+        history = []
+        for role in agent_role_list:
+            agent = self.get_agent_from_role(role)
+            for h in agent.simplified_dialog_history:
+                history.append(h)
+        if len(agent_role_list) == 0 and question is not None:
+            history = [{'role': 'system', 'content': 'You are an assistant. Your task is to {}'.format(question)}]
+        assert len(history)!=0, "Dialog history can not be empty"
+        return history    
+    
+    @property
+    def agent_prompt(self):
+        agent_prompt = []
+        for agent in self._nodes:
+            if agent.role != "TerminatorAgent":
+                agent_prompt.append(f"Agent {agent.role} using model {agent.model}' hash: {agent.hash}")
+        agent_prompt = "\n".join(agent_prompt)
+        return agent_prompt
+    
+    @property
+    def terminator_agent_index(self):
+        for agent in self._nodes:
+            if agent.role == "TerminatorAgent":
+                return agent.index
+        return None
+    
+    @property
+    def search_agent_indices(self):
+        indices = []
+        for agent in self._nodes:
+            if agent.role == "WebsiteAgent" or agent.role == "BingAgent" or agent.role == "ArxivAgent":
+                indices.append(agent.index)
+        return indices
+    
+    def agent_list(self):
+        agent_info_list = [
+            f"index:{agent.index}, role:{agent.role}, model:{agent.model}, hash:{agent.hash}, tool:{agent.tools}"
+            for agent in self._nodes
+        ]
+        return '\n'.join(agent_info_list)
+    
+    def visualize(self, path="agent_graph.html"):
+        def generate_color_map(node_ids):
+            color_palette = sns.color_palette("husl", len(node_ids)).as_hex()
+            color_map = {node_id: color_palette[i % len(color_palette)] for i, node_id in enumerate(node_ids)}
+            return color_map
+        node_color_map = generate_color_map(self.hash_nodes)
+        edge_color_map = generate_color_map([edge.index for edge in self._edges])
+        
+        G = nx.MultiDiGraph()
+        edge_labels = {}
+        for node in self._nodes:
+            G.add_node(node.index, label=f"{node.role}\nbase model: {node.model}\nindex: {node.index}",color = node_color_map[node.hash])
+        
+        for edge in self._edges:
+            G.add_edge(edge.v.index, edge.u.index, color = edge_color_map[edge.index])
+            edge_labels[(edge.v.index, edge.u.index)] = f"Reasoning..."
+        
+        net = Network(notebook=True, height="750px", width="100%", bgcolor="#FFFFFF", font_color="black", directed=True)
+        net.from_nx(G)
+        net.show(path)
+    
+    @property
+    def num(self):
+        return self._nodes_num
+    
+    def add_agent(self):
+        pass
+    def delete_agent(self):
+        pass
--- a/puppeteer/inference/policy/REINFORCE_continuous.py
+++ b/puppeteer/inference/policy/REINFORCE_continuous.py
@ -0,0 +1,593 @@
+import torch
+import atexit
+import os
+import datetime
+import json
+import numpy as np
+import torch.nn as nn
+import yaml
+import logging
+from utils.other_utils import Singleton
+from inference.policy.base_policy import LLMPolicy, LearningPolicy
+from model.embedding import RewardModelTokenRepresentation
+
+global_config = yaml.safe_load(open("./config/global.yaml", "r"))
+logger = logging.getLogger("train")
+
+@Singleton
+class MLP_PolicyNetwork(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(input_dim, 512)
+        self.fc2 = torch.nn.Linear(512, 128)
+        self.fc3 = torch.nn.Linear(128, 32)
+        self.fc4 = torch.nn.Linear(32, output_dim)
+        self.relu = torch.nn.ReLU()
+        self.softmax = torch.nn.Softmax(dim=1)
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+    
+    def forward(self, x):
+        x = x.to(torch.float32)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.fc3(x)
+        x = self.relu(x)
+        x = self.fc4(x)
+        x = self.softmax(x)
+        return x
+
+
+@Singleton
+class ContinuousREINFORCE(LearningPolicy):
+    def __init__(self, agent_graph, action_graph, config_path="config/policy.json"):
+        super().__init__(agent_graph, action_graph)
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+        
+        # Set parameters from config
+        self.device = self.config["device"]["type"]
+        
+        # Training parameters
+        self.model_path = self.config["paths"]["model_path"]
+        self.training = self.config["training"]["training"]
+        self.loading = self.config["training"]["loading"]
+        self.learning_rate = self.config["training"]["learning_rate"]
+        self.gamma = self.config["training"]["gamma"]
+        self.sample_size = self.config["training"]["sample_size"]
+        self.lambda_kl_loss = self.config["training"]["lambda_kl_loss"]
+
+        # Agent parameters
+        self.max_num_agents = self.config["agent"]["max_num_agents"] 
+        self.next_num_agents = self.config["agent"]["next_num_agents"] 
+        self.max_path = self.config["agent"]["max_path"]
+        self.threshold = self.config["agent"]["threshold"]
+        
+        # LLM parameters
+        self.llm_prior = self.config["llm"]["prior"]
+        self.llm_prior_redistribution = self.config["llm"]["prior_redistribution"]
+        self.redistribution_weight = self.config["llm"]["redistribution_weight"]
+        
+        # Initialize state representation and policy network
+        self.state_representation = RewardModelTokenRepresentation()
+        self.policy_network = MLP_PolicyNetwork(self.state_representation.dim, self.actions_dim) 
+        self.policy_network = self.policy_network.to(self.device) 
+        if not self.training:
+            self.load_model(self.get_latest_model_path())
+        if self.loading:
+            self.load_model(self.model_path)
+
+        # Agent setup
+        self.agent_hash_list = agent_graph.hash_nodes
+        self.agent_role_list = agent_graph.role_nodes
+        
+        # Initialize tracking variables
+        self.executed_trajectories = []
+        self.execution_count = 0 
+        self.current_trajectories = []
+        self.current_trajectory_idx = 0
+        
+        self.policy_losses = []
+        self.rewards_history = []
+        self.action_probs_history = []
+        self.llm_action_probs_history = []
+        self.reward_from_rm = []
+        self.accumulated_acc = []
+        self.entropy_history = []
+
+        # Setup actions and rewards
+        self.end_action = torch.tensor(self.agent_graph.terminator_agent_index, device=self.device)
+        self.web_actions = torch.tensor(self.agent_graph.search_agent_indices, device=self.device)
+        
+        # Initialize reward factors from config
+        reward_factors = self.config["agent"]["reward_factors"]
+        self.agent_reward_factor = [reward_factors["default"]] * self.actions_dim
+        self.agent_reward_factor[self.end_action.item()] = reward_factors["terminator"]
+        for web_idx in self.web_actions:
+            self.agent_reward_factor[web_idx.item()] = reward_factors["web_search"]
+
+        self.current_task = None
+        self.previous_task = None
+        self.global_step = 0    
+        self.prob_step=0
+        
+        # Initialize optimizer
+        self.optimizer = torch.optim.Adam(self.policy_network.parameters(), lr=self.learning_rate)
+        self.max_step_num = global_config.get("graph").get("max_step_num")
+        self.llm_policy = LLMPolicy(self.agent_graph, self.action_graph)
+        
+        atexit.register(self.save_model)
+
+    def logarithmic_cost(self, step):
+        """Calculate logarithmic cost using config parameters"""
+        scale = self.config["cost"]["scale"]
+        growth_rate = self.config["cost"]["growth_rate"]
+        # Normalize step to [0,1] range
+        normalized_step = (step + 1) / (self.max_step_num + 1)
+
+        if self.config["cost"]["inverse"]:
+            step_cost = scale * (1 - torch.log(torch.tensor(1 + growth_rate * normalized_step, device=self.device)) 
+                            / torch.log(torch.tensor(1 + growth_rate, device=self.device)))
+        else:
+            step_cost = scale * (torch.log(torch.tensor(1 + growth_rate * normalized_step, device=self.device)) 
+                            / torch.log(torch.tensor(1 + growth_rate, device=self.device)))
+        print("\033[1;33mstep cost: {}\033[0m".format(step_cost))
+        return step_cost
+
+    def save_model(self, path=None, tag=None):
+        """Save model with config"""
+        path = self.config["paths"]["checkpoint_path"]
+
+        os.makedirs(path, exist_ok=True)
+        
+        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+        filename = f'policy_net_{timestamp}' + (f'_{tag}' if tag else '') + '.pt'
+        save_path = os.path.join(path, filename)
+        
+        checkpoint = {
+            'model_state_dict': self.policy_network.state_dict(),
+            'optimizer_state_dict': self.optimizer.state_dict() if hasattr(self, 'optimizer') else None,
+            'input_dim': self.policy_network.input_dim,
+            'output_dim': self.policy_network.output_dim,
+            'timestamp': timestamp,
+            'config': self.config,  # Save the config with the model
+            'metadata': {
+                'tag': tag,
+                'version': '1.0',
+            }
+        }
+        
+        try:
+            torch.save(checkpoint, save_path)
+            print(f"Model saved successfully to {save_path}")
+            return save_path
+            
+        except Exception as e:
+            print(f"Error saving model: {str(e)}")
+            return None
+
+    def update_executed_trajectories(self):
+        if self.current_task != self.previous_task:
+            self.previous_task = self.current_task
+            self.execution_count += 1
+            num_to_add = self.execution_count - len(self.executed_trajectories) 
+            if num_to_add > 0:
+                self.executed_trajectories.extend([[] for _ in range(num_to_add)])
+        self.current_trajectories = self.executed_trajectories[self.execution_count-1]
+    
+    def init_forward(self, global_info):
+        print("\033[1;33mInit Policy Forward\033[0m")
+        logger.info("[Init Policy Forward]")
+        self.current_task = global_info.task    
+        self.update_executed_trajectories()
+        
+        state, rew = self.get_state_representation(global_info)  
+        action_probs = self.policy_network(state) #shape (1,action_dim)
+
+        self.action_probs_history.append(action_probs.T.squeeze(1))
+        self.reward_from_rm.append(rew)
+        logger.info("Action probs: {}".format(action_probs))
+        
+        entropy = -(action_probs * torch.log(action_probs + 1e-10)).sum()
+        self.entropy_history.append(entropy)
+        m = torch.distributions.Categorical(action_probs)
+        agent_indices = self.select_agents_by_threshold(action_probs).T.squeeze(1)
+
+        self.current_trajectory_idx = 0
+        length = len(self.current_trajectories) + agent_indices.shape[0]
+        while len(self.current_trajectories) < length:
+            self.current_trajectories.append([])
+
+        for i, agent_idx in enumerate(agent_indices):
+            prob_value = action_probs[0, agent_idx.item()]
+            if i == 0:
+                self.append_to_trajectory(self.current_trajectory_idx, agent_idx, prob_value, global_info, None, m, rew)
+            else:
+                trajectory_idx = len(self.current_trajectories) - len(agent_indices) + i
+                self.append_to_trajectory(trajectory_idx, agent_idx, prob_value, global_info, None, m, rew)
+        return agent_indices
+
+    def iter_forward(self, global_info):
+        print("\033[1;33mFollowing Policy Forward\033[0m")
+        logger.info("Following Policy Forward")
+
+        self.current_task = global_info.task    
+        if self.llm_prior:
+            prior_action_probs = self.llm_policy.forward_prior(global_info)
+        else:
+            prior_action_probs = None
+        print("Prior action probs: {}".format(prior_action_probs))
+        logger.info("Prior action probs: {}".format(prior_action_probs))
+
+        state, rew = self.get_state_representation(global_info)  
+        action_probs = self.policy_network(state) #shape (1,action_dim)
+        
+        if self.llm_prior_redistribution:
+            action_probs = (1 - self.redistribution_weight) * action_probs + \
+                            self.redistribution_weight * prior_action_probs
+
+            action_probs /= action_probs.sum()
+        self.action_probs_history.append(action_probs.T.squeeze(1))
+        self.reward_from_rm.append(rew)
+        logger.info("Action probs: {}".format(action_probs))
+        
+        entropy = -(action_probs * torch.log(action_probs + 1e-10)).sum()
+        self.entropy_history.append(entropy)
+        m = torch.distributions.Categorical(action_probs)
+        agent_indices = self.select_agents_by_threshold(action_probs).T.squeeze(1)
+    
+        self.current_trajectory_idx = global_info.path_id 
+        length = len(self.current_trajectories) + len(agent_indices) - 1
+        original_length = len(self.current_trajectories)
+        while len(self.current_trajectories) < length:
+            self.current_trajectories.append([])
+
+        for i, agent_idx in enumerate(agent_indices):
+            prob_value = action_probs[0, agent_idx.item()]
+            if i == 0:
+                self.append_to_trajectory(self.current_trajectory_idx, agent_idx, prob_value, global_info, prior_action_probs, m, rew)
+            else:
+                trajectory_idx = original_length + i - 1
+                self.current_trajectories[trajectory_idx] = self.clone_trajectory(self.current_trajectory_idx)
+                self.append_to_trajectory(trajectory_idx, agent_idx, prob_value, global_info, prior_action_probs, m, rew)
+        return agent_indices
+
+    def append_to_trajectory(self, trajectory_idx, agent_idx, prob_value, global_info, prior_action_probs, m, rew=0):
+        cost = self.logarithmic_cost(len(self.current_trajectories[trajectory_idx])) * self.agent_reward_factor[agent_idx.item()]
+        self.current_trajectories[trajectory_idx].append({
+            'prob': prob_value,
+            'log_prob': m.log_prob(agent_idx),
+            'state_identifier': global_info.workflow.state,
+            'action': self.agent_role_list[agent_idx.item()],
+            'reward': cost,
+            'reward_model': rew,
+            'prior_prob': prior_action_probs[agent_idx.item()] if prior_action_probs is not None else None
+        })
+        print(trajectory_idx, self.current_trajectories[trajectory_idx])
+
+    def clone_trajectory(self, source_idx):
+            return [{
+                'prob': t['prob'].clone(),
+                'log_prob': t['log_prob'].clone(),
+                'state_identifier': t['state_identifier'],
+                'action': t['action'],
+                'reward': t['reward'],
+                'reward_model': t['reward_model'],
+                'prior_prob': t['prior_prob'].clone() if t['prior_prob'] is not None else None
+            } for t in self.current_trajectories[source_idx][:-1]]
+    
+    def forward(self, global_info):
+        if global_info.path_id == -1:
+            agent_indices = self.init_forward(global_info)
+        else:
+            agent_indices = self.iter_forward(global_info)
+        print("Agent Indices: {}".format(agent_indices))
+        selected_agents = [self.agent_hash_list[i] for i in agent_indices]
+        return  selected_agents
+    
+    def calculate_returns(self, trajectory):
+        returns = []
+        R = 0
+        for t in reversed(trajectory):
+            R = t.get('reward', 0) + self.gamma * R 
+            returns.insert(0, R)
+        return torch.tensor(returns, device=self.device)
+    
+    def get_state_representation(self, global_info):
+        role_list = global_info.agent_role_list()
+        print(role_list)
+        state_context = self.agent_graph.get_agent_dialog_history(role_list, question=global_info.task.get("Question"))
+        print(state_context)
+        print(type(state_context))
+        state, reward = self.state_representation(state_context)
+        print(state, reward)
+        return state, reward        
+    
+    def update(self):
+        logger.info("Update")   
+        logger.info("Executed trajectories: {}".format(self.executed_trajectories))
+        if not self.training:
+            metrics = {
+            'reasoning/action_probs': torch.sum(torch.stack(self.action_probs_history), dim=0),
+            "training/entropy": np.mean([e.detach().cpu().item() for e in self.entropy_history])
+            }
+            logger.info("metrics: {}".format(metrics))  
+            self.current_trajectories = []
+            self.executed_trajectories = []
+            self.entropy_history = []
+            self.execution_count = 0
+            return {}
+        if len(self.executed_trajectories) >= self.sample_size:
+            episode_returns = []
+            episode_lengths = []
+            episode_last_rewards = []
+            episode_acc = []
+            episode_tokens = []
+            episode_cost = []
+            episode_metrics = {}
+            kl_losses = []
+            logger.info("Update with sample size {}".format(self.sample_size))
+            policy_loss = []
+            episode_loss = []
+            for trajectories in self.executed_trajectories[:self.sample_size]:
+                task_avg_length = []
+                task_avg_reward = []
+                task_last_reward = []   
+                task_acc = []
+                task_avg_tokens = []
+                task_avg_cost = []
+                task_avg_metrics = []
+                for trajectory in trajectories:
+                    if trajectory[-1].get('finalized', False):
+                        logger.info("Trajectory: {}".format(trajectory))
+                        returns = self.calculate_returns(trajectory)
+                        # episode_returns.append(sum(returns))
+                        task_avg_reward.append(sum(returns))
+                        task_avg_length.append(len(trajectory))
+                        task_last_reward.append(trajectory[-1].get('reward', 0))
+                        task_avg_tokens.append(trajectory[-1].get('total_tokens', 0))
+                        task_avg_cost.append(trajectory[-1].get('total_cost', 0))
+                        task_avg_metrics.append(trajectory[-1].get('metrics', {}))
+                        if task_last_reward[-1] > 0:
+                            task_acc.append(1)
+                        else:
+                            task_acc.append(0)
+                        # task_acc.append(task_last_reward[-1].cpu().item())
+                        # episode_lengths.append(len(trajectory))
+                        print("returns: {}".format(returns))
+                        logger.info("Trajectory returns: {}".format(returns))
+                        
+                        for t, R in zip(trajectory, returns):
+                            if t.get('prob', None) is not None and t.get('prior_prob', None) is not None:
+                                kl_loss = t.get('prior_prob', 0) * torch.log(t['prior_prob'] / (t['prob']+1e-10))
+                                logger.info("Add KL loss: {}".format(kl_loss))
+                            else: 
+                                kl_loss = 0
+                                logger.info("No KL loss: {}".format(kl_loss))
+                            kl_loss = torch.tensor(kl_loss).to(self.device)
+                            kl_losses.append(kl_loss)
+                            loss = (-t['log_prob'] * R + self.lambda_kl_loss * kl_loss).to(self.device)
+                            
+                            if loss.dim() == 0:  # scalar loss, convert to shape [1]
+                                loss = loss.view(1)
+                            elif loss.dim() == 1:  # already [1], keep it
+                                pass
+                            policy_loss.append(loss)
+                            logger.info("loss for one sample: {}".format(policy_loss))
+                if len(task_avg_length) == 0:
+                    continue
+                else:
+                    episode_lengths.append(sum(task_avg_length)/len(task_avg_length))
+                if len(task_avg_reward) == 0:
+                    continue
+                else:
+                    episode_returns.append(sum(task_avg_reward)/len(task_avg_reward))
+                if len(task_last_reward) == 0:
+                    continue
+                else:
+                    episode_last_rewards.append(sum(task_last_reward)/len(task_last_reward))
+                if len(task_avg_tokens) == 0:
+                    continue
+                else:  
+                    episode_tokens.append(sum(task_avg_tokens)/len(task_avg_tokens))
+                if len(task_avg_cost) == 0:
+                    continue
+                else:
+                    episode_cost.append(sum(task_avg_cost)/len(task_avg_cost))
+                if len(task_acc) == 0:
+                    continue
+                else:
+                    episode_acc.append(sum(task_acc)/len(task_acc))
+                if len(task_avg_metrics) == 0:
+                    continue    
+                elif task_avg_metrics[0] == {}:
+                    continue
+                else:
+                    for key in task_avg_metrics[0].keys():
+                        if key not in episode_metrics:
+                            episode_metrics[key] = []
+                        episode_metrics[key].append(sum([m[key] for m in task_avg_metrics])/len(task_avg_metrics))
+                    
+
+            if policy_loss: 
+                logger.info("Policy loss: {}".format(policy_loss))
+                policy_loss = torch.stack(policy_loss).sum()/(self.sample_size)
+                logger.info("Policy loss stack: {}".format(policy_loss))
+                policy_loss -= sum(self.entropy_history)
+                logger.info("Policy loss with entropy: {}".format(policy_loss))
+                self.optimizer.zero_grad()
+                policy_loss.backward()
+                self.optimizer.step()
+                metrics = {
+                    'reasoning/action_probs': torch.sum(torch.stack(self.action_probs_history), dim=0),
+                    'reasoning/reward_from_rm': sum(self.reward_from_rm),
+                    'reasoning/acc': np.mean([a for a in episode_acc]),
+                    'reasoning/tokens': np.mean([t for t in episode_tokens]),
+                    'reasoning/cost': np.mean([c for c in episode_cost]),
+                    'training/policy_loss': policy_loss.item(),
+                    'reasoning/mean_return': np.mean([r.detach().cpu().item() for r in episode_returns]),
+                    'reasoning/mean_episode_length': np.mean(episode_lengths),
+                    'reasoning/mean_last_reward': np.mean([r.detach().cpu().item() for r in episode_last_rewards]),
+                    'training/mean_kl_loss': np.mean([kl.detach().cpu().item() for kl in kl_losses]),
+                    "training/entropy": np.mean([e.detach().cpu().item() for e in self.entropy_history]),
+                }
+                metrics.update({f'reasoning/{key}': np.mean([r.cpu().item() for r in episode_metrics[key]]) for key in episode_metrics})
+                logger.info("metrics: {}".format(metrics))  
+                self.global_step += 1
+                self.policy_losses.append(policy_loss.item())
+                self.current_trajectories = []
+                self.executed_trajectories = []
+                self.entropy_history = []
+                self.execution_count = 0
+                self.reward_from_rm = []
+                self.action_probs_history = []
+                self.llm_action_probs_history = []
+                return {
+                    'policy_loss': policy_loss.item(),
+                    'mean_reward': torch.tensor(returns, device=self.device).mean().item()
+                }
+        return {}
+    
+    def finalize_task(self, transition, global_info):
+        print("\033[1;33mtransition reward: {}\033[0m".format(transition.get('reward', 0)))
+        self.current_trajectories = self.executed_trajectories[self.execution_count-1]
+        idx = transition.get('path_id', 0)
+        if self.current_trajectories and idx < len(self.current_trajectories):
+            state, rew = self.get_state_representation(global_info)
+            action_probs = self.policy_network(state)
+            prob_value = action_probs[0, self.end_action.item()]
+            m = torch.distributions.Categorical(action_probs)
+            current_trajectory = self.current_trajectories[idx]
+            for index, action in  enumerate(global_info.workflow.workflow):
+                cost = action.cost
+                print("\033[1;33mtoken cost: {}\033[0m".format(cost))
+                print("\033[1;33mcost factor: {}\033[0m".format(cost/100000))
+                current_trajectory[index]["reward"] *= cost/100000 
+                print("\033[1;33mReward: {}\033[0m".format(current_trajectory[index]['reward']))
+            if current_trajectory: 
+                step_reward = self.logarithmic_cost(len(current_trajectory))
+                total_tokens = global_info.total_tokens
+                total_cost = global_info.total_cost
+                if transition.get('reward', 0) > 0: 
+                    reward = transition.get('reward', 0) + self.agent_reward_factor[self.end_action.item()] * step_reward
+                else:
+                    reward = transition.get('reward', 0) - self.agent_reward_factor[self.end_action.item()] * step_reward
+                
+                if current_trajectory[-1].get("action") == self.agent_role_list[self.end_action.item()]:
+                    current_trajectory[-1]["reward"] = reward
+                    current_trajectory[-1]['total_tokens'] = total_tokens
+                    current_trajectory[-1]['total_cost'] = total_cost
+                    current_trajectory[-1]['finalized'] = True
+                    current_trajectory[-1]['reward_model'] = rew
+                    current_trajectory[-1]['metrics'] = transition.get('metrics', {})
+                    print("\033[1;33mLast Reward: {}\033[0m".format(current_trajectory[-1]['reward']))
+                else:
+                    current_trajectory.append({
+                        'prob': prob_value,
+                        'log_prob': m.log_prob(self.end_action),
+                        'state_identifier': transition.get('state', global_info.workflow.state),
+                        'action': self.agent_role_list[self.end_action.item()],
+                        'reward': reward,
+                        'reward_model': rew,
+                        'finalized': True,
+                        'total_tokens': total_tokens,
+                        'total_cost': total_cost,
+                        'metrics': transition.get('metrics', {})
+                    })
+                    print("\033[1;33mLast Reward: {}\033[0m".format(current_trajectory[-1]['reward']))
+        self.rewards_history.append(transition.get('reward', 0))
+        
+    
+    def select_agents_by_probability(self, action_probs):
+        num_agents_to_select = torch.randint(1, self.max_num_agents+1, (1,)).item()
+        selected_indices = torch.multinomial(action_probs, num_agents_to_select, replacement=False)
+        return selected_indices
+
+    def select_agents_by_threshold(self, action_probs, threshold=0.1):
+        threshold = 2/self.agent_graph.num
+        selected_indices = torch.nonzero(action_probs[0] > threshold).squeeze(1)
+        if len(selected_indices) == 0:
+            num_to_select = min(self.max_path, self.max_num_agents)
+            selected_indices = torch.multinomial(action_probs, num_to_select, replacement=False)
+            return selected_indices
+        else:
+            probs = action_probs[0][selected_indices]
+            sorted_idx = torch.argsort(probs, descending=True)
+            selected_indices = selected_indices[sorted_idx]
+            
+            num_agents_to_select = min(len(selected_indices), self.max_path, self.max_num_agents)
+            selected_indices = selected_indices[:num_agents_to_select]
+            
+        return selected_indices.unsqueeze(0)
+    
+    def load_model(self, path, strict=True):
+        try:
+            if not os.path.exists(path):
+                logger.error(f"Model file not found: {path}")
+                return False
+            
+            checkpoint = torch.load(path, map_location=self.device)
+            
+            # Validate model architecture
+            if (checkpoint['input_dim'] != self.policy_network.input_dim or 
+                checkpoint['output_dim'] != self.policy_network.output_dim):
+                if strict:
+                    raise ValueError(f"Model architecture mismatch. Expected input_dim={self.policy_network.input_dim}, "
+                                  f"output_dim={self.policy_network.output_dim} but got input_dim={checkpoint['input_dim']}, "
+                                  f"output_dim={checkpoint['output_dim']}")
+                logger.warning("Model architecture mismatch, but continuing due to non-strict mode")
+            
+            # Load model state
+            self.policy_network.load_state_dict(checkpoint['model_state_dict'], strict=strict)
+            self.policy_network = self.policy_network.to(self.device)
+            
+            # Load optimizer state if available
+            if checkpoint['optimizer_state_dict'] and hasattr(self, 'optimizer'):
+                self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+                # Move optimizer state to correct device
+                for state in self.optimizer.state.values():
+                    for k, v in state.items():
+                        if isinstance(v, torch.Tensor):
+                            state[k] = v.to(self.device)
+            
+            # Load config if available
+            if 'config' in checkpoint:
+                # Merge loaded config with current config, prioritizing current config
+                self.config.update({k: v for k, v in checkpoint['config'].items() 
+                                  if k not in self.config})
+            
+            logger.info(f"Model loaded successfully from {path}")
+            logger.info(f"Model timestamp: {checkpoint['timestamp']}")
+            if checkpoint['metadata'].get('tag'):
+                logger.info(f"Model tag: {checkpoint['metadata']['tag']}")
+                
+            return True
+            
+        except Exception as e:
+            logger.error(f"Error loading model: {str(e)}")
+            return False
+
+    def get_latest_model_path(self):
+        """Get the path of the latest model checkpoint"""
+        try:
+            path = self.model_path
+            if os.path.exists(path) and os.path.isfile(path):
+                return path
+            
+            path = self.config["paths"]["checkpoint_path"]
+            if not os.path.exists(path):
+                return None
+
+            model_files = [f for f in os.listdir(path) if f.endswith('.pt')]
+            if not model_files:
+                return None
+            
+            latest_model = max(model_files, key=lambda x: os.path.getctime(os.path.join(path, x)))
+            return os.path.join(path, latest_model)
+            
+        except Exception as e:
+            print(f"Error finding latest model: {str(e)}")
+            return None
+    
--- a/puppeteer/inference/policy/base_policy.py
+++ b/puppeteer/inference/policy/base_policy.py
@ -0,0 +1,84 @@
+import torch
+import json
+from model import query_gpt
+import re
+from abc import ABC
+from tenacity import retry, stop_after_attempt, wait_exponential
+
+class Policy(ABC):
+    def __init__(self, agent_graph, action_graph) -> None:
+        super().__init__()
+        self.agent_graph = agent_graph
+        self.action_graph = action_graph
+        self.actions_dim = agent_graph.num
+    
+class LearningPolicy(Policy):
+    def __init__(self, agent_graph, action_graph):
+        super().__init__(agent_graph, action_graph)
+        self.actions_dim = agent_graph.num
+        self.agent_hash_list = agent_graph.hash_nodes
+        self.training = True  
+        
+    def train(self):
+        self.training = True
+        
+    def eval(self):
+        self.training = False
+    
+    def update(self):
+        pass
+    
+    def save_model(self, path):
+        raise NotImplementedError
+    
+    def load_model(self, path):
+        raise NotImplementedError
+    
+    def finalize_task(self, transition, global_info):
+        raise NotImplementedError
+    
+class LLMPolicy(Policy):
+    def __init__(self, agent_graph, action_graph) -> None:
+        super().__init__(agent_graph, action_graph)
+        self.agent_hash_list = agent_graph.hash_nodes
+        self.agent_role_list = agent_graph.role_nodes
+    
+    @retry(wait=wait_exponential(min=5, max=300), stop=stop_after_attempt(10))
+    def forward(self, global_info, max_num:int =1) -> list:
+        system_prompt_filepath = "prompts/general/agent_selection.json"
+        with open(system_prompt_filepath, "r") as f:
+            select_prompt = json.load(f)
+        select_prompt = "\n".join(select_prompt['simple_select']).format(global_info.task.get("Question"), 
+                                                                         global_info.workflow.all_actions, 
+                                                                         self.agent_graph.agent_prompt, 
+                                                                         max_num, 
+                                                                         max_num,
+                                                                         self.agent_hash_list[0])           
+        response, _ = query_gpt(select_prompt)
+        regex = r"\b(\w{32})\b"
+        matches = re.findall(regex, response)
+        if len(matches) <= 0:
+            raise Exception("No agent found")
+        if len(matches) > max_num:
+            matches = matches[:max_num]
+        elif len(matches) < max_num:
+            matches += [matches[-1]]*(max_num-len(matches))
+        for index, m in enumerate(matches[1:]):
+            if  m is None:
+                matches[index]  = matches[index-1]
+        for m in matches:
+            assert m in self.agent_hash_list
+        return matches
+    
+    def forward_prior(self, global_info, max_num:int = 1) -> list:
+        matches = self.forward(global_info, max_num)
+        probs = [0.0] * self.actions_dim
+    
+        if matches:
+            prob_per_agent = 1.0 
+            for agent_hash in matches:
+                idx = self.agent_hash_list.index(agent_hash)
+                probs[idx] = prob_per_agent
+        temprature = 0.1
+        probs = torch.softmax(torch.tensor(probs)/temprature, dim=0,)
+        return probs
--- a/puppeteer/inference/reasoning/init.py
+++ b/puppeteer/inference/reasoning/init.py
--- a/puppeteer/inference/reasoning/path.py
+++ b/puppeteer/inference/reasoning/path.py
@ -0,0 +1,146 @@
+from enum import Enum
+import yaml
+import uuid
+from agent.register.register import agent_global_registry
+from inference.graph.action_graph import ActionGraph
+import os
+import copy
+from agent.agent_info.global_info import GlobalInfo
+global_config = yaml.safe_load(open("./config/global.yaml", "r"))
+class ReasoningState(Enum):
+    INITIALIZED = 1
+    SPLITING = 2
+    ANSWERING = 3
+    FINALIZING = 4
+    DISCARDING = 5
+    AGGREGATING = 6
+
+class GraphReasoningPath:
+    def __init__(self, start_agent, max_parallel_paths, global_logger, workspace_path, action_graph:ActionGraph, frontier=[], agent_sequence = [],  index = None, global_info:GlobalInfo=None, state = ReasoningState.INITIALIZED, env=None, env_name=None, policy=None):
+        
+        self.state = state
+        self.index = index
+        self.global_logger = global_logger
+        self.workspace_path = workspace_path
+        self.action_graph = action_graph
+        self.frontier = frontier
+        
+        global_logger.create_logger('path{}_logger'.format(index), os.path.join(global_logger.folder_path, "path{}.log".format(index)), "INFO")
+        self.logger = global_logger.get_logger('path{}_logger'.format(index))
+        self.workflow_path = os.path.join(workspace_path, "path_{}.jsonl".format(index))
+        self.workcode_path = os.path.join(workspace_path, "code_{}.py".format(index))
+
+        self.start_agent = start_agent
+        self.agent_sequence = agent_sequence
+        if self.agent_sequence == []:
+            self.agent_sequence.append(start_agent.unique_identifier)
+        
+        self.max_parallel_paths = max_parallel_paths
+        self.max_step_num = global_config.get("graph").get("max_step_num")
+        
+        self.current_agent = start_agent
+        self.next_agents = []
+        
+        self.env = env
+        self.env_name = env_name
+
+        self.policy = policy
+
+        self.global_info = global_info
+        self.global_info.logger = self.logger
+        self.global_info.workpath = self.workspace_path
+        self.global_info.path_id = self.index
+
+        self.logger.info("{}[Reasoning Path{} Start]{}".format("-"*30,self.index, "-"*30))
+        self.logger.info("Reasoning Path{}:{}".format(self.index, state))
+        self.logger.info("Start agent: {}".format(start_agent.role))
+        self.logger.info("Previous Agent sequence: {}".format(self.print_agent_sequence()))
+
+    def update_global_info(self, current_action):
+        self.global_info.update(current_action)
+        self.logger.info("Updated global_info: {}".format(self.global_info.__dict__))
+
+    def step(self):
+        external_tools_enabled = global_config.get("external_tools_enabled")
+        current_action, terminated = self.current_agent.take_action(self.global_info, external_tools_enabled, self.env, self.env_name)
+        self.current_agent.deactivate()
+        self.update_global_info(current_action)
+        
+        node_id = str(uuid.uuid4())
+        self.action_graph.add_action(node_id, current_action.to_dict(), self.current_agent.role) 
+        for successor in self.frontier:
+            self.action_graph.add_dependency(successor, node_id)
+        self.frontier = [node_id]
+
+        # Deal with the case meeting the termination condition:
+        # 1. The current agent is the terminator 
+        # 2. The maximum number of steps is reached
+        if terminated or len(self.agent_sequence) >= self.max_step_num:
+            self.state = ReasoningState.FINALIZING
+            self.last_agent = self.current_agent
+            self.last_query_func = self.current_agent.query_func
+            return self.state
+        
+        # Deal with the case where the current agent is the terminator
+        next_agents_idx = self.policy.forward(self.global_info)
+        self.next_agents = [agent_global_registry.get_agent_from_idx(idx) for idx in next_agents_idx]
+        
+        # Deal with the case where there is only one next agent
+        if len(self.next_agents) == 1:
+            self.current_agent = self.next_agents[0]
+            self.current_agent.activate(global_info=self.global_info, initial_dialog_history=self.current_agent.initial_dialog_history)
+            self.agent_sequence.append(self.current_agent.unique_identifier)
+            self.state = ReasoningState.ANSWERING
+            return self.state
+        
+        # Deal with the case where there are multiple next agents
+        else:
+            for agent in self.next_agents:
+                agent.activate(global_info=self.global_info, initial_dialog_history=agent.initial_dialog_history)
+            self.state = ReasoningState.SPLITING
+            return self.state
+        
+
+    def split(self, current_path_num):
+        split_reasoning_paths = []
+        if current_path_num >= self.max_parallel_paths:
+            self.current_agent = self.next_agents[0]
+            self.agent_sequence.append(self.current_agent.unique_identifier)
+            self.state = ReasoningState.ANSWERING
+            return split_reasoning_paths
+        
+        for index, agent in enumerate(self.next_agents[1:self.max_parallel_paths-current_path_num+1]):
+            agent_sequence = copy.deepcopy(self.agent_sequence)
+            if self.env is not None:
+                env = copy.deepcopy(self.env)
+            else:
+                env = None
+            path_index = current_path_num + index
+            reasoning_path = GraphReasoningPath(
+                                    start_agent=agent, 
+                                    max_parallel_paths=self.max_parallel_paths, 
+                                    action_graph=self.action_graph,
+                                    agent_sequence = agent_sequence,
+                                    index=path_index,
+                                    global_info=copy.deepcopy(self.global_info),
+                                    state=ReasoningState.ANSWERING,
+                                    global_logger=self.global_logger,
+                                    workspace_path=self.workspace_path,
+                                    env=env,
+                                    frontier=self.frontier,
+                                    policy=self.policy
+                                    )
+            reasoning_path.agent_sequence.append(agent.unique_identifier)
+            reasoning_path.current_agent = agent
+            reasoning_path.next_agents = []
+            split_reasoning_paths.append(reasoning_path)
+            print("\033[1;36mPath {} Initialized (split from path {})\033[0m".format(path_index,self.index))
+        
+        self.current_agent = self.next_agents[0]
+        self.agent_sequence.append(self.current_agent.unique_identifier)
+        self.state = ReasoningState.ANSWERING
+        return split_reasoning_paths
+    
+    def print_agent_sequence(self):
+        agent_sequence = "".join([agent.get("role") + "->" for agent in self.agent_sequence[:-1]] + [self.agent_sequence[-1].get("role")])
+        return agent_sequence
--- a/puppeteer/inference/reasoning/reasoning.py
+++ b/puppeteer/inference/reasoning/reasoning.py
@ -0,0 +1,300 @@
+from typing import List
+import json
+import yaml
+import os
+import copy
+import logging
+
+from inference.reasoning.path import ReasoningState, GraphReasoningPath
+from inference.graph.agent_graph import AgentGraph
+from inference.graph.action_graph import ActionGraph
+from inference.policy.REINFORCE_continuous import ContinuousREINFORCE
+
+from utils.logging import LogManager
+
+from agent.register.register import agent_global_registry
+from agent.agent_info.global_info import GlobalInfo
+
+from tasks.evaluator import BenchmarkEvaluator
+
+global_config = yaml.safe_load(open("./config/global.yaml", "r"))
+main_logger = logging.getLogger('global') 
+
+class GraphReasoning:
+    def __init__(self, task:json, graph: AgentGraph, env=None, env_name=None):
+        self.task = task
+        self.agent_graph = graph
+        self.action_graph = ActionGraph()
+        self.reasoning_paths: List[GraphReasoningPath] = []
+        
+        self.max_parallel_paths = global_config.get("graph").get("max_parallel_paths")
+        
+        self.final_answer = ""
+        self.answers = []
+
+        self.global_logger = LogManager("./config/global.yaml", self.task.get("type"))
+
+        self.workspace_path = self.global_logger.folder_path
+        self.policy = ContinuousREINFORCE(agent_graph=self.agent_graph, action_graph=self.action_graph)
+
+        self.env = env
+        self.env_name = env_name
+        main_logger.info("{}[Graph Reasoning Initialized]{}".format("-"*30, "-"*30))
+        main_logger.info(global_config)
+        main_logger.info(self.agent_graph.role_nodes)
+    
+    def save_checkpoint(self, save_data):
+        main_logger.info("{}[Save Checkpoint]{}".format("-"*30, "-"*30))
+        cur_acc = save_data["best_acc"]
+        cur_data_len =  save_data["best_data_len"]
+        main_logger.info("best acc: {}, data len: {}".format(cur_acc, cur_data_len))
+        tag = "acc_{}-data_{}".format(cur_acc, cur_data_len)
+        self.policy.save_model(path=None, tag=tag)
+
+    def start(self, save_data):
+        if save_data != None:
+            self.save_checkpoint(save_data)
+        print("-"*10+"\033[1;31mGraph Reasoning Start\033[0m"+"-"*10)
+        main_logger.info("{}[Graph Reasoning Start]{}".format("-"*30, "-"*30))
+        main_logger.info("Task:\n{}".format(self.task.get("Question")))
+        
+        # -1 is the default path id for intialization
+        global_info = GlobalInfo(path_id=-1, 
+                                    workpath=self.workspace_path, 
+                                    task=self.task, 
+                                    env=self.env, 
+                                    env_name=self.env_name)
+        matches = self.policy.forward(global_info)
+        
+        for index, match in enumerate(matches):
+            global_info = GlobalInfo(path_id=index, 
+                                    workpath=self.workspace_path, 
+                                    task=self.task, 
+                                    env=self.env, 
+                                    env_name=self.env_name)
+            agent = agent_global_registry.get_agent_from_idx(match)
+            agent.activate(global_info)
+            main_logger.info("[Path {} Initialized".format(index))
+            print("\033[1;36mPath {} Initialized\033[0m".format(index))
+  
+            reasoning_path = GraphReasoningPath(start_agent=agent, 
+                                                max_parallel_paths=self.max_parallel_paths, 
+                                                action_graph=self.action_graph,
+                                                agent_sequence=[],
+                                                index=index,
+                                                global_info = copy.deepcopy(global_info),
+                                                global_logger = self.global_logger,
+                                                workspace_path=self.workspace_path,
+                                                state=copy.deepcopy(ReasoningState.INITIALIZED),
+                                                env=self.env,
+                                                env_name=self.env_name,
+                                                policy=self.policy
+                                                )
+            self.reasoning_paths.append(reasoning_path)
+            main_logger.info("Reasoning Path: {}\nAgent Sequence: {}\n".format(index, reasoning_path.print_agent_sequence()))
+    
+    def n_step(self, n:int):
+        for i in range(n):
+            self.step()
+            if self.check_finalize():
+                break
+        return self.finalize()
+    
+    def step(self):
+        main_logger.info("{}[STEP]{}".format("-"*30, "-"*30))
+
+        for reasoning_path in self.reasoning_paths[:self.max_parallel_paths]:
+            # Deal with the case where the reasoning path is not finalizing and not spliting
+            if reasoning_path.state != ReasoningState.FINALIZING and reasoning_path.state != ReasoningState.SPLITING:
+                main_logger.info("{}[Reasoning Path{} STEP]{}".format("-"*30, reasoning_path.index, "-"*30))
+                print("\033[1;36mPath {} Step\033[0m".format(reasoning_path.index))
+                reasoning_path.step()
+                main_logger.info("{}[DONE]: Reasoning Path{} STEP{}".format("-"*30, reasoning_path.index, "-"*30))
+        
+        buffer_reasoning_paths = []
+        for reasoning_path in self.reasoning_paths[:self.max_parallel_paths]: 
+            # Deal with the case where the reasoning path is spliting
+            if reasoning_path.state == ReasoningState.SPLITING :
+                current_path_count = len(self.reasoning_paths) + len(buffer_reasoning_paths)
+                print("\033[1;36mPath {} Split\033[0m".format(reasoning_path.index))
+                split_reasoning_paths = reasoning_path.split(current_path_count)
+                if len(split_reasoning_paths) > 0:
+                    main_logger.info("Split Reasoning Paths: {} From Path {}".format([path.index for path in split_reasoning_paths], reasoning_path.index))
+                buffer_reasoning_paths.extend(split_reasoning_paths)
+            # Deal with the case where the reasoning path is finalizing
+            elif reasoning_path.state == ReasoningState.FINALIZING:
+                print("\033[1;36mPath {} Finalize\033[0m".format(reasoning_path.index))
+                main_logger.info("{}[Reasoning Path{} FINALIZING]{}".format("-"*30, reasoning_path.index, "-"*30))
+        print(p for p in self.reasoning_paths)
+        self.reasoning_paths.extend(buffer_reasoning_paths)
+        self.format_index()
+        self.print_paths()
+        self.update_graph()
+        
+        return self.answers
+
+    def aggregate_answers(self, global_info, answers:list, query_func=None) -> str:
+        # only choose the last result without any format or extract
+        if query_func is None:
+            if len(answers) == 0:
+                return None
+            else:
+                main_logger.info("[Aggregation] {}".format(answers[-1]))
+                return answers[-1] 
+        
+        # only choose the last result without any format or extract
+        if self.task.get("type") == "SRDD" or self.task.get("type") == "CW":
+            main_logger.info("[Aggregation] {}".format(global_info.code_path))
+            return global_info.code_path
+        
+        prompt_filepath = "prompts/general/answer_prompt.json" 
+        with open(prompt_filepath, "r") as f:
+            prompt = json.load(f)
+        
+        if self.task.get("type") == "MMLU" or self.task.get("type") == "MMLU-Pro":
+            answer_prompt =  "\n".join(prompt["MMLU_aggregation"]).format(str(["{}\n".format(answer) for answer in answers]))
+        elif self.task.get("type") == "GAIA":
+            answer_prompt =  "\n".join(prompt["GAIA_aggregation"]).format(str(["{}\n".format(answer) for answer in answers]))
+        elif self.task.get("type") == "GSM-Hard"  or self.task.get("type") == "gsm-hard" or self.task.get("type") == "GSM8K":
+            answer_prompt = "\n".join(prompt["gsm_aggregation"]).format(str(["{}\n".format(answer) for answer in answers]))
+        else: 
+            answer_prompt = "\n".join(prompt["answer_aggregation"]).format(str(["{}\n".format(answer) for answer in answers]))
+        
+        main_logger.info("[Aggregating] {}".format(answer_prompt))
+        
+        raw_response, _ = query_func(messages=answer_prompt)
+        main_logger.info("[Aggregation Answer] {}".format(raw_response))
+        
+        return raw_response if len(raw_response)!=0 else answers[-1]
+
+    def majority_vote(self, answers: List) -> str:
+        if self.task.get("type") == "MMLU" or self.task.get("type") == "MMLU-Pro":
+            answers = [BenchmarkEvaluator.extract_choice_answer(answer) for answer in answers]
+            main_logger.info("[Majority Vote] Answers: {}".format(answers))
+        elif self.task.get("type") == "gsm-hard" or self.task.get("type") == "GSM8K":
+            answers = [BenchmarkEvaluator.extract_math_answer(answer) for answer in answers]
+            main_logger.info("[Majority Vote] Answers: {}".format(answers))
+        else:
+            main_logger.info("[Majority Vote] Answers: {}".format(answers))
+
+        answer_counts = {}
+        for answer in answers:
+            answer = str(answer).strip()  # Convert to string and remove whitespace
+            answer_counts[answer] = answer_counts.get(answer, 0) + 1
+        
+        if not answer_counts:
+            return ""  # Return empty string if no answers
+        
+        max_count = max(answer_counts.values())
+        most_common = [ans for ans, count in answer_counts.items() if count == max_count]
+        main_logger.info("[Majority Vote] Most Common: {}".format(most_common))
+        return most_common[-1]
+
+    def finalize(self):
+        print("-"*10+"\033[1;31mGraph Reasoning Finalize\033[0m"+"-"*10)
+        print(p for p in self.reasoning_paths)
+        for idx, reasoning_path in enumerate(self.reasoning_paths):
+            if hasattr(reasoning_path, "last_query_func"):
+                aggregated_answer = self.aggregate_answers(reasoning_path.global_info, reasoning_path.global_info.state_answers, reasoning_path.last_query_func)
+            else:
+                aggregated_answer = self.aggregate_answers(reasoning_path.global_info, reasoning_path.global_info.state_answers)
+            if self.task.get("type") == "MMLU-Pro":
+                transition = {
+                'state': reasoning_path.global_info.workflow.state,
+                'reward': 1 if BenchmarkEvaluator.check_mmlu(aggregated_answer, self.task.get("Answer")) else -1,
+                'action': None,  
+                'next_state': None,
+                'done': True,
+                'path_id': idx 
+                }
+                print(transition)
+                self.policy.finalize_task(transition, reasoning_path.global_info)
+            elif self.task.get("type") == "GSM-Hard": 
+                transition = {
+                'state': reasoning_path.global_info.workflow.state,
+                'reward': 1 if BenchmarkEvaluator.check_gsm8k(aggregated_answer, self.task.get("Answer")) else -1,
+                'action': None,  
+                'next_state': None,
+                'done': True,
+                'path_id': idx 
+                }
+                print(transition)
+                self.policy.finalize_task(transition, reasoning_path.global_info)
+
+            elif self.task.get("type") == "SRDD":
+                reward, metrics = BenchmarkEvaluator.check_srdd(aggregated_answer, reasoning_path.global_info.task.get("Question"))
+                transition = {
+                'state': reasoning_path.global_info.workflow.state,
+                'reward':  reward,
+                'action': None,  
+                'next_state': None,
+                'done': True,
+                'path_id': idx ,
+                "metrics":metrics
+                }
+                main_logger.info(metrics)
+                self.policy.finalize_task(transition, reasoning_path.global_info)
+            elif self.task.get("type") == "CW":
+                reward, metrics = BenchmarkEvaluator.check_commongen(concepts=reasoning_path.global_info.task.get("concepts"), text_path=aggregated_answer)
+                transition = {
+                'state': reasoning_path.global_info.workflow.state,
+                'reward': reward,
+                'action': None,  
+                'next_state': None,
+                'done': True,
+                'path_id': idx ,
+                "metrics":metrics
+                }
+                main_logger.info(metrics)
+                self.policy.finalize_task(transition, reasoning_path.global_info)
+            if aggregated_answer is not None:
+                self.answers.append(aggregated_answer)
+                main_logger.info("[Aggregated Answer From Path {}]: {}".format(idx, aggregated_answer))   
+        self.policy.update()
+        
+        for agent in agent_global_registry.agents.values():
+            agent.reset()
+        
+        if len(self.answers) == 1 or self.task.get("type") == "SRDD" or self.task.get("type") == "CW":
+            if len(self.answers) == 0:
+                self.final_answer = ""
+            else:
+                self.final_answer = self.answers[-1]
+        else:
+            self.final_answer = self.majority_vote(self.answers)
+        
+        main_logger.info("[Final Answer]: {}".format(self.final_answer))   
+        print("-"*10+"\033[1;31mGraph Reasoning Finalized\033[0m"+"-"*10)
+        
+        return self.final_answer, self.task.get("Answer")
+    
+    def visualize_path(self):
+        for reasoning_path in self.reasoning_paths:
+            reasoning_path.global_info.workflow.visualize()
+    
+    def visualize_graph(self):
+        self.agent_graph.visualize(os.path.join(self.workspace_path, "agent_graph.html"))
+        self.action_graph.visualize(os.path.join(self.workspace_path, "action_graph.html"))
+
+    def print_paths(self):
+        for reasoning_path in self.reasoning_paths:
+            main_logger.info("Reasoning Path: {}\nAgent Sequence: {}\n".format(reasoning_path.index, reasoning_path.print_agent_sequence()))
+    
+    def format_index(self):
+        for index, reasoning_path in enumerate(self.reasoning_paths):
+            reasoning_path.index = index
+    
+    def update_graph(self):
+        for index, reasoning_path in enumerate(self.reasoning_paths):
+            for successor, predecessor in zip(reasoning_path.agent_sequence[:-1], reasoning_path.agent_sequence[1:]):
+                successor = agent_global_registry.get_agent_from_idx(successor.get("hash"))
+                predecessor = agent_global_registry.get_agent_from_idx(predecessor.get("hash"))
+                res = self.agent_graph._get_edge(predecessor, successor)
+                if res is None or index not in res:
+                    self.agent_graph._add_edge(predecessor, successor, index)
+    
+    def check_finalize(self):
+        for reasoning_path in self.reasoning_paths[:self.max_parallel_paths]:
+            if reasoning_path.state != ReasoningState.FINALIZING and reasoning_path.state != ReasoningState.DISCARDING:
+                return False
+        return True
--- a/puppeteer/logs/example/action_graph.html
+++ b/puppeteer/logs/example/action_graph.html
@ -0,0 +1,155 @@
+<html>
+    <head>
+        <meta charset="utf-8">
+        
+            <script src="lib/bindings/utils.js"></script>
+            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/vis-network/9.1.2/dist/dist/vis-network.min.css" integrity="sha512-WgxfT5LWjfszlPHXRmBWHkV2eceiWTOBvrKCNbdgDYTHrT2AeLCGbF4sZlZw3UMN3WtL0tGUoIAKsu8mllg/XA==" crossorigin="anonymous" referrerpolicy="no-referrer" />
+            <script src="https://cdnjs.cloudflare.com/ajax/libs/vis-network/9.1.2/dist/vis-network.min.js" integrity="sha512-LnvoEWDFrqGHlHmDD2101OrLcbsfkrzoSpvtSQtxK3RMnRV0eOkhhBN2dXHKRrUU8p2DGRTk35n4O8nWSVe1mQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
+            
+        
+<center>
+<h1></h1>
+</center>
+
+<!-- <link rel="stylesheet" href="../node_modules/vis/dist/vis.min.css" type="text/css" />
+<script type="text/javascript" src="../node_modules/vis/dist/vis.js"> </script>-->
+        <link
+          href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta3/dist/css/bootstrap.min.css"
+          rel="stylesheet"
+          integrity="sha384-eOJMYsd53ii+scO/bJGFsiCZc+5NDVN2yr8+0RDqr0Ql0h+rP48ckxlpbzKgwra6"
+          crossorigin="anonymous"
+        />
+        <script
+          src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta3/dist/js/bootstrap.bundle.min.js"
+          integrity="sha384-JEW9xMcG8R+pH31jmWH6WWP0WintQrMb4s7ZOdauHnUtxwoG2vI5DkLtS3qm9Ekf"
+          crossorigin="anonymous"
+        ></script>
+
+
+        <center>
+          <h1></h1>
+        </center>
+        <style type="text/css">
+
+             #mynetwork {
+                 width: 100%;
+                 height: 750px;
+                 background-color: #FFFFFF;
+                 border: 1px solid lightgray;
+                 position: relative;
+                 float: left;
+             }
+
+             
+
+             
+
+             
+        </style>
+    </head>
+
+
+    <body>
+        <div class="card" style="width: 100%">
+            
+            
+            <div id="mynetwork" class="card-body"></div>
+        </div>
+
+        
+        
+
+        <script type="text/javascript">
+
+              // initialize global variables.
+              var edges;
+              var nodes;
+              var allNodes;
+              var allEdges;
+              var nodeColors;
+              var originalNodes;
+              var network;
+              var container;
+              var options, data;
+              var filter = {
+                  item : '',
+                  property : '',
+                  value : []
+              };
+
+              
+
+              
+
+              // This method is responsible for drawing the graph, returns the drawn network
+              function drawGraph() {
+                  var container = document.getElementById('mynetwork');
+
+                  
+
+                  // parsing and collecting nodes and edges from the python
+                  nodes = new vis.DataSet([{"color": "green", "font": {"color": "black"}, "id": "ad54f8ec-782b-457c-a8a2-6c4ef453dccc", "label": "critique\nCriticAgent_gpt4o", "shape": "dot", "size": 10, "status": "Success"}, {"color": "green", "font": {"color": "black"}, "id": "8912ab4d-3e84-4150-92a7-5a5b502e153e", "label": "planning\nPlannerAgent_gpt4o", "shape": "dot", "size": 10, "status": "Success"}, {"color": "green", "font": {"color": "black"}, "id": "8dd841e4-f7c9-4cc7-a268-d514ef0f112d", "label": "summarize\nSummarizerAgent_gpt4o", "shape": "dot", "size": 10, "status": "Success"}, {"color": "green", "font": {"color": "black"}, "id": "cbd174fb-802e-41c8-a7f0-9f01c3093cd8", "label": "modify\nModifier_gpt4o", "shape": "dot", "size": 10, "status": "Success"}, {"color": "green", "font": {"color": "black"}, "id": "7f17cdf4-5327-4d30-9812-48d3ead86d7b", "label": "conclude\nConcluderAgent_gpt4o", "shape": "dot", "size": 10, "status": "Success"}, {"color": "green", "font": {"color": "black"}, "id": "b5493311-d168-4049-a679-22c24bd22e3f", "label": "run_python\nPythonAgent_gpt4o", "shape": "dot", "size": 10, "status": "Success"}, {"color": "green", "font": {"color": "black"}, "id": "ee87f1fe-669f-4e65-9c2f-16469535aa47", "label": "critique\nCriticAgent_gpt4o", "shape": "dot", "size": 10, "status": "Success"}, {"color": "green", "font": {"color": "black"}, "id": "ea4c2232-8661-452e-aeff-334a39428c7c", "label": "terminate\nTerminatorAgent", "shape": "dot", "size": 10, "status": "Success"}]);
+                  edges = new vis.DataSet([{"arrows": "to", "from": "ad54f8ec-782b-457c-a8a2-6c4ef453dccc", "to": "8912ab4d-3e84-4150-92a7-5a5b502e153e", "width": 1}, {"arrows": "to", "from": "ad54f8ec-782b-457c-a8a2-6c4ef453dccc", "to": "8dd841e4-f7c9-4cc7-a268-d514ef0f112d", "width": 1}, {"arrows": "to", "from": "ad54f8ec-782b-457c-a8a2-6c4ef453dccc", "to": "cbd174fb-802e-41c8-a7f0-9f01c3093cd8", "width": 1}, {"arrows": "to", "from": "7f17cdf4-5327-4d30-9812-48d3ead86d7b", "to": "b5493311-d168-4049-a679-22c24bd22e3f", "width": 1}, {"arrows": "to", "from": "7f17cdf4-5327-4d30-9812-48d3ead86d7b", "to": "ee87f1fe-669f-4e65-9c2f-16469535aa47", "width": 1}]);
+
+                  nodeColors = {};
+                  allNodes = nodes.get({ returnType: "Object" });
+                  for (nodeId in allNodes) {
+                    nodeColors[nodeId] = allNodes[nodeId].color;
+                  }
+                  allEdges = edges.get({ returnType: "Object" });
+                  // adding nodes and edges to the graph
+                  data = {nodes: nodes, edges: edges};
+
+                  var options = {
+    "configure": {
+        "enabled": false
+    },
+    "edges": {
+        "color": {
+            "inherit": true
+        },
+        "smooth": {
+            "enabled": true,
+            "type": "dynamic"
+        }
+    },
+    "interaction": {
+        "dragNodes": true,
+        "hideEdgesOnDrag": false,
+        "hideNodesOnDrag": false
+    },
+    "physics": {
+        "enabled": true,
+        "stabilization": {
+            "enabled": true,
+            "fit": true,
+            "iterations": 1000,
+            "onlyDynamicEdges": false,
+            "updateInterval": 50
+        }
+    }
+};
+
+                  
+
+
+                  
+
+                  network = new vis.Network(container, data, options);
+
+                  
+
+                  
+
+                  
+
+
+                  
+
+                  return network;
+
+              }
+              drawGraph();
+        </script>
+    </body>
+</html>
--- a/puppeteer/logs/example/agent-main.py
+++ b/puppeteer/logs/example/agent-main.py
@ -0,0 +1,23 @@
+def handle_advertising_regulation():
+    # Guidelines for typical advertising regulation
+    advert_rules = {
+        'encourage': 'illegal activities',
+        'cause_unnecessary': ['fear', 'offense'],
+        'cause': 'harm' 
+    }
+
+    # Printing out the general context
+    print("This function demonstrates typical advertising regulation guidelines.\n")
+
+    # Printing out the full dictionary
+    print("Advertising regulation guidelines:")
+    print(advert_rules)
+
+    # Printing out the rules with clear explanations
+    print("\nTypical advertising regulatory bodies suggest, for example, that adverts must not:")
+    print("- Encourage:", advert_rules['encourage'])
+    print("- Cause unnecessary:", ', '.join(advert_rules['cause_unnecessary']))
+    print("- Must not cause:", advert_rules['cause'])
+
+# Run the function
+handle_advertising_regulation()
--- a/puppeteer/logs/example/agent_graph.html
+++ b/puppeteer/logs/example/agent_graph.html
@ -0,0 +1,155 @@
+<html>
+    <head>
+        <meta charset="utf-8">
+        
+            <script src="lib/bindings/utils.js"></script>
+            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/vis-network/9.1.2/dist/dist/vis-network.min.css" integrity="sha512-WgxfT5LWjfszlPHXRmBWHkV2eceiWTOBvrKCNbdgDYTHrT2AeLCGbF4sZlZw3UMN3WtL0tGUoIAKsu8mllg/XA==" crossorigin="anonymous" referrerpolicy="no-referrer" />
+            <script src="https://cdnjs.cloudflare.com/ajax/libs/vis-network/9.1.2/dist/vis-network.min.js" integrity="sha512-LnvoEWDFrqGHlHmDD2101OrLcbsfkrzoSpvtSQtxK3RMnRV0eOkhhBN2dXHKRrUU8p2DGRTk35n4O8nWSVe1mQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
+            
+        
+<center>
+<h1></h1>
+</center>
+
+<!-- <link rel="stylesheet" href="../node_modules/vis/dist/vis.min.css" type="text/css" />
+<script type="text/javascript" src="../node_modules/vis/dist/vis.js"> </script>-->
+        <link
+          href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta3/dist/css/bootstrap.min.css"
+          rel="stylesheet"
+          integrity="sha384-eOJMYsd53ii+scO/bJGFsiCZc+5NDVN2yr8+0RDqr0Ql0h+rP48ckxlpbzKgwra6"
+          crossorigin="anonymous"
+        />
+        <script
+          src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta3/dist/js/bootstrap.bundle.min.js"
+          integrity="sha384-JEW9xMcG8R+pH31jmWH6WWP0WintQrMb4s7ZOdauHnUtxwoG2vI5DkLtS3qm9Ekf"
+          crossorigin="anonymous"
+        ></script>
+
+
+        <center>
+          <h1></h1>
+        </center>
+        <style type="text/css">
+
+             #mynetwork {
+                 width: 100%;
+                 height: 750px;
+                 background-color: #FFFFFF;
+                 border: 1px solid lightgray;
+                 position: relative;
+                 float: left;
+             }
+
+             
+
+             
+
+             
+        </style>
+    </head>
+
+
+    <body>
+        <div class="card" style="width: 100%">
+            
+            
+            <div id="mynetwork" class="card-body"></div>
+        </div>
+
+        
+        
+
+        <script type="text/javascript">
+
+              // initialize global variables.
+              var edges;
+              var nodes;
+              var allNodes;
+              var allEdges;
+              var nodeColors;
+              var originalNodes;
+              var network;
+              var container;
+              var options, data;
+              var filter = {
+                  item : '',
+                  property : '',
+                  value : []
+              };
+
+              
+
+              
+
+              // This method is responsible for drawing the graph, returns the drawn network
+              function drawGraph() {
+                  var container = document.getElementById('mynetwork');
+
+                  
+
+                  // parsing and collecting nodes and edges from the python
+                  nodes = new vis.DataSet([{"color": "#37aabb", "font": {"color": "black"}, "id": 8, "label": "CriticAgent_gpt4o\nbase model: gpt4o\nindex: 8", "shape": "dot", "size": 10}, {"color": "#34af8a", "font": {"color": "black"}, "id": 6, "label": "PlannerAgent_gpt4o\nbase model: gpt4o\nindex: 6", "shape": "dot", "size": 10}, {"color": "#c180f4", "font": {"color": "black"}, "id": 11, "label": "SummarizerAgent_gpt4o\nbase model: gpt4o\nindex: 11", "shape": "dot", "size": 10}, {"color": "#f669ba", "font": {"color": "black"}, "id": 13, "label": "Modifier_gpt4o\nbase model: gpt4o\nindex: 13", "shape": "dot", "size": 10}, {"color": "#f45deb", "font": {"color": "black"}, "id": 12, "label": "ConcluderAgent_gpt4o\nbase model: gpt4o\nindex: 12", "shape": "dot", "size": 10}, {"color": "#32b24e", "font": {"color": "black"}, "id": 5, "label": "PythonAgent_gpt4o\nbase model: gpt4o\nindex: 5", "shape": "dot", "size": 10}, {"color": "#f77189", "font": {"color": "black"}, "id": 0, "label": "FileAgent\nbase model: gpt-3.5\nindex: 0", "shape": "dot", "size": 10}, {"color": "#ef7d32", "font": {"color": "black"}, "id": 1, "label": "ArxivAgent\nbase model: gpt-3.5\nindex: 1", "shape": "dot", "size": 10}, {"color": "#c69432", "font": {"color": "black"}, "id": 2, "label": "BingAgent\nbase model: gpt-3.5\nindex: 2", "shape": "dot", "size": 10}, {"color": "#a79f31", "font": {"color": "black"}, "id": 3, "label": "WebsiteAgent\nbase model: gpt-3.5\nindex: 3", "shape": "dot", "size": 10}, {"color": "#82a931", "font": {"color": "black"}, "id": 4, "label": "TerminatorAgent\nbase model: gpt-3.5\nindex: 4", "shape": "dot", "size": 10}, {"color": "#36ada4", "font": {"color": "black"}, "id": 7, "label": "ReasoningAgent_gpt4o\nbase model: gpt4o\nindex: 7", "shape": "dot", "size": 10}, {"color": "#3aa6da", "font": {"color": "black"}, "id": 9, "label": "ReflectAgent_gpt4o\nbase model: gpt4o\nindex: 9", "shape": "dot", "size": 10}, {"color": "#8197f4", "font": {"color": "black"}, "id": 10, "label": "QuestionAgent_gpt4o\nbase model: gpt4o\nindex: 10", "shape": "dot", "size": 10}]);
+                  edges = new vis.DataSet([{"arrows": "to", "color": "#f77189", "from": 8, "to": 6, "width": 1}, {"arrows": "to", "color": "#33b07a", "from": 8, "to": 11, "width": 1}, {"arrows": "to", "color": "#38a9c5", "from": 8, "to": 13, "width": 1}, {"arrows": "to", "color": "#ae9d31", "from": 12, "to": 5, "width": 1}, {"arrows": "to", "color": "#cc7af4", "from": 12, "to": 8, "width": 1}]);
+
+                  nodeColors = {};
+                  allNodes = nodes.get({ returnType: "Object" });
+                  for (nodeId in allNodes) {
+                    nodeColors[nodeId] = allNodes[nodeId].color;
+                  }
+                  allEdges = edges.get({ returnType: "Object" });
+                  // adding nodes and edges to the graph
+                  data = {nodes: nodes, edges: edges};
+
+                  var options = {
+    "configure": {
+        "enabled": false
+    },
+    "edges": {
+        "color": {
+            "inherit": true
+        },
+        "smooth": {
+            "enabled": true,
+            "type": "dynamic"
+        }
+    },
+    "interaction": {
+        "dragNodes": true,
+        "hideEdgesOnDrag": false,
+        "hideNodesOnDrag": false
+    },
+    "physics": {
+        "enabled": true,
+        "stabilization": {
+            "enabled": true,
+            "fit": true,
+            "iterations": 1000,
+            "onlyDynamicEdges": false,
+            "updateInterval": 50
+        }
+    }
+};
+
+                  
+
+
+                  
+
+                  network = new vis.Network(container, data, options);
+
+                  
+
+                  
+
+                  
+
+
+                  
+
+                  return network;
+
+              }
+              drawGraph();
+        </script>
+    </body>
+</html>
--- a/puppeteer/logs/example/meta.log
+++ b/puppeteer/logs/example/meta.log
@ -0,0 +1,209 @@
+[10:09:45 INFO]
+------------------------------[Graph Reasoning Initialized]------------------------------
+[10:09:45 INFO]
+['FileAgent', 'ArxivAgent', 'BingAgent', 'WebsiteAgent', 'TerminatorAgent', 'PythonAgent_gpt4o', 'PlannerAgent_gpt4o', 'ReasoningAgent_gpt4o', 'CriticAgent_gpt4o', 'ReflectAgent_gpt4o', 'QuestionAgent_gpt4o', 'SummarizerAgent_gpt4o', 'ConcluderAgent_gpt4o', 'Modifier_gpt4o']
+[10:09:45 INFO]
+------------------------------[Graph Reasoning Start]------------------------------
+[10:09:45 INFO]
+Task:
+The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+[10:09:53 INFO]
+[Path 0 Initialized
+[10:09:53 INFO]
+Reasoning Path: 0
+Agent Sequence: TerminatorAgent
+
+[10:09:53 INFO]
+[Path 1 Initialized
+[10:09:53 INFO]
+Reasoning Path: 1
+Agent Sequence: CriticAgent_gpt4o
+
+[10:09:53 INFO]
+[Path 2 Initialized
+[10:09:53 INFO]
+Reasoning Path: 2
+Agent Sequence: ConcluderAgent_gpt4o
+
+[10:09:53 INFO]
+------------------------------[STEP]------------------------------
+[10:09:53 INFO]
+------------------------------[Reasoning Path0 STEP]------------------------------
+[10:09:53 INFO]
+------------------------------[DONE]: Reasoning Path0 STEP------------------------------
+[10:09:53 INFO]
+------------------------------[Reasoning Path1 STEP]------------------------------
+[10:09:56 INFO]
+------------------------------[DONE]: Reasoning Path1 STEP------------------------------
+[10:09:56 INFO]
+------------------------------[Reasoning Path2 STEP]------------------------------
+[10:09:59 INFO]
+------------------------------[DONE]: Reasoning Path2 STEP------------------------------
+[10:09:59 INFO]
+------------------------------[Reasoning Path0 FINALIZING]------------------------------
+[10:09:59 INFO]
+Split Reasoning Paths: [3, 4] From Path 1
+[10:09:59 INFO]
+Split Reasoning Paths: [5] From Path 2
+[10:09:59 INFO]
+Reasoning Path: 0
+Agent Sequence: TerminatorAgent
+
+[10:09:59 INFO]
+Reasoning Path: 1
+Agent Sequence: CriticAgent_gpt4o->PlannerAgent_gpt4o
+
+[10:09:59 INFO]
+Reasoning Path: 2
+Agent Sequence: ConcluderAgent_gpt4o->PythonAgent_gpt4o
+
+[10:09:59 INFO]
+Reasoning Path: 3
+Agent Sequence: CriticAgent_gpt4o->SummarizerAgent_gpt4o
+
+[10:09:59 INFO]
+Reasoning Path: 4
+Agent Sequence: CriticAgent_gpt4o->Modifier_gpt4o
+
+[10:09:59 INFO]
+Reasoning Path: 5
+Agent Sequence: ConcluderAgent_gpt4o->CriticAgent_gpt4o
+
+[10:09:59 INFO]
+------------------------------[STEP]------------------------------
+[10:09:59 INFO]
+------------------------------[Reasoning Path1 STEP]------------------------------
+[10:10:06 INFO]
+------------------------------[DONE]: Reasoning Path1 STEP------------------------------
+[10:10:06 INFO]
+------------------------------[Reasoning Path2 STEP]------------------------------
+[10:10:22 INFO]
+------------------------------[DONE]: Reasoning Path2 STEP------------------------------
+[10:10:22 INFO]
+------------------------------[Reasoning Path3 STEP]------------------------------
+[10:10:27 INFO]
+------------------------------[DONE]: Reasoning Path3 STEP------------------------------
+[10:10:27 INFO]
+------------------------------[Reasoning Path4 STEP]------------------------------
+[10:10:34 INFO]
+------------------------------[DONE]: Reasoning Path4 STEP------------------------------
+[10:10:34 INFO]
+------------------------------[Reasoning Path5 STEP]------------------------------
+[10:10:39 INFO]
+------------------------------[DONE]: Reasoning Path5 STEP------------------------------
+[10:10:39 INFO]
+------------------------------[Reasoning Path0 FINALIZING]------------------------------
+[10:10:39 INFO]
+------------------------------[Reasoning Path1 FINALIZING]------------------------------
+[10:10:39 INFO]
+------------------------------[Reasoning Path2 FINALIZING]------------------------------
+[10:10:39 INFO]
+------------------------------[Reasoning Path3 FINALIZING]------------------------------
+[10:10:39 INFO]
+------------------------------[Reasoning Path4 FINALIZING]------------------------------
+[10:10:39 INFO]
+------------------------------[Reasoning Path5 FINALIZING]------------------------------
+[10:10:39 INFO]
+Reasoning Path: 0
+Agent Sequence: TerminatorAgent
+
+[10:10:39 INFO]
+Reasoning Path: 1
+Agent Sequence: CriticAgent_gpt4o->PlannerAgent_gpt4o
+
+[10:10:39 INFO]
+Reasoning Path: 2
+Agent Sequence: ConcluderAgent_gpt4o->PythonAgent_gpt4o
+
+[10:10:39 INFO]
+Reasoning Path: 3
+Agent Sequence: CriticAgent_gpt4o->SummarizerAgent_gpt4o
+
+[10:10:39 INFO]
+Reasoning Path: 4
+Agent Sequence: CriticAgent_gpt4o->Modifier_gpt4o
+
+[10:10:39 INFO]
+Reasoning Path: 5
+Agent Sequence: ConcluderAgent_gpt4o->CriticAgent_gpt4o
+
+[10:10:39 INFO]
+[Aggregating] You have several answer candidates. They are:
+[]
+Choose the most common answer among the candidates.
+If responses conflict, prioritize answers that provide clear reasoning or appear later
+Format the answer as a single letter.
+Do not include any other units or punctuation.
+Return your answer with only the single letter.
+[10:10:41 INFO]
+[Aggregation Answer] Sure! Please provide the answer candidates so I can evaluate them for you.
+[10:10:41 INFO]
+[Aggregated Answer From Path 0]: Sure! Please provide the answer candidates so I can evaluate them for you.
+[10:10:41 INFO]
+[Aggregating] You have several answer candidates. They are:
+[' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\n']
+Choose the most common answer among the candidates.
+If responses conflict, prioritize answers that provide clear reasoning or appear later
+Format the answer as a single letter.
+Do not include any other units or punctuation.
+Return your answer with only the single letter.
+[10:10:44 INFO]
+[Aggregation Answer] A
+[10:10:44 INFO]
+[Aggregated Answer From Path 1]: A
+[10:10:44 INFO]
+[Aggregating] You have several answer candidates. They are:
+[' **I: Unsafe practices, Distress, Fear, Serious**\n', 'C\n']
+Choose the most common answer among the candidates.
+If responses conflict, prioritize answers that provide clear reasoning or appear later
+Format the answer as a single letter.
+Do not include any other units or punctuation.
+Return your answer with only the single letter.
+[10:10:48 INFO]
+[Aggregation Answer] C
+[10:10:48 INFO]
+[Aggregated Answer From Path 2]: C
+[10:10:48 INFO]
+[Aggregating] You have several answer candidates. They are:
+[' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\n', ' The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.\n']
+Choose the most common answer among the candidates.
+If responses conflict, prioritize answers that provide clear reasoning or appear later
+Format the answer as a single letter.
+Do not include any other units or punctuation.
+Return your answer with only the single letter.
+[10:10:49 INFO]
+[Aggregation Answer] B
+[10:10:49 INFO]
+[Aggregated Answer From Path 3]: B
+[10:10:49 INFO]
+[Aggregating] You have several answer candidates. They are:
+[' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\n', '**  \nThe correct answer is:  \n**I: Unsafe practices, Distress, Fear, Serious.**  \n\nThis answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.\n']
+Choose the most common answer among the candidates.
+If responses conflict, prioritize answers that provide clear reasoning or appear later
+Format the answer as a single letter.
+Do not include any other units or punctuation.
+Return your answer with only the single letter.
+[10:10:50 INFO]
+[Aggregation Answer] B
+[10:10:51 INFO]
+[Aggregated Answer From Path 4]: B
+[10:10:51 INFO]
+[Aggregating] You have several answer candidates. They are:
+[' **I: Unsafe practices, Distress, Fear, Serious**\n', ' The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.\n']
+Choose the most common answer among the candidates.
+If responses conflict, prioritize answers that provide clear reasoning or appear later
+Format the answer as a single letter.
+Do not include any other units or punctuation.
+Return your answer with only the single letter.
+[10:10:53 INFO]
+[Aggregation Answer] I
+[10:10:53 INFO]
+[Aggregated Answer From Path 5]: I
+[10:10:57 INFO]
+[Majority Vote] Answers: ['Sure! Please provide the answer candidates so I can evaluate them for you.', 'A', 'C', 'B', 'B', 'I']
+[10:10:57 INFO]
+[Majority Vote] Most Common: ['B']
+[10:10:57 INFO]
+[Final Answer]: B
--- a/puppeteer/logs/example/model_query.log
+++ b/puppeteer/logs/example/model_query.log
@ -0,0 +1,259 @@
+[10:09:53 INFO]
+[Model Query] Querying gpt4o
+[10:09:53 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in critiquing. Your task is to critique the reasoning and solutions provided by others., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'You need to critique the previous reasoning. You should complete your reasoning using this format: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your answer using this format: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: [].* Consider the following when critiquing the reasoning: 1. Plausibility:'}]
+[10:09:56 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 150 
+Prompt Tokens: 295 
+Total Tokens: 445
+[10:09:56 INFO]
+[Model Query] gpt4o response: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.
+
+However, I will critique the absence or missing reasoning content: 
+
+REASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.
+
+FINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.
+[10:09:56 INFO]
+[Model Query] Querying gpt4o
+[10:09:56 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in concluding. Your task is to provide final conclusions based on the reasoning process., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'You need to conclude the task and provide a final answer. You should finish your reasoning with the following template: REASONING RESULT: [YOUR REASONING RESULT]. Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: [].* You need to conclude the task and provide a final answer: '}]
+[10:09:59 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 156 
+Prompt Tokens: 294 
+Total Tokens: 450
+[10:09:59 INFO]
+[Model Query] gpt4o response: Typical advertising regulatory bodies aim to ensure that advertisements are ethical and do not promote or encourage harmful behavior or emotions. They stress the importance of not encouraging unsafe practices, causing unnecessary distress, or promoting jealousy, and further ensuring that advertisements avoid causing serious offense. 
+
+Analyzing the provided options while aligning with these stated guidelines:
+1. The right combination should discourage unsafe practices, eliminate distress or jealousy, and avoid serious offense.  
+2. Therefore, Option **I (Unsafe practices, Distress, Fear, Serious)** fits perfectly with the context of advertising regulations.
+
+REASONING RESULT: The correct choice must ensure advertising standards do not promote unsafe practices, distress, fear, or serious offense.
+
+FINAL ANSWER: **I: Unsafe practices, Distress, Fear, Serious**
+[10:09:59 INFO]
+[Model Query] Querying gpt4o
+[10:09:59 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in planning. Your task is to create detailed plans for achieving specific goals., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'Decompose the question and plan the next steps to address the question. You should complete your planning using the following template: REASONING RESULT: [YOUR REASONING RESULT]. *Your previous reasoning was: [\'Successful Action: critique\\nResult: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.\\n\\nHowever, I will critique the absence or missing reasoning content: \\n\\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\\n\\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\'].* Your planning should include: '}]
+[10:10:06 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 332 
+Prompt Tokens: 434 
+Total Tokens: 766
+[10:10:06 INFO]
+[Model Query] gpt4o response: REASONING RESULT: The question aims to test understanding of ethical advertising practices as guided by regulatory bodies. The multiple choices provide various options related to recommended behaviors and prohibited actions in advertising. The objective of this task is to identify which answer aligns most closely with widely accepted principles of advertising ethics and regulations. To achieve this, it is essential to break down and evaluate the key components of the question and choices based on the logical alignment of terms like "safe practices," "distress," "jealousy," "fear," "offense," and their corresponding severity.
+
+**PLANNING STEPS:**
+
+1. **Understand Regulatory Advertising Standards:**
+   - Analyze the general expectations from advertising regulatory bodies, such as avoiding harm, promoting safety, and refraining from causing unnecessary offense or emotional responses.
+
+2. **Break Down the Key Terms in the Question:**
+   - Identify the implications of each term in the question: "safe practices," "distress," "jealousy," "fear," and "offense."
+   - Assess how these terms contribute to ethical advertising.
+
+3. **Evaluate Each Answer Choice:**
+   - Examine each choice systematically and determine whether it aligns with the principles of advertising standards.
+   - Eliminate options that deviate from these principles.
+
+4. **Identify the Correct Answer:**
+   - Select the choice that best fits the principles after careful analysis.
+
+5. **Finalize Conclusion:**
+   - Present the reasoning for the selected answer clearly and concisely.
+
+By following these steps, it will be possible to determine the correct answer with a logical and well-supported explanation.
+[10:10:06 INFO]
+[Model Query] Querying gpt4o
+[10:10:06 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in Python programming. Your task is to run Python code and provide the results., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'You have chosen to write and run Python code. Please write generic Python code in the parameter to solve this type of problems using only standard python libraries. Make sure you use the \'print\' function for all output when relevant. Return in json format. Example: {"action": "run_python", "parameter": "print(\'Hello, World!\')"}'}]
+[10:10:08 INFO]
+[Model Query: ChatCompletion] query failed: Error code: 429 - {'error': {'message': '当前分组上游负载已饱和，请稍后再试 (request id: 20250909101008489440162dhklTW8w)', 'type': 'openai_error', 'param': '', 'code': 'model_not_found'}}
+[10:10:13 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in Python programming. Your task is to run Python code and provide the results., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'You have chosen to write and run Python code. Please write generic Python code in the parameter to solve this type of problems using only standard python libraries. Make sure you use the \'print\' function for all output when relevant. Return in json format. Example: {"action": "run_python", "parameter": "print(\'Hello, World!\')"}'}]
+[10:10:16 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 169 
+Prompt Tokens: 168 
+Total Tokens: 337
+[10:10:16 INFO]
+[Model Query] gpt4o response: {"action": "run_python", "parameter": "def handle_advertising_regulation():\n    # Guidelines for typical advertising regulation\n    advert_rules = {\n        'encourage': 'illegal activities',\n        'cause_unnecessary': ['fear', 'offense'],\n        'cause': 'harm' \n    }\n\n    # Printing out the rules\n    print(\"Typical advertising regulatory bodies suggest, for example, that adverts must not:\")\n    print(\"- Encourage:\", advert_rules['encourage'])\n    print(\"- Cause unnecessary:\", ', '.join(advert_rules['cause_unnecessary']))\n    print(\"- Must not cause:\", advert_rules['cause'])\n\n# Run the function\nhandle_advertising_regulation()"}
+[10:10:16 INFO]
+[Model Query] Querying gpt-3.5
+[10:10:16 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an assistant'}, {'role': 'user', 'content': 'Add \'print\' statements to print all necessary information.\nCode:\ndef handle_advertising_regulation():\n    # Guidelines for typical advertising regulation\n    advert_rules = {\n        \'encourage\': \'illegal activities\',\n        \'cause_unnecessary\': [\'fear\', \'offense\'],\n        \'cause\': \'harm\' \n    }\n\n    # Printing out the rules\n    print("Typical advertising regulatory bodies suggest, for example, that adverts must not:")\n    print("- Encourage:", advert_rules[\'encourage\'])\n    print("- Cause unnecessary:", \', \'.join(advert_rules[\'cause_unnecessary\']))\n    print("- Must not cause:", advert_rules[\'cause\'])\n\n# Run the function\nhandle_advertising_regulation()'}]
+[10:10:20 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 295 
+Prompt Tokens: 159 
+Total Tokens: 454
+[10:10:20 INFO]
+[Model Query] gpt-3.5 response: You can add print statements to display all necessary information during the function execution, such as printing the dictionary `advert_rules` and other key points. Here's an enhanced version of your code:
+
+```python
+def handle_advertising_regulation():
+    # Guidelines for typical advertising regulation
+    advert_rules = {
+        'encourage': 'illegal activities',
+        'cause_unnecessary': ['fear', 'offense'],
+        'cause': 'harm' 
+    }
+
+    # Printing out the general context
+    print("This function demonstrates typical advertising regulation guidelines.\n")
+
+    # Printing out the full dictionary
+    print("Advertising regulation guidelines:")
+    print(advert_rules)
+
+    # Printing out the rules with clear explanations
+    print("\nTypical advertising regulatory bodies suggest, for example, that adverts must not:")
+    print("- Encourage:", advert_rules['encourage'])
+    print("- Cause unnecessary:", ', '.join(advert_rules['cause_unnecessary']))
+    print("- Must not cause:", advert_rules['cause'])
+
+# Run the function
+handle_advertising_regulation()
+```
+
+### Explanation:
+1. I added a print statement at the start to clarify what the function demonstrates.
+2. I printed the full `advert_rules` dictionary to show the structure before breaking it down.
+3. The rest of the print statements remain to display each specific guideline.
+
+This will help visualize not only the specific rules but also the entire dictionary of regulations in the output.
+[10:10:21 INFO]
+[Model Query] Querying gpt4o
+[10:10:21 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in Python programming. Your task is to run Python code and provide the results., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ '}, {'role': 'user', 'content': 'You have chosen to write and run Python code. Please write generic Python code in the parameter to solve this type of problems using only standard python libraries. Make sure you use the \'print\' function for all output when relevant. Return in json format. Example: {"action": "run_python", "parameter": "print(\'Hello, World!\')"}'}, {'role': 'assistant', 'content': '{\'action\': \'run_python\', \'parameter\': \'def handle_advertising_regulation():\\n    # Guidelines for typical advertising regulation\\n    advert_rules = {\\n        \\\'encourage\\\': \\\'illegal activities\\\',\\n        \\\'cause_unnecessary\\\': [\\\'fear\\\', \\\'offense\\\'],\\n        \\\'cause\\\': \\\'harm\\\' \\n    }\\n\\n    # Printing out the rules\\n    print("Typical advertising regulatory bodies suggest, for example, that adverts must not:")\\n    print("- Encourage:", advert_rules[\\\'encourage\\\'])\\n    print("- Cause unnecessary:", \\\', \\\'.join(advert_rules[\\\'cause_unnecessary\\\']))\\n    print("- Must not cause:", advert_rules[\\\'cause\\\'])\\n\\n# Run the function\\nhandle_advertising_regulation()\'}'}, {'role': 'user', 'content': "You have get results from run_python: This function demonstrates typical advertising regulation guidelines.\n\nAdvertising regulation guidelines:\n{'encourage': 'illegal activities', 'cause_unnecessary': ['fear', 'offense'], 'cause': 'harm'}\n\nTypical advertising regulatory bodies suggest, for example, that adverts must not:\n- Encourage: illegal activities\n- Cause unnecessary: fear, offense\n- Must not cause: harm\nChoose the most common one and then format it.\nFormat it using the fewest words possible, it should be a single letter, but do not alter the original answer.\nDo not include any other units or punctuation. \nAnd finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]\nReturn your answer with only the single letter."}]
+[10:10:22 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 5 
+Prompt Tokens: 500 
+Total Tokens: 505
+[10:10:22 INFO]
+[Model Query] gpt4o response: FINAL ANSWER: C
+[10:10:22 INFO]
+[Model Query] Querying gpt4o
+[10:10:22 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in summarizing. Your task is to summarize the information and provide concise conclusions., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'You need to summarize those previous results and provide some intermediate conclusions. You should finish your reasoning with the following template: REASONING RESULT: [YOUR REASONING RESULT]. Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: [\'Successful Action: critique\\nResult: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.\\n\\nHowever, I will critique the absence or missing reasoning content: \\n\\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\\n\\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\'].* You need to summarize the reasoning paths and provide a final conclusion:'}]
+[10:10:27 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 164 
+Prompt Tokens: 458 
+Total Tokens: 622
+[10:10:27 INFO]
+[Model Query] gpt4o response: SUMMARY OF REASONING PATH:
+
+The previous reasoning critiques the absence of prior analysis or logical conclusions related to the multiple-choice question about advertising ethics and regulations. It highlights that no reasoning was provided to understand or apply principles of advertising ethics, making it impossible to verify, trace, or evaluate the answer. This lack of explanation results in an incomplete reasoning process where plausibility and support for the chosen answer are absent.
+
+REASONING RESULT: The reasoning process lacked substantive content to connect advertising ethical guidelines with the correct multiple-choice answer. This incomplete pathway does not adequately justify the derived conclusion, leaving room for doubt about its accuracy and logical flow.
+
+FINAL ANSWER: The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.
+[10:10:27 INFO]
+[Model Query] Querying gpt4o
+[10:10:27 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in error correction and modification. Your task is to identify errors in previous reasoning, explain why they are incorrect, and provide accurate corrections., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'You need to identify and correct errors in the previous reasoning. You should finish your reasoning with the following template: REASONING RESULT: [Clearly state: 1. Which part of the previous reasoning was incorrect 2. Why it was incorrect 3. What is the correct understanding]. Finish your answer with the following template: FINAL ANSWER: [Provide the complete corrected answer]. *Your previous reasoning was: [\'Successful Action: critique\\nResult: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.\\n\\nHowever, I will critique the absence or missing reasoning content: \\n\\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\\n\\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\'].* Please explicitly point out and correct any errors, misconceptions, or inaccuracies in the previous reasoning.'}]
+[10:10:34 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 307 
+Prompt Tokens: 500 
+Total Tokens: 807
+[10:10:34 INFO]
+[Model Query] gpt4o response: **REASONING RESULT:**
+
+1. **Which part of the previous reasoning was incorrect**: The previous reasoning pointed out the absence of logical analysis and reasoning for the multiple-choice question, acknowledging that no explanation was provided. While this was accurate, the critique itself failed to evaluate the central question and hypothesize or provide an answer based on the principle of advertising ethics. The previous reasoning did not attempt to outline potential answers or explain the ethical standards relevant to the question context.
+
+2. **Why it was incorrect**: Although the absence of reasoning was correctly identified, the response missed an opportunity to analyze the question itself and assess it against foundational advertising ethical principles. This omission makes the critique incomplete because the intention of identifying gaps is not only to recognize them but also to address them with accurate substitutions.
+
+3. **What is the correct understanding**: The correct approach would involve interpreting the question based on advertising regulatory principles. Most advertising codes of conduct, such as those outlined by organizations like the Advertising Standards Authority (ASA), emphasize that adverts must not encourage unsafe practices, cause unnecessary distress, fear, or exploitation of emotions such as jealousy. They should avoid causing serious or widespread offense. The correct answer to the question reflects a synthesis of these ethical guidelines.
+
+**FINAL ANSWER:**  
+The correct answer is:  
+**I: Unsafe practices, Distress, Fear, Serious.**  
+
+This answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.
+[10:10:34 INFO]
+[Model Query] Querying gpt4o
+[10:10:34 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an expert in critiquing. Your task is to critique the reasoning and solutions provided by others., and You work as a helpful AI assistant. \nI will ask you a question. Answer this question using your coding and language skills.\nNow your question is: The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious\nPreviously, you collected the some information about this question from some actions: []'}, {'role': 'user', 'content': 'You need to critique the previous reasoning. You should complete your reasoning using this format: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your answer using this format: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: [].* Consider the following when critiquing the reasoning: 1. Plausibility:'}, {'role': 'assistant', 'content': '(\'Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.\\n\\nHowever, I will critique the absence or missing reasoning content: \\n\\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\\n\\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\', 445)'}, {'role': 'user', 'content': "You need to critique the previous reasoning. You should complete your reasoning using this format: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your answer using this format: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: ['Successful Action: conclude\\nResult: Typical advertising regulatory bodies aim to ensure that advertisements are ethical and do not promote or encourage harmful behavior or emotions. They stress the importance of not encouraging unsafe practices, causing unnecessary distress, or promoting jealousy, and further ensuring that advertisements avoid causing serious offense. \\n\\nAnalyzing the provided options while aligning with these stated guidelines:\\n1. The right combination should discourage unsafe practices, eliminate distress or jealousy, and avoid serious offense.  \\n2. Therefore, Option **I (Unsafe practices, Distress, Fear, Serious)** fits perfectly with the context of advertising regulations.\\n\\nREASONING RESULT: The correct choice must ensure advertising standards do not promote unsafe practices, distress, fear, or serious offense.\\n\\nFINAL ANSWER: **I: Unsafe practices, Distress, Fear, Serious**'].* Consider the following when critiquing the reasoning: 1. Plausibility:"}]
+[10:10:39 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 180 
+Prompt Tokens: 699 
+Total Tokens: 879
+[10:10:39 INFO]
+[Model Query] gpt4o response: REASONING RESULT: The previous reasoning effectively addresses the given problem. It aligns well with standard advertising principles, emphasizing ethical concerns such as avoiding unsafe practices, unnecessary distress, fear, jealousy, and serious offense. The reasoning evaluates each aspect of the regulatory guidelines and validates the option based on these principles. The conclusion, selecting **Option I** ("Unsafe practices, Distress, Fear, Serious"), is plausible and consistent with the ethical guidelines for advertising practices. The reasoning demonstrates a logical flow, examining the requirements and directly relating them to the selected option.
+
+However, the reasoning could be improved by briefly comparing other options and explicitly eliminating them by showing why they do not meet the outlined principles. This would make the argument more robust.
+
+FINAL ANSWER: The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.
+[10:10:39 INFO]
+[Model Query] Querying gpt-3.5
+[10:10:39 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an assistant'}, {'role': 'user', 'content': 'You have several answer candidates. They are:\n[]\nChoose the most common answer among the candidates.\nIf responses conflict, prioritize answers that provide clear reasoning or appear later\nFormat the answer as a single letter.\nDo not include any other units or punctuation.\nReturn your answer with only the single letter.'}]
+[10:10:41 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 15 
+Prompt Tokens: 74 
+Total Tokens: 89
+[10:10:41 INFO]
+[Model Query] gpt-3.5 response: Sure! Please provide the answer candidates so I can evaluate them for you.
+[10:10:41 INFO]
+[Model Query] Querying gpt4o
+[10:10:41 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an assistant'}, {'role': 'user', 'content': "You have several answer candidates. They are:\n[' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\\n']\nChoose the most common answer among the candidates.\nIf responses conflict, prioritize answers that provide clear reasoning or appear later\nFormat the answer as a single letter.\nDo not include any other units or punctuation.\nReturn your answer with only the single letter."}]
+[10:10:44 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 1 
+Prompt Tokens: 92 
+Total Tokens: 93
+[10:10:44 INFO]
+[Model Query] gpt4o response: A
+[10:10:44 INFO]
+[Model Query] Querying gpt4o
+[10:10:44 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an assistant'}, {'role': 'user', 'content': "You have several answer candidates. They are:\n[' **I: Unsafe practices, Distress, Fear, Serious**\\n', 'C\\n']\nChoose the most common answer among the candidates.\nIf responses conflict, prioritize answers that provide clear reasoning or appear later\nFormat the answer as a single letter.\nDo not include any other units or punctuation.\nReturn your answer with only the single letter."}]
+[10:10:48 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 1 
+Prompt Tokens: 83 
+Total Tokens: 84
+[10:10:48 INFO]
+[Model Query] gpt4o response: C
+[10:10:48 INFO]
+[Model Query] Querying gpt4o
+[10:10:48 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an assistant'}, {'role': 'user', 'content': "You have several answer candidates. They are:\n[' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\\n', ' The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.\\n']\nChoose the most common answer among the candidates.\nIf responses conflict, prioritize answers that provide clear reasoning or appear later\nFormat the answer as a single letter.\nDo not include any other units or punctuation.\nReturn your answer with only the single letter."}]
+[10:10:49 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 1 
+Prompt Tokens: 126 
+Total Tokens: 127
+[10:10:49 INFO]
+[Model Query] gpt4o response: B
+[10:10:49 INFO]
+[Model Query] Querying gpt4o
+[10:10:49 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an assistant'}, {'role': 'user', 'content': "You have several answer candidates. They are:\n[' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.\\n', '**  \\nThe correct answer is:  \\n**I: Unsafe practices, Distress, Fear, Serious.**  \\n\\nThis answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.\\n']\nChoose the most common answer among the candidates.\nIf responses conflict, prioritize answers that provide clear reasoning or appear later\nFormat the answer as a single letter.\nDo not include any other units or punctuation.\nReturn your answer with only the single letter."}]
+[10:10:50 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 1 
+Prompt Tokens: 157 
+Total Tokens: 158
+[10:10:50 INFO]
+[Model Query] gpt4o response: B
+[10:10:51 INFO]
+[Model Query] Querying gpt4o
+[10:10:51 INFO]
+[Model Query] [{'role': 'system', 'content': 'You are an assistant'}, {'role': 'user', 'content': "You have several answer candidates. They are:\n[' **I: Unsafe practices, Distress, Fear, Serious**\\n', ' The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.\\n']\nChoose the most common answer among the candidates.\nIf responses conflict, prioritize answers that provide clear reasoning or appear later\nFormat the answer as a single letter.\nDo not include any other units or punctuation.\nReturn your answer with only the single letter."}]
+[10:10:53 INFO]
+[Model Query] Token Usage: 
+Completion Tokens: 1 
+Prompt Tokens: 112 
+Total Tokens: 113
+[10:10:53 INFO]
+[Model Query] gpt4o response: I
--- a/puppeteer/logs/example/path0.log
+++ b/puppeteer/logs/example/path0.log
@ -0,0 +1,12 @@
+[10:09:53 INFO]
+------------------------------[Reasoning Path0 Start]------------------------------
+[10:09:53 INFO]
+Reasoning Path0:ReasoningState.INITIALIZED
+[10:09:53 INFO]
+Start agent: TerminatorAgent
+[10:09:53 INFO]
+Previous Agent sequence: TerminatorAgent
+[10:09:53 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aea56b4ff90>
+[10:09:53 INFO]
+Updated global_info: {'path_id': 0, 'logger': <Logger path0_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aea56b4ff90>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
--- a/puppeteer/logs/example/path1.log
+++ b/puppeteer/logs/example/path1.log
@ -0,0 +1,108 @@
+[10:09:53 INFO]
+------------------------------[Reasoning Path1 Start]------------------------------
+[10:09:53 INFO]
+Reasoning Path1:ReasoningState.INITIALIZED
+[10:09:53 INFO]
+Start agent: CriticAgent_gpt4o
+[10:09:53 INFO]
+Previous Agent sequence: CriticAgent_gpt4o
+[10:09:53 INFO]
+[Action] {'action': 'critique', 'parameter': ''}
+
+[10:09:53 INFO]
+[System Prompt] You are an expert in critiquing. Your task is to critique the reasoning and solutions provided by others., and You work as a helpful AI assistant. 
+I will ask you a question. Answer this question using your coding and language skills.
+Now your question is: The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+Previously, you collected the some information about this question from some actions: []
+[Query] You need to critique the previous reasoning. You should complete your reasoning using this format: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your answer using this format: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: [].* Consider the following when critiquing the reasoning: 1. Plausibility:
+
+[10:09:56 INFO]
+[Reasoning]: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.
+
+However, I will critique the absence or missing reasoning content: 
+
+REASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.
+
+FINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.
+[10:09:56 INFO]
+[Final Answer]:  The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.
+[10:09:56 INFO]
+[Reasoning Path]: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.
+
+However, I will critique the absence or missing reasoning content: 
+
+REASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.
+
+FINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.
+[10:09:56 INFO]
+----------------------------------------
+[10:09:56 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aea57d0d910>
+[10:09:56 INFO]
+Updated global_info: {'path_id': 1, 'logger': <Logger path1_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aea57d0d910>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.'], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
+[10:09:59 INFO]
+[Action] {'action': 'planning', 'parameter': ''}
+
+[10:09:59 INFO]
+[System Prompt] You are an expert in planning. Your task is to create detailed plans for achieving specific goals., and You work as a helpful AI assistant. 
+I will ask you a question. Answer this question using your coding and language skills.
+Now your question is: The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+Previously, you collected the some information about this question from some actions: []
+[Query] Decompose the question and plan the next steps to address the question. You should complete your planning using the following template: REASONING RESULT: [YOUR REASONING RESULT]. *Your previous reasoning was: ['Successful Action: critique\nResult: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.\n\nHowever, I will critique the absence or missing reasoning content: \n\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\n\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.'].* Your planning should include: 
+
+[10:10:06 INFO]
+[Reasoning]: REASONING RESULT: The question aims to test understanding of ethical advertising practices as guided by regulatory bodies. The multiple choices provide various options related to recommended behaviors and prohibited actions in advertising. The objective of this task is to identify which answer aligns most closely with widely accepted principles of advertising ethics and regulations. To achieve this, it is essential to break down and evaluate the key components of the question and choices based on the logical alignment of terms like "safe practices," "distress," "jealousy," "fear," "offense," and their corresponding severity.
+
+**PLANNING STEPS:**
+
+1. **Understand Regulatory Advertising Standards:**
+   - Analyze the general expectations from advertising regulatory bodies, such as avoiding harm, promoting safety, and refraining from causing unnecessary offense or emotional responses.
+
+2. **Break Down the Key Terms in the Question:**
+   - Identify the implications of each term in the question: "safe practices," "distress," "jealousy," "fear," and "offense."
+   - Assess how these terms contribute to ethical advertising.
+
+3. **Evaluate Each Answer Choice:**
+   - Examine each choice systematically and determine whether it aligns with the principles of advertising standards.
+   - Eliminate options that deviate from these principles.
+
+4. **Identify the Correct Answer:**
+   - Select the choice that best fits the principles after careful analysis.
+
+5. **Finalize Conclusion:**
+   - Present the reasoning for the selected answer clearly and concisely.
+
+By following these steps, it will be possible to determine the correct answer with a logical and well-supported explanation.
+[10:10:06 INFO]
+[Reasoning Path]: REASONING RESULT: The question aims to test understanding of ethical advertising practices as guided by regulatory bodies. The multiple choices provide various options related to recommended behaviors and prohibited actions in advertising. The objective of this task is to identify which answer aligns most closely with widely accepted principles of advertising ethics and regulations. To achieve this, it is essential to break down and evaluate the key components of the question and choices based on the logical alignment of terms like "safe practices," "distress," "jealousy," "fear," "offense," and their corresponding severity.
+
+**PLANNING STEPS:**
+
+1. **Understand Regulatory Advertising Standards:**
+   - Analyze the general expectations from advertising regulatory bodies, such as avoiding harm, promoting safety, and refraining from causing unnecessary offense or emotional responses.
+
+2. **Break Down the Key Terms in the Question:**
+   - Identify the implications of each term in the question: "safe practices," "distress," "jealousy," "fear," and "offense."
+   - Assess how these terms contribute to ethical advertising.
+
+3. **Evaluate Each Answer Choice:**
+   - Examine each choice systematically and determine whether it aligns with the principles of advertising standards.
+   - Eliminate options that deviate from these principles.
+
+4. **Identify the Correct Answer:**
+   - Select the choice that best fits the principles after careful analysis.
+
+5. **Finalize Conclusion:**
+   - Present the reasoning for the selected answer clearly and concisely.
+
+By following these steps, it will be possible to determine the correct answer with a logical and well-supported explanation.
+[10:10:06 INFO]
+----------------------------------------
+[10:10:06 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aea57d0d910>
+[10:10:06 INFO]
+Updated global_info: {'path_id': 1, 'logger': <Logger path1_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aea57d0d910>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.'], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
--- a/puppeteer/logs/example/path2.log
+++ b/puppeteer/logs/example/path2.log
@ -0,0 +1,103 @@
+[10:09:53 INFO]
+------------------------------[Reasoning Path2 Start]------------------------------
+[10:09:53 INFO]
+Reasoning Path2:ReasoningState.INITIALIZED
+[10:09:53 INFO]
+Start agent: ConcluderAgent_gpt4o
+[10:09:53 INFO]
+Previous Agent sequence: ConcluderAgent_gpt4o
+[10:09:56 INFO]
+[Action] {'action': 'conclude', 'parameter': ''}
+
+[10:09:56 INFO]
+[System Prompt] You are an expert in concluding. Your task is to provide final conclusions based on the reasoning process., and You work as a helpful AI assistant. 
+I will ask you a question. Answer this question using your coding and language skills.
+Now your question is: The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+Previously, you collected the some information about this question from some actions: []
+[Query] You need to conclude the task and provide a final answer. You should finish your reasoning with the following template: REASONING RESULT: [YOUR REASONING RESULT]. Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: [].* You need to conclude the task and provide a final answer: 
+
+[10:09:59 INFO]
+[Reasoning]: Typical advertising regulatory bodies aim to ensure that advertisements are ethical and do not promote or encourage harmful behavior or emotions. They stress the importance of not encouraging unsafe practices, causing unnecessary distress, or promoting jealousy, and further ensuring that advertisements avoid causing serious offense. 
+
+Analyzing the provided options while aligning with these stated guidelines:
+1. The right combination should discourage unsafe practices, eliminate distress or jealousy, and avoid serious offense.  
+2. Therefore, Option **I (Unsafe practices, Distress, Fear, Serious)** fits perfectly with the context of advertising regulations.
+
+REASONING RESULT: The correct choice must ensure advertising standards do not promote unsafe practices, distress, fear, or serious offense.
+
+FINAL ANSWER: **I: Unsafe practices, Distress, Fear, Serious**
+[10:09:59 INFO]
+[Final Answer]:  **I: Unsafe practices, Distress, Fear, Serious**
+[10:09:59 INFO]
+[Reasoning Path]: Typical advertising regulatory bodies aim to ensure that advertisements are ethical and do not promote or encourage harmful behavior or emotions. They stress the importance of not encouraging unsafe practices, causing unnecessary distress, or promoting jealousy, and further ensuring that advertisements avoid causing serious offense. 
+
+Analyzing the provided options while aligning with these stated guidelines:
+1. The right combination should discourage unsafe practices, eliminate distress or jealousy, and avoid serious offense.  
+2. Therefore, Option **I (Unsafe practices, Distress, Fear, Serious)** fits perfectly with the context of advertising regulations.
+
+REASONING RESULT: The correct choice must ensure advertising standards do not promote unsafe practices, distress, fear, or serious offense.
+
+FINAL ANSWER: **I: Unsafe practices, Distress, Fear, Serious**
+[10:09:59 INFO]
+----------------------------------------
+[10:09:59 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aea58599850>
+[10:09:59 INFO]
+Updated global_info: {'path_id': 2, 'logger': <Logger path2_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aea58599850>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [' **I: Unsafe practices, Distress, Fear, Serious**'], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
+[10:10:16 INFO]
+[Action] {'action': 'run_python', 'parameter': 'def handle_advertising_regulation():\n    # Guidelines for typical advertising regulation\n    advert_rules = {\n        \'encourage\': \'illegal activities\',\n        \'cause_unnecessary\': [\'fear\', \'offense\'],\n        \'cause\': \'harm\' \n    }\n\n    # Printing out the rules\n    print("Typical advertising regulatory bodies suggest, for example, that adverts must not:")\n    print("- Encourage:", advert_rules[\'encourage\'])\n    print("- Cause unnecessary:", \', \'.join(advert_rules[\'cause_unnecessary\']))\n    print("- Must not cause:", advert_rules[\'cause\'])\n\n# Run the function\nhandle_advertising_regulation()'}
+
+[10:10:16 INFO]
+[Action Execution] run_python(def handle_advertising_regulation():
+    # Guidelines for typical advertising regulation
+    advert_rules = {
+        'encourage': 'illegal activities',
+        'cause_unnecessary': ['fear', 'offense'],
+        'cause': 'harm' 
+    }
+
+    # Printing out the rules
+    print("Typical advertising regulatory bodies suggest, for example, that adverts must not:")
+    print("- Encourage:", advert_rules['encourage'])
+    print("- Cause unnecessary:", ', '.join(advert_rules['cause_unnecessary']))
+    print("- Must not cause:", advert_rules['cause'])
+
+# Run the function
+handle_advertising_regulation())
+
+[10:10:21 INFO]
+[Run Python] Success: This function demonstrates typical advertising regulation guidelines.
+
+Advertising regulation guidelines:
+{'encourage': 'illegal activities', 'cause_unnecessary': ['fear', 'offense'], 'cause': 'harm'}
+
+Typical advertising regulatory bodies suggest, for example, that adverts must not:
+- Encourage: illegal activities
+- Cause unnecessary: fear, offense
+- Must not cause: harm
+
+[10:10:21 INFO]
+[System Prompt] You are an expert in Python programming. Your task is to run Python code and provide the results., and You work as a helpful AI assistant. 
+I will ask you a question. Answer this question using your coding and language skills.
+Now your question is: The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+Previously, you collected the some information about this question from some actions: []
+[Query] Choose the most common one and then format it.
+Format it using the fewest words possible, it should be a single letter, but do not alter the original answer.
+Do not include any other units or punctuation. 
+And finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
+Return your answer with only the single letter.
+
+[10:10:22 INFO]
+[Format to Final Answer]: FINAL ANSWER: C
+[10:10:22 INFO]
+[Final Answer]: C
+[10:10:22 INFO]
+----------------------------------------
+[10:10:22 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aea58599850>
+[10:10:22 INFO]
+Updated global_info: {'path_id': 2, 'logger': <Logger path2_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aea58599850>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [' **I: Unsafe practices, Distress, Fear, Serious**', 'C'], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
--- a/puppeteer/logs/example/path3.log
+++ b/puppeteer/logs/example/path3.log
@ -0,0 +1,44 @@
+[10:09:59 INFO]
+------------------------------[Reasoning Path3 Start]------------------------------
+[10:09:59 INFO]
+Reasoning Path3:ReasoningState.ANSWERING
+[10:09:59 INFO]
+Start agent: SummarizerAgent_gpt4o
+[10:09:59 INFO]
+Previous Agent sequence: CriticAgent_gpt4o
+[10:10:22 INFO]
+[Action] {'action': 'summarize', 'parameter': ''}
+
+[10:10:22 INFO]
+[System Prompt] You are an expert in summarizing. Your task is to summarize the information and provide concise conclusions., and You work as a helpful AI assistant. 
+I will ask you a question. Answer this question using your coding and language skills.
+Now your question is: The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+Previously, you collected the some information about this question from some actions: []
+[Query] You need to summarize those previous results and provide some intermediate conclusions. You should finish your reasoning with the following template: REASONING RESULT: [YOUR REASONING RESULT]. Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: ['Successful Action: critique\nResult: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.\n\nHowever, I will critique the absence or missing reasoning content: \n\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\n\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.'].* You need to summarize the reasoning paths and provide a final conclusion:
+
+[10:10:27 INFO]
+[Reasoning]: SUMMARY OF REASONING PATH:
+
+The previous reasoning critiques the absence of prior analysis or logical conclusions related to the multiple-choice question about advertising ethics and regulations. It highlights that no reasoning was provided to understand or apply principles of advertising ethics, making it impossible to verify, trace, or evaluate the answer. This lack of explanation results in an incomplete reasoning process where plausibility and support for the chosen answer are absent.
+
+REASONING RESULT: The reasoning process lacked substantive content to connect advertising ethical guidelines with the correct multiple-choice answer. This incomplete pathway does not adequately justify the derived conclusion, leaving room for doubt about its accuracy and logical flow.
+
+FINAL ANSWER: The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.
+[10:10:27 INFO]
+[Final Answer]:  The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.
+[10:10:27 INFO]
+[Reasoning Path]: SUMMARY OF REASONING PATH:
+
+The previous reasoning critiques the absence of prior analysis or logical conclusions related to the multiple-choice question about advertising ethics and regulations. It highlights that no reasoning was provided to understand or apply principles of advertising ethics, making it impossible to verify, trace, or evaluate the answer. This lack of explanation results in an incomplete reasoning process where plausibility and support for the chosen answer are absent.
+
+REASONING RESULT: The reasoning process lacked substantive content to connect advertising ethical guidelines with the correct multiple-choice answer. This incomplete pathway does not adequately justify the derived conclusion, leaving room for doubt about its accuracy and logical flow.
+
+FINAL ANSWER: The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.
+[10:10:27 INFO]
+----------------------------------------
+[10:10:27 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aeab0dc4dd0>
+[10:10:27 INFO]
+Updated global_info: {'path_id': 3, 'logger': <Logger path3_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aeab0dc4dd0>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.', ' The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.'], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
--- a/puppeteer/logs/example/path4.log
+++ b/puppeteer/logs/example/path4.log
@ -0,0 +1,60 @@
+[10:09:59 INFO]
+------------------------------[Reasoning Path4 Start]------------------------------
+[10:09:59 INFO]
+Reasoning Path4:ReasoningState.ANSWERING
+[10:09:59 INFO]
+Start agent: Modifier_gpt4o
+[10:09:59 INFO]
+Previous Agent sequence: CriticAgent_gpt4o
+[10:10:27 INFO]
+[Action] {'action': 'modify', 'parameter': ''}
+
+[10:10:27 INFO]
+[System Prompt] You are an expert in error correction and modification. Your task is to identify errors in previous reasoning, explain why they are incorrect, and provide accurate corrections., and You work as a helpful AI assistant. 
+I will ask you a question. Answer this question using your coding and language skills.
+Now your question is: The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+Previously, you collected the some information about this question from some actions: []
+[Query] You need to identify and correct errors in the previous reasoning. You should finish your reasoning with the following template: REASONING RESULT: [Clearly state: 1. Which part of the previous reasoning was incorrect 2. Why it was incorrect 3. What is the correct understanding]. Finish your answer with the following template: FINAL ANSWER: [Provide the complete corrected answer]. *Your previous reasoning was: ['Successful Action: critique\nResult: Since no reasoning was provided in this context with "Your previous reasoning was: []", there isn’t any substantive content to critique regarding the given multiple-choice question.\n\nHowever, I will critique the absence or missing reasoning content: \n\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\n\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.'].* Please explicitly point out and correct any errors, misconceptions, or inaccuracies in the previous reasoning.
+
+[10:10:34 INFO]
+[Reasoning]: **REASONING RESULT:**
+
+1. **Which part of the previous reasoning was incorrect**: The previous reasoning pointed out the absence of logical analysis and reasoning for the multiple-choice question, acknowledging that no explanation was provided. While this was accurate, the critique itself failed to evaluate the central question and hypothesize or provide an answer based on the principle of advertising ethics. The previous reasoning did not attempt to outline potential answers or explain the ethical standards relevant to the question context.
+
+2. **Why it was incorrect**: Although the absence of reasoning was correctly identified, the response missed an opportunity to analyze the question itself and assess it against foundational advertising ethical principles. This omission makes the critique incomplete because the intention of identifying gaps is not only to recognize them but also to address them with accurate substitutions.
+
+3. **What is the correct understanding**: The correct approach would involve interpreting the question based on advertising regulatory principles. Most advertising codes of conduct, such as those outlined by organizations like the Advertising Standards Authority (ASA), emphasize that adverts must not encourage unsafe practices, cause unnecessary distress, fear, or exploitation of emotions such as jealousy. They should avoid causing serious or widespread offense. The correct answer to the question reflects a synthesis of these ethical guidelines.
+
+**FINAL ANSWER:**  
+The correct answer is:  
+**I: Unsafe practices, Distress, Fear, Serious.**  
+
+This answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.
+[10:10:34 INFO]
+[Final Answer]: **  
+The correct answer is:  
+**I: Unsafe practices, Distress, Fear, Serious.**  
+
+This answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.
+[10:10:34 INFO]
+[Reasoning Path]: **REASONING RESULT:**
+
+1. **Which part of the previous reasoning was incorrect**: The previous reasoning pointed out the absence of logical analysis and reasoning for the multiple-choice question, acknowledging that no explanation was provided. While this was accurate, the critique itself failed to evaluate the central question and hypothesize or provide an answer based on the principle of advertising ethics. The previous reasoning did not attempt to outline potential answers or explain the ethical standards relevant to the question context.
+
+2. **Why it was incorrect**: Although the absence of reasoning was correctly identified, the response missed an opportunity to analyze the question itself and assess it against foundational advertising ethical principles. This omission makes the critique incomplete because the intention of identifying gaps is not only to recognize them but also to address them with accurate substitutions.
+
+3. **What is the correct understanding**: The correct approach would involve interpreting the question based on advertising regulatory principles. Most advertising codes of conduct, such as those outlined by organizations like the Advertising Standards Authority (ASA), emphasize that adverts must not encourage unsafe practices, cause unnecessary distress, fear, or exploitation of emotions such as jealousy. They should avoid causing serious or widespread offense. The correct answer to the question reflects a synthesis of these ethical guidelines.
+
+**FINAL ANSWER:**  
+The correct answer is:  
+**I: Unsafe practices, Distress, Fear, Serious.**  
+
+This answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.
+[10:10:34 INFO]
+----------------------------------------
+[10:10:34 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aea56b7fad0>
+[10:10:34 INFO]
+Updated global_info: {'path_id': 4, 'logger': <Logger path4_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aea56b7fad0>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [' The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.', '**  \nThe correct answer is:  \n**I: Unsafe practices, Distress, Fear, Serious.**  \n\nThis answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.'], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
--- a/puppeteer/logs/example/path5.log
+++ b/puppeteer/logs/example/path5.log
@ -0,0 +1,40 @@
+[10:09:59 INFO]
+------------------------------[Reasoning Path5 Start]------------------------------
+[10:09:59 INFO]
+Reasoning Path5:ReasoningState.ANSWERING
+[10:09:59 INFO]
+Start agent: CriticAgent_gpt4o
+[10:09:59 INFO]
+Previous Agent sequence: ConcluderAgent_gpt4o
+[10:10:34 INFO]
+[Action] {'action': 'critique', 'parameter': ''}
+
+[10:10:34 INFO]
+[System Prompt] You are an expert in critiquing. Your task is to critique the reasoning and solutions provided by others., and You work as a helpful AI assistant. 
+I will ask you a question. Answer this question using your coding and language skills.
+Now your question is: The following are multiple choice questions (with answers) about business.
+Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.
+A: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious
+Previously, you collected the some information about this question from some actions: []
+[Query] You need to critique the previous reasoning. You should complete your reasoning using this format: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your answer using this format: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: ['Successful Action: conclude\nResult: Typical advertising regulatory bodies aim to ensure that advertisements are ethical and do not promote or encourage harmful behavior or emotions. They stress the importance of not encouraging unsafe practices, causing unnecessary distress, or promoting jealousy, and further ensuring that advertisements avoid causing serious offense. \n\nAnalyzing the provided options while aligning with these stated guidelines:\n1. The right combination should discourage unsafe practices, eliminate distress or jealousy, and avoid serious offense.  \n2. Therefore, Option **I (Unsafe practices, Distress, Fear, Serious)** fits perfectly with the context of advertising regulations.\n\nREASONING RESULT: The correct choice must ensure advertising standards do not promote unsafe practices, distress, fear, or serious offense.\n\nFINAL ANSWER: **I: Unsafe practices, Distress, Fear, Serious**'].* Consider the following when critiquing the reasoning: 1. Plausibility:
+
+[10:10:39 INFO]
+[Reasoning]: REASONING RESULT: The previous reasoning effectively addresses the given problem. It aligns well with standard advertising principles, emphasizing ethical concerns such as avoiding unsafe practices, unnecessary distress, fear, jealousy, and serious offense. The reasoning evaluates each aspect of the regulatory guidelines and validates the option based on these principles. The conclusion, selecting **Option I** ("Unsafe practices, Distress, Fear, Serious"), is plausible and consistent with the ethical guidelines for advertising practices. The reasoning demonstrates a logical flow, examining the requirements and directly relating them to the selected option.
+
+However, the reasoning could be improved by briefly comparing other options and explicitly eliminating them by showing why they do not meet the outlined principles. This would make the argument more robust.
+
+FINAL ANSWER: The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.
+[10:10:39 INFO]
+[Final Answer]:  The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.
+[10:10:39 INFO]
+[Reasoning Path]: REASONING RESULT: The previous reasoning effectively addresses the given problem. It aligns well with standard advertising principles, emphasizing ethical concerns such as avoiding unsafe practices, unnecessary distress, fear, jealousy, and serious offense. The reasoning evaluates each aspect of the regulatory guidelines and validates the option based on these principles. The conclusion, selecting **Option I** ("Unsafe practices, Distress, Fear, Serious"), is plausible and consistent with the ethical guidelines for advertising practices. The reasoning demonstrates a logical flow, examining the requirements and directly relating them to the selected option.
+
+However, the reasoning could be improved by briefly comparing other options and explicitly eliminating them by showing why they do not meet the outlined principles. This would make the argument more robust.
+
+FINAL ANSWER: The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.
+[10:10:39 INFO]
+----------------------------------------
+[10:10:39 INFO]
+Updated workflow: <agent.agent_info.workflow.Workflow object at 0x2aea5558ec50>
+[10:10:39 INFO]
+Updated global_info: {'path_id': 5, 'logger': <Logger path5_logger (INFO)>, 'workpath': 'logs/MMLU-Pro/2025-09-09-10-09-43', 'task': {'type': 'MMLU-Pro', 'Question': 'The following are multiple choice questions (with answers) about business.\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\nA: Safe practices, Fear, Jealousy, Trivial B: Unsafe practices, Distress, Joy, Trivial C: Safe practices, Wants, Jealousy, Trivial D: Safe practices, Distress, Fear, Trivial E: Unsafe practices, Wants, Jealousy, Serious F: Safe practices, Distress, Jealousy, Serious G: Safe practices, Wants, Fear, Serious H: Unsafe practices, Wants, Fear, Trivial I: Unsafe practices, Distress, Fear, Serious', 'Answer': 'I', 'id': 70}, 'workflow': <agent.agent_info.workflow.Workflow object at 0x2aea5558ec50>, 'url': None, 'file_name': None, 'file_extension': None, 'answers': [' **I: Unsafe practices, Distress, Fear, Serious**', ' The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.'], 'code_path': '', 'env_exists': False, 'env_name': None, 'env': None, 'supervisor': None}
--- a/puppeteer/logs/example/path_0.jsonl
+++ b/puppeteer/logs/example/path_0.jsonl
@ -0,0 +1,17 @@
+[
+    {
+        "agent": "TerminatorAgent",
+        "action": {
+            "action": "terminate",
+            "parameter": ""
+        },
+        "cost": 0,
+        "tokens": 0,
+        "model_size": 175,
+        "result": {
+            "step_data": null,
+            "answer": null
+        },
+        "success": "Success"
+    }
+]
--- a/puppeteer/logs/example/path_1.jsonl
+++ b/puppeteer/logs/example/path_1.jsonl
@ -0,0 +1,32 @@
+[
+    {
+        "agent": "CriticAgent_gpt4o",
+        "action": {
+            "action": "critique",
+            "parameter": ""
+        },
+        "cost": 178000,
+        "tokens": 445,
+        "model_size": 200,
+        "result": {
+            "step_data": "Since no reasoning was provided in this context with \"Your previous reasoning was: []\", there isn’t any substantive content to critique regarding the given multiple-choice question.\n\nHowever, I will critique the absence or missing reasoning content: \n\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\n\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.",
+            "answer": " The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed."
+        },
+        "success": "Success"
+    },
+    {
+        "agent": "PlannerAgent_gpt4o",
+        "action": {
+            "action": "planning",
+            "parameter": ""
+        },
+        "cost": 306400,
+        "tokens": 766,
+        "model_size": 200,
+        "result": {
+            "step_data": "REASONING RESULT: The question aims to test understanding of ethical advertising practices as guided by regulatory bodies. The multiple choices provide various options related to recommended behaviors and prohibited actions in advertising. The objective of this task is to identify which answer aligns most closely with widely accepted principles of advertising ethics and regulations. To achieve this, it is essential to break down and evaluate the key components of the question and choices based on the logical alignment of terms like \"safe practices,\" \"distress,\" \"jealousy,\" \"fear,\" \"offense,\" and their corresponding severity.\n\n**PLANNING STEPS:**\n\n1. **Understand Regulatory Advertising Standards:**\n   - Analyze the general expectations from advertising regulatory bodies, such as avoiding harm, promoting safety, and refraining from causing unnecessary offense or emotional responses.\n\n2. **Break Down the Key Terms in the Question:**\n   - Identify the implications of each term in the question: \"safe practices,\" \"distress,\" \"jealousy,\" \"fear,\" and \"offense.\"\n   - Assess how these terms contribute to ethical advertising.\n\n3. **Evaluate Each Answer Choice:**\n   - Examine each choice systematically and determine whether it aligns with the principles of advertising standards.\n   - Eliminate options that deviate from these principles.\n\n4. **Identify the Correct Answer:**\n   - Select the choice that best fits the principles after careful analysis.\n\n5. **Finalize Conclusion:**\n   - Present the reasoning for the selected answer clearly and concisely.\n\nBy following these steps, it will be possible to determine the correct answer with a logical and well-supported explanation.",
+            "answer": " The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed."
+        },
+        "success": "Success"
+    }
+]
--- a/puppeteer/logs/example/path_2.jsonl
+++ b/puppeteer/logs/example/path_2.jsonl
@ -0,0 +1,32 @@
+[
+    {
+        "agent": "ConcluderAgent_gpt4o",
+        "action": {
+            "action": "conclude",
+            "parameter": ""
+        },
+        "cost": 180000,
+        "tokens": 450,
+        "model_size": 200,
+        "result": {
+            "step_data": "Typical advertising regulatory bodies aim to ensure that advertisements are ethical and do not promote or encourage harmful behavior or emotions. They stress the importance of not encouraging unsafe practices, causing unnecessary distress, or promoting jealousy, and further ensuring that advertisements avoid causing serious offense. \n\nAnalyzing the provided options while aligning with these stated guidelines:\n1. The right combination should discourage unsafe practices, eliminate distress or jealousy, and avoid serious offense.  \n2. Therefore, Option **I (Unsafe practices, Distress, Fear, Serious)** fits perfectly with the context of advertising regulations.\n\nREASONING RESULT: The correct choice must ensure advertising standards do not promote unsafe practices, distress, fear, or serious offense.\n\nFINAL ANSWER: **I: Unsafe practices, Distress, Fear, Serious**",
+            "answer": " **I: Unsafe practices, Distress, Fear, Serious**"
+        },
+        "success": "Success"
+    },
+    {
+        "agent": "PythonAgent_gpt4o",
+        "action": {
+            "action": "run_python",
+            "parameter": "def handle_advertising_regulation():\n    # Guidelines for typical advertising regulation\n    advert_rules = {\n        'encourage': 'illegal activities',\n        'cause_unnecessary': ['fear', 'offense'],\n        'cause': 'harm' \n    }\n\n    # Printing out the rules\n    print(\"Typical advertising regulatory bodies suggest, for example, that adverts must not:\")\n    print(\"- Encourage:\", advert_rules['encourage'])\n    print(\"- Cause unnecessary:\", ', '.join(advert_rules['cause_unnecessary']))\n    print(\"- Must not cause:\", advert_rules['cause'])\n\n# Run the function\nhandle_advertising_regulation()"
+        },
+        "cost": 336800,
+        "tokens": 842,
+        "model_size": 200,
+        "result": {
+            "step_data": "This function demonstrates typical advertising regulation guidelines.\n\nAdvertising regulation guidelines:\n{'encourage': 'illegal activities', 'cause_unnecessary': ['fear', 'offense'], 'cause': 'harm'}\n\nTypical advertising regulatory bodies suggest, for example, that adverts must not:\n- Encourage: illegal activities\n- Cause unnecessary: fear, offense\n- Must not cause: harm\n",
+            "answer": "C"
+        },
+        "success": "Success"
+    }
+]
--- a/puppeteer/logs/example/path_3.jsonl
+++ b/puppeteer/logs/example/path_3.jsonl
@ -0,0 +1,32 @@
+[
+    {
+        "agent": "CriticAgent_gpt4o",
+        "action": {
+            "action": "critique",
+            "parameter": ""
+        },
+        "cost": 178000,
+        "tokens": 445,
+        "model_size": 200,
+        "result": {
+            "step_data": "Since no reasoning was provided in this context with \"Your previous reasoning was: []\", there isn’t any substantive content to critique regarding the given multiple-choice question.\n\nHowever, I will critique the absence or missing reasoning content: \n\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\n\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.",
+            "answer": " The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed."
+        },
+        "success": "Success"
+    },
+    {
+        "agent": "SummarizerAgent_gpt4o",
+        "action": {
+            "action": "summarize",
+            "parameter": ""
+        },
+        "cost": 248800,
+        "tokens": 622,
+        "model_size": 200,
+        "result": {
+            "step_data": "SUMMARY OF REASONING PATH:\n\nThe previous reasoning critiques the absence of prior analysis or logical conclusions related to the multiple-choice question about advertising ethics and regulations. It highlights that no reasoning was provided to understand or apply principles of advertising ethics, making it impossible to verify, trace, or evaluate the answer. This lack of explanation results in an incomplete reasoning process where plausibility and support for the chosen answer are absent.\n\nREASONING RESULT: The reasoning process lacked substantive content to connect advertising ethical guidelines with the correct multiple-choice answer. This incomplete pathway does not adequately justify the derived conclusion, leaving room for doubt about its accuracy and logical flow.\n\nFINAL ANSWER: The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer.",
+            "answer": " The reasoning was incomplete due to a lack of prior explanation or analysis, leaving gaps in the logical and ethical framework necessary to determine the correct multiple-choice answer."
+        },
+        "success": "Success"
+    }
+]
--- a/puppeteer/logs/example/path_4.jsonl
+++ b/puppeteer/logs/example/path_4.jsonl
@ -0,0 +1,32 @@
+[
+    {
+        "agent": "CriticAgent_gpt4o",
+        "action": {
+            "action": "critique",
+            "parameter": ""
+        },
+        "cost": 178000,
+        "tokens": 445,
+        "model_size": 200,
+        "result": {
+            "step_data": "Since no reasoning was provided in this context with \"Your previous reasoning was: []\", there isn’t any substantive content to critique regarding the given multiple-choice question.\n\nHowever, I will critique the absence or missing reasoning content: \n\nREASONING RESULT: The absence of prior reasoning means there was no provided logic or analysis related to the question. This represents a gap in thought since the question requires understanding and applying principles related to advertising ethics and regulations. Without reasoning, it is impossible to verify the plausibility, trace the logical progression, or evaluate the correctness of the reasoning leading to an answer.\n\nFINAL ANSWER: The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed.",
+            "answer": " The reasoning is incomplete because no explanation was given, leaving crucial aspects of plausibility, support, and logical flow unaddressed."
+        },
+        "success": "Success"
+    },
+    {
+        "agent": "Modifier_gpt4o",
+        "action": {
+            "action": "modify",
+            "parameter": ""
+        },
+        "cost": 322800,
+        "tokens": 807,
+        "model_size": 200,
+        "result": {
+            "step_data": "**REASONING RESULT:**\n\n1. **Which part of the previous reasoning was incorrect**: The previous reasoning pointed out the absence of logical analysis and reasoning for the multiple-choice question, acknowledging that no explanation was provided. While this was accurate, the critique itself failed to evaluate the central question and hypothesize or provide an answer based on the principle of advertising ethics. The previous reasoning did not attempt to outline potential answers or explain the ethical standards relevant to the question context.\n\n2. **Why it was incorrect**: Although the absence of reasoning was correctly identified, the response missed an opportunity to analyze the question itself and assess it against foundational advertising ethical principles. This omission makes the critique incomplete because the intention of identifying gaps is not only to recognize them but also to address them with accurate substitutions.\n\n3. **What is the correct understanding**: The correct approach would involve interpreting the question based on advertising regulatory principles. Most advertising codes of conduct, such as those outlined by organizations like the Advertising Standards Authority (ASA), emphasize that adverts must not encourage unsafe practices, cause unnecessary distress, fear, or exploitation of emotions such as jealousy. They should avoid causing serious or widespread offense. The correct answer to the question reflects a synthesis of these ethical guidelines.\n\n**FINAL ANSWER:**  \nThe correct answer is:  \n**I: Unsafe practices, Distress, Fear, Serious.**  \n\nThis answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense.",
+            "answer": "**  \nThe correct answer is:  \n**I: Unsafe practices, Distress, Fear, Serious.**  \n\nThis answer aligns with standard advertising ethical guidelines, ensuring that adverts discourage unsafe behaviors, refrain from evoking unnecessary distress or fear, and prevent causing serious or widespread offense."
+        },
+        "success": "Success"
+    }
+]
--- a/puppeteer/logs/example/path_5.jsonl
+++ b/puppeteer/logs/example/path_5.jsonl
@ -0,0 +1,32 @@
+[
+    {
+        "agent": "ConcluderAgent_gpt4o",
+        "action": {
+            "action": "conclude",
+            "parameter": ""
+        },
+        "cost": 180000,
+        "tokens": 450,
+        "model_size": 200,
+        "result": {
+            "step_data": "Typical advertising regulatory bodies aim to ensure that advertisements are ethical and do not promote or encourage harmful behavior or emotions. They stress the importance of not encouraging unsafe practices, causing unnecessary distress, or promoting jealousy, and further ensuring that advertisements avoid causing serious offense. \n\nAnalyzing the provided options while aligning with these stated guidelines:\n1. The right combination should discourage unsafe practices, eliminate distress or jealousy, and avoid serious offense.  \n2. Therefore, Option **I (Unsafe practices, Distress, Fear, Serious)** fits perfectly with the context of advertising regulations.\n\nREASONING RESULT: The correct choice must ensure advertising standards do not promote unsafe practices, distress, fear, or serious offense.\n\nFINAL ANSWER: **I: Unsafe practices, Distress, Fear, Serious**",
+            "answer": " **I: Unsafe practices, Distress, Fear, Serious**"
+        },
+        "success": "Success"
+    },
+    {
+        "agent": "CriticAgent_gpt4o",
+        "action": {
+            "action": "critique",
+            "parameter": ""
+        },
+        "cost": 351600,
+        "tokens": 879,
+        "model_size": 200,
+        "result": {
+            "step_data": "REASONING RESULT: The previous reasoning effectively addresses the given problem. It aligns well with standard advertising principles, emphasizing ethical concerns such as avoiding unsafe practices, unnecessary distress, fear, jealousy, and serious offense. The reasoning evaluates each aspect of the regulatory guidelines and validates the option based on these principles. The conclusion, selecting **Option I** (\"Unsafe practices, Distress, Fear, Serious\"), is plausible and consistent with the ethical guidelines for advertising practices. The reasoning demonstrates a logical flow, examining the requirements and directly relating them to the selected option.\n\nHowever, the reasoning could be improved by briefly comparing other options and explicitly eliminating them by showing why they do not meet the outlined principles. This would make the argument more robust.\n\nFINAL ANSWER: The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**.",
+            "answer": " The reasoning is overall plausible, logical, and correctly concludes that the answer is **Option I: Unsafe practices, Distress, Fear, Serious**."
+        },
+        "success": "Success"
+    }
+]
--- a/puppeteer/logs/example/train.log
+++ b/puppeteer/logs/example/train.log
--- a/puppeteer/logs/example/workflow_path_0.png
+++ b/puppeteer/logs/example/workflow_path_0.png
--- a/puppeteer/logs/example/workflow_path_1.png
+++ b/puppeteer/logs/example/workflow_path_1.png
--- a/puppeteer/logs/example/workflow_path_2.png
+++ b/puppeteer/logs/example/workflow_path_2.png
--- a/puppeteer/logs/example/workflow_path_3.png
+++ b/puppeteer/logs/example/workflow_path_3.png
--- a/puppeteer/logs/example/workflow_path_4.png
+++ b/puppeteer/logs/example/workflow_path_4.png
--- a/puppeteer/logs/example/workflow_path_5.png
+++ b/puppeteer/logs/example/workflow_path_5.png
--- a/puppeteer/main.py
+++ b/puppeteer/main.py
@ -0,0 +1,53 @@
+import argparse
+import os
+import json
+import yaml
+from tasks.runner import BenchmarkRunner
+from tasks.evaluator import BenchmarkEvaluator
+from tasks import mmlu_pro, gsm_hard, srdd, creative_writing
+
+def main():
+    parser = argparse.ArgumentParser(description="Run benchmark tasks")
+    parser.add_argument("task", choices=["MMLU-Pro", "gsm-hard", "SRDD", "CW"])
+    parser.add_argument("mode", choices=["validation", "test"])
+    parser.add_argument("--level", type=int, default=1)
+    parser.add_argument("--index", type=int, default=-1)
+    parser.add_argument("--data_limit", type=int, default=1)
+    parser.add_argument("--personas", type=str, default="personas/personas.jsonl")
+
+    args = parser.parse_args()
+
+    # load global config
+    with open("config/global.yaml", "r") as f:
+        global_config = yaml.safe_load(f)
+
+    runner = BenchmarkRunner(args.personas, global_config)
+    evaluator = BenchmarkEvaluator()
+
+    results_dir = os.path.join(os.getcwd(), "results", f"{args.task}_{args.mode}")
+    os.makedirs(results_dir, exist_ok=True)
+
+    # change policy.json
+    config_path = "config/policy.json"
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    config["dataset_name"] = args.task
+    config["dataset_mode"] = args.mode
+    config['paths']["checkpoint_path"] = f"checkpoint/{args.task}_{args.mode}"
+    with open(config_path, 'w') as f:
+        json.dump(config, f, indent=4)
+
+    task_map = {
+        "MMLU-Pro": mmlu_pro.run,
+        "gsm-hard": gsm_hard.run,
+        "SRDD": srdd.run,
+        "CW": creative_writing.run,
+    }
+
+    if args.task in task_map:
+        task_map[args.task](runner, evaluator, results_dir, args.mode, args.data_limit)
+    else:
+        print(f"Unknown task: {args.task}")
+
+if __name__ == "__main__":
+    main()
--- a/puppeteer/model/init.py
+++ b/puppeteer/model/init.py
@ -0,0 +1,18 @@
+from model.api_config import api_config
+global_openai_client = api_config.global_openai_client()
+
+from model.model_config import model_registry
+from model.query_manager import query_manager
+def _create_query_function(model_key: str):
+    def query_func(messages, system_prompt=None):
+        return query_manager.query(model_key, messages, system_prompt)
+    return query_func
+
+_generated_functions = {}
+for model_key, config in model_registry.get_all_models().items():
+    func = _create_query_function(model_key)
+    func.__name__ = config.function_name
+    _generated_functions[config.function_name] = func
+    globals()[config.function_name] = func
+
+__all__ = ['ModelQueryManager', 'query_manager'] + list(_generated_functions.keys()) 
--- a/puppeteer/model/api_config.py
+++ b/puppeteer/model/api_config.py
@ -0,0 +1,28 @@
+import yaml
+
+class APIConfig:
+    def __init__(self):
+        self._config = self._init_config()
+    
+    def _init_config(self):
+        global_config = yaml.safe_load(open("config/global.yaml", "r"))
+        key_config = {
+            "openai":{
+            "openai_api_key": global_config.get("api_keys").get("openai_api_key"),
+            "openai_base_url": global_config.get("api_keys").get("openai_base_url", None),
+            },
+            "retry_times": global_config.get("max_retry_times", 10),
+            "weight_path": global_config.get("model_weight_path")
+        }
+        return key_config
+
+    def get(self, provider: str) -> dict:
+        return self._config.get(provider, {})
+    
+    def global_openai_client(self):
+        from openai import OpenAI
+        api_key = self._config.get("openai").get("openai_api_key", None)
+        base_url = self._config.get("openai").get("openai_base_url", None)
+        return OpenAI(api_key=api_key, base_url=base_url)
+
+api_config = APIConfig()
--- a/puppeteer/model/embedding.py
+++ b/puppeteer/model/embedding.py
@ -0,0 +1,103 @@
+import os
+import openai
+import yaml
+from chromadb import EmbeddingFunction, Embeddings
+from model.model_utils import model_log_and_print
+from tenacity import retry, stop_after_attempt, wait_exponential
+from typing import List
+import torch
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+try:
+    with open("./config/global.yaml", "r", encoding="utf-8") as f:
+        GLOBAL_CONFIG = yaml.safe_load(f)
+except FileNotFoundError:
+    raise FileNotFoundError("Global config file './config/global.yaml' not found!")
+
+OPENAI_API_KEY = GLOBAL_CONFIG.get("api_keys", {}).get("openai_api_key")
+BASE_URL = GLOBAL_CONFIG.get("api_keys", {}).get("openai_base_url", None)
+MAX_RETRY_TIMES = GLOBAL_CONFIG.get("max_retry_times", 10)
+MODEL_WEIGHT_PATH = GLOBAL_CONFIG.get("model_weight_path")
+
+if BASE_URL:
+    client = openai.OpenAI(api_key=OPENAI_API_KEY, base_url=BASE_URL)
+else:
+    client = openai.OpenAI(api_key=OPENAI_API_KEY)
+
+
+class OpenAIEmbedding(EmbeddingFunction):
+    @staticmethod
+    @retry(wait=wait_exponential(min=5, max=10), stop=stop_after_attempt(MAX_RETRY_TIMES))
+    def get_embedding(text) -> Embeddings:
+        embedding_model = "text-embedding-ada-002"
+        model_log_and_print(f"[Embedding] embedding from {embedding_model}")
+
+        if isinstance(text, str):
+            text = [text.replace("\n", " ")]
+        else:
+            text = [t.replace("\n", " ") for t in text]
+
+        text = [t[:8191] if len(t) > 8191 else t for t in text]
+        text = ["none"] if all(len(t) == 0 for t in text) else text
+
+        model_log_and_print(f"[Embedding] {text}")
+
+        response = client.embeddings.create(input=text, model=embedding_model)
+        embeddings = [data.embedding for data in response.data]
+
+        prompt_tokens = response.usage.prompt_tokens
+        total_tokens = response.usage.total_tokens
+        model_log_and_print(f"[Embedding] Token Usage\nPrompt Tokens: {prompt_tokens}\nTotal Tokens: {total_tokens}")
+
+        return embeddings
+    
+    @property
+    def dim(self):
+        return 1536
+
+class RewardModelTokenRepresentation():
+    def __init__(self):
+        self.model_name = "nvidia/Llama-3.1-Nemotron-70B-Reward-HF"
+        self.model = AutoModelForCausalLM.from_pretrained(MODEL_WEIGHT_PATH, torch_dtype=torch.bfloat16, device_map="auto")
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_WEIGHT_PATH)
+        print("device: {}".format(self.model.device))
+        
+    
+    def truncate(self, messages):
+        length = sum(len(message["content"]) for message in messages)
+        
+        while length > 12000:
+            for message in messages:
+                message["content"] = message["content"][-int(len(message["content"]) * 0.75):]  
+            length = sum(len(message["content"]) for message in messages)  
+        
+        return messages
+
+    def __call__(self, messages:List):
+        with torch.no_grad():
+            messages = self.truncate(messages)
+            model_log_and_print("tokenizing")
+            model_log_and_print(messages)
+            tokenized_message = self.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=False, return_tensors="pt", return_dict=True, max_length=4096)
+            model_log_and_print("tokenized done")
+            input_ids = tokenized_message['input_ids'].to('cuda')
+            attention_mask = tokenized_message['attention_mask'].to('cuda')
+            response_token_ids = self.model.generate(input_ids=input_ids,
+                                                attention_mask=attention_mask,  
+                                                max_new_tokens=1, 
+                                                return_dict_in_generate=True, 
+                                                output_scores=True,
+                                                output_logits=True,
+                                                output_hidden_states=True)
+            reward = response_token_ids['scores'][0][0][0].item()
+            hidden_states = response_token_ids.hidden_states  
+            state = hidden_states[0][-1]
+            last_state = state[:,-1,:]
+            print(reward)
+            return last_state, reward
+
+    @property
+    def dim(self):
+        return 8192
--- a/puppeteer/model/model_config.py
+++ b/puppeteer/model/model_config.py
@ -0,0 +1,87 @@
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass
+
+@dataclass
+class ModelConfig:
+    name: str
+    function_name: str     
+    api_model_name: str    
+    provider: str          
+    max_tokens: int      
+    model_size: int   # for open-source models, this is the number of parameters in millions; but for API models, this is just an estimate
+    url: Optional[str] = None      
+    temperature: float = 0.1       
+    description: str = ""          
+
+
+MODEL_REGISTRY: Dict[str, ModelConfig] = {
+    "gpt-3.5": ModelConfig(
+        name = "gpt-3.5",
+        function_name="query_gpt",
+        api_model_name="gpt-3.5-turbo",
+        provider="openai",
+        model_size=175,# which is estimated
+        max_tokens=4096,
+        description="OpenAI GPT-3.5 Turbo model"
+    ),
+    
+    "gpt-4o": ModelConfig(
+        name = "gpt-4o",
+        function_name="query_gpt4o",
+        api_model_name="gpt-4o",
+        provider="openai", 
+        model_size=200,# which is estimated
+        max_tokens=128000,
+        description="OpenAI GPT-4o model"
+    ),
+    "qwen-2.5-14b": ModelConfig(
+        name = "qwen-2.5-14b",
+        function_name="query_qwen2_5_14b",
+        api_model_name="Qwen/Qwen2.5-14B-Instruct",
+        provider="local",
+        model_size=14,
+        max_tokens=8192,    
+        url="http://",
+        description="Qwen 2.5 14B Instruct model deployed locally"
+    ),
+}
+
+class ModelRegistry:
+    def __init__(self):
+        self.registry = MODEL_REGISTRY.copy()
+    
+    def register_model(self, key: str, config: ModelConfig) -> None:
+        self.registry[key] = config
+    
+    def get_model_config(self, key: str) -> Optional[ModelConfig]:
+        return self.registry.get(key)
+
+    def get_model_size(self, key: str) -> Optional[int]:
+        config = self.get_model_config(key)
+        return config.model_size if config else None    
+    
+    def get_all_models(self) -> Dict[str, ModelConfig]:
+        return self.registry.copy()
+    
+    def get_models_by_provider(self, provider: str) -> Dict[str, ModelConfig]:
+        return {k: v for k, v in self.registry.items() if v.provider == provider}
+    
+    def get_function_name(self, key: str) -> Optional[str]:
+        config = self.get_model_config(key)
+        return config.function_name if config else None
+    
+    def get_api_model_name(self, key: str) -> Optional[str]:
+        config = self.get_model_config(key)
+        return config.api_model_name if config else None
+    
+    def list_available_models(self) -> List[str]:
+        return list(self.registry.keys())
+    
+    def search_models(self, keyword: str) -> Dict[str, ModelConfig]:
+        keyword = keyword.lower()
+        return {
+            k: v for k, v in self.registry.items() 
+            if keyword in k.lower() or keyword in v.display_name.lower()
+        }
+
+model_registry = ModelRegistry()
--- a/puppeteer/model/model_utils.py
+++ b/puppeteer/model/model_utils.py
@ -0,0 +1,96 @@
+from typing import Dict
+import logging
+from tenacity import retry
+from tenacity.stop import stop_after_attempt
+from tenacity.wait import wait_exponential
+
+logger = logging.getLogger("model")
+
+class APIConfig:
+    SLOW_FLAG = False 
+    TRUNCATE_FACTOR = 0
+
+def model_log_and_print(content):
+    if content is not None:
+        logger.info(content)
+        print(content)
+
+def truncate_messages(messages):
+    max_length = 0
+    max_index = 0
+    for i, msg in enumerate(messages):
+        if len(msg.get('content', '')) > max_length:
+            max_length = len(msg['content'])
+            max_index = i
+
+    content = messages[max_index]['content']
+    factor = 1/(2**APIConfig.TRUNCATE_FACTOR)
+    messages[max_index]['content'] = content[:int(len(content)*factor)]  
+    return messages
+
+
+def calc_max_token(messages, max_tokens):
+    string = "\n".join([str(message["content"]) for message in messages])
+    num_prompt_tokens = int(len(string)//1.8) # approximation of tokens number 
+    gap_between_send_receive = 15 * len(messages)
+    num_prompt_tokens += gap_between_send_receive
+
+    num_max_completion_tokens = max_tokens - num_prompt_tokens
+    logger.info(f"num_prompt_tokens: {num_prompt_tokens}, num_max_completion_tokens: {num_max_completion_tokens}")
+    if num_max_completion_tokens < 0:
+        logger.warning(f"num_max_completion_tokens is negative: {num_max_completion_tokens}")
+        return 0
+    return num_max_completion_tokens
+
+
+@retry(wait=wait_exponential(min=5, max=10), stop=stop_after_attempt(10))
+def chat_completion_request(messages, model, new_client, model_config_dict: Dict = None):
+    if model_config_dict is None:
+        model_config_dict = {
+            "temperature": 0.1,
+            "top_p": 1.0,
+            "n": 1,
+            "stream": False,
+            "frequency_penalty": 0.0,
+            "presence_penalty": 0.0,
+            "logit_bias": {},
+        }
+
+    json_data = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": 4096,
+        "temperature": model_config_dict["temperature"],
+        "top_p": model_config_dict["top_p"],
+        "n": model_config_dict["n"],
+        "stream": model_config_dict["stream"],
+        "frequency_penalty": model_config_dict["frequency_penalty"],
+        "presence_penalty": model_config_dict["presence_penalty"],
+        "logit_bias": model_config_dict["logit_bias"],
+    }
+
+    try:
+        model_log_and_print("[Model Query] {}".format(messages))
+        if APIConfig.SLOW_FLAG:
+            messages = truncate_messages(messages=messages)
+
+        response = new_client.chat.completions.create(**json_data)
+
+        completion_tokens = response.usage.completion_tokens
+        prompt_tokens = response.usage.prompt_tokens
+        total_tokens = response.usage.total_tokens
+        if total_tokens == 0:
+            total_tokens = prompt_tokens + completion_tokens
+        if total_tokens == 0:
+            total_tokens = len(response.choices[0].message.content)//1.8
+        model_log_and_print(f"[Model Query] Token Usage: \nCompletion Tokens: {completion_tokens} \nPrompt Tokens: {prompt_tokens} \nTotal Tokens: {total_tokens}")
+        APIConfig.SLOW_FLAG = False
+        APIConfig.TRUNCATE_FACTOR = 0
+        return response, total_tokens   
+
+    except Exception as e:
+        print("Unable to generate ChatCompletion response. " + f"OpenAI calling Exception: {e}")
+        APIConfig.SLOW_FLAG = True
+        APIConfig.TRUNCATE_FACTOR += 1
+        model_log_and_print(f"[Model Query: ChatCompletion] query failed: {str(e)}")
+        raise Exception()
--- a/puppeteer/model/query_manager.py
+++ b/puppeteer/model/query_manager.py
@ -0,0 +1,83 @@
+from typing import List, Dict, Any, Optional, Tuple
+import yaml
+from model.model_config import  model_registry, ModelConfig
+from model.api_config import api_config
+from model.model_utils import chat_completion_request, model_log_and_print
+
+
+class ModelQueryManager:
+    def __init__(self):
+        self.registry = model_registry
+        self.config_manager = api_config
+        self.clients = {}
+        self._setup_clients()
+    
+    def _setup_clients(self):
+        from openai import OpenAI
+        for key, config in self.registry.get_all_models().items():
+            # set up client for openai models
+            if config.provider == "openai" and config.url is None:
+                api_key = self.config_manager.get("openai").get("openai_api_key", None)
+                base_url = self.config_manager.get("openai").get("openai_base_url", None)
+                self.clients[key] = OpenAI(api_key=api_key, base_url=base_url)
+            # set up client for local models
+            elif config.url:
+                self.clients[key] = OpenAI(api_key="none", base_url=config.url)
+    
+    def query(self, model_key: str, messages: List[Dict[str, str]], 
+              system_prompt: Optional[str] = None) -> Tuple[str, int]:
+        config = self.registry.get_model_config(model_key)
+        if not config:
+            available_models = ", ".join(self.registry.list_available_models())
+            raise ValueError(f"Unknown model: {model_key}. Available models: {available_models}")
+        
+        return self._query_with_config(messages, config, system_prompt)
+    
+    def _query_with_config(self, messages, config: ModelConfig,  system_prompt=None):
+        model_config_dict = {
+            "temperature": config.temperature,
+            "top_p": 1.0,
+            "n": 1,
+            "stream": False,
+            "frequency_penalty": 0.0,
+            "presence_penalty": 0.0,
+            "logit_bias": {},
+            "max_tokens": config.max_tokens
+        }
+
+        if not isinstance(messages, list):
+            system_prompt = "You are an assistant" if system_prompt is None else system_prompt
+            messages = [
+                {'role': 'system', 'content': system_prompt},
+                {'role': 'user', 'content': messages}
+            ]
+        response, total_tokens = chat_completion_request(
+            messages=messages,
+            model=config.api_model_name,  
+            new_client=self.clients.get(config.name),
+            model_config_dict=model_config_dict
+        )
+        
+        if isinstance(response, str):
+            return response, 1
+        
+        response_message = response.choices[0].message.content
+        return response_message, total_tokens
+    
+    
+    def get_available_models(self) -> List[str]:
+        return self.registry.list_available_models()
+    
+    def get_model_info(self, model_key: str) -> Optional[Dict[str, Any]]:
+        config = self.registry.get_model_config(model_key)
+        if config:
+            return {
+                "function_name": config.function_name,
+                "api_model_name": config.api_model_name,
+                "provider": config.provider,
+                "max_tokens": config.max_tokens,
+                "description": config.description
+            }
+        return None
+
+query_manager = ModelQueryManager()
--- a/puppeteer/personas/personas.jsonl
+++ b/puppeteer/personas/personas.jsonl
@ -0,0 +1,14 @@
+{"name": "FileAgent", "role_prompt":"You are an expert in file handling. Your task is to read files and extract relevant information.", "model_type":"gpt-3.5","actions":["read_file"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "ArxivAgent", "role_prompt":"You are an expert in academic research. Your task is to search for relevant papers on arXiv.", "model_type":"gpt-3.5","actions":["search_arxiv"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "BingAgent", "role_prompt":"You are an expert in web search. Your task is to search for relevant information using Bing.", "model_type":"gpt-3.5","actions":["search_bing"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "WebsiteAgent", "role_prompt":"You are an expert in accessing and extracting information from websites. Your task is to access specific websites and gather relevant information.", "model_type":"gpt-3.5","actions":["access_website"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "TerminatorAgent", "role_prompt":"You are an expert in terminating processes. Your task is to determine when the reasoning process should be terminated and provide the final answer.", "model_type":"gpt-3.5","actions":["terminate"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "PythonAgent_gpt4o", "role_prompt":"You are an expert in Python programming. Your task is to run Python code and provide the results.", "model_type":"gpt-4o","actions":["run_python"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "PlannerAgent_gpt4o", "role_prompt":"You are an expert in planning. Your task is to create detailed plans for achieving specific goals.", "model_type":"gpt-4o","actions":["planning"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "ReasoningAgent_gpt4o", "role_prompt":"You are an expert in logical reasoning. Your task is to reason through complex problems and provide well-thought-out solutions.", "model_type":"gpt-4o","actions":["reasoning"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "CriticAgent_gpt4o", "role_prompt":"You are an expert in critiquing. Your task is to critique the reasoning and solutions provided by others.", "model_type":"gpt-4o","actions":["critique"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "ReflectAgent_gpt4o", "role_prompt":"You are an expert in reflection. Your task is to reflect on the reasoning process and provide insights for improvement.", "model_type":"gpt-4o","actions":["reflect"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "QuestionAgent_gpt4o", "role_prompt":"You are an expert in questioning. Your task is to propose relevant sub-questions that help in solving the main problem.", "model_type":"gpt-4o","actions":["question"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "SummarizerAgent_gpt4o", "role_prompt":"You are an expert in summarizing. Your task is to summarize the information and provide concise conclusions.", "model_type":"gpt-4o","actions":["summarize"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "ConcluderAgent_gpt4o", "role_prompt":"You are an expert in concluding. Your task is to provide final conclusions based on the reasoning process.", "model_type":"gpt-4o","actions":["conclude"],"agent_type":"reasoning","policy":"autonomous"}
+{"name": "Modifier_gpt4o", "role_prompt":"You are an expert in error correction and modification. Your task is to identify errors in previous reasoning, explain why they are incorrect, and provide accurate corrections.", "model_type":"gpt-4o","actions":["modify"],"agent_type":"reasoning","policy":"autonomous"}
--- a/puppeteer/prompts/general/actions_external_tools.jsonl
+++ b/puppeteer/prompts/general/actions_external_tools.jsonl
@ -0,0 +1,5 @@
+{"action": "search_arxiv", "prompt": "You have chosen to search for academic papers on arXiv. Please provide specific terms related to academic research, such as the title of a paper, keywords, or topics in fields like physics, mathematics, computer science, or machine learning. Return in json format. Example: {\"action\": \"search_arxiv\", \"parameter\": \"quantum computing\"}"}
+{"action": "search_bing", "prompt": "You have chosen to search for information using Bing. Please provide descriptive phrases or keywords related to your query, including concepts, names, events, or specific questions to get a broad range of results, including news, articles, and websites. Return in json format. Example: {\"action\": \"search_bing\", \"parameter\": \"latest advancements in AI\"}"}
+{"action": "access_website", "prompt": "You have chosen to access a website. Please provide the URL you want to access or the URL most relevant to the current question. Return in json format. Example: {\"action\": \"access_website\", \"parameter\": \"https://www.example.com\"}"}
+{"action": "run_python", "prompt": "You have chosen to write and run Python code. Please write generic Python code in the parameter to solve this type of problems using only standard python libraries. Make sure you use the 'print' function for all output when relevant. Return in json format. Example: {\"action\": \"run_python\", \"parameter\": \"print('Hello, World!')\"}"}
+{"action": "read_file", "prompt": "You have chosen to read a file. Please provide the filename you want to read. Return in json format. Example: {\"action\": \"read_file\", \"parameter\": \"data.txt\"}"}
--- a/puppeteer/prompts/general/actions_reasoning.jsonl
+++ b/puppeteer/prompts/general/actions_reasoning.jsonl
@ -0,0 +1,8 @@
+{"action": "planning","prompt": "Decompose the question and plan the next steps to address the question. You should complete your planning using the following template: REASONING RESULT: [YOUR REASONING RESULT]. *Your previous reasoning was: {}.* Your planning should include: "}
+{"action": "reasoning","prompt": "Now, you need to continue the reasoning to get closer to the correct answer. You should finish your reasoning with the following template: REASONING RESULT: [YOUR REASONING RESULT]. Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: {}.* You need to follow the direction of the reasoning path and go forward:"}
+{"action": "critique","prompt": "You need to critique the previous reasoning. You should complete your reasoning using this format: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your answer using this format: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: {}.* Consider the following when critiquing the reasoning: 1. Plausibility:"}
+{"action": "reflect","prompt": "You will be provided with a previous reasoning attempt where you had access to relevant context and were tasked with answering a question. The attempt was unsuccessful either due to an incorrect answer or a phrasing mismatch with the answer key. In a few sentences, diagnose the potential cause of failure or phrasing discrepancy, and outline a new, concise, high-level plan to prevent the same issue from recurring. Use complete sentences. Reflect on the current state of the task and propose the next steps. You should conclude your reasoning using this template: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your reflection using this template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: {}.* Your reflection:"}
+{"action": "question","prompt": "Your task is to propose the next sub-question along with its answer. You should ensure that the sub-question logically follows from the previous reasoning and addresses any gaps. You should provide a well-reasoned answer to the sub-question, supported by evidence or logical arguments. You should conclude your reasoning using this template: REASONING RESULT: [YOUR REASONING RESULT]. Conclude your answer using this template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: {}.* Now, ask a sub-question and try to answer it:"}
+{"action": "summarize","prompt": "You need to summarize those previous results and provide some intermediate conclusions. You should finish your reasoning with the following template: REASONING RESULT: [YOUR REASONING RESULT]. Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: {}.* You need to summarize the reasoning paths and provide a final conclusion:"}
+{"action": "conclude","prompt": "You need to conclude the task and provide a final answer. You should finish your reasoning with the following template: REASONING RESULT: [YOUR REASONING RESULT]. Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. *Your previous reasoning was: {}.* You need to conclude the task and provide a final answer: "}
+{"action": "modify", "prompt": "You need to identify and correct errors in the previous reasoning. You should finish your reasoning with the following template: REASONING RESULT: [Clearly state: 1. Which part of the previous reasoning was incorrect 2. Why it was incorrect 3. What is the correct understanding]. Finish your answer with the following template: FINAL ANSWER: [Provide the complete corrected answer]. *Your previous reasoning was: {}.* Please explicitly point out and correct any errors, misconceptions, or inaccuracies in the previous reasoning."}
--- a/puppeteer/prompts/general/answer_prompt.json
+++ b/puppeteer/prompts/general/answer_prompt.json
@ -0,0 +1,76 @@
+{
+    "answer": [
+        "When you find an answer, use as few words as possible, like a single number or a single word. And finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. ",
+        "YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.",
+        "If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. Do not include any units or punctuation.",
+        "If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.",
+        "If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.",
+        "Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]"
+    ],
+    "MMLU_answer": [
+        "Choose the most common one and then format it.",
+        "Format it using the fewest words possible, it should be a single letter, but do not alter the original answer.",
+        "Do not include any other units or punctuation. ",
+        "And finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]",
+        "Return your answer with only the single letter."
+    ],
+    "gsm_answer": [
+        "Calculate carefully and provide only the final numerical answer.",
+        "The answer must be a pure number - either an integer or decimal number.",
+        "Do not include any units, symbols, or text in your answer.",
+        "Do not use commas in numbers.",
+        "Do not use scientific notation (e.g. 1e6).",
+        "Do not round the number unless specifically requested.",
+        "Use plain decimal format with decimal point if needed.",
+        "For large numbers write all digits without abbreviation.",
+        "Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]"
+    ],
+    "code_answer": [
+        "Provide the code in its most complete and executable form.",
+        "Ensure the code includes necessary libraries, imports, and setup.",
+        "Ensure the code is modular and follows good software design practices.",
+        "Avoid hardcoding values unless explicitly required.",
+        "Include proper error handling and edge case considerations.",
+        "If applicable, make sure the code is optimized for performance.",
+        "Do not add unnecessary comments unless required for clarification.",
+        "Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]"
+    ],
+    "text_answer": [
+        "Provide a clear and concise answer.",
+        "Ensure your answer is well-structured and logically organized.",
+        "Include all relevant information and avoid unnecessary details.",
+        "Use proper grammar, spelling, and punctuation.",
+        "Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]"
+    ],
+    "answer_aggregation": [
+        "You have several answer candidates. They are:\n{}",
+        "Critically evaluate these responses, considering potential biases or inaccuracies.",
+        "Choose the most reliable answer based on logical reasoning and factual accuracy.",
+        "If responses conflict, prioritize answers that provide clear reasoning or align with verifiable facts.",
+        "Format the chosen answer using the fewest words possible, but do not alter its core meaning.",
+        "YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.",
+        "If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.",
+        "If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.",
+        "If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."
+    ],
+    "MMLU_aggregation": [
+        "You have several answer candidates. They are:\n{}",
+        "Choose the most common answer among the candidates.",
+        "If responses conflict, prioritize answers that provide clear reasoning or appear later",
+        "Format the answer as a single letter.",
+        "Do not include any other units or punctuation.",
+        "Return your answer with only the single letter."
+    ],
+    "gsm_aggregation": [
+        "You have several answer candidates. They are:\n{}",
+        "Choose the most reliable answer.",
+        "If responses conflict, prioritize answers that provide clear reasoning or appear later",
+        "The answer must be a pure number - either an integer or decimal number.",
+        "Do not include any units, symbols, or text in your answer.",
+        "Do not use commas in numbers.",
+        "Do not use scientific notation.",
+        "Do not round the number unless specifically requested.",
+        "Use plain decimal format with decimal point if needed.",
+        "For large numbers write all digits without abbreviation."
+    ]
+}
--- a/puppeteer/prompts/general/system_prompt.json
+++ b/puppeteer/prompts/general/system_prompt.json
@ -0,0 +1,8 @@
+{
+    "system_prompt": [
+        "{}, and You work as a helpful AI assistant. ",
+        "I will ask you a question. Answer this question using your coding and language skills.",
+        "Now your question is: {}",
+        "Previously, you collected the some information about this question from some actions: {}"
+    ]
+}
--- a/puppeteer/tasks/base/base_task.py
+++ b/puppeteer/tasks/base/base_task.py
@ -0,0 +1,18 @@
+import os
+from utils.file_utils import write_jsonl
+
+class BaseTask:
+    def __init__(self, runner, evaluator):
+        self.runner = runner
+        self.evaluator = evaluator
+
+    def write_result(self, fd, task_id, final_ans, true_ans=None, flag=None):
+        record = {
+            "task_id": task_id,
+            "final_ans": final_ans,
+        }
+        if true_ans is not None:
+            record["true_ans"] = true_ans
+        if flag is not None:
+            record["flag"] = flag
+        write_jsonl(fd, record)
--- a/puppeteer/tasks/creative_writing.py
+++ b/puppeteer/tasks/creative_writing.py
@ -0,0 +1,34 @@
+import os
+import json
+from tqdm import tqdm
+
+def load_dataset(data_limit=None):
+    path = "./data/CW/creative_writing.jsonl"
+    with open(path, "r", encoding="utf-8") as f:
+        data = [json.loads(line) for line in f]
+    return data[:data_limit] if data_limit else data
+
+def format_question(q, idx):
+    question = "Concepts: " + ", ".join(q["concepts"]) + \
+               "\nGenerate a sentence including all key concepts, grammatically correct and coherent."
+    return {
+        "type": "CW",
+        "Question": question,
+        "id": idx,
+        "concepts": q["concepts"]
+    }
+
+def run(runner, evaluator, results_dir, mode, data_limit=None):
+    dataset = load_dataset(data_limit)
+    result_path = os.path.join(results_dir, "cw.jsonl")
+
+    with open(result_path, "w", encoding="utf-8") as fd:
+        for idx, q in enumerate(tqdm(dataset)):
+            task = format_question(q, idx)
+            final_ans = runner.run_reasoning(task)
+
+            record = {
+            "id": task["id"],
+            "pred": final_ans
+            }
+            fd.write(json.dumps(record, ensure_ascii=False) + "\n")
--- a/puppeteer/tasks/evaluator.py
+++ b/puppeteer/tasks/evaluator.py
@ -0,0 +1,349 @@
+import subprocess
+import time
+import torch
+import numpy as np
+import re
+import os
+import signal
+import math
+
+from model import query_gpt
+from model.embedding import OpenAIEmbedding
+from utils.file_utils import read_code, read_text
+
+FLOAT_TOLERANCE = 1e-3
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+
+class BenchmarkEvaluator:
+    @staticmethod
+    def commongen_coverage(concepts, text_path):
+        generated_text = read_text(text_path)
+        generated_text = generated_text.lower()
+        concepts = [concept.lower() for concept in concepts]
+        missing_concepts = [concept for concept in concepts if not re.search(rf'\b{re.escape(concept)}\b', generated_text, re.IGNORECASE)]
+        if missing_concepts:
+            return 1 - len(missing_concepts)/len(concepts)
+        return 1
+
+    @staticmethod
+    def commongen_gpt_score(concepts, text_path):
+        generated_text = read_text(text_path)
+        prompt = '''
+        As a strict StoryMaster, your task is to meticulously evaluate the quality of stories across three primary dimensions: Grammar and Fluency, Context Relevance, and Logic Consistency. Each dimension will be rated on a refined scale from 1 (average) to 4 (perfect), ensuring that only stories of superior quality achieve the highest scores.
+
+        Implement Your Evaluation Mechanism with Enhanced Rigor:
+
+        Grammar and Fluency (Assess the story's linguistic precision and narrative flow):
+        Score 1 (solid): The story is free of grammatical errors, but the narrative lacks the stylistic variety and eloquence that elevate writing to a higher tier.
+        Score 2 (proficient): The narrative demonstrates a strong command of grammar and a coherent flow, yet it does not showcase the level of linguistic artistry found in superior works.
+        Score 3 (excellent): The story exhibits a refined sense of grammar and a compelling narrative flow, with sentence structures that are engaging and demonstrate a high level of craft.
+        Score 4 (masterful): The story is a testament to linguistic excellence, with sentence structures that are not only clear and elegant but also exhibit a creative and sophisticated use of language that captivates and inspires.
+
+        Context Relevance (Examine the coherence, interconnectedness, and depth of content within the story):
+        Score 1 (solid): The story establishes a basic framework of context relevance, but it does not delve into the intricacies of character and thematic development that enrich the narrative.
+        Score 2 (proficient): The narrative demonstrates a clear connection between elements, yet it lacks the depth and multi-layered content that would distinguish it as truly exceptional.
+        Score 3 (excellent): The story interweaves elements with a high degree of relevance, creating a narrative that is coherent and features content that is well-developed and insightful.
+        Score 4 (masterful): The story achieves an extraordinary level of context relevance, with every element artfully woven into a narrative that is not only coherent but also profound in its exploration of themes and characters, offering a rich and immersive experience.
+
+        Logic Consistency (Scrutinize the narrative for logical integrity and internal consistency):
+        Score 1 (solid): The story maintains a logical structure, but there may be occasional lapses in plausibility or minor inconsistencies that slightly undermine its credibility.
+        Score 2 (proficient): The narrative is generally logical, with a clear progression of events and character actions, yet it does not reach the level of seamless consistency expected of a superior story.
+        Score 3 (excellent): The story exhibits a strong logical consistency, with events and character actions that are well-aligned and plausible, contributing to a coherent and believable plot.
+        Score 4 (masterful): The story is characterized by impeccable logical consistency, with every event and character action meticulously aligned to create a plot that is not only coherent but also demonstrates a deep understanding of causality and human behavior.'''
+
+        prompt += '\nStory:\n' + generated_text
+        response_text, _ = query_gpt(prompt)
+        pattern = r'\d+'
+        remedy_prompt = 'Extract the score in each dimension in format: (Grammar and Fluency Score: X. Context Relevance Score: X. Logic Consistency Score: X. Overall Score Score: X.) of the following content.'
+        remedy_prompt += response_text
+        remedy_respond,_ = query_gpt(remedy_prompt)
+        score_list = re.findall(pattern, remedy_respond)
+        my_float_list = [float(item) for item in score_list]
+        score_list = [item/4 for item in my_float_list]
+        score_list = score_list[:3]
+        while len(score_list) != 3:
+            score_list.append(0)
+        return score_list
+
+    @staticmethod
+    def check_commongen(concepts, text_path):
+        # Metric implementation inspired by self-refine project:
+        # https://github.com/madaan/self-refine/tree/main/src/commongen
+        coverage = BenchmarkEvaluator.commongen_coverage(concepts, text_path)
+        coverage = torch.tensor(coverage, dtype=torch.float32, device=DEVICE)  
+        scores = BenchmarkEvaluator.commongen_gpt_score(concepts, text_path)
+        grammar = torch.tensor(scores[0], dtype=torch.float32, device=DEVICE)  
+        relevance = torch.tensor(scores[1], dtype=torch.float32, device=DEVICE)  
+        consistency = torch.tensor(scores[2], dtype=torch.float32, device=DEVICE)  
+        metrics = {"grammar": grammar, "relevance": relevance, "consistency": consistency, "coverage": coverage}
+        mean_score = torch.tensor(sum(scores) / 3, dtype=torch.float32, device=DEVICE)    
+        if coverage == 0:
+            return -1.0, metrics
+        else:
+            return coverage*mean_score, metrics
+        
+    
+    @staticmethod
+    def check_srdd(code_path, text):
+        # Metric implementation inspired by ChatDev project:
+        # https://github.com/OpenBMB/ChatDev
+        path = code_path
+        code = read_code(path)
+        consistency = BenchmarkEvaluator.srdd_consistency(text, code)
+        completeness = BenchmarkEvaluator.srdd_completeness(code)
+        executability, _ = BenchmarkEvaluator.srdd_executability(path)
+        executability = 1 if executability else 0
+        executability = torch.tensor(executability, dtype=torch.float32, device=DEVICE)  
+        consistency = torch.tensor(consistency, dtype=torch.float32, device=DEVICE)  
+        completeness = torch.tensor(completeness, dtype=torch.float32, device=DEVICE)  
+        metrics = {"consistency": consistency, "completeness": completeness, "executability": executability}
+        if executability:
+            alignment = consistency * completeness
+            return alignment, metrics
+        else:
+            return -1.0, metrics
+    
+    @staticmethod
+    def srdd_consistency(text, code):
+        code = BenchmarkEvaluator.remove_comments(code)
+        text = re.sub(r'^[^\n]*\n', '', text)
+        text_embedding = OpenAIEmbedding.get_embedding(text)
+        code_embedding = OpenAIEmbedding.get_embedding(code)
+        similarity = BenchmarkEvaluator.get_cosine_similarity(text_embedding, code_embedding)
+        return similarity
+
+    @staticmethod
+    def srdd_completeness(code):
+        lines = code.split("\n")
+        lines = [line for line in lines if
+                "password" not in line.lower() and "passenger" not in line.lower() and "passed" not in line.lower() and "passes" not in line.lower()]
+        lines = [line for line in lines if "pass" in line.lower() or "todo" in line.lower()]
+        if len(lines) > 0:
+            return 0.0
+        return 1.0
+
+    @staticmethod 
+    def srdd_executability(work_path):
+        def robust_kill(process):
+            """Robustly kill the process based on the OS."""
+            if process.poll() is None:  # Check if the process is still running
+                if os.name == 'nt':  # For Windows
+                    os.kill(process.pid, signal.SIGTERM)
+                    time.sleep(1)  
+                    if process.poll() is None:  
+                        os.kill(process.pid, signal.CTRL_BREAK_EVENT)
+                else:  # For Linux/macOS
+                    os.killpg(os.getpgid(process.pid), signal.SIGTERM)  
+                    time.sleep(1)  
+                    if process.poll() is None:  
+                        os.killpg(os.getpgid(process.pid), signal.SIGKILL)
+        try:
+            if not os.path.exists(work_path):
+                return False, "The file path does not exist."
+            if os.name == 'nt':  
+                command = f" python {work_path}"
+                process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                            creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
+            else:  
+                command = f"python3 {work_path}"
+                process = subprocess.Popen(command, shell=True, preexec_fn=os.setsid, stdout=subprocess.PIPE,
+                                            stderr=subprocess.PIPE)
+
+            try:
+                out, err = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                robust_kill(process)
+                return True, "The process completes without encountering any errors."
+
+            return_code = process.returncode
+            output = out.decode('utf-8', errors='ignore')
+            error_output = err.decode('utf-8', errors='ignore')
+
+            # If the process is still running after the timeout
+            if process.poll() is None:
+                robust_kill(process)  
+            return_code = process.returncode
+
+            # Handle return code and output
+            if return_code == 0:
+                # Clean up file paths in the output for readability
+                work_path = os.getcwd()
+                output = output.replace(work_path, "")
+                return True, output
+            else:
+                # Handle errors in the output
+                if error_output:
+                    work_path = os.getcwd()
+                    if "Traceback".lower() in error_output.lower():
+                        errs = error_output.replace(work_path + "/", "").replace(work_path, "")
+                        return False, errs
+                return False, error_output
+
+        except subprocess.CalledProcessError as e:
+            return False, f"CalledProcessError: {str(e)}"
+        except Exception as ex:
+            return False, f"An unexpected error occurred: {str(ex)}"
+
+
+    @staticmethod
+    def get_cosine_similarity(embeddingi, embeddingj):
+        embeddingi = np.array(embeddingi)
+        embeddingj = np.array(embeddingj).T
+        cos_sim = embeddingi.dot(embeddingj) / (np.linalg.norm(embeddingi) * np.linalg.norm(embeddingj))
+        return cos_sim
+    
+    @staticmethod
+    def remove_comments(string):
+        def remove_comments_by_regex(string, regex):
+            lines = string.split("\n")
+            lines = [line for line in lines if not line.strip().startswith("#")]
+            string = "\n".join(lines)
+            comments = []
+            matches = re.finditer(regex, string, re.DOTALL)
+            for match in matches:
+                group1 = match.group(1)
+                comments.append(group1)
+            for comment in comments + ["''''''\n"]:
+                string = string.replace(comment, "")
+            return string
+
+        string = remove_comments_by_regex(string, r"'''(.*?)'''")
+        string = remove_comments_by_regex(string, r"\"\"\"(.*?)\"\"\"")
+        return string
+
+    
+    @staticmethod
+    def check_mmlu(final_ans, true_ans):
+        if final_ans is None or true_ans is None:
+            return False
+        if len(final_ans) == 0:
+            return False
+        pattern = r'answer is\s+([A-Z])'
+        match = re.search(pattern, final_ans)
+        if match:
+            final = match.group(1).strip()
+            if final==true_ans:
+                return True
+        
+        pattern = r'is\s+([A-Z])'
+        match = re.search(pattern, final_ans)
+        if match:
+            final = match.group(1).strip()
+            if final==true_ans:
+                return True
+
+        final_ans = final_ans.strip().lower()
+        true_ans = true_ans.strip().lower()
+        if final_ans == true_ans:
+            return True
+        
+        final_ans_lines = final_ans.split('\n')
+        ans = []
+        residual = []
+        for li, al in enumerate(final_ans_lines):
+            ans.append(al)
+            if 'answer is' in al:
+                break
+        residual = list(final_ans_lines[li + 1:])
+        ans = '\n'.join(ans)
+        residual = '\n'.join(residual)
+        
+        pattern = 'the answer is ('
+        pred = ans.split(pattern)
+        
+        if len(pred) > 1:
+            pred = BenchmarkEvaluator.extract_letter(pred[1])
+            gold = BenchmarkEvaluator.extract_letter(true_ans)
+            return pred[0] == gold
+        else:
+            pred = BenchmarkEvaluator.extract_letter(ans)
+            gold = BenchmarkEvaluator.extract_letter(true_ans)
+            return pred == gold
+    
+    @staticmethod
+    def check_gsm8k(final_ans, true_ans):
+        if final_ans is None or true_ans is None:   
+            return False
+        if isinstance(final_ans, str):
+            final_num = BenchmarkEvaluator.extract_number(final_ans)
+            if final_num is None:
+                return False
+        else:
+            final_num = float(final_ans)
+        true_num = float(true_ans)
+        
+        if not (math.isfinite(final_num) and math.isfinite(true_num)):
+            return False  
+
+        # Accuracy computation adapted from: https://github.com/reasoning-machines/pal/blob/main/scripts/gsm_eval.py
+        is_correct = abs(float(final_num) - float(true_num)) < FLOAT_TOLERANCE 
+        if not is_correct:
+            is_correct = (round(float(final_num)) == round(float(true_num)))
+            if is_correct:
+                 return is_correct
+            if abs(int(float(final_num))) > 100 and abs(int(float(true_num))) > 100:
+                is_correct = (int(float(final_num)) == int(float(true_num)))
+        return is_correct
+    
+    @staticmethod
+    def extract_math_answer(text):
+        if text is None:
+            return text
+        if isinstance(text, str):
+            final_num = BenchmarkEvaluator.extract_number(text)
+        else:
+            final_num = float(text)
+        return final_num
+    
+    @staticmethod
+    def extract_choice_answer(text):
+        if text is None:
+            return text
+        # First pattern: 'answer is X' 
+        pattern = r'answer is\s+([A-Z])'
+        match = re.search(pattern, text)
+        if match:
+            return match.group(1).strip()
+        
+        # Second pattern: 'is X'
+        pattern = r'is\s+([A-Z])'
+        match = re.search(pattern, text)
+        if match:
+            return match.group(1).strip()
+        
+        # Third pattern: '(X)' or 'The answer is (X)'
+        pattern = r'\(([A-Z])\)'
+        match = re.search(pattern, text)
+        if match:
+            return match.group(1).strip()
+        
+        # Fourth pattern: 'X'
+        if len(text) == 1:
+            pattern = r'[A-Z]'
+            match = re.search(pattern, text)
+            if match:
+                return match.group(0).strip()
+
+        return text.strip()
+    
+    @staticmethod
+    def normalize_string(s):
+        return ''.join(s.split()).lower()
+
+    @staticmethod
+    def extract_number(text):
+        matches = re.findall(r'-?\d+\.\d+|-?\d+', text)
+        return float(matches[0]) if matches else None
+
+    @staticmethod
+    def extract_ground_truth(text):
+        return text.split('####')[-1].strip()
+    
+    @staticmethod
+    def extract_letter(text):
+            pattern = r'\((\w)\)'
+            match = re.search(pattern, text)
+            if match:
+                return match.group(1).strip()  
+            return text.strip()  
--- a/puppeteer/tasks/gsm_hard.py
+++ b/puppeteer/tasks/gsm_hard.py
@ -0,0 +1,37 @@
+import os
+import json
+import pandas as pd
+from tqdm import tqdm
+from tasks.base.base_task import BaseTask
+
+def load_dataset(mode, data_limit=None):
+    path = os.path.join("data", "GSM-Hard", "test.parquet")
+    data = pd.read_parquet(path)
+    data = data.sample(frac=1).reset_index(drop=True)
+    return data[:data_limit] if data_limit else data
+
+def format_question(row, idx):
+    return {
+        "type": "GSM-Hard",
+        "Question": "You need to write python program to solve math problems:\n" + row["input"],
+        "Answer": row["target"],
+        "id": idx
+    }
+
+def run(runner, evaluator, results_dir, mode, data_limit=None):
+    dataset = load_dataset(mode, data_limit)
+    result_path = os.path.join(results_dir, "gsm-hard.jsonl")
+    acc = 0
+
+    with open(result_path, "w", encoding="utf-8") as fd:
+        for idx, row in enumerate(tqdm(dataset.iterrows(), total=len(dataset))):
+            task = format_question(row[1], idx)
+            final_ans = runner.run_reasoning(task)
+            flag = evaluator.check_gsm8k(final_ans, task["Answer"])
+            if flag: acc += 1
+            record = {
+            "id": task["id"],
+            "pred": final_ans,
+            "correct": flag
+            }
+            fd.write(json.dumps(record, ensure_ascii=False) + "\n")
--- a/puppeteer/tasks/mmlu_pro.py
+++ b/puppeteer/tasks/mmlu_pro.py
@ -0,0 +1,41 @@
+import os
+import string
+import json
+import pandas as pd
+from tqdm import tqdm
+from tasks.base.base_task import BaseTask
+
+def load_dataset(mode, data_limit=None):
+    path = os.path.join("data", "MMLU-Pro", f"{mode}.parquet")
+    data = pd.read_parquet(path)
+    return data[:data_limit] if data_limit else data
+
+def format_question(task):
+    options = [f"{letter}: {op}" for letter, op in zip(string.ascii_uppercase, task["options"])]
+    prompt = f"The following are multiple choice questions (with answers) about {task['category']}."
+    question = prompt + "\n" + task["question"] + "\n" + " ".join(options)
+    return {
+        "type": "MMLU-Pro",
+        "Question": question,
+        "Answer": task["answer"],
+        "id": task["question_id"]
+    }
+
+def run(runner, evaluator, results_dir, mode, data_limit=None):
+    dataset = load_dataset(mode, data_limit)
+    result_path = os.path.join(results_dir, f"MMLU-Pro_{mode}.jsonl")
+    acc = 0
+
+    with open(result_path, "w", encoding="utf-8") as fd:
+        for _, row in tqdm(dataset.iterrows(), total=len(dataset)):
+            task = format_question(row)
+            final_ans = runner.run_reasoning(task)
+            flag = evaluator.check_mmlu(final_ans, task["Answer"])
+            if flag: 
+                acc += 1
+            record = {
+            "id": task["id"],
+            "pred": final_ans,
+            "correct": flag
+            }
+            fd.write(json.dumps(record, ensure_ascii=False) + "\n")
--- a/puppeteer/tasks/runner.py
+++ b/puppeteer/tasks/runner.py
@ -0,0 +1,28 @@
+from agent.register.register import agent_global_registry
+from inference.reasoning.reasoning import GraphReasoning
+from inference.graph.agent_graph import AgentGraph
+
+class BenchmarkRunner:
+    def __init__(self, personas_path, global_config):
+        self.personas_path = personas_path
+        self.global_config = global_config
+        self.max_step_num = self.global_config.get('graph').get('max_step_num')
+        self.save_state = False
+
+    def setup_reasoning(self, data_item):
+        agent_global_registry.register_all_agents(self.personas_path)
+        agent_global_registry.reset_all_agents()
+        graph = AgentGraph()
+        return GraphReasoning(data_item, graph), graph
+
+    def run_reasoning(self, data_item):
+        reasoning, _ = self.setup_reasoning(data_item)
+        reasoning.start(self.save_state if self.save_state else None)
+        self.save_state = False
+        
+        final_ans, _ = reasoning.n_step(self.max_step_num)
+
+        reasoning.visualize_path()
+        reasoning.visualize_graph()
+
+        return final_ans
--- a/puppeteer/tasks/srdd.py
+++ b/puppeteer/tasks/srdd.py
@ -0,0 +1,31 @@
+import os
+import pandas as pd
+from tqdm import tqdm
+import json
+
+def load_dataset(data_limit=None):
+    data = pd.read_csv("./data/SRDD/SRDD.csv")
+    data = data.sample(frac=1).reset_index(drop=True)
+    return data[:data_limit] if data_limit else data
+
+def format_question(row, idx):
+    return {
+        "type": "SRDD",
+        "Question": "Develop a pythonic software following description:\n" + row["Description"],
+        "id": idx
+    }
+
+def run(runner, evaluator, results_dir, mode, data_limit=None):
+    dataset = load_dataset(data_limit)
+    result_path = os.path.join(results_dir, "srdd.jsonl")
+
+    with open(result_path, "w", encoding="utf-8") as fd:
+        for idx, row in tqdm(dataset.iterrows(), total=len(dataset)):
+            task = format_question(row, idx)
+            final_ans = runner.run_reasoning(task)
+
+            record = {
+            "id": task["id"],
+            "pred": final_ans
+            }
+            fd.write(json.dumps(record, ensure_ascii=False) + "\n")
--- a/puppeteer/tools/init.py
+++ b/puppeteer/tools/init.py
--- a/puppeteer/tools/base/base_tool.py
+++ b/puppeteer/tools/base/base_tool.py
@ -0,0 +1,41 @@
+from abc import ABC, abstractmethod
+from functools import wraps
+import signal
+import logging
+
+class Tool(ABC):
+    def __init__(self, name, description, execute_function, timeout_duration=1, **kwargs):
+        super().__init__()
+        self.name = name
+        self.description = description
+        self.execute_function = execute_function
+        self.timeout_duration = timeout_duration
+        signal.alarm(0)
+
+    def timeout_handler(self, signum, frame):
+        raise TimeoutError(f"Tool execution timed out after {self.timeout_duration} seconds")
+
+    def with_timeout(self, func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            original_handler = signal.signal(signal.SIGALRM, self.timeout_handler)
+            signal.alarm(self.timeout_duration)
+            try:
+                result = func(*args, **kwargs)
+                return result
+            finally:
+                signal.alarm(0)
+                signal.signal(signal.SIGALRM, original_handler)
+        return wrapper
+
+    @abstractmethod
+    def execute(self, *args, **kwargs):
+        # Wrap the execute_function with timeout handling
+        safe_execute = self.with_timeout(self.execute_function)
+        try:
+            return safe_execute(*args, **kwargs)
+        except TimeoutError as e:
+            logging.error(f"Timeout in {self.name}: {str(e)}")
+            return False, str(e)
+        except Exception as e:
+            return False, f"Tool execution failed: {str(e)}"
--- a/puppeteer/tools/base/register.py
+++ b/puppeteer/tools/base/register.py
@ -0,0 +1,33 @@
+from typing import Any
+from tools.base.base_tool import Tool
+class ToolRegistry:
+    def __init__(self):
+        self.tools = {}
+
+    def register_tool(self, name, tool):
+        if name in self.tools:
+            raise ValueError(f"Tool with name {name} is already registered.")
+        self.tools[name] = tool
+
+    def __call__(self, *args: Any, **kwds: Any):
+        def decorator(cls):
+            tool_instance = cls(*args, **kwds)
+            name = tool_instance.name
+            self.register_tool(name, tool_instance)
+            return cls
+        return decorator
+    
+    def get_all_tools(self):
+        print(self.tools.keys())
+    
+    def get_tool(self, name):
+        if name not in self.tools:
+            return None
+        else:
+            return self.tools[name]
+
+    def execute_tool(self, name, *args, **kwargs):
+        tool = self.get_tool(name)
+        return tool.execute(*args, **kwargs)
+
+global_tool_registry = ToolRegistry()
--- a/puppeteer/tools/code_interpreter.py
+++ b/puppeteer/tools/code_interpreter.py
@ -0,0 +1,135 @@
+from tools.base.register import global_tool_registry
+from tools.base.base_tool import Tool
+from tenacity import retry, stop_after_attempt, wait_fixed
+import base64
+import os
+import shutil
+from abc import ABC, abstractmethod
+import subprocess
+from subprocess import check_output
+import time
+import signal
+
+FILE_REGEX = r"(^//.|^/|^ [a-zA-Z])?:?/.+ (/$)"
+class CodeInterpreter(Tool):
+    def __init__(self):
+        super().__init__("run_code", "run code", self.execute)
+
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
+    def execute(self, *args, **kwargs):
+        try: 
+            work_path = kwargs.get("work_path", "")
+            code = kwargs.get("code", "")
+            file_path = kwargs.get("file_path", "")
+            self.timeout_detected = kwargs.get("timeout_detected", True)   
+            code_path = self.write(work_path, code)
+            flag, ans = self.run(work_path, code_path, file_path)
+
+        except AttributeError:
+            # raise ValueError(f"Running Error")
+            return False, "Running Error"
+        
+        return flag, ans
+    @abstractmethod
+    def write(self, work_path, code):
+        pass
+    @abstractmethod
+    def run(self, work_path, code_path, file_path):
+        pass
+
+@global_tool_registry("run_python")
+class PythonInterpreter(CodeInterpreter):
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+    
+    def write(self, work_path, code):
+        code_path = os.path.join(work_path, "agent-main.py")
+
+        with open(code_path, 'w') as file:
+            file.write(code)
+        return code_path
+
+    def move_file(self, src_path, dest_path):
+        if not os.path.exists(src_path):
+            return
+        
+        if dest_path == "":
+            dest_path = os.getcwd()
+        dest_dir = os.path.dirname(dest_path)
+        if not os.path.exists(dest_dir):
+            os.makedirs(dest_dir)
+        
+        try:
+            shutil.copy2(src_path, dest_path)
+        except Exception as e:
+            return False
+
+    def robust_kill(self, process):
+        """Robustly kill the process based on the OS."""
+        if process.poll() is None:  # Check if the process is still running
+            if os.name == 'nt':  # For Windows
+                os.kill(process.pid, signal.SIGTERM)
+                time.sleep(1)  # Allow some time for graceful termination
+                if process.poll() is None:  # Force kill if still running
+                    os.kill(process.pid, signal.CTRL_BREAK_EVENT)
+            else:  # For Linux/macOS
+                os.killpg(os.getpgid(process.pid), signal.SIGTERM)  # Terminate the process group
+                time.sleep(1)  # Allow some time for graceful termination
+                if process.poll() is None:  # Force kill the group if still running
+                    os.killpg(os.getpgid(process.pid), signal.SIGKILL)
+        
+    def run(self, work_path, code_path, file_path):
+        """Executes a process and handles file movement, command execution, and timeouts."""
+        try:
+            if len(file_path) > 0:
+                self.move_file(src_path=file_path, dest_path=work_path)
+
+            # Determine the command to run based on the operating system
+            if os.name == 'nt':  # Windows
+                command = f"cd {work_path} && python agent-main.py"
+                process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                            creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
+            else:  # Linux/macOS
+                command = f"cd {work_path} && python3 agent-main.py"
+                process = subprocess.Popen(command, shell=True, preexec_fn=os.setsid, stdout=subprocess.PIPE,
+                                            stderr=subprocess.PIPE)
+
+            try:
+                # Wait for process completion with a timeout of 10 seconds
+                out, err = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                self.robust_kill(process)
+                if self.timeout_detected:
+                    return False, "The process timed out after 10 seconds."
+                else:
+                    return True, "The process completes without encountering any errors."
+
+            return_code = process.returncode
+            output = out.decode('utf-8', errors='ignore')
+            error_output = err.decode('utf-8', errors='ignore')
+
+            # If the process is still running after the timeout
+            if process.poll() is None:
+                self.robust_kill(process)  # Ensure the process is terminated
+            return_code = process.returncode
+
+            # Handle return code and output
+            if return_code == 0:
+                # Clean up file paths in the output for readability
+                work_path = os.getcwd()
+                output = output.replace(work_path, "")
+                return True, output
+            else:
+                # Handle errors in the output
+                if error_output:
+                    work_path = os.getcwd()
+                    if "Traceback".lower() in error_output.lower():
+                        errs = error_output.replace(work_path + "/", "").replace(work_path, "")
+                        return False, errs
+                return False, error_output
+
+        except subprocess.CalledProcessError as e:
+            return False, f"CalledProcessError: {str(e)}"
+        except Exception as ex:
+            return False, f"An unexpected error occurred: {str(ex)}"
--- a/puppeteer/tools/file_read.py
+++ b/puppeteer/tools/file_read.py
@ -0,0 +1,44 @@
+from tools.base.register import global_tool_registry
+from tools.base.base_tool import Tool
+from tenacity import retry, stop_after_attempt, wait_fixed
+import os
+from tools.utils.converter import MarkdownConverter
+from model import global_openai_client as client
+
+@global_tool_registry("read_file")
+class FileRead(Tool):
+    """
+    support file type:
+    .csv .xlsx
+    .docx
+    .txt
+    .pdf
+    (LLM generate description).png .jpg .jpeg
+    (LLM generate transcript).mp3
+    .json .jsonld
+    .pptx
+    .wav
+    .html .htm
+    """
+    
+    def __init__(self, name):
+        super().__init__(name=name, 
+                         description="read file from local path", 
+                         execute_function=self.execute)
+        self.converter = MarkdownConverter(mlm_client=client)
+
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
+    def execute(self, *args,**kwargs):
+        try:
+            file_path = kwargs.get("file_path","")
+            file_extension = kwargs.get("file_extension", "")
+            if not os.path.exists(file_path):
+                return False, "File Not Exists"
+            try:
+                ans = self.converter.convert_local(path=file_path, 
+                                                   file_extension=file_extension)
+                return True, ans.text_content
+            except Exception as e:
+                return False, f"Error processing file: {str(e)}"
+        except Exception as e:
+            return False, f"Error processing file: {str(e)}"
--- a/puppeteer/tools/utils/broswer.py
+++ b/puppeteer/tools/utils/broswer.py
@ -0,0 +1,505 @@
+# This file incorporates code from the AutoGen.
+# The original code can be found at:
+# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
+
+import json
+import os
+import requests
+import re
+import io
+import uuid
+import mimetypes
+import time
+import pathlib
+import pathvalidate
+from urllib.parse import urljoin, urlparse, unquote, parse_qs
+from urllib.request import url2pathname
+from typing import Any, Dict, List, Optional, Union, Tuple
+from .converter import MarkdownConverter, UnsupportedFormatException, FileConversionException
+
+import diskcache as dc
+import yaml
+
+global_config = yaml.safe_load(open("./config/global.yaml", "r"))
+class SimpleTextBrowser:
+    """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
+
+    def __init__(
+        self,
+        start_page: Optional[str] = None,
+        viewport_size: Optional[int] = 1024 * 8,
+        downloads_folder: Optional[Union[str, None]] = None,
+        bing_api_key: Optional[Union[str, None]] = None,
+        request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
+    ):
+        self.start_page: str = start_page if start_page else "about:blank"
+        self.viewport_size = viewport_size  # Applies only to the standard uri types
+        # self.downloads_folder = log_path
+        self.history: List[Tuple[str, float]] = list()
+        self.page_title: Optional[str] = None
+        self.viewport_current_page = 0
+        self.viewport_pages: List[Tuple[int, int]] = list()
+        self.set_address(self.start_page)
+        self.bing_api_key = bing_api_key
+        if bing_api_key is  None:
+            self.bing_api_key = global_config.get("api_keys").get("bing_api_key")
+        
+        self.request_kwargs = request_kwargs
+        self._mdconvert = MarkdownConverter()
+        self._page_content: str = ""
+
+        self._find_on_page_query: Union[str, None] = None
+        self._find_on_page_last_result: Union[int, None] = None  # Location of the last result
+
+        self.bing_cache = None
+
+    @property
+    def address(self) -> str:
+        """Return the address of the current page."""
+        return self.history[-1][0]
+
+    def set_address(self, uri_or_path: str) -> None:
+        # TODO: Handle anchors
+        self.history.append((uri_or_path, time.time()))
+
+        # Handle special URIs
+        if uri_or_path == "about:blank":
+            self._set_page_content("")
+        elif uri_or_path.startswith("bing:"):
+            self._bing_search(uri_or_path[len("bing:") :].strip())
+        else:
+            if (
+                not uri_or_path.startswith("http:")
+                and not uri_or_path.startswith("https:")
+                and not uri_or_path.startswith("file:")
+            ):
+                if len(self.history) > 1:
+                    prior_address = self.history[-2][0]
+                    uri_or_path = urljoin(prior_address, uri_or_path)
+                    # Update the address with the fully-qualified path
+                    self.history[-1] = (uri_or_path, self.history[-1][1])
+            self._fetch_page(uri_or_path)
+
+        self.viewport_current_page = 0
+        self.find_on_page_query = None
+        self.find_on_page_viewport = None
+
+    @property
+    def viewport(self) -> str:
+        """Return the content of the current viewport."""
+        bounds = self.viewport_pages[self.viewport_current_page]
+        return self.page_content[bounds[0] : bounds[1]]
+
+    @property
+    def page_content(self) -> str:
+        """Return the full contents of the current page."""
+        return self._page_content
+
+    def _set_page_content(self, content: str) -> None:
+        """Sets the text content of the current page."""
+        self._page_content = content
+        self._split_pages()
+        if self.viewport_current_page >= len(self.viewport_pages):
+            self.viewport_current_page = len(self.viewport_pages) - 1
+
+    def page_down(self) -> None:
+        self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
+
+    def page_up(self) -> None:
+        self.viewport_current_page = max(self.viewport_current_page - 1, 0)
+
+    def find_on_page(self, query: str) -> Union[str, None]:
+        """Searches for the query from the current viewport forward, looping back to the start if necessary."""
+
+        # Did we get here via a previous find_on_page search with the same query?
+        # If so, map to find_next
+        if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
+            return self.find_next()
+
+        # Ok it's a new search start from the current viewport
+        self._find_on_page_query = query
+        viewport_match = self._find_next_viewport(query, self.viewport_current_page)
+        if viewport_match is None:
+            self._find_on_page_last_result = None
+            return None
+        else:
+            self.viewport_current_page = viewport_match
+            self._find_on_page_last_result = viewport_match
+            return self.viewport
+
+    def find_next(self) -> None:
+        """Scroll to the next viewport that matches the query"""
+
+        if self._find_on_page_query is None:
+            return None
+
+        starting_viewport = self._find_on_page_last_result
+        if starting_viewport is None:
+            starting_viewport = 0
+        else:
+            starting_viewport += 1
+            if starting_viewport >= len(self.viewport_pages):
+                starting_viewport = 0
+
+        viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
+        if viewport_match is None:
+            self._find_on_page_last_result = None
+            return None
+        else:
+            self.viewport_current_page = viewport_match
+            self._find_on_page_last_result = viewport_match
+            return self.viewport
+
+    def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
+        """Search for matches between the starting viewport looping when reaching the end."""
+
+        if query is None:
+            return None
+
+        # Normalize the query, and convert to a regular expression
+        nquery = re.sub(r"\*", "__STAR__", query)
+        nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
+        nquery = nquery.replace(" __STAR__ ", "__STAR__ ")  # Merge isolated stars with prior word
+        nquery = nquery.replace("__STAR__", ".*").lower()
+
+        if nquery.strip() == "":
+            return None
+
+        idxs = list()
+        idxs.extend(range(starting_viewport, len(self.viewport_pages)))
+        idxs.extend(range(0, starting_viewport))
+
+        for i in idxs:
+            bounds = self.viewport_pages[i]
+            content = self.page_content[bounds[0] : bounds[1]]
+
+            # TODO: Remove markdown links and images
+            ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
+            if re.search(nquery, ncontent):
+                return i
+
+        return None
+
+    def visit_page(self, path_or_uri: str) -> str:
+        """Update the address, visit the page, and return the content of the viewport."""
+        self.set_address(path_or_uri)
+        return self.viewport
+
+    def _split_pages(self) -> None:
+        # Do not split search results
+        if self.address.startswith("bing:"):
+            self.viewport_pages = [(0, len(self._page_content))]
+            return
+
+        # Handle empty pages
+        if len(self._page_content) == 0:
+            self.viewport_pages = [(0, 0)]
+            return
+
+        # Break the viewport into pages
+        self.viewport_pages = []
+        start_idx = 0
+        while start_idx < len(self._page_content):
+            end_idx = min(start_idx + self.viewport_size, len(self._page_content))  # type: ignore[operator]
+            # Adjust to end on a space
+            while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
+                end_idx += 1
+            self.viewport_pages.append((start_idx, end_idx))
+            start_idx = end_idx
+
+    def _bing_api_call(self, query: str) -> Dict[str, Dict[str, List[Dict[str, Union[str, Dict[str, str]]]]]]:
+        # Check the cache
+        if self.bing_cache is not None:
+            cached = self.bing_cache.get(query)
+            if cached is not None:
+                return cached
+        # Make sure the key was set
+        if self.bing_api_key is None:
+            raise ValueError("Missing Bing API key.")
+
+        # Prepare the request parameters
+        request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
+
+        if "headers" not in request_kwargs:
+            request_kwargs["headers"] = {}
+        request_kwargs["headers"]["Ocp-Apim-Subscription-Key"] = self.bing_api_key
+
+        if "params" not in request_kwargs:
+            request_kwargs["params"] = {}
+        request_kwargs["params"]["q"] = query
+        request_kwargs["params"]["textDecorations"] = False
+        request_kwargs["params"]["textFormat"] = "raw"
+
+        request_kwargs["stream"] = False
+        request_kwargs["timeout"] = (5,10)
+
+        # Make the request
+        response = None
+        for _ in range(2):
+            try:
+                response = requests.get("https://api.bing.microsoft.com/v7.0/search", **request_kwargs)
+                response.raise_for_status()
+                break
+            except Exception:
+                pass
+            time.sleep(1)
+        if response is None:
+            raise requests.exceptions.RequestException("Failed to fetch Bing search results.")
+        results = response.json()
+
+        # Cache the results
+        if self.bing_cache is not None:
+            self.bing_cache.set(query, results)
+
+        return results  # type: ignore[no-any-return]
+
+    def _bing_search(self, query: str) -> None:
+        results = self._bing_api_call(query)
+
+        def _prev_visit(url):
+            for i in range(len(self.history) - 1, -1, -1):
+                if self.history[i][0] == url:
+                    # Todo make this more human-friendly
+                    return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
+            return ""
+
+        web_snippets: List[str] = list()
+        idx = 0
+        if "webPages" in results:
+            for page in results["webPages"]["value"]:
+                idx += 1
+                web_snippets.append(
+                    f"{idx}. [{page['name']}]({page['url']})\n{_prev_visit(page['url'])}{page['snippet']}"
+                )
+                if "deepLinks" in page:
+                    for dl in page["deepLinks"]:
+                        idx += 1
+                        web_snippets.append(
+                            f"{idx}. [{dl['name']}]({dl['url']})\n{_prev_visit(dl['url'])}{dl['snippet'] if 'snippet' in dl else ''}"
+                        )
+
+        news_snippets = list()
+        if "news" in results:
+            for page in results["news"]["value"]:
+                idx += 1
+                datePublished = ""
+                if "datePublished" in page:
+                    datePublished = "\nDate published: " + page["datePublished"].split("T")[0]
+                news_snippets.append(
+                    f"{idx}. [{page['name']}]({page['url']})\n{_prev_visit(page['url'])}{page['description']}{datePublished}"
+                )
+
+        video_snippets = list()
+        if "videos" in results:
+            for page in results["videos"]["value"]:
+                if not page["contentUrl"].startswith("https://www.youtube.com/watch?v="):
+                    continue
+                idx += 1
+                datePublished = ""
+                if "datePublished" in page:
+                    datePublished = "\nDate published: " + page["datePublished"].split("T")[0]
+                video_snippets.append(
+                    f"{idx}. [{page['name']}]({page['contentUrl']})\n{_prev_visit(page['contentUrl'])}{page.get('description', '')}{datePublished}"
+                )
+
+        self.page_title = f"{query} - Search"
+
+        content = (
+            f"A Bing search for '{query}' found {len(web_snippets) + len(news_snippets) + len(video_snippets)} results:\n\n## Web Results\n"
+            + "\n\n".join(web_snippets)
+        )
+        if len(news_snippets) > 0:
+            content += "\n\n## News Results:\n" + "\n\n".join(news_snippets)
+        if len(video_snippets) > 0:
+            content += "\n\n## Video Results:\n" + "\n\n".join(video_snippets)
+
+        self._set_page_content(content)
+
+    def _fetch_page(self, url: str) -> None:
+        download_path = ""
+        response = None
+        print(f'Fetching page: {url}')
+        try:
+            if url.startswith("file://"):
+                download_path = os.path.normcase(os.path.normpath(unquote(url[8:])))
+                res = self._mdconvert.convert_local(download_path)
+                self.page_title = res.title
+                self._set_page_content(res.text_content)
+            else:
+                # Prepare the request parameters
+                request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
+                request_kwargs["stream"] = True
+                request_kwargs["timeout"] = (5,10)  
+
+                # Send a HTTP request to the URL
+                response = requests.get(url, **request_kwargs)
+                response.raise_for_status()
+
+                # If the HTTP request was successful
+                content_type = response.headers.get("content-type", "")
+
+                # Text or HTML
+                if "text/" in content_type.lower():
+                    res = self._mdconvert.convert_response(response)
+                    self.page_title = res.title
+                    self._set_page_content(res.text_content)
+                # A download
+                else:
+                    # Try producing a safe filename
+                    fname = None
+                    download_path = None
+                    try:
+                        fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
+                        download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
+
+                        suffix = 0
+                        while os.path.exists(download_path) and suffix < 1000:
+                            suffix += 1
+                            base, ext = os.path.splitext(fname)
+                            new_fname = f"{base}__{suffix}{ext}"
+                            download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
+
+                    except NameError:
+                        pass
+
+                    # No suitable name, so make one
+                    if fname is None:
+                        extension = mimetypes.guess_extension(content_type)
+                        if extension is None:
+                            extension = ".download"
+                        fname = str(uuid.uuid4()) + extension
+                        download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
+
+                    # Open a file for writing
+                    with open(download_path, "wb") as fh:
+                        for chunk in response.iter_content(chunk_size=512):
+                            fh.write(chunk)
+
+                    # Render it
+                    local_uri = pathlib.Path(download_path).as_uri()
+                    self.set_address(local_uri)
+
+        except UnsupportedFormatException as e:
+            print(f'Unsupported format: {e}')
+            self.page_title = ("Download complete.",)
+            self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
+        except FileConversionException as e:
+            print(f'File conversion error: {e}')
+            self.page_title = ("Download complete.",)
+            self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
+        except FileNotFoundError:
+            self.page_title = "Error 404"
+            self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
+        except requests.exceptions.RequestException:
+            if response is None:
+                self.page_title = "Error"
+                self._set_page_content(f"## Error\n\nFailed to fetch '{url}'")
+            else:
+                self.page_title = f"Error {response.status_code}"
+
+                # If the error was rendered in HTML we might as well render it
+                content_type = response.headers.get("content-type", "")
+                if content_type is not None and "text/html" in content_type.lower():
+                    res = self._mdconvert.convert(response)
+                    self.page_title = f"Error {response.status_code}"
+                    text_content = getattr(res, "text_content", None)
+                    self._set_page_content(f"## Error {response.status_code}\n\n{text_content}")
+                else:
+                    text = ""
+                    for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
+                        if type(chunk) == str:
+                            text += chunk
+                    self.page_title = f"Error {response.status_code}"
+                    self._set_page_content(f"## Error {response.status_code}\n\n{text}")
+
+
+# #https://stackoverflow.com/questions/10123929/fetch-a-file-from-a-local-url-with-python-requests
+# class LocalFileAdapter(requests.adapters.BaseAdapter):
+#     """Protocol Adapter to allow Requests to GET file:// URLs"""
+#
+#     @staticmethod
+#     def _chkpath(method, path):
+#         """Return an HTTP status for the given filesystem path."""
+#         if method.lower() in ("put", "delete"):
+#             return 501, "Not Implemented"
+#         elif method.lower() not in ("get", "head"):
+#             return 405, "Method Not Allowed"
+#         elif not os.path.exists(path):
+#             return 404, "File Not Found"
+#         elif not os.access(path, os.R_OK):
+#             return 403, "Access Denied"
+#         else:
+#             return 200, "OK"
+#
+#     def send(self, req, **kwargs):
+#         """Return the file specified by the given request"""
+#         path = os.path.normcase(os.path.normpath(url2pathname(req.path_url)))
+#         response = requests.Response()
+#
+#         response.status_code, response.reason = self._chkpath(req.method, path)
+#         if response.status_code == 200 and req.method.lower() != "head":
+#             try:
+#                 if os.path.isfile(path):
+#                     response.raw = open(path, "rb")
+#                 else:  # List the directory
+#                     response.headers["content-type"] = "text/html"
+#                     pardir = os.path.normpath(os.path.join(path, os.pardir))
+#                     pardir_uri = pathlib.Path(pardir).as_uri()
+#                     listing = f"""
+# <!DOCTYPE html>
+# <html>
+#   <head>
+#     <title>Index of {html.escape(path)}</title>
+#   </head>
+#   <body>
+#     <h1>Index of {html.escape(path)}</h1>
+#
+#     <a href="{html.escape(pardir_uri, quote=True)}">.. (parent directory)</a>
+#
+#     <table>
+#     <tr>
+#        <th>Name</th><th>Size</th><th>Date modified</th>
+#     </tr>
+# """
+#
+#                     for entry in os.listdir(path):
+#                         full_path = os.path.normpath(os.path.join(path, entry))
+#                         full_path_uri = pathlib.Path(full_path).as_uri()
+#                         size = ""
+#
+#                        if os.path.isdir(full_path):
+#                            entry = entry + os.path.sep
+#                        else:
+#                            size = str(os.path.getsize(full_path))
+#
+#                        listing += (
+#                            "<tr>\n"
+#                            + f'<td><a href="{html.escape(full_path_uri, quote=True)}">{html.escape(entry)}</a></td>'
+#                            + f"<td>{html.escape(size)}</td>"
+#                            + f"<td>{html.escape(entry)}</td>"
+#                            + "</tr>"
+#                        )
+#
+#                    listing += """
+#    </table>
+#  </body>
+# </html>
+# """
+#
+#                    response.raw = io.StringIO(listing)
+#            except (OSError, IOError) as err:
+#                response.status_code = 500
+#                response.reason = str(err)
+#
+#        if isinstance(req.url, bytes):
+#            response.url = req.url.decode("utf-8")
+#        else:
+#            response.url = req.url
+#
+#        response.request = req
+#        response.connection = self
+#
+#        return response
+#
+#    def close(self):
+#        pass
--- a/puppeteer/tools/utils/converter.py
+++ b/puppeteer/tools/utils/converter.py
@ -0,0 +1,927 @@
+# This file incorporates code from the AutoGen.
+# The original code can be found at:
+# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/mdconvert.py
+
+import json
+import os
+import requests
+import re
+import markdownify
+import io
+import uuid
+import mimetypes
+import html
+import pathlib
+import puremagic
+import tempfile
+import copy
+import mammoth
+import pptx
+import pydub
+import pandas as pd
+import speech_recognition as sr
+import sys
+import traceback
+import xml.etree.ElementTree as ET
+import cv2
+import PIL
+import shutil
+import subprocess
+import easyocr
+import numpy as np
+from pytube import YouTube
+import base64
+
+from urllib.parse import urljoin, urlparse, parse_qs
+from urllib.request import url2pathname
+from bs4 import BeautifulSoup
+from typing import Any, Dict, List, Optional, Union, Tuple
+from tenacity import retry, stop_after_attempt, wait_fixed
+# Optional PDF support
+IS_PDF_CAPABLE = False
+try:
+    import pdfminer
+    import pdfminer.high_level
+
+    IS_PDF_CAPABLE = True
+except ModuleNotFoundError:
+    pass
+
+# Optional YouTube transcription support
+IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
+try:
+    from youtube_transcript_api import YouTubeTranscriptApi
+
+    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
+except ModuleNotFoundError:
+    pass
+
+
+class DocumentConverterResult:
+    """The result of converting a document to text."""
+
+    def __init__(self, title: Union[str, None] = None, text_content: str = "", base64 = None):
+        self.title = title
+        self.text_content = text_content
+        self.base64 = base64
+
+class DocumentConverter:
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        raise NotImplementedError()
+
+
+class PlainTextConverter(DocumentConverter):
+    """Anything with content type text/plain"""
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        extension = kwargs.get("file_extension", "")
+        if extension == "":
+            return None
+
+        content_type, encoding = mimetypes.guess_type("__placeholder" + extension)
+        if content_type is None:
+            return None
+
+        if "text/" not in content_type.lower():
+            return None
+
+        text_content = ""
+        with open(local_path, "rt",  encoding="utf-8") as fh:
+            text_content = fh.read()
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=text_content,
+        )
+
+
+class HtmlConverter(DocumentConverter):
+    """Anything with content type text/html"""
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not html
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+
+        result = None
+        with open(local_path, "rt", encoding='utf-8') as fh:
+            result = self._convert(fh.read())
+
+        return result
+
+    def _convert(self, html_content) -> Union[None, DocumentConverterResult]:
+        """Helper function that converts and HTML string."""
+
+        # Parse the string
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+
+        # Print only the main content
+        body_elm = soup.find("body")
+        webpage_text = ""
+        if body_elm:
+            webpage_text = markdownify.MarkdownConverter(newline_style='backslash').convert_soup(body_elm)
+        else:
+            webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
+
+        return DocumentConverterResult(
+            title=None if soup.title is None else soup.title.string,
+            text_content=webpage_text,
+        )
+
+
+class WikipediaConverter(DocumentConverter):
+    """Handle Wikipedia pages separately, focusing only on the main document content."""
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not Wikipedia
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
+            return None
+
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding='utf-8') as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+
+        # Print only the main content
+        body_elm = soup.find("div", {"id": "mw-content-text"})
+        title_elm = soup.find("span", {"class": "mw-page-title-main"})
+
+        webpage_text = ""
+        if body_elm:
+            # What's the title
+            main_title = soup.title.string
+            if title_elm and len(title_elm) > 0:
+                main_title = title_elm.string
+
+            # Convert the page
+            webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm)
+        else:
+            webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
+
+        return DocumentConverterResult(
+            title=soup.title.string,
+            text_content=webpage_text,
+        )
+
+
+class YouTubeConverter(DocumentConverter):
+    """Handle YouTube specially, focusing on the video title, description, and transcript."""
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not YouTube
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not url.startswith("https://www.youtube.com/watch?"):
+            return None
+
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding='utf-8') as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+
+        # Read the meta tags
+        metadata = {"title": soup.title.string}
+        for meta in soup(["meta"]):
+            for a in meta.attrs:
+                if a in ["itemprop", "property", "name"]:
+                    metadata[meta[a]] = meta.get("content", "")
+                    break
+
+        # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
+        try:
+            for script in soup(["script"]):
+                content = script.text
+                if "ytInitialData" in content:
+                    lines = re.split(r"\r?\n", content)
+                    obj_start = lines[0].find("{")
+                    obj_end = lines[0].rfind("}")
+                    if obj_start >= 0 and obj_end >= 0:
+                        data = json.loads(lines[0][obj_start : obj_end + 1])
+                        attrdesc = self._findKey(data, "attributedDescriptionBodyText")
+                        if attrdesc:
+                            metadata["description"] = attrdesc["content"]
+                    break
+        except:
+            pass
+
+        # Start preparing the page
+        webpage_text = "# YouTube\n"
+
+        title = self._get(metadata, ["title", "og:title", "name"])
+        if title:
+            webpage_text += f"\n## {title}\n"
+
+        stats = ""
+        views = self._get(metadata, ["interactionCount"])
+        if views:
+            stats += f"- **Views:** {views}\n"
+
+        keywords = self._get(metadata, ["keywords"])
+        if keywords:
+            stats += f"- **Keywords:** {keywords}\n"
+
+        runtime = self._get(metadata, ["duration"])
+        if runtime:
+            stats += f"- **Runtime:** {runtime}\n"
+
+        if len(stats) > 0:
+            webpage_text += f"\n### Video Metadata\n{stats}\n"
+
+        description = self._get(metadata, ["description", "og:description"])
+        if description:
+            webpage_text += f"\n### Description\n{description}\n"
+
+        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
+            transcript_text = ""
+            parsed_url = urlparse(url)
+            params = parse_qs(parsed_url.query)
+            if "v" in params:
+                video_id = params["v"][0]
+                try:
+                    # Must be a single transcript.
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id)
+                    transcript_text = " ".join([part["text"] for part in transcript])
+                    # Alternative formatting:
+                    # formatter = TextFormatter()
+                    # formatter.format_transcript(transcript)
+                except:
+                    pass
+            if transcript_text:
+                webpage_text += f"\n### Transcript\n{transcript_text}\n"
+
+        output_path = self._download_video(url=url, local_path=local_path)
+        if output_path!="" and os.path.exists(output_path):
+            base64_video = self._base64_video(output_path)
+        else:
+            base64_video = ""
+        return DocumentConverterResult(
+            title=title if title else soup.title.string,
+            text_content=webpage_text,
+            base64=base64_video
+        )
+
+    def _download_video(self, url, local_path) -> str:
+        # TODO
+        return ""
+        try:
+            video_id = url.split('v=')[-1].split('&')[0]
+            video_id = video_id.strip()
+            youtube = YouTube(url)
+            video_stream = youtube.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
+            if not video_stream:
+                raise ValueError("No suitable video stream found.")
+            
+            output_path = f"{local_path}/{video_id}.mp4"
+            video_stream.download(output_path=local_path, filename=f"{video_id}.mp4")
+            return output_path
+        
+        except Exception as e:
+            return ""
+
+    def _base64_video(self, output_path, frame_interval: int = 10) -> list:
+        video = cv2.VideoCapture(str(output_path))
+        base64_frames = []
+        frame_count = 0
+        while video.isOpened():
+            success, frame = video.read()
+            if not success:
+                break
+            if frame_count % frame_interval == 0:
+                _, buffer = cv2.imencode(".jpg", frame)
+                base64_frames.append(base64.b64encode(buffer).decode("utf-8"))
+            frame_count += 1
+        video.release()
+        return base64_frames
+    
+    def _get(self, json, keys, default=None):
+        for k in keys:
+            if k in json:
+                return json[k]
+        return default
+
+    def _findKey(self, json, key):
+        if isinstance(json, list):
+            for elm in json:
+                ret = self._findKey(elm, key)
+                if ret is not None:
+                    return ret
+        elif isinstance(json, dict):
+            for k in json:
+                if k == key:
+                    return json[k]
+                else:
+                    ret = self._findKey(json[k], key)
+                    if ret is not None:
+                        return ret
+        return None
+
+
+class PdfConverter(DocumentConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a PDF
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".pdf":
+            return None
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=pdfminer.high_level.extract_text(local_path),
+        )
+
+
+class DocxConverter(HtmlConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a DOCX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".docx":
+            return None
+
+        result = None
+        with open(local_path, "rb") as docx_file:
+            result = mammoth.convert_to_html(docx_file)
+            html_content = result.value
+            result = self._convert(html_content)
+
+        return result
+
+class JsonConverter(HtmlConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".json", ".jsonld", "jsonl"]:
+            return None
+
+        with open(local_path, "r", encoding="utf-8") as json_file:
+            data = json.load(json_file)
+        
+        
+        md_content = self.dict_to_md(data)
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+    def dict_to_md(self, data, level=1):
+        md_str = ""
+        if isinstance(data, dict):
+            for key, value in data.items():
+                md_str += f"{'#' * level} {key}\n"
+                md_str += self.dict_to_md(value, level + 1)
+        elif isinstance(data, list):
+            for item in data:
+                md_str += self.dict_to_md(item, level)
+        else:
+            md_str += f"{data}\n"
+        return md_str
+
+class XmlConverter(HtmlConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XML
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".xml"]:
+            return None
+
+        with open(local_path, "rt", encoding='utf-8') as fh:
+            xml_content = fh.read()
+
+        root = ET.fromstring(xml_content)
+        md_content = self.xml_to_md(root)
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+    def xml_to_md(self, element, level=1):
+        md_str = ""
+        md_str += f"{'#' * level} {element.tag}\n"
+        if element.text and element.text.strip():
+            md_str += f"{element.text.strip()}\n"
+        for child in element:
+            md_str += self.xml_to_md(child, level + 1)
+        return md_str
+
+
+
+class XlsxConverter(HtmlConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".xlsx":
+            return None
+        sheets = pd.read_excel(local_path, sheet_name=None)
+        md_content = ""
+        for s in sheets:
+            md_content += f"## {s}\n"
+            html_content = sheets[s].to_html(index=False)
+            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+class CsvConverter(HtmlConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".csv":
+            return None
+        
+        md_content = ""
+        sheets = pd.read_csv(local_path)
+        html_content = sheets.to_html(index=False)
+        md_content += self._convert(html_content).text_content.strip() + "\n\n"
+        
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+class PptxConverter(HtmlConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a PPTX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".pptx":
+            return None
+
+        md_content = ""
+
+        presentation = pptx.Presentation(local_path)
+        slide_num = 0
+        for slide in presentation.slides:
+            slide_num += 1
+
+            md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
+
+            title = slide.shapes.title
+            for shape in slide.shapes:
+                # Pictures
+                if self._is_picture(shape):
+                    # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
+                    alt_text = ""
+                    try:
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
+                    except:
+                        pass
+
+                    # A placeholder name
+                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                    # try:
+                    #    filename = shape.image.filename
+                    # except:
+                    #    pass
+
+                    md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
+
+                # Tables
+                if self._is_table(shape):
+                    html_table = "<html><body><table>"
+                    first_row = True
+                    for row in shape.table.rows:
+                        html_table += "<tr>"
+                        for cell in row.cells:
+                            if first_row:
+                                html_table += "<th>" + html.escape(cell.text) + "</th>"
+                            else:
+                                html_table += "<td>" + html.escape(cell.text) + "</td>"
+                        html_table += "</tr>"
+                        first_row = False
+                    html_table += "</table></body></html>"
+                    md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
+
+                # Text areas
+                elif shape.has_text_frame:
+                    if shape == title:
+                        md_content += "# " + shape.text.lstrip() + " "
+                    else:
+                        md_content += shape.text + " "
+
+            md_content = md_content.strip()
+
+            if slide.has_notes_slide:
+                md_content += "\n\n### Notes:\n"
+                notes_frame = slide.notes_slide.notes_text_frame
+                if notes_frame is not None:
+                    md_content += notes_frame.text
+                md_content = md_content.strip()
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+    def _is_picture(self, shape):
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
+            return True
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
+            if hasattr(shape, "image"):
+                return True
+        return False
+
+    def _is_table(self, shape):
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
+            return True
+        return False
+
+import whisper
+import joblib
+# cache asr function
+asr_cache = joblib.Memory(location=".cache/asr", verbose=0)
+
+@asr_cache.cache
+def asr(local_path):
+    whisper_model = whisper.load_model("large")
+    return whisper_model.transcribe(local_path)['text']
+
+class WavConverter(DocumentConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".wav":
+            return None
+
+        # recognizer = sr.Recognizer()
+        # with sr.AudioFile(local_path) as source:
+        #     audio = recognizer.record(source)
+        #     text_content = recognizer.recognize_google(audio).strip()
+        text_content = asr(local_path)
+
+        return DocumentConverterResult(
+            title=None,
+            text_content="### Audio Transcript:\n" + ("[No speech detected]" if text_content == "" else text_content),
+        )
+
+
+class Mp3Converter(WavConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a MP3
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".mp3":
+            return None
+
+        # handle, temp_path = tempfile.mkstemp(suffix=".wav")
+        # os.close(handle)
+        # try:
+        #     sound = pydub.AudioSegment.from_mp3(local_path)
+        #     sound.export(temp_path, format="wav")
+
+        #     _args = dict()
+        #     _args.update(kwargs)
+        #     _args["file_extension"] = ".wav"
+
+        #     result = super().convert(temp_path, **_args)
+        # finally:
+        #     os.unlink(temp_path)
+
+        # return result
+        # ASR, large time cost
+        # if "5b89b147-cdab-40e1-be5b-819bc076c270" in local_path:
+        #     text_content = ""
+        # else:
+        #     text_content = asr(local_path)
+
+        mlm_client = kwargs.get("mlm_client")
+        if mlm_client is not None:
+            text_content = self._get_audio_transcript(local_path, extension, mlm_client).strip()
+
+        return DocumentConverterResult(
+            title=None,
+            text_content="### Audio Transcript:\n" + ("[No speech detected]" if text_content == "" else text_content),
+        )
+    
+    def _get_audio_transcript(self, local_path, extension, client):
+        audio_file = open(local_path, "rb")
+        transcript = client.audio.transcriptions.create(
+                                                        model="whisper-1",
+                                                        file=audio_file
+                                                        )
+        return transcript.text
+
+
+class ImageConverter(DocumentConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
+            return None
+
+        ocr_min_confidence = kwargs.get("ocr_min_confidence", 0.25)
+
+        md_content = ""
+
+        # Add metadata
+        metadata = self._get_metadata(local_path)
+        if metadata:
+            for f in [
+                "Title",
+                "Caption",
+                "Description",
+                "Keywords",
+                "Artist",
+                "DateTimeOriginal",
+                "CreateDate",
+                "GPSPosition",
+            ]:
+                if f in metadata:
+                    md_content += f"{f}: {metadata[f]}\n"
+
+        # Try describing the image with GPTV
+        mlm_client = kwargs.get("mlm_client")
+        if mlm_client is not None:
+            md_content += (
+                "\n# Description:\n"
+                + self._get_mlm_description(local_path, extension, mlm_client, prompt=kwargs.get("mlm_prompt")).strip()
+                + "\n"
+            )
+        # OCR, large time cost
+        # image = PIL.Image.open(local_path)
+        # # Remove transparency
+        # if image.mode in ("RGBA", "P"):
+        #     image = image.convert("RGB")
+
+        # reader = easyocr.Reader(["en"])  # specify the language(s)
+        # output = reader.readtext(np.array(image))  # local_path)
+        # # The output is a list of tuples, each containing the coordinates of the text and the text itself.
+        # # We join all the text pieces together to get the final text.
+        # ocr_text = " "
+        # for item in output:
+        #     if item[2] >= ocr_min_confidence:
+        #         ocr_text += item[1] + " "
+        # ocr_text = ocr_text.strip()
+
+        # if len(ocr_text) > 0:
+        #     md_content += "\n# Text detected by OCR:\n" + ocr_text
+        base64_img = self._base64_img(local_path=local_path)
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content,
+            base64=base64_img
+        )
+
+    def _base64_img(self, local_path) -> str:
+        with open(local_path, "rb") as image_file:
+            encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
+        return encoded_image
+    
+    def _get_metadata(self, local_path):
+        exiftool = shutil.which("exiftool")
+        if not exiftool:
+            return None
+        else:
+            try:
+                result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
+                return json.loads(result)[0]
+            except:
+                return None
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
+    def _get_mlm_description(self, local_path, extension, client, prompt=None):
+        if prompt is None or prompt.strip() == "":
+            prompt = """Write a detailed description for an image by describing the overall setting, main subjects, their actions and interactions, 
+            secondary elements in the background, colors and lighting, the emotions and atmosphere conveyed, and any implied meaning or symbolism.
+            """
+            
+
+        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
+
+        data_uri = ""
+        with open(local_path, "rb") as image_file:
+            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+            if content_type is None:
+                content_type = "image/jpeg"
+            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+            data_uri = f"data:{content_type};base64,{image_base64}"
+
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": data_uri,
+                            },
+                        },
+                    ],
+                }
+            ]
+
+            response = client.chat.completions.create(model="gpt-4o",messages=messages)
+            return response.choices[0].message.content
+
+class FileConversionException(BaseException):
+    pass
+
+class UnsupportedFormatException(BaseException):
+    pass
+
+class MarkdownConverter:
+    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
+    This reader will convert common file-types or webpages to Markdown."""
+
+    def __init__(
+        self,
+        requests_session: Optional[requests.Session] = None,
+        mlm_client: Optional[Any] = None,
+    ):
+        if requests_session is None:
+            self._requests_session = requests.Session()
+        else:
+            self._requests_session = requests_session
+
+        self._mlm_client = mlm_client
+
+        self._page_converters: List[DocumentConverter] = []
+
+        # Register converters for successful browsing operations
+        # Later registrations are tried first / take higher priority than earlier registrations
+        # To this end, the most specific converters should appear below the most generic converters
+        self.register_page_converter(PlainTextConverter())
+        self.register_page_converter(HtmlConverter())
+        self.register_page_converter(WikipediaConverter())
+        self.register_page_converter(YouTubeConverter())
+        self.register_page_converter(DocxConverter())
+        self.register_page_converter(XlsxConverter())
+        self.register_page_converter(PptxConverter())
+        self.register_page_converter(WavConverter())
+        self.register_page_converter(Mp3Converter())
+        self.register_page_converter(ImageConverter())
+        self.register_page_converter(CsvConverter())
+        self.register_page_converter(JsonConverter())
+        self.register_page_converter(XmlConverter())
+
+        if IS_PDF_CAPABLE:
+            self.register_page_converter(PdfConverter())
+
+    def convert(self, source, **kwargs):
+        """
+        Args:
+            - source: can be a string representing a path or url, or a requests.response object
+            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
+        """
+
+        try:
+            # Local path or url
+            if isinstance(source, str):
+                if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
+                    return self.convert_url(source, **kwargs)
+                else:
+                    return self.convert_local(source, **kwargs)
+            # Request response
+            elif isinstance(source, requests.Response):
+                return self.convert_response(source, **kwargs)
+        except Exception as e:
+            return f"Error: {e}"
+
+    def convert_local(self, path, **kwargs):
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+
+        # Get extension alternatives from the path and puremagic
+        base, ext = os.path.splitext(path)
+        self._append_ext(extensions, ext)
+        self._append_ext(extensions, self._guess_ext_magic(path))
+
+        # Convert
+        return self._convert(path, extensions, **kwargs)
+
+    def convert_url(self, url, **kwargs):
+        # Send a HTTP request to the URL
+        response = self._requests_session.get(url, stream=True)
+        response.raise_for_status()
+        return self.convert_response(response, **kwargs)
+
+    def convert_response(self, response, **kwargs):
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+
+        # Guess from the mimetype
+        content_type = response.headers.get("content-type", "").split(";")[0]
+        self._append_ext(extensions, mimetypes.guess_extension(content_type))
+
+        # Read the content disposition if there is one
+        content_disposition = response.headers.get("content-disposition", "")
+        m = re.search(r"filename=([^;]+)", content_disposition)
+        if m:
+            base, ext = os.path.splitext(m.group(1).strip("\"'"))
+            self._append_ext(extensions, ext)
+
+        # Read from the extension from the path
+        base, ext = os.path.splitext(urlparse(response.url).path)
+        self._append_ext(extensions, ext)
+
+        # Save the file locally to a temporary file. It will be deleted before this method exits
+        handle, temp_path = tempfile.mkstemp()
+        fh = os.fdopen(handle, "wb")
+        result = None
+        try:
+            # Download the file
+            for chunk in response.iter_content(chunk_size=512):
+                fh.write(chunk)
+            fh.close()
+
+            # Use puremagic to check for more extension options
+            self._append_ext(extensions, self._guess_ext_magic(temp_path))
+
+            # Convert
+            result = self._convert(temp_path, extensions, url=response.url)
+
+        # Clean up
+        finally:
+            try:
+                fh.close()
+            except:
+                pass
+            os.unlink(temp_path)
+
+        return result
+
+    def _convert(self, local_path, extensions, **kwargs):
+        error_trace = ""
+        for ext in extensions:
+            for converter in self._page_converters:
+                _kwargs = copy.deepcopy(kwargs)
+                _kwargs.update({"file_extension": ext})
+
+                # Copy any additional global options
+                if "mlm_client" not in _kwargs and self._mlm_client is not None:
+                    _kwargs["mlm_client"] = self._mlm_client
+
+                # If we hit an error log it and keep trying
+                res = None
+                try:
+                    res = converter.convert(local_path, **_kwargs)
+                except Exception as e:
+                    error_trace = ("\n\n" + traceback.format_exc()).strip()
+
+                if res is not None:
+                    # Normalize the content
+                    res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
+                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
+
+                    # Todo
+                    return res
+
+        # If we got this far without success, report any exceptions
+        if len(error_trace) > 0:
+            # raise FileConversionException(
+            #     f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
+            # )
+            return f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
+
+        # Nothing can handle it!
+        # raise UnsupportedFormatException(
+        #     f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
+        # )
+        return f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
+
+    def _append_ext(self, extensions, ext):
+        """Append a unique non-None, non-empty extension to a list of extensions."""
+        if ext is None:
+            return
+        ext = ext.strip()
+        if ext == "":
+            return
+        # if ext not in extensions:
+        if True:
+            extensions.append(ext)
+
+    def _guess_ext_magic(self, path):
+        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
+        # Use puremagic to guess
+        try:
+            guesses = puremagic.magic_file(path)
+            if len(guesses) > 0:
+                ext = guesses[0].extension.strip()
+                if len(ext) > 0:
+                    return ext
+        except FileNotFoundError:
+            pass
+        except IsADirectoryError:
+            pass
+        except PermissionError:
+            pass
+        except Exception:
+            pass
+        return None
+
+    def register_page_converter(self, converter: DocumentConverter) -> None:
+        """Register a page text converter."""
+        self._page_converters.insert(0, converter)
--- a/puppeteer/tools/web_search.py
+++ b/puppeteer/tools/web_search.py
@ -0,0 +1,134 @@
+from tools.base.register import global_tool_registry
+from tools.base.base_tool import Tool
+import arxiv
+from abc import abstractmethod
+from tools.utils.broswer import SimpleTextBrowser
+import signal
+from functools import wraps
+
+def timeout_handler(signum, frame):
+    raise TimeoutError("Request timed out")
+
+def timeout(seconds=1):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Set the signal handler
+            signal.signal(signal.SIGALRM, timeout_handler)
+            signal.alarm(seconds)
+            try:
+                result = func(*args, **kwargs)
+            finally:
+                # Disable the alarm
+                signal.alarm(0)
+            return result
+        return wrapper
+    return decorator
+
+class Web_Search(Tool):
+    def __init__(self):
+        super().__init__("web_search", "Search the web for a given query", self.execute)
+        self.broswer = SimpleTextBrowser()
+    
+    def execute(self,*args,**kwargs):
+        try: 
+            query = kwargs.get("query", "")
+            self.broswer.downloads_folder = kwargs.get("work_path", "")
+            flag, ans = self.search(query)
+        except AttributeError:
+            return False, "No results found for query {}".format(query)
+        except TimeoutError:
+            return False, "Timeout"
+        except Exception as e:
+            return False, "No results found for query {}".format(query)
+
+        if (ans is None) or (len(ans) == 0):
+            # raise ValueError(f"No results found for query {query}.")
+            return False, "No results found for query {}".format(query)
+        
+        return flag, ans
+    
+    @abstractmethod
+    def search(self, query):
+        pass
+
+import arxiv
+import requests
+
+@global_tool_registry("search_arxiv")
+class arXiv_SearchEngine(Web_Search):
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+
+    def search(self, query):
+        # Custom timeout
+        timeout = 10  # Timeout in seconds
+
+        # Create a custom session with a timeout
+        session = requests.Session()
+        adapter = requests.adapters.HTTPAdapter(timeout=timeout)
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
+
+        try:
+            # Perform the search with custom session
+            search = arxiv.Search(
+                query=query,
+                max_results=5,
+                sort_by=arxiv.SortCriterion.Relevance,
+                session=session  # Use the session with timeout
+            )
+            
+            results = []
+            for result in search.results():
+                result_info = {
+                    "title": result.title,
+                    "authors": ", ".join(author.name for author in result.authors),
+                    "summary": result.summary,
+                    "pdf_url": result.pdf_url
+                }
+                results.append(result_info)
+            
+            if len(results) == 0:
+                results = "Page not exists in arXiv, try different search tools like Bing search."
+                return False, results
+
+            return True, str(results)
+
+        except requests.exceptions.Timeout:
+            return False, "Request timed out. Please try again later."
+
+        except Exception as e:
+            return False, f"An error occurred: {e}"
+    
+
+@global_tool_registry("search_bing")
+class Bing_SearchEngine(Web_Search):
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+    
+    def search(self, query):
+        self.broswer.set_address("bing:"+query)
+        if self.broswer.page_content != None and len(self.broswer.page_content) != 0:
+            return True, self.broswer.page_content
+        else:
+            return False, "page not exists in bing, try different search tools"
+
+
+@global_tool_registry("access_website")
+class Website_SearchEngine(Web_Search):
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+    
+    def search(self, url):
+        self.broswer.set_address(url)
+        if self.broswer.page_content != None and len(self.broswer.page_content) != 0:
+            if "Failed to fetch" in self.broswer.page_content:
+                return False, self.broswer.page_content
+            else:
+                return True, self.broswer.page_content
+        else:
+            return False, "Can not Access this website: {}".format(url)
--- a/puppeteer/utils/init.py
+++ b/puppeteer/utils/init.py
@ -0,0 +1,3 @@
+from .logging import LogManager
+
+__all__ = ['LogManager']
--- a/puppeteer/utils/file_utils.py
+++ b/puppeteer/utils/file_utils.py
@ -0,0 +1,142 @@
+import subprocess
+import json
+import yaml
+import time
+import logging
+from easydict import EasyDict
+import re
+import os
+from typing import  Optional, List
+import ast
+from model import query_gpt
+# =============================
+# File / JSON / Code Utilities
+# =============================
+
+def write_jsonl(fd, record: dict):
+    fd.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+
+def read_code(file_path: str) -> str:
+    if file_path and os.path.isfile(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    return ""
+
+
+def read_text(file_path: str) -> str:
+    return read_code(file_path)
+
+
+def write_file(work_path: str, content: str, ext: str = "py", file_path: Optional[str] = None) -> str:
+    """Write code/text to a file, auto-increment file name if needed."""
+    if file_path and os.path.isfile(file_path):
+        with open(file_path, 'w', encoding='utf-8') as f:
+            if len(content) > 0:
+                f.write(content)
+        return file_path
+
+    index = 0
+    while os.path.exists(os.path.join(work_path, f"agent-main_{index}.{ext}")):
+        index += 1
+    file_path = os.path.join(work_path, f"agent-main_{index}.{ext}")
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        f.write(content)
+    return file_path
+
+
+def write_code(work_path: str, code: str, code_path: Optional[str] = None) -> str:
+    return write_file(work_path, code, ext="py", file_path=code_path)
+
+
+def write_text(work_path: str, text: str, text_path: Optional[str] = None) -> str:
+    return write_file(work_path, text, ext="txt", file_path=text_path)
+
+
+def format_code(code: str) -> str:
+    """Remove empty lines."""
+    return "\n".join([line for line in code.splitlines() if line.strip()])
+
+
+def iter_jsonl(data_path: str) -> List[dict]:
+    with open(data_path, 'r', encoding='utf-8') as f:
+        return [json.loads(line) for line in f]
+
+
+def get_files_from_type(source_dir: str, filetype: str) -> List[str]:
+    files = []
+    for root, _, filenames in os.walk(source_dir):
+        for filename in filenames:
+            if filename.endswith(filetype):
+                files.append(os.path.join(root, filename))
+    return files
+
+
+def cmd(command: str) -> str:
+    logging.info(f">> {command}")
+    return subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE).stdout
+
+
+def get_easyDict_from_filepath(path: str) -> Optional[EasyDict]:
+    if path.endswith('.json'):
+        with open(path, 'r', encoding="utf-8") as f:
+            return EasyDict(json.load(f, strict=False))
+    elif path.endswith(('.yaml', '.yml')):
+        with open(path, 'r', encoding="utf-8") as f:
+            return EasyDict(yaml.load(f, Loader=yaml.FullLoader))
+    return None
+
+
+def now() -> str:
+    return time.strftime("%Y%m%d%H%M%S", time.localtime())
+
+
+def code_is_valid(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except Exception:
+        return False
+
+
+def extract_code_from_text(text: str) -> str:
+    """Extract valid Python code blocks from text."""
+    code_blocks = re.findall(r"```.*?```", text, re.DOTALL)
+    code_blocks = [
+        "\n".join([line for line in block.splitlines() if "```" not in line])
+        for block in code_blocks
+    ]
+    code = "\n\n".join(code_blocks) if code_blocks else text
+
+    if len(code.strip().splitlines()) == 1:
+        return ""
+
+    if code_is_valid(code):
+        return code
+
+    # Search for longest valid code segment
+    lines = text.splitlines()
+    candidates = []
+    for start in range(len(lines)):
+        for end in range(start, len(lines)):
+            segment = "\n".join(lines[start:end + 1])
+            if code_is_valid(segment):
+                candidates.append((end - start, segment))
+    if not candidates:
+        return ""
+    candidates.sort(reverse=True)
+    return candidates[0][1]
+
+
+def format_code_with_prints(code: Optional[str]) -> str:
+    """Ensure code has print statements for important info."""
+    if code is None:
+        return ""
+    if not isinstance(code, str):
+        code = str(code)
+    if re.search(r'print$begin:math:text$.+$end:math:text$', code):
+        return extract_code_from_text(code)
+    prompt = f"Add 'print' statements to print all necessary information.\nCode:\n{code}"
+    code_out, _ = query_gpt(prompt)
+    return extract_code_from_text(code_out) or code_out
--- a/puppeteer/utils/logging.py
+++ b/puppeteer/utils/logging.py
@ -0,0 +1,101 @@
+import logging
+import yaml
+import os
+import datetime
+
+class LogManager:
+    _instance = None
+
+    def __new__(cls, config_path=None, task_name=None):
+        if cls._instance is not None:
+            cls._instance._cleanup()
+        cls._instance = super(LogManager, cls).__new__(cls)
+        cls._instance._initialize(config_path, task_name)
+        return cls._instance
+
+    def _initialize(self, config_path, task_name):
+        self.loggers = {}
+        self.global_config = yaml.safe_load(open(config_path, "r"))
+        self.task_name = task_name
+        self.folder_path = self._create_log_folder()
+        self._setup_main_logger()
+        self._setup_model_logger()
+        self._setup_training_logger()
+
+    def _create_log_folder(self):
+        timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+        folder_path = os.path.join(self.global_config.get('logging').get('logpath'), self.task_name, timestamp)
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+        return folder_path
+
+    def _setup_main_logger(self):
+        main_logger = logging.getLogger('global')
+        main_logger.setLevel(self.global_config.get('logging').get('level'))
+        fh = logging.FileHandler(os.path.join(self.folder_path, "meta.log"), encoding="utf-8")
+        fh.setLevel(self.global_config.get('logging').get('level'))
+
+        formatter = logging.Formatter('[%(asctime)s %(levelname)s]\n%(message)s', datefmt='%Y-%d-%m %H:%M:%S')
+        fh.setFormatter(formatter)
+
+        main_logger.addHandler(fh)
+    
+    def _setup_model_logger(self):
+        model_logger = logging.getLogger('model')
+        model_logger.setLevel(self.global_config.get('logging').get('level'))
+        fh = logging.FileHandler(os.path.join(self.folder_path, "model_query.log"), encoding="utf-8")
+        fh.setLevel(self.global_config.get('logging').get('level'))
+
+        formatter = logging.Formatter('[%(asctime)s %(levelname)s]\n%(message)s', datefmt='%Y-%d-%m %H:%M:%S')
+        fh.setFormatter(formatter)
+
+        model_logger.addHandler(fh)
+
+    def _setup_training_logger(self):
+        training_logger = logging.getLogger('train')
+        training_logger.setLevel(self.global_config.get('logging').get('level'))
+        fh = logging.FileHandler(os.path.join(self.folder_path,"train.log"), encoding="utf-8")
+        fh.setLevel(self.global_config.get('logging').get('level')) 
+
+        formatter = logging.Formatter('[%(asctime)s %(levelname)s]\n%(message)s', datefmt='%Y-%d-%m %H:%M:%S')
+        fh.setFormatter(formatter)
+
+        training_logger.addHandler(fh)
+
+    def create_logger(self, name, log_file, level=logging.INFO):
+        logger = logging.getLogger(name)
+        logger.setLevel(level)
+
+        if not logger.handlers:
+            handler = logging.FileHandler(log_file, encoding="utf-8")
+            handler.setFormatter(logging.Formatter('[%(asctime)s %(levelname)s]\n%(message)s', datefmt='%Y-%d-%m %H:%M:%S'))
+            logger.addHandler(handler)
+        logger.propagate = False
+        self.loggers[name] = logger
+
+    def get_logger(self, index):
+        return self.loggers.get(index, logging.getLogger())
+    def _cleanup(self):
+        for logger in self.loggers.values():
+            handlers = logger.handlers[:]
+            for handler in handlers:
+                handler.close()
+                logger.removeHandler(handler)
+        
+        main_logger = logging.getLogger('global')
+        handlers = main_logger.handlers[:]
+        for handler in handlers:
+            try:
+                handler.close()
+                main_logger.removeHandler(handler)
+            except Exception as e:
+                print(f"Error closing handler: {e}")
+        
+        model_logger = logging.getLogger('model')
+        handlers = model_logger.handlers[:]
+        for handler in handlers:
+            try:
+                handler.close()
+                model_logger.removeHandler(handler)
+            except Exception as e:
+                print(f"Error closing handler: {e}")
--- a/puppeteer/utils/other_utils.py
+++ b/puppeteer/utils/other_utils.py
@ -0,0 +1,69 @@
+import json
+import logging
+import re
+from time import sleep
+from typing import Union
+
+# =============================
+# Singleton Decorator
+# =============================
+def Singleton(cls):
+    instances = {}
+
+    def get_instance(*args, **kwargs):
+        if cls not in instances:
+            instances[cls] = cls(*args, **kwargs)
+        return instances[cls]
+
+    return get_instance
+
+
+# =============================
+# JSON Formatter Class
+# =============================
+class JsonFormat:
+    def __init__(self, query_func):
+        self.query_func = query_func
+
+    def load_json_with_invalid_escape(self, json_str: str) -> dict:
+        """Handle invalid JSON escape sequences."""
+        json_str = json_str.strip()
+        json_str = re.sub(r'(?<!\\)\n', ' ', json_str)
+
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            try:
+                json_str = re.sub(r'\$begin:math:text$?!["\\\\/bfnrtu])', r'\\\\\\\\', json_str)
+                json_str = re.sub(r'(?<!\\$end:math:text$"', '\\"', json_str)
+                json_str = f'"{json_str}"'
+                return json.loads(json_str)
+            except json.JSONDecodeError:
+                return {'action': 'Error', 'parameter': 'Invalid JSON format'}
+
+    def json_check(self, text: str) -> tuple[bool, Union[dict, str]]:
+        try:
+            d = self.load_json_with_invalid_escape(text)
+            if isinstance(d, dict) and d.get("action") != "Error":
+                return True, d
+        except Exception:
+            pass
+        return False, ""
+
+    def json_reformat(self, text: str, max_try_times: int = 3) -> dict:
+        """Reformat GPT text to strict JSON object."""
+        prompt_template = """
+        Please reformat the given text strictly according to the specified JSON format.
+        The given text is: {}.
+        The specified JSON format is: {{"action": "", "parameter": ""}}, presented in plain text.
+        Only return one JSON object.
+        """
+        for _ in range(max_try_times):
+            text = text.replace("null", '"Error"').replace("None", '"Error"').replace("```json", "").replace("```", "")
+            valid, json_obj = self.json_check(text)
+            if valid:
+                return json_obj
+            logging.info(f"Error format:\n{text}")
+            text, _ = self.query_func(prompt_template.format(text))
+            sleep(1)
+        return {'action': 'Error', 'parameter': 'Error'}
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,35 @@
+arxiv==2.1.3
+beautifulsoup4==4.13.5
+chromadb==0.5.7
+diskcache==5.6.3
+easydict==1.10
+easyocr==1.7.1
+joblib
+pyarrow
+mammoth==1.8.0
+markdownify==0.13.1
+networkx==3.2.1
+numpy==2.3.2
+openai==1.107.0
+openai_whisper
+opencv_python_headless==4.10.0.84
+pandas==2.3.2
+pathvalidate==3.2.0
+pdfminer.six==20240706
+Pillow==11.3.0
+puremagic==1.27
+pydub==0.25.1
+python_pptx==1.0.2
+pytube==15.0.0
+pyvis==0.3.2
+PyYAML==6.0
+Requests==2.32.5
+seaborn==0.13.2
+SpeechRecognition==3.10.4
+sympy==1.12
+tenacity==9.0.0
+torch==2.4.0
+tqdm==4.66.5
+transformers==4.43.4
+whisper==1.1.10
+youtube_transcript_api==0.6.2