From 39c96da27e0a31636cb5285a6e7f8b34c3fdf8f0 Mon Sep 17 00:00:00 2001 From: Md Farhan Ishmam <45528856+farhanishmam@users.noreply.github.com> Date: Sun, 15 Mar 2026 23:06:47 -0600 Subject: [PATCH 1/4] Add files via upload --- .../generic_agent_with_training.py | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 src/agentlab/agents/generic_agent/generic_agent_with_training.py diff --git a/src/agentlab/agents/generic_agent/generic_agent_with_training.py b/src/agentlab/agents/generic_agent/generic_agent_with_training.py new file mode 100644 index 000000000..b35697beb --- /dev/null +++ b/src/agentlab/agents/generic_agent/generic_agent_with_training.py @@ -0,0 +1,225 @@ +""" +GenericAgent with training data saving functionality. + +This module extends GenericAgent to save training data (system prompt, user prompt, and agent output) +for each step during benchmarking. This is useful for creating training datasets. +""" + +from copy import deepcopy +from dataclasses import asdict +from pathlib import Path +import json +import logging + +from browsergym.experiments.agent import AgentInfo + +from agentlab.agents import dynamic_prompting as dp +from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, BaseMessage, retry +from agentlab.llm.tracking import cost_tracker_decorator + +from .generic_agent import GenericAgent, GenericAgentArgs +from .generic_agent_prompt import MainPrompt + +logger = logging.getLogger(__name__) + + +class GenericAgentWithTrainingArgs(GenericAgentArgs): + """Agent arguments for GenericAgentWithTraining.""" + + def __post_init__(self): + super().__post_init__() + try: + self.agent_name = f"GenericAgentWithTraining-{self.chat_model_args.model_name}".replace("/", "_") + except AttributeError: + pass + + def make_agent(self): + return GenericAgentWithTraining( + chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry + ) + + +class GenericAgentWithTraining(GenericAgent): + """ + GenericAgent extended with training data saving functionality. + + This agent saves: + 1. System prompt (separately) + 2. User prompt (with all context: observations including environment memory, history, etc.) + 3. Agent output (thinking/reasoning + action) + + All saved in the training_data/ directory within the experiment directory. + """ + + def __init__( + self, + chat_model_args, + flags, + max_retry: int = 4, + ): + super().__init__(chat_model_args, flags, max_retry) + + # Track current step for saving training data + self._current_step = None + self._training_data_dir = None + + @cost_tracker_decorator + def get_action(self, obs): + """Override get_action to save training data before and after LLM call.""" + + self.obs_history.append(obs) + main_prompt = MainPrompt( + action_set=self.action_set, + obs_history=self.obs_history, + actions=self.actions, + memories=self.memories, + thoughts=self.thoughts, + previous_plan=self.plan, + step=self.plan_step, + flags=self.flags, + ) + + max_prompt_tokens, max_trunc_itr = self._get_maxes() + + system_prompt = SystemMessage(dp.SystemPrompt().prompt) + + human_prompt = dp.fit_tokens( + shrinkable=main_prompt, + max_prompt_tokens=max_prompt_tokens, + model_name=self.chat_model_args.model_name, + max_iterations=max_trunc_itr, + additional_prompts=system_prompt, + ) + + # Save system prompt and user prompt right before LLM call + if self._training_data_dir is not None and self._current_step is not None: + self._save_training_prompts(system_prompt, human_prompt, self._current_step) + else: + logger.debug(f"Not saving training prompts: training_data_dir={self._training_data_dir}, current_step={self._current_step}") + + try: + # TODO, we would need to further shrink the prompt if the retry + # cause it to be too long + + chat_messages = Discussion([system_prompt, human_prompt]) + ans_dict = retry( + self.chat_llm, + chat_messages, + n_retry=self.max_retry, + parser=main_prompt._parse_answer, + ) + ans_dict["busted_retry"] = 0 + # inferring the number of retries, TODO: make this less hacky + ans_dict["n_retry"] = (len(chat_messages) - 3) / 2 + except ParseError as e: + ans_dict = dict( + action=None, + n_retry=self.max_retry + 1, + busted_retry=1, + ) + + stats = self.chat_llm.get_stats() + stats["n_retry"] = ans_dict["n_retry"] + stats["busted_retry"] = ans_dict["busted_retry"] + + self.plan = ans_dict.get("plan", self.plan) + self.plan_step = ans_dict.get("step", self.plan_step) + action = ans_dict["action"] + self.actions.append(action) + self.memories.append(ans_dict.get("memory", None)) + self.thoughts.append(ans_dict.get("think", None)) + + # Save the agent output (thinking + action) + if self._training_data_dir is not None and self._current_step is not None: + self._save_training_output(ans_dict, self._current_step) + else: + logger.debug(f"Not saving training output: training_data_dir={self._training_data_dir}, current_step={self._current_step}") + + agent_info = AgentInfo( + think=ans_dict.get("think", None), + chat_messages=chat_messages, + stats=stats, + extra_info={"chat_model_args": asdict(self.chat_model_args)}, + ) + return action, agent_info + + def _save_training_prompts(self, system_prompt: SystemMessage, human_prompt: BaseMessage, step: int): + """Save system prompt and user prompt separately for training data.""" + try: + logger.info(f"Saving training prompts for step {step} to {self._training_data_dir}") + training_dir = Path(self._training_data_dir) + training_dir.mkdir(parents=True, exist_ok=True) + + # Save system prompt + system_dict = { + "role": system_prompt.get("role", "system"), + "content": deepcopy(system_prompt.get("content", "")) + } + + system_file = training_dir / f"system_prompt_step_{step}.json" + with open(system_file, "w", encoding="utf-8") as f: + json.dump(system_dict, f, indent=2, ensure_ascii=False) + + system_text_file = training_dir / f"system_prompt_step_{step}.txt" + with open(system_text_file, "w", encoding="utf-8") as f: + f.write(str(system_prompt)) + + # Save user prompt (human prompt without system) + user_dict = { + "role": human_prompt.get("role", "user"), + "content": deepcopy(human_prompt.get("content", "")) + } + + user_file = training_dir / f"user_prompt_step_{step}.json" + with open(user_file, "w", encoding="utf-8") as f: + json.dump(user_dict, f, indent=2, ensure_ascii=False) + + user_text_file = training_dir / f"user_prompt_step_{step}.txt" + with open(user_text_file, "w", encoding="utf-8") as f: + f.write(str(human_prompt)) + + except Exception as e: + logger.warning(f"Failed to save training prompts for step {step}: {e}") + + def _save_training_output(self, ans_dict: dict, step: int): + """Save the agent output (thinking/reasoning + action) for training data.""" + try: + logger.info(f"Saving training output for step {step} to {self._training_data_dir}") + training_dir = Path(self._training_data_dir) + training_dir.mkdir(parents=True, exist_ok=True) + + # Extract only thinking and action from ans_dict + output_dict = { + "think": ans_dict.get("think", None), + "action": ans_dict.get("action", None), + } + + # Save as JSON + output_file = training_dir / f"agent_output_step_{step}.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(output_dict, f, indent=2, ensure_ascii=False) + + # Also save a text version with thinking and action + output_text_parts = [] + if output_dict["think"]: + output_text_parts.append(f"Thinking:\n{output_dict['think']}\n") + if output_dict["action"]: + output_text_parts.append(f"Action:\n{output_dict['action']}\n") + + output_text_file = training_dir / f"agent_output_step_{step}.txt" + with open(output_text_file, "w", encoding="utf-8") as f: + f.write("\n".join(output_text_parts) if output_text_parts else "") + + except Exception as e: + logger.warning(f"Failed to save training output for step {step}: {e}") + + def set_training_data_dir(self, exp_dir: Path): + """Set the directory where training data should be saved.""" + self._training_data_dir = Path(exp_dir) / "training_data" + self._training_data_dir.mkdir(parents=True, exist_ok=True) + logger.info(f"Training data directory set to: {self._training_data_dir}") + + def set_current_step(self, step: int): + """Set the current step number for saving training data.""" + self._current_step = step + From 8aa3560fc57ec5bda48ddec3e220de41bce3d136 Mon Sep 17 00:00:00 2001 From: Md Farhan Ishmam <45528856+farhanishmam@users.noreply.github.com> Date: Sun, 15 Mar 2026 23:08:28 -0600 Subject: [PATCH 2/4] Add files via upload --- src/agentlab/experiments/loop.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 75dd9f407..d31e0b334 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -415,6 +415,11 @@ def run(self): try: logger.info(f"Running experiment {self.exp_name} in:\n {self.exp_dir}") agent = self.agent_args.make_agent() + + # Set training data directory for agents that support it + if hasattr(agent, 'set_training_data_dir'): + agent.set_training_data_dir(Path(self.exp_dir)) + if hasattr(agent, "set_task_name"): agent.set_task_name(self.env_args.task_name) @@ -436,6 +441,11 @@ def run(self): while not step_info.is_done: # set a limit logger.debug(f"Starting step {step_info.step}.") + + # Set current step for training data capture + if hasattr(agent, 'set_current_step'): + agent.set_current_step(step_info.step) + action = step_info.from_action(agent) logger.debug(f"Agent chose action:\n {action}") @@ -926,6 +936,8 @@ def _get_env_name(task_name: str): import browsergym.assistantbench elif task_name.startswith("weblinx"): import weblinx_browsergym + elif task_name.startswith("timewarp"): + import browsergym.timewarp return f"browsergym/{task_name}" From 6e634d9612aee971fe6b010aab7382b90b9ceaeb Mon Sep 17 00:00:00 2001 From: Md Farhan Ishmam <45528856+farhanishmam@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:54:46 -0600 Subject: [PATCH 3/4] removed the data saving loop, ran black, updated readme --- README.md | 1 + .../generic_agent_with_training.py | 225 ------------------ src/agentlab/experiments/loop.py | 12 +- 3 files changed, 7 insertions(+), 231 deletions(-) delete mode 100644 src/agentlab/agents/generic_agent/generic_agent_with_training.py diff --git a/README.md b/README.md index 46c9dc4a9..40611b4dc 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ AgentLab Features: | [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon | | [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon | | [OSWorld](https://os-world.github.io/) | [setup](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/benchmarks/osworld.md) | 369 | None | - | - | self hosted | soon | +| [TimeWarp](https://timewarp-web.github.io/) | [setup](https://github.com/sparklabutah/timewarp) | 1386 | None | 30 | yes | self hosted | soon | ## 🛠️ Setup AgentLab diff --git a/src/agentlab/agents/generic_agent/generic_agent_with_training.py b/src/agentlab/agents/generic_agent/generic_agent_with_training.py deleted file mode 100644 index b35697beb..000000000 --- a/src/agentlab/agents/generic_agent/generic_agent_with_training.py +++ /dev/null @@ -1,225 +0,0 @@ -""" -GenericAgent with training data saving functionality. - -This module extends GenericAgent to save training data (system prompt, user prompt, and agent output) -for each step during benchmarking. This is useful for creating training datasets. -""" - -from copy import deepcopy -from dataclasses import asdict -from pathlib import Path -import json -import logging - -from browsergym.experiments.agent import AgentInfo - -from agentlab.agents import dynamic_prompting as dp -from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, BaseMessage, retry -from agentlab.llm.tracking import cost_tracker_decorator - -from .generic_agent import GenericAgent, GenericAgentArgs -from .generic_agent_prompt import MainPrompt - -logger = logging.getLogger(__name__) - - -class GenericAgentWithTrainingArgs(GenericAgentArgs): - """Agent arguments for GenericAgentWithTraining.""" - - def __post_init__(self): - super().__post_init__() - try: - self.agent_name = f"GenericAgentWithTraining-{self.chat_model_args.model_name}".replace("/", "_") - except AttributeError: - pass - - def make_agent(self): - return GenericAgentWithTraining( - chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry - ) - - -class GenericAgentWithTraining(GenericAgent): - """ - GenericAgent extended with training data saving functionality. - - This agent saves: - 1. System prompt (separately) - 2. User prompt (with all context: observations including environment memory, history, etc.) - 3. Agent output (thinking/reasoning + action) - - All saved in the training_data/ directory within the experiment directory. - """ - - def __init__( - self, - chat_model_args, - flags, - max_retry: int = 4, - ): - super().__init__(chat_model_args, flags, max_retry) - - # Track current step for saving training data - self._current_step = None - self._training_data_dir = None - - @cost_tracker_decorator - def get_action(self, obs): - """Override get_action to save training data before and after LLM call.""" - - self.obs_history.append(obs) - main_prompt = MainPrompt( - action_set=self.action_set, - obs_history=self.obs_history, - actions=self.actions, - memories=self.memories, - thoughts=self.thoughts, - previous_plan=self.plan, - step=self.plan_step, - flags=self.flags, - ) - - max_prompt_tokens, max_trunc_itr = self._get_maxes() - - system_prompt = SystemMessage(dp.SystemPrompt().prompt) - - human_prompt = dp.fit_tokens( - shrinkable=main_prompt, - max_prompt_tokens=max_prompt_tokens, - model_name=self.chat_model_args.model_name, - max_iterations=max_trunc_itr, - additional_prompts=system_prompt, - ) - - # Save system prompt and user prompt right before LLM call - if self._training_data_dir is not None and self._current_step is not None: - self._save_training_prompts(system_prompt, human_prompt, self._current_step) - else: - logger.debug(f"Not saving training prompts: training_data_dir={self._training_data_dir}, current_step={self._current_step}") - - try: - # TODO, we would need to further shrink the prompt if the retry - # cause it to be too long - - chat_messages = Discussion([system_prompt, human_prompt]) - ans_dict = retry( - self.chat_llm, - chat_messages, - n_retry=self.max_retry, - parser=main_prompt._parse_answer, - ) - ans_dict["busted_retry"] = 0 - # inferring the number of retries, TODO: make this less hacky - ans_dict["n_retry"] = (len(chat_messages) - 3) / 2 - except ParseError as e: - ans_dict = dict( - action=None, - n_retry=self.max_retry + 1, - busted_retry=1, - ) - - stats = self.chat_llm.get_stats() - stats["n_retry"] = ans_dict["n_retry"] - stats["busted_retry"] = ans_dict["busted_retry"] - - self.plan = ans_dict.get("plan", self.plan) - self.plan_step = ans_dict.get("step", self.plan_step) - action = ans_dict["action"] - self.actions.append(action) - self.memories.append(ans_dict.get("memory", None)) - self.thoughts.append(ans_dict.get("think", None)) - - # Save the agent output (thinking + action) - if self._training_data_dir is not None and self._current_step is not None: - self._save_training_output(ans_dict, self._current_step) - else: - logger.debug(f"Not saving training output: training_data_dir={self._training_data_dir}, current_step={self._current_step}") - - agent_info = AgentInfo( - think=ans_dict.get("think", None), - chat_messages=chat_messages, - stats=stats, - extra_info={"chat_model_args": asdict(self.chat_model_args)}, - ) - return action, agent_info - - def _save_training_prompts(self, system_prompt: SystemMessage, human_prompt: BaseMessage, step: int): - """Save system prompt and user prompt separately for training data.""" - try: - logger.info(f"Saving training prompts for step {step} to {self._training_data_dir}") - training_dir = Path(self._training_data_dir) - training_dir.mkdir(parents=True, exist_ok=True) - - # Save system prompt - system_dict = { - "role": system_prompt.get("role", "system"), - "content": deepcopy(system_prompt.get("content", "")) - } - - system_file = training_dir / f"system_prompt_step_{step}.json" - with open(system_file, "w", encoding="utf-8") as f: - json.dump(system_dict, f, indent=2, ensure_ascii=False) - - system_text_file = training_dir / f"system_prompt_step_{step}.txt" - with open(system_text_file, "w", encoding="utf-8") as f: - f.write(str(system_prompt)) - - # Save user prompt (human prompt without system) - user_dict = { - "role": human_prompt.get("role", "user"), - "content": deepcopy(human_prompt.get("content", "")) - } - - user_file = training_dir / f"user_prompt_step_{step}.json" - with open(user_file, "w", encoding="utf-8") as f: - json.dump(user_dict, f, indent=2, ensure_ascii=False) - - user_text_file = training_dir / f"user_prompt_step_{step}.txt" - with open(user_text_file, "w", encoding="utf-8") as f: - f.write(str(human_prompt)) - - except Exception as e: - logger.warning(f"Failed to save training prompts for step {step}: {e}") - - def _save_training_output(self, ans_dict: dict, step: int): - """Save the agent output (thinking/reasoning + action) for training data.""" - try: - logger.info(f"Saving training output for step {step} to {self._training_data_dir}") - training_dir = Path(self._training_data_dir) - training_dir.mkdir(parents=True, exist_ok=True) - - # Extract only thinking and action from ans_dict - output_dict = { - "think": ans_dict.get("think", None), - "action": ans_dict.get("action", None), - } - - # Save as JSON - output_file = training_dir / f"agent_output_step_{step}.json" - with open(output_file, "w", encoding="utf-8") as f: - json.dump(output_dict, f, indent=2, ensure_ascii=False) - - # Also save a text version with thinking and action - output_text_parts = [] - if output_dict["think"]: - output_text_parts.append(f"Thinking:\n{output_dict['think']}\n") - if output_dict["action"]: - output_text_parts.append(f"Action:\n{output_dict['action']}\n") - - output_text_file = training_dir / f"agent_output_step_{step}.txt" - with open(output_text_file, "w", encoding="utf-8") as f: - f.write("\n".join(output_text_parts) if output_text_parts else "") - - except Exception as e: - logger.warning(f"Failed to save training output for step {step}: {e}") - - def set_training_data_dir(self, exp_dir: Path): - """Set the directory where training data should be saved.""" - self._training_data_dir = Path(exp_dir) / "training_data" - self._training_data_dir.mkdir(parents=True, exist_ok=True) - logger.info(f"Training data directory set to: {self._training_data_dir}") - - def set_current_step(self, step: int): - """Set the current step number for saving training data.""" - self._current_step = step - diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index d31e0b334..7c85bd924 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -415,11 +415,11 @@ def run(self): try: logger.info(f"Running experiment {self.exp_name} in:\n {self.exp_dir}") agent = self.agent_args.make_agent() - + # Set training data directory for agents that support it - if hasattr(agent, 'set_training_data_dir'): + if hasattr(agent, "set_training_data_dir"): agent.set_training_data_dir(Path(self.exp_dir)) - + if hasattr(agent, "set_task_name"): agent.set_task_name(self.env_args.task_name) @@ -441,11 +441,11 @@ def run(self): while not step_info.is_done: # set a limit logger.debug(f"Starting step {step_info.step}.") - + # Set current step for training data capture - if hasattr(agent, 'set_current_step'): + if hasattr(agent, "set_current_step"): agent.set_current_step(step_info.step) - + action = step_info.from_action(agent) logger.debug(f"Agent chose action:\n {action}") From f45a6bfe063f697c77cf677651e3e772b6c42a06 Mon Sep 17 00:00:00 2001 From: Md Farhan Ishmam <45528856+farhanishmam@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:57:19 -0600 Subject: [PATCH 4/4] reverted to original loop.py --- src/agentlab/experiments/loop.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 7c85bd924..486e2f354 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -415,11 +415,6 @@ def run(self): try: logger.info(f"Running experiment {self.exp_name} in:\n {self.exp_dir}") agent = self.agent_args.make_agent() - - # Set training data directory for agents that support it - if hasattr(agent, "set_training_data_dir"): - agent.set_training_data_dir(Path(self.exp_dir)) - if hasattr(agent, "set_task_name"): agent.set_task_name(self.env_args.task_name) @@ -441,11 +436,6 @@ def run(self): while not step_info.is_done: # set a limit logger.debug(f"Starting step {step_info.step}.") - - # Set current step for training data capture - if hasattr(agent, "set_current_step"): - agent.set_current_step(step_info.step) - action = step_info.from_action(agent) logger.debug(f"Agent chose action:\n {action}")