From d76e5c38613af962e93a5b9377752c98849149a1 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 15 Oct 2024 15:10:45 -0400
Subject: [PATCH 01/47] downgrading ubuntu version for github tests (#62)

---
 .github/workflows/unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 1b09ad6e5..a6b44f873 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -9,7 +9,7 @@ on:
 jobs:
 
   agentlab:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     defaults:
       run:

From 3697c4d3d34a2830b144da80245e7b7be26ce1fd Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 15 Oct 2024 15:33:18 -0400
Subject: [PATCH 02/47] Llm api update (#59)

* getting rid of .invoke()

* adding an AbstractChatModel

* changing chat_api structure
---
 .../generic_agent/reproducibility_agent.py    |   2 +-
 src/agentlab/llm/base_api.py                  |  33 ++++++
 src/agentlab/llm/chat_api.py                  |  67 ++++++------
 src/agentlab/llm/huggingface_utils.py         | 101 +-----------------
 src/agentlab/llm/llm_utils.py                 |   2 +-
 tests/agents/test_agent.py                    |  10 +-
 tests/llm/test_chat_api.py                    |   4 +-
 tests/llm/test_llm_utils.py                   |  23 ++--
 tests/llm/test_tracking.py                    |   6 +-
 9 files changed, 88 insertions(+), 160 deletions(-)
 create mode 100644 src/agentlab/llm/base_api.py

diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
index b484ac7de..4cab3435d 100644
--- a/src/agentlab/agents/generic_agent/reproducibility_agent.py
+++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -43,7 +43,7 @@ def __init__(self, old_messages, delay=1) -> None:
         self.old_messages = old_messages
         self.delay = delay
 
-    def invoke(self, messages: list):
+    def __call__(self, messages: list):
         self.new_messages = copy(messages)
 
         if len(messages) >= len(self.old_messages):
diff --git a/src/agentlab/llm/base_api.py b/src/agentlab/llm/base_api.py
new file mode 100644
index 000000000..9c1ebf5ff
--- /dev/null
+++ b/src/agentlab/llm/base_api.py
@@ -0,0 +1,33 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+
+class AbstractChatModel(ABC):
+    @abstractmethod
+    def __call__(self, messages: list[dict]) -> dict:
+        pass
+
+    def get_stats(self):
+        return {}
+
+
+@dataclass
+class BaseModelArgs(ABC):
+    """Base class for all model arguments."""
+
+    model_name: str
+    max_total_tokens: int = None
+    max_input_tokens: int = None
+    max_new_tokens: int = None
+    temperature: float = 0.1
+    vision_support: bool = False
+
+    @abstractmethod
+    def make_model(self) -> AbstractChatModel:
+        pass
+
+    def prepare_server(self):
+        pass
+
+    def close_server(self):
+        pass
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index 139b2ca5e..a4df0a977 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -2,14 +2,17 @@
 import os
 import re
 import time
-from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from functools import partial
+from typing import Optional
 
 import openai
+from huggingface_hub import InferenceClient
 from openai import AzureOpenAI, OpenAI
 
 import agentlab.llm.tracking as tracking
-from agentlab.llm.huggingface_utils import HuggingFaceURLChatModel
+from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
+from agentlab.llm.huggingface_utils import HFBaseChatModel
 
 
 def make_system_message(content: str) -> dict:
@@ -24,10 +27,10 @@ def make_assistant_message(content: str) -> dict:
     return dict(role="assistant", content=content)
 
 
-class CheatMiniWoBLLM:
+class CheatMiniWoBLLM(AbstractChatModel):
     """For unit-testing purposes only. It only work with miniwob.click-test task."""
 
-    def invoke(self, messages) -> str:
+    def __call__(self, messages) -> str:
         prompt = messages[-1]["content"]
         match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)
 
@@ -44,12 +47,6 @@ def invoke(self, messages) -> str:
 """
         return make_assistant_message(answer)
 
-    def __call__(self, messages) -> str:
-        return self.invoke(messages)
-
-    def get_stats(self):
-        return {}
-
 
 @dataclass
 class CheatMiniWoBLLMArgs:
@@ -68,28 +65,6 @@ def close_server(self):
         pass
 
 
-@dataclass
-class BaseModelArgs(ABC):
-    """Base class for all model arguments."""
-
-    model_name: str
-    max_total_tokens: int = None
-    max_input_tokens: int = None
-    max_new_tokens: int = None
-    temperature: float = 0.1
-    vision_support: bool = False
-
-    @abstractmethod
-    def make_model(self) -> "ChatModel":
-        pass
-
-    def prepare_server(self):
-        pass
-
-    def close_server(self):
-        pass
-
-
 @dataclass
 class OpenRouterModelArgs(BaseModelArgs):
     """Serializable object for instantiating a generic chat model with an OpenAI
@@ -221,7 +196,7 @@ def handle_error(error, itr, min_retry_wait_time, max_retry):
     return error_type
 
 
-class ChatModel:
+class ChatModel(AbstractChatModel):
     def __init__(
         self,
         model_name,
@@ -310,9 +285,6 @@ def __call__(self, messages: list[dict]) -> dict:
 
         return make_assistant_message(completion.choices[0].message.content)
 
-    def invoke(self, messages: list[dict]) -> dict:
-        return self(messages)
-
     def get_stats(self):
         return {
             "n_retry_llm": self.retries,
@@ -401,3 +373,26 @@ def __init__(
             client_args=client_args,
             pricing_func=tracking.get_pricing_openai,
         )
+
+
+class HuggingFaceURLChatModel(HFBaseChatModel):
+    def __init__(
+        self,
+        model_name: str,
+        model_url: str,
+        token: Optional[str] = None,
+        temperature: Optional[int] = 1e-1,
+        max_new_tokens: Optional[int] = 512,
+        n_retry_server: Optional[int] = 4,
+    ):
+        super().__init__(model_name, n_retry_server)
+        if temperature < 1e-3:
+            logging.warning("Models might behave weirdly when temperature is too low.")
+
+        if token is None:
+            token = os.environ["TGI_TOKEN"]
+
+        client = InferenceClient(model=model_url, token=token)
+        self.llm = partial(
+            client.text_generation, temperature=temperature, max_new_tokens=max_new_tokens
+        )
diff --git a/src/agentlab/llm/huggingface_utils.py b/src/agentlab/llm/huggingface_utils.py
index ce4dae067..470324bd6 100644
--- a/src/agentlab/llm/huggingface_utils.py
+++ b/src/agentlab/llm/huggingface_utils.py
@@ -1,17 +1,15 @@
 import logging
-import os
 import time
-from functools import partial
 from typing import Any, List, Optional
 
-from huggingface_hub import InferenceClient
 from pydantic import Field
 from transformers import AutoTokenizer, GPT2TokenizerFast
 
+from agentlab.llm.base_api import AbstractChatModel
 from agentlab.llm.prompt_templates import PromptTemplate, get_prompt_template
 
 
-class HFBaseChatModel:
+class HFBaseChatModel(AbstractChatModel):
     """
     Custom LLM Chatbot that can interface with HuggingFace models.
 
@@ -94,101 +92,6 @@ def __call__(
     def _llm_type(self):
         return "huggingface"
 
-    def invoke(self, messages: list[dict]) -> dict:
-        return self(messages)
-
-    def get_stats(self):
-        return {}
-
-
-class HuggingFaceURLChatModel(HFBaseChatModel):
-    def __init__(
-        self,
-        model_name: str,
-        model_url: str,
-        token: Optional[str] = None,
-        temperature: Optional[int] = 1e-1,
-        max_new_tokens: Optional[int] = 512,
-        n_retry_server: Optional[int] = 4,
-    ):
-        super().__init__(model_name, n_retry_server)
-        if temperature < 1e-3:
-            logging.warning("Models might behave weirdly when temperature is too low.")
-
-        if token is None:
-            token = os.environ["TGI_TOKEN"]
-
-        client = InferenceClient(model=model_url, token=token)
-        self.llm = partial(
-            client.text_generation, temperature=temperature, max_new_tokens=max_new_tokens
-        )
-
-
-# def _convert_messages_to_dict(messages, column_remap={}):
-#     """
-#     Converts a list of message objects into a list of dictionaries, categorizing each message by its role.
-
-#     Each message is expected to be an instance of one of the following types: SystemMessage, HumanMessage, AIMessage.
-#     The function maps each message to its corresponding role ('system', 'user', 'assistant') and formats it into a dictionary.
-
-#     Args:
-#         messages (list): A list of message objects.
-#         column_remap (dict): A dictionary that maps the column names to the desired output format.
-
-#     Returns:
-#         list: A list of dictionaries where each dictionary represents a message and contains 'role' and 'content' keys.
-
-#     Raises:
-#         ValueError: If an unsupported message type is encountered.
-
-#     Example:
-#         >>> messages = [SystemMessage("System initializing..."), HumanMessage("Hello!"), AIMessage("How can I assist?")]
-#         >>> _convert_messages_to_dict(messages)
-#         [
-#             {"role": "system", "content": "System initializing..."},
-#             {"role": "user", "content": "Hello!"},
-#             {"role": "assistant", "content": "How can I assist?"}
-#         ]
-#     """
-
-#     human_key = column_remap.get("HumanMessage", "user")
-#     ai_message_key = column_remap.get("AIMessage", "assistant")
-#     role_key = column_remap.get("role", "role")
-#     text_key = column_remap.get("text", "content")
-#     image_key = column_remap.get("image", "media_url")
-
-#     # Mapping of message types to roles
-#     message_type_to_role = {
-#         SystemMessage: "system",
-#         HumanMessage: human_key,
-#         AIMessage: ai_message_key,
-#     }
-
-#     def convert_format_vision(message_content, role, text_key, image_key):
-#         result = {}
-#         result["type"] = role
-#         for item in message_content:
-#             if item["type"] == "text":
-#                 result[text_key] = item["text"]
-#             elif item["type"] == "image_url":
-#                 result[image_key] = item["image_url"]
-#         return result
-
-#     chat = []
-#     for message in messages:
-#         message_role = message_type_to_role.get(type(message))
-#         if message_role:
-#             if isinstance(message.content, str):
-#                 chat.append({role_key: message_role, text_key: message.content})
-#             else:
-#                 chat.append(
-#                     convert_format_vision(message.content, message_role, text_key, image_key)
-#                 )
-#         else:
-#             raise ValueError(f"Message type {type(message)} not supported")
-
-#     return chat
-
 
 def _prepend_system_to_first_user(messages, column_remap={}):
     # Initialize an index for the system message
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index 4b876b54f..c3d750098 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -79,7 +79,7 @@ def retry(
     """
     tries = 0
     while tries < n_retry:
-        answer = chat.invoke(messages)
+        answer = chat(messages)
         messages.append(answer)  # TODO: could we change this to not use inplace modifications ?
 
         try:
diff --git a/tests/agents/test_agent.py b/tests/agents/test_agent.py
index ae1732892..0b2c31f28 100644
--- a/tests/agents/test_agent.py
+++ b/tests/agents/test_agent.py
@@ -50,7 +50,7 @@ class CheatMiniWoBLLM_ParseRetry:
     n_retry: int
     retry_count: int = 0
 
-    def invoke(self, messages) -> str:
+    def __call__(self, messages) -> str:
         if self.retry_count < self.n_retry:
             self.retry_count += 1
             return dict(role="assistant", content="I'm retrying")
@@ -71,9 +71,6 @@ def invoke(self, messages) -> str:
 """
         return dict(role="assistant", content=answer)
 
-    def __call__(self, messages) -> str:
-        return self.invoke(messages)
-
     def get_stats(self):
         return {}
 
@@ -94,7 +91,7 @@ class CheatLLM_LLMError:
     n_retry: int = 0
     success: bool = False
 
-    def invoke(self, messages) -> str:
+    def __call__(self, messages) -> str:
         if self.success:
             prompt = messages[1].get("content", "")
             match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)
@@ -113,9 +110,6 @@ def invoke(self, messages) -> str:
             return dict(role="assistant", content=answer)
         raise OpenAIError("LLM failed to respond")
 
-    def __call__(self, messages) -> str:
-        return self.invoke(messages)
-
     def get_stats(self):
         return {"n_llm_retry": self.n_retry, "n_llm_busted_retry": int(not self.success)}
 
diff --git a/tests/llm/test_chat_api.py b/tests/llm/test_chat_api.py
index b49f35887..f06fa7fa4 100644
--- a/tests/llm/test_chat_api.py
+++ b/tests/llm/test_chat_api.py
@@ -35,7 +35,7 @@ def test_api_model_args_azure():
         make_system_message("You are an helpful virtual assistant"),
         make_user_message("Give the third prime number"),
     ]
-    answer = model.invoke(messages)
+    answer = model(messages)
 
     assert "5" in answer.get("content")
 
@@ -56,6 +56,6 @@ def test_api_model_args_openai():
         make_system_message("You are an helpful virtual assistant"),
         make_user_message("Give the third prime number"),
     ]
-    answer = model.invoke(messages)
+    answer = model(messages)
 
     assert "5" in answer.get("content")
diff --git a/tests/llm/test_llm_utils.py b/tests/llm/test_llm_utils.py
index 1314bea03..7e5bb87cc 100644
--- a/tests/llm/test_llm_utils.py
+++ b/tests/llm/test_llm_utils.py
@@ -93,9 +93,12 @@ def test_compress_string():
 
 # Mock ChatOpenAI class
 class MockChatOpenAI:
-    def invoke(self, messages):
+    def call(self, messages):
         return "mocked response"
 
+    def __call__(self, messages):
+        return self.call(messages)
+
 
 def mock_parser(answer):
     if answer == "correct content":
@@ -126,7 +129,7 @@ def mock_rate_limit_error(message: str, status_code: Literal[429] = 429) -> Rate
 # Test to ensure function stops retrying after reaching the max wait time
 # def test_rate_limit_max_wait_time():
 #     mock_chat = MockChatOpenAI()
-#     mock_chat.invoke = Mock(
+#     mock_chat.call = Mock(
 #         side_effect=mock_rate_limit_error("Rate limit reached. Please try again in 2s.")
 #     )
 
@@ -141,12 +144,12 @@ def mock_rate_limit_error(message: str, status_code: Literal[429] = 429) -> Rate
 #         )
 
 #     # The function should stop retrying after 2 attempts (6s each time, 12s total which is greater than the 10s max wait time)
-#     assert mock_chat.invoke.call_count == 3
+#     assert mock_chat.call.call_count == 3
 
 
 # def test_rate_limit_success():
 #     mock_chat = MockChatOpenAI()
-#     mock_chat.invoke = Mock(
+#     mock_chat.call = Mock(
 #         side_effect=[
 #             mock_rate_limit_error("Rate limit reached. Please try again in 2s."),
 #             make_system_message("correct content"),
@@ -163,7 +166,7 @@ def mock_rate_limit_error(message: str, status_code: Literal[429] = 429) -> Rate
 #     )
 
 #     assert result == "Parsed value"
-#     assert mock_chat.invoke.call_count == 2
+#     assert mock_chat.call.call_count == 2
 
 
 # Mock a successful parser response to test function exit before max retries
@@ -172,7 +175,7 @@ def test_successful_parse_before_max_retries():
 
     # mock a chat that returns the wrong content the first 2 time, but the right
     # content  on the 3rd time
-    mock_chat.invoke = Mock(
+    mock_chat.call = Mock(
         side_effect=[
             make_system_message("wrong content"),
             make_system_message("wrong content"),
@@ -183,7 +186,7 @@ def test_successful_parse_before_max_retries():
     result = llm_utils.retry(mock_chat, [], 5, mock_parser)
 
     assert result == "Parsed value"
-    assert mock_chat.invoke.call_count == 3
+    assert mock_chat.call.call_count == 3
 
 
 def test_unsuccessful_parse_before_max_retries():
@@ -191,7 +194,7 @@ def test_unsuccessful_parse_before_max_retries():
 
     # mock a chat that returns the wrong content the first 2 time, but the right
     # content  on the 3rd time
-    mock_chat.invoke = Mock(
+    mock_chat.call = Mock(
         side_effect=[
             make_system_message("wrong content"),
             make_system_message("wrong content"),
@@ -201,12 +204,12 @@ def test_unsuccessful_parse_before_max_retries():
     with pytest.raises(llm_utils.ParseError):
         result = llm_utils.retry(mock_chat, [], 2, mock_parser)
 
-    assert mock_chat.invoke.call_count == 2
+    assert mock_chat.call.call_count == 2
 
 
 def test_retry_parse_raises():
     mock_chat = MockChatOpenAI()
-    mock_chat.invoke = Mock(return_value=make_system_message("mocked response"))
+    mock_chat.call = Mock(return_value=make_system_message("mocked response"))
     parser_raises = Mock(side_effect=ValueError("Parser error"))
 
     with pytest.raises(ValueError):
diff --git a/tests/llm/test_tracking.py b/tests/llm/test_tracking.py
index cc5abd36f..01ebcc067 100644
--- a/tests/llm/test_tracking.py
+++ b/tests/llm/test_tracking.py
@@ -136,7 +136,7 @@ def test_openai_chat_model():
         make_user_message("Give the third prime number"),
     ]
     with tracking.set_tracker() as tracker:
-        answer = chat_model.invoke(messages)
+        answer = chat_model(messages)
     assert "5" in answer.get("content")
     assert tracker.stats["cost"] > 0
 
@@ -161,7 +161,7 @@ def test_azure_chat_model():
         make_user_message("Give the third prime number"),
     ]
     with tracking.set_tracker() as tracker:
-        answer = chat_model.invoke(messages)
+        answer = chat_model(messages)
     assert "5" in answer.get("content")
     assert tracker.stats["cost"] > 0
 
@@ -178,6 +178,6 @@ def test_openrouter_chat_model():
         make_user_message("Give the third prime number"),
     ]
     with tracking.set_tracker() as tracker:
-        answer = chat_model.invoke(messages)
+        answer = chat_model(messages)
     assert "5" in answer.get("content")
     assert tracker.stats["cost"] > 0

From 9f58f15ad3fae3e84f7d7959f1acfb8a104b63ed Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Wed, 16 Oct 2024 07:39:36 -0400
Subject: [PATCH 03/47] Reproducibility again (#61)

* core functions

* switch to dask

* removing joblib dependency and adding dask

* fixing imports

* handles multiple backends

* ensure asyncio loop creation

* more tests

* setting dashboard address to None

* minor

* Finally found a way to make it work

* initial reproducibility files

* Seems to be superflus

* adding a reproducibility journal

* minor update

* more robust

* adding reproducibility tools

* fix white listing

* minor

* minor

* minor

* minor

* minor fix

* more tests

* more results yay

* disabling this test

* update

* update

* black

* maybe fixing github workflow ?

* make get_git_username great again

* trigger change

* new browsergym

* GPT-4o result (and new comment column)

* Seems like there was a change to 4o flags, trying these

* minor comment

* better xray

* minor fix

* addming a comment field

* new agent

* another test with GPT-4o

* adding llama3 from openrouter

* fix naming

* unused import

* new summary tools and remove "_args" from columns in results

* add Llama

* initial code for reproducibility agent

* adjust inspect results

* infer from benchmark

* fix reproducibility agent

* prevent the repro_dir to be an index variable

* updating repro agent stats

* Reproducibility agent

* instructions to setup workarena

* fixing tests

* handles better a few edge cases

* default progress function to None

* minor formatting

* minor

* initial commit

* refactoring with Study class

* refactor to adapt for study class

* minor

* fix pricy test

* fixing tests

* tmp

* print report

* minor fix

* refine little details about reproducibility

* minor

* no need for set_temp anymore

* sanity check before running main

* minor update

* minor

* new results with 4o on workarena.l1

* sharing is caring

* add llama to main.py

* new hournal entry

* lamma 3 70B

* minor

* typo

* black fix (wasn't configured)

---------

Co-authored-by: Thibault Le Sellier de Chezelles <thibault.de.chezelles@gmail.com>
---
 reproducibility_journal.csv                   |  1 +
 .../generic_agent/reproducibility_agent.py    | 20 +++++++++++++++----
 src/agentlab/experiments/reproduce_study.py   |  8 ++------
 src/agentlab/experiments/study_generators.py  |  7 +++++--
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index 8eeb33bad..df2ff7478 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -9,3 +9,4 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0
 recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140,  M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
index 4cab3435d..c197b76e9 100644
--- a/src/agentlab/agents/generic_agent/reproducibility_agent.py
+++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -56,6 +56,9 @@ def __call__(self, messages: list):
         # return the next message in the list
         return old_response
 
+    def get_stats(self):
+        return {}
+
 
 @dataclass
 class ReproAgentArgs(GenericAgentArgs):
@@ -102,6 +105,14 @@ def get_action(self, obs):
             )
             return None, agent_info
 
+        # an old bug prevented the response from being saved.
+        if len(old_chat_messages) == 2:
+            recorded_action = step_info.action
+            if recorded_action:
+                # Recreate the 3rd message based on the recorded action
+                assistant_message = make_assistant_message(f"<action>{recorded_action}</action>")
+                old_chat_messages.append(assistant_message)
+
         self.chat_llm = ReproChatModel(old_chat_messages)
         action, agent_info = super().get_action(obs)
 
@@ -128,27 +139,28 @@ def _format_messages(messages: list[dict]):
     return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages)
 
 
-def reproduce_study(original_study_dir: Path | str):
+def reproduce_study(original_study_dir: Path | str, log_level=logging.INFO):
     """Reproduce a study by running the same experiments with the same agent."""
 
     original_study_dir = Path(original_study_dir)
 
     study_name = f"reproducibility_of_{original_study_dir.name}"
 
-    exp_args_list = []
+    exp_args_list: list[ExpArgs] = []
     for exp_result in yield_all_exp_results(original_study_dir, progress_fn=None):
         agent_args = make_repro_agent(exp_result.exp_args.agent_args, exp_dir=exp_result.exp_dir)
         exp_args_list.append(
             ExpArgs(
                 agent_args=agent_args,
                 env_args=exp_result.exp_args.env_args,
-                logging_level=logging.DEBUG,
+                logging_level=log_level,
             )
         )
+    benchmark_name = exp_args_list[0].env_args.task_name.split(".")[0]
 
     return Study(
         exp_args_list=exp_args_list,
-        benchmark_name="repro_study",
+        benchmark_name=benchmark_name,
         agent_names=[agent_args.agent_name],
     )
 
diff --git a/src/agentlab/experiments/reproduce_study.py b/src/agentlab/experiments/reproduce_study.py
index 3c2dd0aef..93ef07fbd 100644
--- a/src/agentlab/experiments/reproduce_study.py
+++ b/src/agentlab/experiments/reproduce_study.py
@@ -5,18 +5,14 @@
 the diff in HTML format.
 """
 
-import logging
-
 from agentlab.agents.generic_agent.reproducibility_agent import reproduce_study
 from agentlab.experiments.exp_utils import RESULTS_DIR
 
-logging.getLogger().setLevel(logging.INFO)
-
 
 if __name__ == "__main__":
 
-    old_study = "2024-06-02_18-16-17_final_run"
-    # old_study = "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on_miniwob_tiny_test"
+    # old_study = "2024-06-03_13-53-50_final_run_workarena_L1_llama3-70b"
+    old_study = "2024-06-03_12-28-51_final_run_miniwob_llama3-70b"
 
     study = reproduce_study(RESULTS_DIR / old_study)
     n_jobs = 1
diff --git a/src/agentlab/experiments/study_generators.py b/src/agentlab/experiments/study_generators.py
index e079ba7f1..3a2567d51 100644
--- a/src/agentlab/experiments/study_generators.py
+++ b/src/agentlab/experiments/study_generators.py
@@ -153,7 +153,10 @@ def set_demo_mode(env_args_list: list[EnvArgs]):
 
 
 def run_agents_on_benchmark(
-    agents: list[AgentArgs] | AgentArgs = AGENT_4o_MINI, benchmark: str = "miniwob", demo_mode=False
+    agents: list[AgentArgs] | AgentArgs = AGENT_4o_MINI,
+    benchmark: str = "miniwob",
+    demo_mode=False,
+    log_level=logging.INFO,
 ):
     """Run one or multiple agents on a benchmark.
 
@@ -190,7 +193,7 @@ def run_agents_on_benchmark(
         ExpArgs(
             agent_args=args.CrossProd(agents),
             env_args=args.CrossProd(env_args_list),
-            logging_level=logging.DEBUG,
+            logging_level=log_level,
         )
     )
 

From 79b92028caddfd47c690e58ceb4a1ef3b1bd7be7 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Wed, 16 Oct 2024 09:51:50 -0400
Subject: [PATCH 04/47] version bump

---
 src/agentlab/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py
index 3ced3581b..b5fdc7530 100644
--- a/src/agentlab/__init__.py
+++ b/src/agentlab/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.1"
+__version__ = "0.2.2"

From b09ea9319904f48344716de847fc8090e3f5bc43 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:15:48 -0400
Subject: [PATCH 05/47] Patching minor stuff (#69)

* fixing sample_std for single experience

* making gradio shared server non default

* missing requirement for xray
---
 requirements.txt                        | 3 ++-
 src/agentlab/analyze/agent_xray.py      | 3 ++-
 src/agentlab/analyze/inspect_results.py | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e96fa61ee..aa528c18a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,4 +18,5 @@ pyyaml>=6
 pandas
 gradio
 gitpython # for the reproducibility script
-requests
\ No newline at end of file
+requests
+matplotlib
\ No newline at end of file
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 228901b39..8c43ef046 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -482,7 +482,8 @@ def run_gradio(results_dir: Path):
         tabs.select(tab_select)
 
     demo.queue()
-    demo.launch(server_port=int(os.getenv("AGENTXRAY_APP_PORT", 7899)), share=True)
+    do_share = os.getenv("AGENTXRAY_SHARE_SERVER", "false").lower() == "true"
+    demo.launch(server_port=int(os.getenv("AGENTXRAY_APP_PORT", 7899)), share=do_share)
 
 
 def tab_select(evt: gr.SelectData):
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 7d46113c9..d33cc756e 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -245,9 +245,9 @@ def get_std_err(df, metric):
     if np.all(np.isin(data, [0, 1])):
         mean = np.mean(data)
         std_err = np.sqrt(mean * (1 - mean) / len(data))
+        return mean, std_err
     else:
         return get_sample_std_err(df, metric)
-    return mean, std_err
 
 
 def get_sample_std_err(df, metric):
@@ -258,7 +258,7 @@ def get_sample_std_err(df, metric):
     mean = np.mean(data)
     std_err = np.std(data, ddof=1) / np.sqrt(len(data))
     if np.isnan(std_err):
-        std_err = 0
+        std_err = np.float64(0)
     return mean, std_err
 
 

From 3ceaa0fd658d4eeb4910b55d29e4fdc35129400d Mon Sep 17 00:00:00 2001
From: Xing Han Lu <21180505+xhluca@users.noreply.github.com>
Date: Thu, 17 Oct 2024 09:31:14 -0400
Subject: [PATCH 06/47] Improve agent xray app (#70)

* 0.2.2 Release (#67)

* downgrading ubuntu version for github tests (#62)

* Llm api update (#59)

* getting rid of .invoke()

* adding an AbstractChatModel

* changing chat_api structure

* Reproducibility again (#61)

* core functions

* switch to dask

* removing joblib dependency and adding dask

* fixing imports

* handles multiple backends

* ensure asyncio loop creation

* more tests

* setting dashboard address to None

* minor

* Finally found a way to make it work

* initial reproducibility files

* Seems to be superflus

* adding a reproducibility journal

* minor update

* more robust

* adding reproducibility tools

* fix white listing

* minor

* minor

* minor

* minor

* minor fix

* more tests

* more results yay

* disabling this test

* update

* update

* black

* maybe fixing github workflow ?

* make get_git_username great again

* trigger change

* new browsergym

* GPT-4o result (and new comment column)

* Seems like there was a change to 4o flags, trying these

* minor comment

* better xray

* minor fix

* addming a comment field

* new agent

* another test with GPT-4o

* adding llama3 from openrouter

* fix naming

* unused import

* new summary tools and remove "_args" from columns in results

* add Llama

* initial code for reproducibility agent

* adjust inspect results

* infer from benchmark

* fix reproducibility agent

* prevent the repro_dir to be an index variable

* updating repro agent stats

* Reproducibility agent

* instructions to setup workarena

* fixing tests

* handles better a few edge cases

* default progress function to None

* minor formatting

* minor

* initial commit

* refactoring with Study class

* refactor to adapt for study class

* minor

* fix pricy test

* fixing tests

* tmp

* print report

* minor fix

* refine little details about reproducibility

* minor

* no need for set_temp anymore

* sanity check before running main

* minor update

* minor

* new results with 4o on workarena.l1

* sharing is caring

* add llama to main.py

* new hournal entry

* lamma 3 70B

* minor

* typo

* black fix (wasn't configured)

---------

Co-authored-by: Thibault Le Sellier de Chezelles <thibault.de.chezelles@gmail.com>

* version bump

---------

Co-authored-by: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>

* Make share=TRue into a environment variable, disabled by default for security

* fix floating point issue with std_reward in agent xray

* Update src/agentlab/analyze/inspect_results.py

* Update src/agentlab/analyze/agent_xray.py

---------

Co-authored-by: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Co-authored-by: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
---
 src/agentlab/analyze/agent_xray.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 8c43ef046..58461a386 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -482,8 +482,9 @@ def run_gradio(results_dir: Path):
         tabs.select(tab_select)
 
     demo.queue()
-    do_share = os.getenv("AGENTXRAY_SHARE_SERVER", "false").lower() == "true"
-    demo.launch(server_port=int(os.getenv("AGENTXRAY_APP_PORT", 7899)), share=do_share)
+
+    do_share = os.getenv("AGENTXRAY_SHARE_GRADIO", "false").lower() == "true"
+    demo.launch(server_port=int(os.getenv("AGENTXRAY_APP_PORT", "7899")), share=do_share)
 
 
 def tab_select(evt: gr.SelectData):

From 7bba275c004f1f90dfd83eaaab963ab5066e2baf Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Thu, 17 Oct 2024 10:10:10 -0400
Subject: [PATCH 07/47] added tmlr definitive config (#71)

---
 .../agents/generic_agent/tmlr_config.py       | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 src/agentlab/agents/generic_agent/tmlr_config.py

diff --git a/src/agentlab/agents/generic_agent/tmlr_config.py b/src/agentlab/agents/generic_agent/tmlr_config.py
new file mode 100644
index 000000000..11860e691
--- /dev/null
+++ b/src/agentlab/agents/generic_agent/tmlr_config.py
@@ -0,0 +1,72 @@
+from copy import deepcopy
+
+from agentlab.agents import dynamic_prompting as dp
+from agentlab.experiments import args
+from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
+
+from .generic_agent import GenericAgentArgs
+from .generic_agent_prompt import GenericPromptFlags
+
+BASE_FLAGS = GenericPromptFlags(
+    obs=dp.ObsFlags(
+        use_html=False,
+        use_ax_tree=True,
+        use_focused_element=True,
+        use_error_logs=True,
+        use_history=True,
+        use_past_error_logs=False,
+        use_action_history=True,
+        use_think_history=True,  # gpt-4o config except for this line
+        use_diff=False,
+        html_type="pruned_html",
+        use_screenshot=False,
+        use_som=False,
+        extract_visible_tag=True,
+        extract_clickable_tag=True,
+        extract_coords="False",
+        filter_visible_elements_only=False,
+    ),
+    action=dp.ActionFlags(
+        multi_actions=False,
+        action_set="bid",
+        long_description=False,
+        individual_examples=False,
+    ),
+    use_plan=False,
+    use_criticise=False,
+    use_thinking=True,
+    use_memory=False,
+    use_concrete_example=True,
+    use_abstract_example=True,
+    use_hints=True,
+    enable_chat=False,
+    max_prompt_tokens=None,
+    be_cautious=True,
+    extra_instructions=None,
+)
+
+
+def get_base_agent(llm_config: str):
+    return GenericAgentArgs(
+        chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
+        flags=BASE_FLAGS,
+    )
+
+
+def get_vision_agent(llm_config: str):
+    flags = deepcopy(BASE_FLAGS)
+    flags.obs.use_screenshot = True
+    return GenericAgentArgs(
+        chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
+        flags=flags,
+    )
+
+
+def get_som_agent(llm_config: str):
+    flags = deepcopy(BASE_FLAGS)
+    flags.obs.use_screenshot = True
+    flags.obs.use_som = True
+    return GenericAgentArgs(
+        chat_model_args=CHAT_MODEL_ARGS_DICT[llm_config],
+        flags=flags,
+    )

From 1b6b217d5818cb5d413077f678f3fff55b081f84 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Fri, 18 Oct 2024 23:33:43 -0400
Subject: [PATCH 08/47] downgrading gradio version (#77)

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index aa528c18a..037e495f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ contexttimer
 ipython
 pyyaml>=6
 pandas
-gradio
+gradio==4.*
 gitpython # for the reproducibility script
 requests
-matplotlib
\ No newline at end of file
+matplotlib

From 98acd0cbc468418f2c00a64a30acc2508f4bef75 Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Sat, 19 Oct 2024 21:13:57 -0400
Subject: [PATCH 09/47] Study refactor (#73)

* adapting to new Benchmark class

* fixing tests

* fix tests

* typo

* not ready for gradio 5

* study id and a few fixes

* fixing pricy tests

---------

Co-authored-by: ThibaultLSDC <thibault.de.chezelles@gmail.com>
---
 main.py                                       |  22 +-
 reproducibility_journal.csv                   |  22 +-
 requirements.txt                              |   2 +-
 src/agentlab/agents/agent_args.py             |   3 +-
 src/agentlab/agents/dynamic_prompting.py      |  41 ++-
 .../agents/generic_agent/agent_configs.py     |  41 ++-
 .../agents/generic_agent/generic_agent.py     |  19 +-
 .../generic_agent/generic_agent_prompt.py     |  76 +---
 .../generic_agent/reproducibility_agent.py    |   6 +-
 src/agentlab/analyze/agent_xray.py            |  18 +-
 src/agentlab/analyze/inspect_results.ipynb    |   4 +-
 src/agentlab/analyze/inspect_results.py       |  36 +-
 src/agentlab/experiments/launch_exp.py        |   6 +-
 .../experiments/reproducibility_util.py       | 112 ++----
 src/agentlab/experiments/study.py             | 343 ++++++++++++++++++
 src/agentlab/experiments/study_generators.py  | 272 --------------
 src/agentlab/experiments/task_collections.py  | 212 -----------
 tests/agents/test_generic_prompt.py           |  18 +-
 tests/experiments/test_exp_configs.py         |  21 +-
 tests/experiments/test_launch_exp.py          |  15 +-
 .../experiments/test_reproducibility_util.py  | 128 +++----
 tests/experiments/test_task_collection.py     |  27 --
 22 files changed, 567 insertions(+), 877 deletions(-)
 create mode 100644 src/agentlab/experiments/study.py
 delete mode 100644 src/agentlab/experiments/study_generators.py
 delete mode 100644 src/agentlab/experiments/task_collections.py
 delete mode 100644 tests/experiments/test_task_collection.py

diff --git a/main.py b/main.py
index 7a038b6a9..1d0ba4464 100644
--- a/main.py
+++ b/main.py
@@ -6,8 +6,8 @@
 repository.
 """
 
+import bgym
 import logging
-
 from agentlab.agents.generic_agent import (
     RANDOM_SEARCH_AGENT,
     AGENT_4o,
@@ -15,8 +15,7 @@
     AGENT_LLAMA3_70B,
     AGENT_LLAMA31_70B,
 )
-from agentlab.analyze.inspect_results import get_most_recent_folder
-from agentlab.experiments import study_generators
+from agentlab.experiments.study import Study
 
 logging.getLogger().setLevel(logging.INFO)
 
@@ -24,12 +23,13 @@
 agent_args = [AGENT_4o_MINI]
 # agent_args = [AGENT_4o]
 
-## select the benchmark to run on
+
+# ## select the benchmark to run on
 benchmark = "miniwob_tiny_test"
-# benchmark = "miniwob"
-# benchmark = "workarena.l1"
-# benchmark = "workarena.l2"
-# benchmark = "workarena.l3"
+# benchmark = "miniwob_all"
+# benchmark = "workarena_l1"
+# benchmark = "workarena_l2"
+# benchmark = "workarena_l3"
 # benchmark = "webarena"
 
 # Set reproducibility_mode = True for reproducibility
@@ -53,11 +53,11 @@
 
     if relaunch:
         #  relaunch an existing study
-        study_dir = get_most_recent_folder()
-        study = study_generators.make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
+        study = Study.load_most_recent()
+        study.find_incomplete(relaunch_mode="incomplete_or_error")
 
     else:
-        study = study_generators.run_agents_on_benchmark(agent_args, benchmark)
+        study = Study(agent_args, benchmark)
 
     study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)
 
diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index df2ff7478..03158422d 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -1,12 +1,12 @@
-git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_completed,comment,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications
-recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
-recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,"  M: reproducibility_journal.csv
+git_user,agent_name,benchmark,benchmark_version,date,study_id,avg_reward,std_err,n_err,n_completed,comment,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications
+recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,,0.75,0.217,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
+recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,,1.0,0.0,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,"  M: reproducibility_journal.csv
   M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
-recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
-recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
-recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
-recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
-recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140,  M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
-recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
-recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
-recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
+recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
+recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
+recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
+recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140,"  M: main.py",0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
diff --git a/requirements.txt b/requirements.txt
index 037e495f1..5d7fcb8f5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ contexttimer
 ipython
 pyyaml>=6
 pandas
-gradio==4.*
+gradio<5
 gitpython # for the reproducibility script
 requests
 matplotlib
diff --git a/src/agentlab/agents/agent_args.py b/src/agentlab/agents/agent_args.py
index 0e0d6d8b9..40810f6b8 100644
--- a/src/agentlab/agents/agent_args.py
+++ b/src/agentlab/agents/agent_args.py
@@ -1,9 +1,10 @@
 from bgym import AbstractAgentArgs
+import bgym
 
 
 class AgentArgs(AbstractAgentArgs):
 
-    def set_benchmark(self, benchmark: str, demo_mode: bool):
+    def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode: bool):
         """Optional method to set benchmark specific flags.
 
         This allows the agent to have minor adjustments based on the benchmark.
diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 47aed2264..fdeecaac1 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -1,5 +1,4 @@
 import abc
-import difflib
 import logging
 import platform
 import time
@@ -9,6 +8,7 @@
 from typing import Literal
 from warnings import warn
 
+import bgym
 from browsergym.core.action.base import AbstractActionSet
 from browsergym.core.action.highlevel import HighLevelActionSet
 from browsergym.core.action.python import PythonActionSet
@@ -94,13 +94,14 @@ class ObsFlags(Flags):
 
 @dataclass
 class ActionFlags(Flags):
-    multi_actions: bool = False
-    action_set: str = "bid"
-    is_strict: bool = False
-    demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = "off"
+    action_set: bgym.HighLevelActionSetArgs = None  # should be set by the set_benchmark method
     long_description: bool = True
     individual_examples: bool = False
 
+    # for backward compatibility
+    multi_actions: bool = None
+    is_strict: bool = None
+
 
 class PromptElement:
     """Base class for all prompt elements. Prompt elements can be hidden."""
@@ -592,24 +593,24 @@ def _parse_answer(self, text_answer):
         return ans_dict
 
 
-def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:
+# def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:
 
-    if action_flags.action_set == "python":
-        action_set = PythonActionSet(strict=action_flags.is_strict)
-        if action_flags.demo_mode != "off":
-            warn(
-                f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
-            )
-        return action_set
+#     if action_flags.action_set == "python":
+#         action_set = PythonActionSet(strict=action_flags.is_strict)
+#         if action_flags.demo_mode != "off":
+#             warn(
+#                 f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
+#             )
+#         return action_set
 
-    action_set = HighLevelActionSet(
-        subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
-        multiaction=action_flags.multi_actions,
-        strict=action_flags.is_strict,
-        demo_mode=action_flags.demo_mode,
-    )
+#     action_set = HighLevelActionSet(
+#         subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
+#         multiaction=action_flags.multi_actions,
+#         strict=action_flags.is_strict,
+#         demo_mode=action_flags.demo_mode,
+#     )
 
-    return action_set
+#     return action_set
 
 
 class Think(PromptElement):
diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
index 4c0a39a74..2d4f9bc39 100644
--- a/src/agentlab/agents/generic_agent/agent_configs.py
+++ b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -1,3 +1,4 @@
+import bgym
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.experiments import args
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
@@ -25,8 +26,10 @@
         filter_visible_elements_only=False,
     ),
     action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
+        action_set=bgym.HighLevelActionSetArgs(
+            subsets=["bid"],
+            multiaction=False,
+        ),
         long_description=False,
         individual_examples=True,
     ),
@@ -71,8 +74,10 @@
         filter_visible_elements_only=False,
     ),
     action=dp.ActionFlags(
-        multi_actions=False,  # often detrimental
-        action_set="bid",
+        action_set=bgym.HighLevelActionSetArgs(
+            subsets=["bid"],
+            multiaction=False,
+        ),
         long_description=False,
         individual_examples=True,
     ),
@@ -116,8 +121,10 @@
         filter_visible_elements_only=False,
     ),
     action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
+        action_set=bgym.HighLevelActionSetArgs(
+            subsets=["bid"],
+            multiaction=False,
+        ),
         long_description=False,
         individual_examples=True,
     ),
@@ -164,8 +171,10 @@
         filter_visible_elements_only=False,
     ),
     action=dp.ActionFlags(
-        multi_actions=True,
-        action_set="bid",
+        action_set=bgym.HighLevelActionSetArgs(
+            subsets=["bid"],
+            multiaction=True,
+        ),
         long_description=False,
         individual_examples=True,
     ),
@@ -210,8 +219,10 @@
         filter_visible_elements_only=False,
     ),
     action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
+        action_set=bgym.HighLevelActionSetArgs(
+            subsets=["bid"],
+            multiaction=False,
+        ),
         long_description=False,
         individual_examples=False,
     ),
@@ -270,10 +281,12 @@
         filter_visible_elements_only=args.Choice([True, False], p=[0.3, 0.7]),
     ),
     action=dp.ActionFlags(
-        multi_actions=args.Choice([True, False], p=[0.7, 0.3]),
-        action_set=args.Choice(["bid", "bid+coord"]),
-        # action_set=args.Choice(["python", "bid", "coord",
-        # "bid+coord"]),
+        action_set=bgym.HighLevelActionSetArgs(
+            subsets=args.Choice([["bid"], ["bid", "coord"]]),
+            multiaction=args.Choice([True, False], p=[0.7, 0.3]),
+        ),
+        long_description=False,
+        individual_examples=False,
     ),
     # drop_ax_tree_first=True, # this flag is no longer active, according to browsergym doc
     use_plan=args.Choice([True, False]),
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index 7c65e3cd6..622d25414 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -1,7 +1,9 @@
+from copy import deepcopy
 from dataclasses import asdict, dataclass
 from functools import partial
 from warnings import warn
 
+import bgym
 from browsergym.experiments.agent import Agent, AgentInfo
 
 from agentlab.agents import dynamic_prompting as dp
@@ -25,13 +27,22 @@ def __post_init__(self):
         except AttributeError:
             pass
 
-    def set_benchmark(self, benchmark, demo_mode):
+    def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
         """Override Some flags based on the benchmark."""
-        if benchmark == "miniwob":
+        if benchmark.name.startswith("miniwob"):
             self.flags.obs.use_html = True
 
+        self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args)
+
+        # for backward compatibility with old traces
+        if self.flags.action.multi_actions is not None:
+            self.flags.action.action_set.multiaction = self.flags.action.multi_actions
+        if self.flags.action.is_strict is not None:
+            self.flags.action.action_set.strict = self.flags.action.is_strict
+
+        # verify if we can remove this
         if demo_mode:
-            self.flags.action.demo_mode = "all_blue"
+            self.action_set.demo_mode = "all_blue"
 
     def set_reproducibility_mode(self):
         self.chat_model_args.temperature = 0
@@ -62,7 +73,7 @@ def __init__(
         self.max_retry = max_retry
 
         self.flags = flags
-        self.action_set = dp.make_action_set(self.flags.action)
+        self.action_set = self.flags.action.action_set.make_action_set()
         self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs)
 
         self._check_flag_constancy()
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index 81450847b..b75621497 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -79,7 +79,7 @@ def __init__(
         def time_for_caution():
             # no need for caution if we're in single action mode
             return flags.be_cautious and (
-                flags.action.multi_actions or flags.action.action_set == "python"
+                flags.action.action_set.multiaction or flags.action.action_set == "python"
             )
 
         self.be_cautious = dp.BeCautious(visible=time_for_caution)
@@ -242,77 +242,3 @@ class Criticise(dp.PromptElement):
 
     def _parse_answer(self, text_answer):
         return parse_html_tags_raise(text_answer, optional_keys=["action_draft", "criticise"])
-
-
-if __name__ == "__main__":
-    html_template = """
-    <html>
-    <body>
-    <div>
-    Hello World.
-    Step {}.
-    </div>
-    </body>
-    </html>
-    """
-
-    OBS_HISTORY = [
-        {
-            "goal": "do this and that",
-            "pruned_html": html_template.format(1),
-            "axtree_txt": "[1] Click me",
-            "last_action_error": "",
-            "focused_element_bid": "32",
-        },
-        {
-            "goal": "do this and that",
-            "pruned_html": html_template.format(2),
-            "axtree_txt": "[1] Click me",
-            "last_action_error": "",
-            "focused_element_bid": "32",
-        },
-        {
-            "goal": "do this and that",
-            "pruned_html": html_template.format(3),
-            "axtree_txt": "[1] Click me",
-            "last_action_error": "Hey, there is an error now",
-            "focused_element_bid": "32",
-        },
-    ]
-    ACTIONS = ["click('41')", "click('42')"]
-    MEMORIES = ["memory A", "memory B"]
-    THOUGHTS = ["thought A", "thought B"]
-
-    flags = dp.ObsFlags(
-        use_html=True,
-        use_ax_tree=True,
-        use_plan=True,
-        use_criticise=True,
-        use_thinking=True,
-        use_error_logs=True,
-        use_past_error_logs=True,
-        use_history=True,
-        use_action_history=True,
-        use_memory=True,
-        use_diff=True,
-        html_type="pruned_html",
-        use_concrete_example=True,
-        use_abstract_example=True,
-        multi_actions=True,
-        use_screenshot=False,
-    )
-
-    print(
-        MainPrompt(
-            action_set=dp.make_action_set(
-                "bid", is_strict=False, multiaction=True, demo_mode="off"
-            ),
-            obs_history=OBS_HISTORY,
-            actions=ACTIONS,
-            memories=MEMORIES,
-            thoughts=THOUGHTS,
-            previous_plan="No plan yet",
-            step=0,
-            flags=flags,
-        ).prompt
-    )
diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
index c197b76e9..c40aece6e 100644
--- a/src/agentlab/agents/generic_agent/reproducibility_agent.py
+++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -24,7 +24,7 @@
 from langchain_community.adapters.openai import convert_message_to_dict
 
 from agentlab.agents.agent_args import AgentArgs
-from agentlab.experiments.study_generators import Study
+from agentlab.experiments.study import Study
 from agentlab.llm.chat_api import make_assistant_message
 from agentlab.llm.llm_utils import messages_to_dict
 
@@ -144,7 +144,9 @@ def reproduce_study(original_study_dir: Path | str, log_level=logging.INFO):
 
     original_study_dir = Path(original_study_dir)
 
-    study_name = f"reproducibility_of_{original_study_dir.name}"
+    study = Study.load(original_study_dir)
+    study.dir = None
+    study.make_dir()
 
     exp_args_list: list[ExpArgs] = []
     for exp_result in yield_all_exp_results(original_study_dir, progress_fn=None):
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 58461a386..59db2ab29 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -20,7 +20,7 @@
 from agentlab.analyze import inspect_results
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.llm.chat_api import make_system_message, make_user_message
-from agentlab.llm.llm_utils import image_to_jpg_base64_url
+from agentlab.experiments.study import get_most_recent_study
 
 select_dir_instructions = "Select Experiment Directory"
 AGENT_NAME_KEY = "agent.agent_name"
@@ -152,6 +152,12 @@ def filter_agent_id(self, agent_id: list[tuple]):
     white-space: normal !important;
     word-wrap: break-word !important;
 }
+#task_table {
+    height: 500px !important;
+}
+#seed_table {
+    height: 500px !important;
+}
 """
 
 
@@ -236,7 +242,9 @@ def run_gradio(results_dir: Path):
                                 )
                             refresh_results_button = gr.Button("↺", scale=0, size="sm")
 
-                        task_table = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        task_table = gr.DataFrame(
+                            height=500, show_label=False, interactive=False, elem_id="task_table"
+                        )
 
                     with gr.Column(scale=2):
                         with gr.Accordion("Seed Selector (click for help)", open=False):
@@ -249,7 +257,9 @@ def run_gradio(results_dir: Path):
     the order."""
                             )
 
-                        seed_table = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        seed_table = gr.DataFrame(
+                            height=500, show_label=False, interactive=False, elem_id="seed_table"
+                        )
 
             with gr.Tab("Constants and Variables"):
                 with gr.Row():
@@ -958,7 +968,7 @@ def get_directory_contents(results_dir: Path):
 
 
 def most_recent_folder(results_dir: Path):
-    return inspect_results.get_most_recent_folder(results_dir).name
+    return get_most_recent_study(results_dir).name
 
 
 def refresh_exp_dir_choices(exp_dir_choice):
diff --git a/src/agentlab/analyze/inspect_results.ipynb b/src/agentlab/analyze/inspect_results.ipynb
index 6db090926..e1281f141 100644
--- a/src/agentlab/analyze/inspect_results.ipynb
+++ b/src/agentlab/analyze/inspect_results.ipynb
@@ -8,6 +8,8 @@
    "source": [
     "from agentlab.experiments.exp_utils import RESULTS_DIR\n",
     "from agentlab.analyze import inspect_results\n",
+    "from agentlab.experiments.study import get_most_recent_study\n",
+    "\n",
     "import pandas as pd\n",
     "pd.set_option('display.max_rows', 200)\n",
     "\n",
@@ -52,7 +54,7 @@
     "# result_dir = RESULTS_DIR / \"2024-05-28_01-13-04_generic_agent_eval_llm\" \n",
     "# result_dir = RESULTS_DIR / \"2024-05-28_01-44-29_generic_agent_eval_llm\"\n",
     "\n",
-    "result_dir = inspect_results.get_most_recent_folder(RESULTS_DIR, contains=None)\n",
+    "result_dir = get_most_recent_study(RESULTS_DIR, contains=None)\n",
     "\n",
     "print(result_dir)\n",
     "result_df = inspect_results.load_result_df(result_dir)"
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index d33cc756e..f304250b3 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -16,6 +16,7 @@
 from IPython.display import display
 from tqdm import tqdm
 
+
 from agentlab.analyze.error_categorization import (
     ERR_CLASS_MAP,
     is_critical_server_error,
@@ -509,41 +510,6 @@ def flag_report(report: pd.DataFrame, metric: str = "avg_reward", round_digits:
     return flag_report
 
 
-def get_most_recent_folder(
-    root_dir: Path = None, date_format: str = "%Y-%m-%d_%H-%M-%S", contains=None
-):
-    """Return the most recent directory based on the date in the folder name.
-
-    Args:
-        root_dir: The directory to search in
-        date_format: The format of the date in the folder name
-        contains: If not None, only consider folders that contains this string
-
-    Returns:
-        Path: The most recent folder satisfying the conditions
-    """
-
-    if root_dir is None:
-        root_dir = RESULTS_DIR
-
-    most_recent_folder = None
-    most_recent_time = datetime.min
-
-    for item in root_dir.iterdir():
-        if item.is_dir() and not item.name.startswith("_"):
-            if contains is not None and contains not in item.name:
-                continue
-            try:
-                folder_date = datetime.strptime("_".join(item.name.split("_")[:2]), date_format)
-                if folder_date > most_recent_time:
-                    most_recent_time = folder_date
-                    most_recent_folder = item
-            except (ValueError, IndexError):
-                continue
-
-    return most_recent_folder
-
-
 def display_report(
     report: pd.DataFrame,
     apply_shrink_columns: bool = True,
diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index fd7d2b6b3..d62de9efa 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -79,8 +79,8 @@ def run_experiments(
         logging.info("Experiment finished.")
 
 
-def relaunch_study(study_dir: str | Path, relaunch_mode="incomplete_only"):
-    """Return exp_args_list and study_dir
+def find_incomplete(study_dir: str | Path, relaunch_mode="incomplete_only"):
+    """Return incomplete experiments
 
     Args:
         study_dir: Path
@@ -107,7 +107,7 @@ def relaunch_study(study_dir: str | Path, relaunch_mode="incomplete_only"):
 
     logging.info(message)
 
-    return exp_args_list, study_dir
+    return exp_args_list
 
 
 def _yield_incomplete_experiments(exp_root, relaunch_mode="incomplete_only"):
diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
index 3ef7d8ef6..ef790d0f1 100644
--- a/src/agentlab/experiments/reproducibility_util.py
+++ b/src/agentlab/experiments/reproducibility_util.py
@@ -1,27 +1,30 @@
 import csv
-import json
 import logging
 import os
 import platform
-from copy import deepcopy
 from datetime import datetime
 from importlib import metadata
 from pathlib import Path
 
+import bgym
 import pandas as pd
-from browsergym.experiments.loop import ExpArgs
 from git import InvalidGitRepositoryError, Repo
 from git.config import GitConfigParser
 
 import agentlab
-from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
 
 
 def _get_repo(module):
     return Repo(Path(module.__file__).resolve().parent, search_parent_directories=True)
 
 
-def _get_benchmark_version(benchmark_name):
+def _get_benchmark_version(benchmark: bgym.Benchmark) -> str:
+    benchmark_name = benchmark.name
+
+    if hasattr(benchmark, "get_version"):
+        return benchmark.get_version()
+
+    # in between 2 pull requests
     if benchmark_name.startswith("miniwob"):
         return metadata.distribution("browsergym.miniwob").version
     elif benchmark_name.startswith("workarena"):
@@ -166,8 +169,9 @@ def _get_git_info(module, changes_white_list=()) -> tuple[str, list[tuple[str, P
 
 
 def get_reproducibility_info(
-    agent_name: str | list[str],
-    benchmark_name,
+    agent_names: str | list[str],
+    benchmark: bgym.Benchmark,
+    study_id: str = "",
     comment=None,
     changes_white_list=(  # Files that are often modified during experiments but do not affect reproducibility
         "*/reproducibility_script.py",
@@ -183,15 +187,16 @@ def get_reproducibility_info(
 
     import agentlab
 
-    if isinstance(agent_name, str):
-        agent_name = [agent_name]
+    if isinstance(agent_names, str):
+        agent_names = [agent_names]
 
     info = {
         "git_user": _get_git_username(_get_repo(agentlab)),
-        "agent_names": agent_name,
-        "benchmark": benchmark_name,
+        "agent_names": agent_names,
+        "benchmark": benchmark.name,
+        "study_id": study_id,
         "comment": comment,
-        "benchmark_version": _get_benchmark_version(benchmark_name),
+        "benchmark_version": _get_benchmark_version(benchmark),
         "date": datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
         "os": f"{platform.system()} ({platform.version()})",
         "python_version": platform.python_version(),
@@ -226,7 +231,7 @@ def add_git_info(module_name, module):
     return info
 
 
-def _assert_compatible(info: dict, old_info: dict, raise_if_incompatible=True):
+def assert_compatible(info: dict, old_info: dict, raise_if_incompatible=True):
     """Make sure that the two info dicts are compatible."""
     # TODO may need to adapt if there are multiple agents, and the re-run on
     # error only has a subset of agents. Hence old_info.agent_name != info.agent_name
@@ -234,81 +239,12 @@ def _assert_compatible(info: dict, old_info: dict, raise_if_incompatible=True):
         if key in ("date", "avg_reward", "std_err", "n_completed", "n_err"):
             continue
         if info[key] != old_info[key]:
-            if not raise_if_incompatible:
-                logging.warning(
-                    f"Reproducibility info already exist and is not compatible."
-                    f"Key {key} has changed from {old_info[key]} to {info[key]}."
-                )
-            else:
-                raise ValueError(
-                    f"Reproducibility info already exist and is not compatible."
-                    f"Key {key} has changed from {old_info[key]} to {info[key]}."
-                    f"Set strict_reproducibility=False to bypass this error."
-                )
-
-
-# def _benchmark_from_task_name(task_name: str):
-#     """Extract the benchmark from the task name.
-#     TODO should be more robost, e.g. handle workarna.L1, workarena.L2, etc.
-#     """
-#     return task_name.split(".")[0]
-
-
-# def infer_agent(exp_args_list: list[ExpArgs]):
-#     return list(set(exp_args.agent_args.agent_name for exp_args in exp_args_list))
-
-
-# def infer_benchmark(exp_args_list: list[ExpArgs]):
-#     bench_name = set(
-#         _benchmark_from_task_name(exp_args.env_args.task_name) for exp_args in exp_args_list
-#     )
-#     if len(bench_name) > 1:
-#         raise ValueError(
-#             f"Multiple benchmarks in the same study are not well supported: {bench_name}."
-#             "Comment out the reproducibility part of the code to proceed at your own risk."
-#         )
-
-#     return bench_name.pop()
-
-
-# def write_reproducibility_info(
-#     study_dir, agent_name, benchmark_name, comment=None, strict_reproducibility=True
-# ):
-#     info = get_reproducibility_info(
-#         agent_name, benchmark_name, comment, ignore_changes=not strict_reproducibility
-#     )
-#     return save_reproducibility_info(study_dir, info, strict_reproducibility)
-
-
-def save_reproducibility_info(study_dir, info, strict_reproducibility=True):
-    """
-    Save a JSON file containing reproducibility information to the specified directory.
-    """
-
-    info_path = Path(study_dir) / "reproducibility_info.json"
-
-    if info_path.exists():
-        with open(info_path, "r") as f:
-            existing_info = json.load(f)
-        _assert_compatible(info, existing_info, raise_if_incompatible=strict_reproducibility)
-        logging.info(
-            "Reproducibility info already exists and is compatible. Overwriting the old one."
-        )
-
-    with open(info_path, "w") as f:
-        json.dump(info, f, indent=4)
-
-    info_str = json.dumps(info, indent=4)
-    logging.info(f"Reproducibility info saved to {info_path}. Info: {info_str}")
-
-    return info
-
-
-def load_reproducibility_info(study_dir) -> dict[str]:
-    """Retrieve the reproducibility info from the study directory."""
-    info_path = Path(study_dir) / "reproducibility_info.json"
-    with open(info_path, "r") as f:
-        return json.load(f)
+            _raise_or_warn(
+                f"Reproducibility info already exist and is not compatible."
+                f"Key {key} has changed from {old_info[key]} to {info[key]}."
+                f"Set strict_reproducibility=False to bypass this error.",
+                raise_error=raise_if_incompatible,
+            )
 
 
 def _raise_or_warn(msg, raise_error=True):
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
new file mode 100644
index 000000000..60f2166bb
--- /dev/null
+++ b/src/agentlab/experiments/study.py
@@ -0,0 +1,343 @@
+from dataclasses import dataclass
+from datetime import datetime
+import gzip
+import logging
+from pathlib import Path
+import pickle
+
+from bgym import ExpArgs, EnvArgs, Benchmark
+import bgym
+
+from agentlab.agents.agent_args import AgentArgs
+from agentlab.analyze import inspect_results
+from agentlab.experiments import args
+from agentlab.experiments.launch_exp import run_experiments, find_incomplete
+from agentlab.experiments.exp_utils import RESULTS_DIR
+from agentlab.experiments import reproducibility_util as repro
+
+
+@dataclass
+class Study:
+    """A study coresponds to one or multiple agents evaluated on a benchmark.
+
+    This is part of the high level API to help keep experiments organized and reproducible.
+
+    Attributes:
+        benchmark: Benchmark
+            The benchmark to evaluate the agents on.
+
+        agent_args: list[AgentArgs]
+            The list of agents to evaluate.
+
+        dir: Path
+            The directory where the results will be saved.
+
+        suffix: str
+            A suffix to add to the study name
+
+        uuid: str
+            A unique identifier for the study
+
+        reproducibility_info: dict
+            The reproducibility information for the study.
+    """
+
+    agent_args: list[AgentArgs] = None
+    benchmark: Benchmark = None
+    dir: Path = None
+    suffix: str = ""  # used for adding a personnal comment to the study name
+    uuid: str = None
+    reproducibility_info: dict = None
+    logging_level: int = logging.INFO
+
+    def __post_init__(self):
+        self.uuid = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+        if isinstance(self.benchmark, str):
+            self.benchmark = bgym.BENCHMARKS[self.benchmark]()
+        if isinstance(self.dir, str):
+            self.dir = Path(self.dir)
+        self.make_exp_args_list()
+
+    def make_exp_args_list(self):
+        self.exp_args_list = _agents_on_benchmark(
+            self.agent_args, self.benchmark, logging_level=self.logging_level
+        )
+
+    def find_incomplete(self, relaunch_mode="incomplete_or_error"):
+        """Find incomplete or errored experiments in the study directory for relaunching."""
+        self.exp_args_list = find_incomplete(self.dir, relaunch_mode=relaunch_mode)
+
+    def set_reproducibility_info(self, strict_reproducibility=False):
+        """Gather relevant information that may affect the reproducibility of the experiment
+
+        e.g.: versions of BrowserGym, benchmark, AgentLab..."""
+        agent_names = [a.agent_name for a in self.agent_args]
+        info = repro.get_reproducibility_info(
+            agent_names, self.benchmark, self.uuid, ignore_changes=not strict_reproducibility
+        )
+        if self.reproducibility_info is not None:
+            repro.assert_compatible(self.reproducibility_info, info)
+        self.reproducibility_info = info
+
+    def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False):
+        """Run all experiments in the study in parallel when possible.
+
+        Args:
+            n_jobs: int
+                Number of parallel jobs.
+
+            parallel_backend: str
+                Parallel backend to use. Either "joblib", "dask" or "sequential".
+
+            strict_reproducibility: bool
+                If True, all modifications have to be committed before running the experiments.
+                Also, if relaunching a study, it will not be possible if the code has changed.
+        """
+
+        if self.exp_args_list is None:
+            raise ValueError("exp_args_list is None. Please set exp_args_list before running.")
+
+        self.set_reproducibility_info(strict_reproducibility=strict_reproducibility)
+        self.save()
+
+        run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
+        report_df = self.get_report(ignore_cache=True)
+        logging.info(f"Study {self.name} finished.")
+        logging.info("\n" + str(report_df))
+
+    def append_to_journal(self, strict_reproducibility=True):
+        """Append the study to the journal.
+
+        Args:
+            strict_reproducibility: bool
+                If True, incomplete experiments will raise an error.
+
+        Raises:
+            ValueError: If the reproducibility information is not compatible
+                with the report.
+        """
+        repro.append_to_journal(
+            self.reproducibility_info,
+            self.get_report(),
+            strict_reproducibility=strict_reproducibility,
+        )
+
+    @property
+    def name(self):
+        agent_names = [a.agent_name for a in self.agent_args]
+        if len(agent_names) == 1:
+            study_name = f"{agent_names[0]}_on_{self.benchmark.name}"
+        else:
+            study_name = f"{len(agent_names)}_agents_on_{self.benchmark.name}"
+        if self.suffix:
+            study_name += f"_{self.suffix}"
+        return study_name
+
+    def make_dir(self, exp_root=RESULTS_DIR):
+        if self.dir is None:
+            dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}"
+
+            self.dir = Path(exp_root) / dir_name
+        self.dir.mkdir(parents=True, exist_ok=True)
+
+    def save(self):
+        """Pickle the study to the directory"""
+
+        # TODO perhaps remove exp_args_list before pickling and when loading bring them from the individual directories
+
+        self.make_dir()
+
+        with gzip.open(self.dir / "study.pkl.gz", "wb") as f:
+            pickle.dump(self, f)
+
+    def get_report(self, ignore_cache=False, ignore_stale=False):
+        return inspect_results.get_study_summary(
+            self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
+        )
+
+    @staticmethod
+    def load(dir: Path) -> "Study":
+        with gzip.open(dir / "study.pkl.gz", "rb") as f:
+            study = pickle.load(f)  # type: Study
+
+        study.dir = dir
+        return study
+
+    @staticmethod
+    def load_most_recent(root_dir: Path = None):
+        return Study.load(get_most_recent_study(root_dir))
+
+
+def get_most_recent_study(
+    root_dir: Path = None, date_format: str = "%Y-%m-%d_%H-%M-%S", contains=None
+):
+    """Return the most recent directory based on the date in the folder name.
+
+    Args:
+        root_dir: The directory to search in
+        date_format: The format of the date in the folder name
+        contains: If not None, only consider folders that contains this string
+
+    Returns:
+        Path: The most recent folder satisfying the conditions
+    """
+
+    if root_dir is None:
+        root_dir = RESULTS_DIR
+
+    most_recent_folder = None
+    most_recent_time = datetime.min
+
+    for item in root_dir.iterdir():
+        if item.is_dir() and not item.name.startswith("_"):
+            if contains is not None and contains not in item.name:
+                continue
+            try:
+                folder_date = datetime.strptime("_".join(item.name.split("_")[:2]), date_format)
+                if folder_date > most_recent_time:
+                    most_recent_time = folder_date
+                    most_recent_folder = item
+            except (ValueError, IndexError):
+                continue
+
+    return most_recent_folder
+
+
+# def make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error"):
+#     """Create a study from an existing study directory.
+
+#     It will search for all experiments that needs to be relaunched depending on
+#     `relaunch_mode`.
+
+#     Args:
+#         study_dir: Path
+#             The directory where the experiments are saved.
+#         relaunch_mode: str
+#             Find all incomplete experiments and relaunch them.
+#             - "incomplete_only": relaunch only the incomplete experiments.
+#             - "incomplete_or_error": relaunch incomplete or errors.
+#     """
+#     study = Study(dir=study_dir)
+#     study.exp_args_list, _ = find_incomplete(study.dir, relaunch_mode=relaunch_mode)
+#     info = study.load_reproducibility_info()
+#     study.benchmark_name = info["benchmark"]
+#     study.agent_names = info["agent_names"]
+#     return study
+
+
+def set_demo_mode(env_args_list: list[EnvArgs]):
+
+    for env_args in env_args_list:
+        env_args.viewport = {"width": 1280, "height": 720}
+        env_args.record_video = True
+        env_args.wait_for_user_message = False
+        env_args.slow_mo = 1000
+
+
+def _agents_on_benchmark(
+    agents: list[AgentArgs] | AgentArgs,
+    benchmark: bgym.Benchmark,
+    demo_mode=False,
+    logging_level: int = logging.INFO,
+):
+    """Run one or multiple agents on a benchmark.
+
+    Args:
+        agents: list[AgentArgs] | AgentArgs
+            The agent configuration(s) to run.
+        benchmark: bgym.Benchmark
+            The benchmark to run the agents on.
+        demo_mode: bool
+            If True, the experiments will be run in demo mode.
+        logging_level: int
+            The logging level for individual jobs.
+
+    Returns:
+        study: Study
+    """
+
+    if not isinstance(agents, (list, tuple)):
+        agents = [agents]
+
+    for agent in agents:
+        agent.set_benchmark(benchmark, demo_mode)  # the agent can adapt (lightly?) to the benchmark
+
+    env_args_list = benchmark.env_args_list
+    if demo_mode:
+        set_demo_mode(env_args_list)
+
+    return args.expand_cross_product(
+        ExpArgs(
+            agent_args=args.CrossProd(agents),
+            env_args=args.CrossProd(env_args_list),
+            logging_level=logging_level,
+        )
+    )
+
+
+# def ablation_study(start_agent: AgentArgs, changes, benchmark: str, demo_mode=False):
+#     """Ablation study of an agent.
+
+#     Changes is a list of tuples (path_to_attribute, value) to change in the agent
+#     configuration.
+
+#     Args:
+#         start_agent: AgentArgs
+#             The agent configuration to start from.
+
+#         changes: list[tuple]
+#             The changes to apply to the agent configuration.
+
+#         benchmark: str
+#             The benchmark to use.
+
+#         demo_mode: bool
+#             If True, the experiments will be run in demo mode.
+
+#     Returns:
+#         Study
+#     """
+#     agents = args.make_ablation_study(start_agent, changes)
+#     study = run_agents_on_benchmark(agents, benchmark, demo_mode=demo_mode)
+#     study.suffix = "ablation_study"
+#     return study
+
+
+# def random_search(
+#     random_agent: AgentArgs = RANDOM_SEARCH_AGENT,
+#     n_samples=10,
+#     benchmark: str = "miniwob",
+#     demo_mode=False,
+# ):
+#     """
+#     Random search of AgentArgs (NOTE: not fully tested since refactoring)
+
+#     The random search mechanism will recursively search through dataclasses and
+#     dict to find attributes of type args.Choice. It will sample iid and replace
+#     with the corresponding value.
+
+#     *WARINING* The standard errror of the experiment will usually be relatively high and
+#     the search space is usually big so the false discovery rate will likely be
+#     high. Make sure to analyze the results with caution and don't actually draw
+#     final conclusions from these experiments.
+
+#     Args:
+#         agent: AgentArgs
+#             The agent configuration, with some sub-arguments defined as args.Choice.
+
+#         n_samples: int
+#             The number of samples to take.
+
+#         benchmark: str
+#             The benchmark to use.
+
+#         demo_mode: bool
+#             If True, the experiments will be run in demo mode.
+
+#     Returns:
+#         Study
+#     """
+#     agents = args.sample_and_expand_cross_product(random_agent, n_samples)
+#     study = run_agents_on_benchmark(agents, benchmark, demo_mode=demo_mode)
+#     study.suffix = "random_search"
+#     return study
diff --git a/src/agentlab/experiments/study_generators.py b/src/agentlab/experiments/study_generators.py
deleted file mode 100644
index 3a2567d51..000000000
--- a/src/agentlab/experiments/study_generators.py
+++ /dev/null
@@ -1,272 +0,0 @@
-from dataclasses import dataclass
-from datetime import datetime
-import logging
-from pathlib import Path
-
-from bgym import ExpArgs, EnvArgs
-
-from agentlab.agents.agent_args import AgentArgs
-from agentlab.agents.generic_agent.agent_configs import RANDOM_SEARCH_AGENT, AGENT_4o_MINI
-from agentlab.analyze import inspect_results
-from agentlab.experiments import args
-from agentlab.experiments import task_collections as tasks
-from agentlab.experiments.launch_exp import run_experiments, relaunch_study
-from agentlab.experiments.exp_utils import RESULTS_DIR
-from agentlab.experiments import reproducibility_util as repro
-
-
-@dataclass
-class Study:
-    """A study coresponds to one or multiple agents evaluated on a benchmark.
-
-    This is part of the high level API to help keep experiments organized and reproducible.
-
-    Attributes:
-        exp_args_list: list[ExpArgs]
-            The list of experiments to run.
-
-        benchmark_name: str
-            The name of the benchmark.
-
-        agent_names: list[str]
-            The names of the agents.
-
-        dir: Path
-            The directory where the results will be saved.
-
-        suffix: str
-            A suffix to add to the study name
-    """
-
-    exp_args_list: list[ExpArgs] = None
-    benchmark_name: str = None
-    agent_names: list[str] = None
-    dir: Path = None
-    suffix: str = ""  # used for adding a personnal comment to the study name
-
-    def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False):
-        """Run all experiments in the study in parallel when possible.
-
-        Args:
-            n_jobs: int
-                Number of parallel jobs.
-
-            parallel_backend: str
-                Parallel backend to use. Either "joblib", "dask" or "sequential".
-
-            strict_reproducibility: bool
-                If True, you will have to commit all your files before running the experiments.
-        """
-
-        if self.exp_args_list is None:
-            raise ValueError("exp_args_list is None. Please set exp_args_list before running.")
-
-        self.make_dir()
-        self.write_reproducibility_info(strict_reproducibility=strict_reproducibility)
-
-        run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
-        report_df = self.get_report(ignore_cache=True)
-        logging.info(f"Study {self.name} finished.")
-        logging.info("\n" + str(report_df))
-
-    def append_to_journal(self, strict_reproducibility=True):
-        """Append the study to the journal.
-
-        Args:
-            strict_reproducibility: bool
-                If True, incomplete experiments will raise an error.
-
-        Raises:
-            ValueError: If the reproducibility information is not compatible
-                with the report.
-        """
-        repro.append_to_journal(
-            self.load_reproducibility_info(),
-            self.get_report(),
-            strict_reproducibility=strict_reproducibility,
-        )
-
-    @property
-    def name(self):
-        if len(self.agent_names) == 1:
-            study_name = f"{self.agent_names[0]}_on_{self.benchmark_name}"
-        else:
-            study_name = f"{len(self.agent_names)}_agents_on_{self.benchmark_name}"
-        if self.suffix:
-            study_name += f"_{self.suffix}"
-        return study_name
-
-    def make_dir(self, exp_root=RESULTS_DIR):
-        if self.dir is None:
-            dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}"
-
-            self.dir = Path(exp_root) / dir_name
-        self.dir.mkdir(parents=True, exist_ok=True)
-
-    def write_reproducibility_info(self, comment=None, strict_reproducibility=False):
-        info = repro.get_reproducibility_info(
-            self.agent_names,
-            self.benchmark_name,
-            comment,
-            ignore_changes=not strict_reproducibility,
-        )
-        return repro.save_reproducibility_info(self.dir, info, strict_reproducibility)
-
-    def get_report(self, ignore_cache=False, ignore_stale=False):
-        return inspect_results.get_study_summary(
-            self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
-        )
-
-    def load_reproducibility_info(self):
-        return repro.load_reproducibility_info(self.dir)
-
-
-def make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error"):
-    """Create a study from an existing study directory.
-
-    It will search for all experiments that needs to be relaunched depending on
-    `relaunch_mode`.
-
-    Args:
-        study_dir: Path
-            The directory where the experiments are saved.
-        relaunch_mode: str
-            Find all incomplete experiments and relaunch them.
-            - "incomplete_only": relaunch only the incomplete experiments.
-            - "incomplete_or_error": relaunch incomplete or errors.
-    """
-    study = Study(dir=study_dir)
-    study.exp_args_list, _ = relaunch_study(study.dir, relaunch_mode=relaunch_mode)
-    info = study.load_reproducibility_info()
-    study.benchmark_name = info["benchmark"]
-    study.agent_names = info["agent_names"]
-    return study
-
-
-def set_demo_mode(env_args_list: list[EnvArgs]):
-
-    for env_args in env_args_list:
-        env_args.viewport = {"width": 1280, "height": 720}
-        env_args.record_video = True
-        env_args.wait_for_user_message = False
-        env_args.slow_mo = 1000
-
-
-def run_agents_on_benchmark(
-    agents: list[AgentArgs] | AgentArgs = AGENT_4o_MINI,
-    benchmark: str = "miniwob",
-    demo_mode=False,
-    log_level=logging.INFO,
-):
-    """Run one or multiple agents on a benchmark.
-
-    Args:
-        agents: list[AgentArgs] | AgentArgs
-            The agent configuration(s) to run.
-
-        benchmark: str
-            The benchmark to use. One of:
-                * miniwob
-                * webarena
-                * workarena.l1
-                * workarena.l2
-                * workarena.l3
-                * miniwob_tiny_test
-
-    Returns:
-        study: Study
-    """
-
-    if not isinstance(agents, (list, tuple)):
-        agents = [agents]
-
-    for agent in agents:
-        agent.set_benchmark(benchmark, demo_mode)  # the agent can adapt (lightly?) to the benchmark
-
-    env_args_list = tasks.get_benchmark_env_args(
-        benchmark, meta_seed=43, max_steps=None, n_repeat=None
-    )
-    if demo_mode:
-        set_demo_mode(env_args_list)
-
-    exp_args_list = args.expand_cross_product(
-        ExpArgs(
-            agent_args=args.CrossProd(agents),
-            env_args=args.CrossProd(env_args_list),
-            logging_level=log_level,
-        )
-    )
-
-    return Study(
-        exp_args_list=exp_args_list,
-        benchmark_name=benchmark,
-        agent_names=[a.agent_name for a in agents],
-    )
-
-
-def ablation_study(start_agent: AgentArgs, changes, benchmark: str, demo_mode=False):
-    """Ablation study of an agent.
-
-    Changes is a list of tuples (path_to_attribute, value) to change in the agent
-    configuration.
-
-    Args:
-        start_agent: AgentArgs
-            The agent configuration to start from.
-
-        changes: list[tuple]
-            The changes to apply to the agent configuration.
-
-        benchmark: str
-            The benchmark to use.
-
-        demo_mode: bool
-            If True, the experiments will be run in demo mode.
-
-    Returns:
-        Study
-    """
-    agents = args.make_ablation_study(start_agent, changes)
-    study = run_agents_on_benchmark(agents, benchmark, demo_mode=demo_mode)
-    study.suffix = "ablation_study"
-    return study
-
-
-def random_search(
-    random_agent: AgentArgs = RANDOM_SEARCH_AGENT,
-    n_samples=10,
-    benchmark: str = "miniwob",
-    demo_mode=False,
-):
-    """
-    Random search of AgentArgs (NOTE: not fully tested since refactoring)
-
-    The random search mechanism will recursively search through dataclasses and
-    dict to find attributes of type args.Choice. It will sample iid and replace
-    with the corresponding value.
-
-    *WARINING* The standard errror of the experiment will usually be relatively high and
-    the search space is usually big so the false discovery rate will likely be
-    high. Make sure to analyze the results with caution and don't actually draw
-    final conclusions from these experiments.
-
-    Args:
-        agent: AgentArgs
-            The agent configuration, with some sub-arguments defined as args.Choice.
-
-        n_samples: int
-            The number of samples to take.
-
-        benchmark: str
-            The benchmark to use.
-
-        demo_mode: bool
-            If True, the experiments will be run in demo mode.
-
-    Returns:
-        Study
-    """
-    agents = args.sample_and_expand_cross_product(random_agent, n_samples)
-    study = run_agents_on_benchmark(agents, benchmark, demo_mode=demo_mode)
-    study.suffix = "random_search"
-    return study
diff --git a/src/agentlab/experiments/task_collections.py b/src/agentlab/experiments/task_collections.py
deleted file mode 100644
index 66bf00b79..000000000
--- a/src/agentlab/experiments/task_collections.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import logging
-import time as t
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-from browsergym.experiments import EnvArgs
-from browsergym.webarena import ALL_WEBARENA_TASK_IDS
-
-df = pd.read_csv(Path(__file__).parent / "miniwob_tasks_all.csv")
-# append miniwob. to task_name column
-df["task_name"] = "miniwob." + df["task_name"]
-MINIWOB_ALL = df["task_name"].tolist()
-tasks_eval = df[df["miniwob_category"].isin(["original", "additional", "hidden test"])][
-    "task_name"
-].tolist()
-miniwob_debug = df[df["miniwob_category"].isin(["debug"])]["task_name"].tolist()
-MINIWOB_TINY_TEST = ["miniwob.click-dialog", "miniwob.click-checkboxes"]
-
-assert len(MINIWOB_ALL) == 125
-assert len(tasks_eval) == 107
-assert len(miniwob_debug) == 12
-assert len(MINIWOB_TINY_TEST) == 2
-
-
-webgum_tasks = [
-    "miniwob.book-flight",
-    "miniwob.choose-date",
-    "miniwob.choose-date-easy",
-    "miniwob.choose-date-medium",
-    "miniwob.choose-list",
-    "miniwob.click-button",
-    "miniwob.click-button-sequence",
-    "miniwob.click-checkboxes",
-    "miniwob.click-checkboxes-large",
-    "miniwob.click-checkboxes-soft",
-    "miniwob.click-checkboxes-transfer",
-    "miniwob.click-collapsible",
-    "miniwob.click-collapsible-2",
-    "miniwob.click-color",
-    "miniwob.click-dialog",
-    "miniwob.click-dialog-2",
-    "miniwob.click-link",
-    "miniwob.click-menu",
-    "miniwob.click-option",
-    "miniwob.click-pie",
-    "miniwob.click-scroll-list",
-    "miniwob.click-shades",
-    "miniwob.click-shape",
-    "miniwob.click-tab",
-    "miniwob.click-tab-2",
-    "miniwob.click-tab-2-hard",
-    "miniwob.click-test",
-    "miniwob.click-test-2",
-    "miniwob.click-widget",
-    "miniwob.count-shape",
-    "miniwob.email-inbox",
-    "miniwob.email-inbox-forward-nl",
-    "miniwob.email-inbox-forward-nl-turk",
-    "miniwob.email-inbox-nl-turk",
-    "miniwob.enter-date",
-    "miniwob.enter-password",
-    "miniwob.enter-text",
-    "miniwob.enter-text-dynamic",
-    "miniwob.enter-time",
-    "miniwob.focus-text",
-    "miniwob.focus-text-2",
-    "miniwob.grid-coordinate",
-    "miniwob.guess-number",
-    "miniwob.identify-shape",
-    "miniwob.login-user",
-    "miniwob.login-user-popup",
-    "miniwob.multi-layouts",
-    "miniwob.multi-orderings",
-    "miniwob.navigate-tree",
-    "miniwob.search-engine",
-    "miniwob.social-media",
-    "miniwob.social-media-all",
-    "miniwob.social-media-some",
-    "miniwob.tic-tac-toe",
-    "miniwob.use-autocomplete",
-    "miniwob.use-spinner",
-]
-
-
-# TODO add miniwob_tiny_test as benchmarks
-def get_benchmark_env_args(
-    benchmark_name: str, meta_seed=42, max_steps=None, n_repeat=None
-) -> list[EnvArgs]:
-    """
-    Returns a list of EnvArgs for the given benchmark_name.
-
-    Args:
-        benchmark_name: A string representing the benchmark name.
-        meta_seed: The seed for the random number generator.
-        max_steps: None or int. The maximum number of steps for each task.
-            if None, it will use the default value for the benchmark.
-        n_repeat: None or int. The number of seeds for each task.
-            if None, it will use the default value for the benchmark.
-        is_agent_curriculum: wether to use the agent curriculum or the human curriculum.
-
-    Returns:
-        A list of EnvArgs.
-
-    Raises:
-        ValueError: If the benchmark_name is not recognized, or if the benchmark_name is not
-            followed by a subcategory for workarena.
-    """
-    env_args_list = []
-    rng = np.random.RandomState(meta_seed)
-
-    filters = benchmark_name.split(".")
-    benchmark_id = filters[0]
-    if filters[0] == "workarena":
-        benchmark_id = "workarena." + filters[1]
-
-    max_steps_default = {
-        "workarena.l1": 15,
-        "workarena.l2": 50,
-        "workarena.l3": 50,
-        "webarena": 15,
-        "miniwob": 10,
-        "miniwob_tiny_test": 5,
-        "weblinx": None,
-    }
-
-    n_repeat_default = {
-        "workarena.l1": 10,
-        "workarena.l2": 1,
-        "workarena.l3": 1,
-        "webarena": 1,
-        "miniwob": 5,
-        "miniwob_tiny_test": 2,
-        "weblinx": 1,
-    }
-
-    if max_steps is None:
-        max_steps = max_steps_default.get(benchmark_id, None)
-    if n_repeat is None:
-        n_repeat = n_repeat_default.get(benchmark_id, 1)
-    else:
-        if benchmark_id == "webarena" and n_repeat != 1:
-            logger.warning(
-                f"webarena is expected to have only one seed per task. Ignoring n_seeds_default = {n_repeat}"
-            )
-            n_repeat = 1
-
-    if benchmark_name.startswith("workarena"):
-        t0 = t.time()
-        from browsergym.workarena import ALL_WORKARENA_TASKS, ATOMIC_TASKS, get_all_tasks_agents
-
-        dt = t.time() - t0
-        print(f"done importing workarena, took {dt:.2f} seconds")
-
-        if len(filters) < 2:
-            raise ValueError(f"You must specify the sub set of workarena, e.g.: workarena.l2.")
-
-        if benchmark_name == "workarena.l1.sort":
-            task_names = [task.get_task_id() for task in ATOMIC_TASKS]
-            task_names = [task for task in task_names if "sort" in task]
-            env_args_list = _make_env_args(task_names, max_steps, n_repeat, rng)
-
-        else:
-            for task, seed in get_all_tasks_agents(
-                filter=".".join(filters[1:]),
-                meta_seed=meta_seed,
-                n_seed_l1=n_repeat,
-            ):
-                task_name = task.get_task_id()
-                env_args_list.append(
-                    EnvArgs(task_name=task_name, task_seed=seed, max_steps=max_steps)
-                )
-
-    elif benchmark_name == "webarena":
-        from browsergym.webarena import ALL_WEBARENA_TASK_IDS
-
-        env_args_list = _make_env_args(ALL_WEBARENA_TASK_IDS, max_steps, n_repeat, rng)
-    elif benchmark_name.startswith("miniwob"):
-        miniwob_benchmarks_map = {
-            "miniwob": MINIWOB_ALL,
-            "miniwob_tiny_test": MINIWOB_TINY_TEST,
-        }
-        env_args_list = _make_env_args(
-            miniwob_benchmarks_map[benchmark_name], max_steps, n_repeat, rng
-        )
-    elif benchmark_name.startswith("weblinx"):
-        from weblinx_browsergym import ALL_WEBLINX_TASK_IDS
-
-        env_args_list = _make_env_args(ALL_WEBLINX_TASK_IDS, max_steps, n_repeat, rng)
-    else:
-        raise ValueError(f"Unknown benchmark name: {benchmark_name}")
-
-    return env_args_list
-
-
-def _make_env_args(task_list, max_steps, n_seeds_default, rng):
-    env_args_list = []
-    for task in task_list:
-        for seed in rng.randint(0, 100, n_seeds_default):
-            env_args_list.append(EnvArgs(task_name=task, task_seed=int(seed), max_steps=max_steps))
-    return env_args_list
-
-
-if __name__ == "__main__":
-    env_args_list = get_benchmark_env_args("workarena.l2")
-    print(f"Number of tasks: {len(env_args_list)}")
-    for env_args in env_args_list:
-        if "infeasible" in env_args.task_name:
-            print(env_args.task_seed, env_args.task_name)
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index 712bc4db2..1c8deff8a 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -1,5 +1,6 @@
 from copy import deepcopy
 
+import bgym
 import pytest
 
 from agentlab.agents import dynamic_prompting as dp
@@ -70,7 +71,12 @@
         filter_visible_elements_only=True,
     ),
     action=dp.ActionFlags(
-        multi_actions=True,
+        action_set=bgym.HighLevelActionSetArgs(
+            subsets=["bid"],
+            multiaction=True,
+        ),
+        long_description=True,
+        individual_examples=True,
     ),
     use_plan=True,
     use_criticise=True,
@@ -144,10 +150,10 @@
         "use_abstract_example",
         ("# Abstract Example",),
     ),
-    (
-        "action.multi_actions",
-        ("One or several actions, separated by new lines",),
-    ),
+    # (
+    #     "action.action_set.multiaction",
+    #     ("One or several actions, separated by new lines",),
+    # ),
 ]
 
 
@@ -199,7 +205,7 @@ def test_main_prompt_elements_gone_one_at_a_time(flag_name: str, expected_prompt
 
     # Initialize MainPrompt
     prompt = MainPrompt(
-        action_set=dp.make_action_set(flags.action),
+        action_set=flags.action.action_set.make_action_set(),
         obs_history=OBS_HISTORY,
         actions=ACTIONS,
         memories=memories,
diff --git a/tests/experiments/test_exp_configs.py b/tests/experiments/test_exp_configs.py
index bf1b07f64..8a1b096bb 100644
--- a/tests/experiments/test_exp_configs.py
+++ b/tests/experiments/test_exp_configs.py
@@ -1,20 +1 @@
-from agentlab.experiments import study_generators
-
-
-def test_all_configs():
-    generators = [
-        # study_generators.ablation_study,
-        study_generators.run_agents_on_benchmark,
-        study_generators.random_search,
-    ]
-
-    for generator in generators:
-        study = generator()
-        assert isinstance(study, study_generators.Study)
-        assert isinstance(study.exp_args_list, list)
-        assert len(study.exp_args_list) > 0
-        assert isinstance(study.exp_args_list[0], study_generators.ExpArgs)
-
-
-if __name__ == "__main__":
-    test_all_configs()
+from agentlab.experiments import study
diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py
index daa6f2063..f37869fd6 100644
--- a/tests/experiments/test_launch_exp.py
+++ b/tests/experiments/test_launch_exp.py
@@ -7,22 +7,20 @@
 from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5, AGENT_4o_MINI
 from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
 from agentlab.analyze import inspect_results
-from agentlab.experiments.launch_exp import relaunch_study, run_experiments
-from agentlab.experiments.study_generators import run_agents_on_benchmark
+from agentlab.experiments.launch_exp import find_incomplete, run_experiments
+from agentlab.experiments.study import Study
 from agentlab.llm.chat_api import CheatMiniWoBLLMArgs
 
 
 def test_relaunch_study():
     study_dir = Path(__file__).parent.parent / "data" / "test_study"
-    exp_args_list, study_dir_ = relaunch_study(study_dir, relaunch_mode="incomplete_only")
+    exp_args_list = find_incomplete(study_dir, relaunch_mode="incomplete_only")
 
-    assert study_dir_ == study_dir
     assert len(exp_args_list) == 1
     assert exp_args_list[0].env_args.task_name == "miniwob.ascending-numbers"
 
-    exp_args_list, study_dir_ = relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
+    exp_args_list = find_incomplete(study_dir, relaunch_mode="incomplete_or_error")
 
-    assert study_dir_ == study_dir
     assert len(exp_args_list) == 2
 
 
@@ -76,7 +74,7 @@ def test_4o_mini_on_miniwob_tiny_test():
     """Run with `pytest -m pricy`."""
     with tempfile.TemporaryDirectory() as tmp_dir:
 
-        study = run_agents_on_benchmark(agents=AGENT_4o_MINI, benchmark="miniwob_tiny_test")
+        study = Study(agent_args=[AGENT_4o_MINI], benchmark="miniwob_tiny_test", dir=tmp_dir)
 
         study.run(n_jobs=4)
 
@@ -96,6 +94,7 @@ def test_4o_mini_on_miniwob_tiny_test():
 
 
 if __name__ == "__main__":
-    test_4o_mini_on_miniwob_tiny_test()
+    test_relaunch_study()
+    # test_4o_mini_on_miniwob_tiny_test()
     # test_launch_system()
     # test_launch_system_sequntial()
diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py
index 15b056580..57299ae3e 100644
--- a/tests/experiments/test_reproducibility_util.py
+++ b/tests/experiments/test_reproducibility_util.py
@@ -6,15 +6,19 @@
 from agentlab.agents.generic_agent import AGENT_4o_MINI
 import pytest
 import json
+import bgym
 
 
 @pytest.mark.parametrize(
     "benchmark_name",
-    ["miniwob", "workarena.l1", "webarena", "visualwebarena"],
+    ["miniwob_all", "workarena_l1", "webarena", "visualwebarena"],
 )
 def test_get_reproducibility_info(benchmark_name):
+
+    benchmark = bgym.BENCHMARKS[benchmark_name]()
+
     info = reproducibility_util.get_reproducibility_info(
-        "test_agent", benchmark_name, ignore_changes=True
+        "test_agent", benchmark, "test_id", ignore_changes=True
     )
 
     print("reproducibility info:")
@@ -32,68 +36,68 @@ def test_get_reproducibility_info(benchmark_name):
     assert "browsergym__local_modifications" in info
 
 
-def test_save_reproducibility_info():
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        tmp_dir = Path(tmp_dir)
-
-        info1 = reproducibility_util.save_reproducibility_info(
-            study_dir=tmp_dir,
-            info=reproducibility_util.get_reproducibility_info(
-                agent_name="GenericAgent",
-                benchmark_name="miniwob",
-                ignore_changes=True,
-            ),
-        )
-        time.sleep(1)  # make sure the date changes by at least 1s
-
-        # this should overwrite the previous info since they are the same beside
-        # the date
-        info2 = reproducibility_util.save_reproducibility_info(
-            study_dir=tmp_dir,
-            info=reproducibility_util.get_reproducibility_info(
-                agent_name="GenericAgent",
-                benchmark_name="miniwob",
-                ignore_changes=True,
-            ),
-        )
-
-        reproducibility_util._assert_compatible(info1, info2)
-
-        # this should not overwrite info2 as the agent name is different, it
-        # should raise an error
-        with pytest.raises(ValueError):
-            reproducibility_util.save_reproducibility_info(
-                study_dir=tmp_dir,
-                info=reproducibility_util.get_reproducibility_info(
-                    agent_name="GenericAgent_alt",
-                    benchmark_name="miniwob",
-                    ignore_changes=True,
-                ),
-            )
-
-        # load json
-        info3 = reproducibility_util.load_reproducibility_info(tmp_dir)
-
-        assert info2 == info3
-        assert info1 != info3
-
-        test_study_dir = Path(__file__).parent.parent / "data" / "test_study"
-        result_df = inspect_results.load_result_df(test_study_dir, progress_fn=None)
-        report_df = inspect_results.summarize_study(result_df)
-
-        with pytest.raises(ValueError):
-            reproducibility_util.append_to_journal(
-                info3, report_df, journal_path=tmp_dir / "journal.csv"
-            )
-
-        reproducibility_util.append_to_journal(
-            info3, report_df, journal_path=tmp_dir / "journal.csv", strict_reproducibility=False
-        )
-
-        print((tmp_dir / "journal.csv").read_text())
+# def test_save_reproducibility_info():
+#     with tempfile.TemporaryDirectory() as tmp_dir:
+#         tmp_dir = Path(tmp_dir)
+
+#         info1 = reproducibility_util.save_reproducibility_info(
+#             study_dir=tmp_dir,
+#             info=reproducibility_util.get_reproducibility_info(
+#                 agents_args="GenericAgent",
+#                 benchmark_name="miniwob",
+#                 ignore_changes=True,
+#             ),
+#         )
+#         time.sleep(1)  # make sure the date changes by at least 1s
+
+#         # this should overwrite the previous info since they are the same beside
+#         # the date
+#         info2 = reproducibility_util.save_reproducibility_info(
+#             study_dir=tmp_dir,
+#             info=reproducibility_util.get_reproducibility_info(
+#                 agents_args="GenericAgent",
+#                 benchmark_name="miniwob",
+#                 ignore_changes=True,
+#             ),
+#         )
+
+#         reproducibility_util.assert_compatible(info1, info2)
+
+#         # this should not overwrite info2 as the agent name is different, it
+#         # should raise an error
+#         with pytest.raises(ValueError):
+#             reproducibility_util.save_reproducibility_info(
+#                 study_dir=tmp_dir,
+#                 info=reproducibility_util.get_reproducibility_info(
+#                     agents_args="GenericAgent_alt",
+#                     benchmark_name="miniwob",
+#                     ignore_changes=True,
+#                 ),
+#             )
+
+#         # load json
+#         info3 = reproducibility_util.load_reproducibility_info(tmp_dir)
+
+#         assert info2 == info3
+#         assert info1 != info3
+
+#         test_study_dir = Path(__file__).parent.parent / "data" / "test_study"
+#         result_df = inspect_results.load_result_df(test_study_dir, progress_fn=None)
+#         report_df = inspect_results.summarize_study(result_df)
+
+#         with pytest.raises(ValueError):
+#             reproducibility_util.append_to_journal(
+#                 info3, report_df, journal_path=tmp_dir / "journal.csv"
+#             )
+
+#         reproducibility_util.append_to_journal(
+#             info3, report_df, journal_path=tmp_dir / "journal.csv", strict_reproducibility=False
+#         )
+
+#         print((tmp_dir / "journal.csv").read_text())
 
 
 if __name__ == "__main__":
     # test_set_temp()
     test_get_reproducibility_info("miniwob")
-    test_save_reproducibility_info()
+    # test_save_reproducibility_info()
diff --git a/tests/experiments/test_task_collection.py b/tests/experiments/test_task_collection.py
deleted file mode 100644
index ea6267dcf..000000000
--- a/tests/experiments/test_task_collection.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import pytest
-
-from agentlab.experiments.task_collections import get_benchmark_env_args
-
-
-@pytest.mark.pricy
-@pytest.mark.parametrize(
-    "benchmark_name, expected_length",
-    [
-        ("workarena.l1", 330),
-        ("workarena.l2", 235),
-        ("workarena.l3", 235),
-        ("webarena", 812),
-        ("miniwob", 625),
-    ],
-)
-def test_get_benchmark_env_args(benchmark_name, expected_length):
-    result = get_benchmark_env_args(benchmark_name)
-    assert len(result) == expected_length
-
-
-if __name__ == "__main__":
-    test_get_benchmark_env_args("workarena.l1", 5)
-    test_get_benchmark_env_args("workarena.l2", 5)
-    test_get_benchmark_env_args("workarena.l3", 5)
-    test_get_benchmark_env_args("webarena", 5)
-    test_get_benchmark_env_args("miniwob", 5)

From 98e5a221f59872141ab530b25098e32afc7ce942 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:50:16 -0400
Subject: [PATCH 10/47] adding message class and updating generic agent
 accordingly (#68)

* adding message class and updating generic agent accordingly

* updating tests

* Reproducibility test before message class

* Adding inspect_result.ipynb to reprod white list

* Reproducibility test after message class

* L1 before message class

* L1 after message class

* added append as method to the Discussion class, to make it totally similar to a list

* changed to_markdown behavior

* updated most_basic_agent

* updated ReproAgent

* Update src/agentlab/analyze/agent_xray.py

* format

* new journal entry

* immutable as default kwarg

* removing __add__ and __radd__

* added deprecation warning

* updating tests
---
 reproducibility_journal.csv                   |   5 +
 src/agentlab/agents/dynamic_prompting.py      |  21 +--
 .../agents/generic_agent/generic_agent.py     |  11 +-
 .../generic_agent/generic_agent_prompt.py     |  20 ++-
 .../generic_agent/reproducibility_agent.py    |   8 +-
 .../most_basic_agent/most_basic_agent.py      |  48 +++---
 src/agentlab/analyze/agent_xray.py            |   7 +-
 .../experiments/reproducibility_util.py       |   1 +
 src/agentlab/llm/chat_api.py                  |   6 +-
 src/agentlab/llm/llm_utils.py                 | 141 +++++++++++++++++-
 tests/agents/test_agent.py                    |  13 +-
 tests/agents/test_generic_prompt.py           |  50 ++++---
 tests/llm/test_llm_utils.py                   |   6 +-
 13 files changed, 253 insertions(+), 84 deletions(-)

diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index 03158422d..f1d937bc8 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -10,3 +10,8 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21
 recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.8.1,2024-10-17_10-13-28,,0.557,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.7,1.39.0,0.2.2,7bba275c004f1f90dfd83eaaab963ab5066e2baf,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.8.1,2024-10-17_10-50-53,,0.563,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.7,1.39.0,0.2.2,057b7d4a201cc1cd1ebd7bc884f6a91e104c479d,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_17-30-43,,0.258,0.024,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,7bba275c004f1f90dfd83eaaab963ab5066e2baf,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_18-30-28,,0.273,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,8b2b3f39a2bdb9efafad97791536a0b8cff4e708,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_all,0.9.0,2024-10-20_01-54-16,2024-10-20_01-54-02,0.588,0.014,0,1250/1250,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,1770eba87fabfe1e32cdf6078d71032fe00db736,,0.9.0,None,
diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index fdeecaac1..54d52f2cd 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -15,6 +15,7 @@
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
+    BaseMessage,
     ParseError,
     count_tokens,
     extract_code_blocks,
@@ -122,7 +123,7 @@ def __init__(self, visible: bool = True) -> None:
         self._visible = visible
 
     @property
-    def prompt(self):
+    def prompt(self) -> str | BaseMessage:
         """Avoid overriding this method. Override _prompt instead."""
         if self.is_visible:
             return self._prompt
@@ -253,7 +254,14 @@ def fit_tokens(
         if isinstance(prompt, str):
             prompt_str = prompt
         elif isinstance(prompt, list):
+            # warn deprecated
+            warn(
+                "Using list of prompts is deprecated. Use a Discussion object instead.",
+                DeprecationWarning,
+            )
             prompt_str = "\n".join([p["text"] for p in prompt if p["type"] == "text"])
+        elif isinstance(prompt, BaseMessage):
+            prompt_str = str(prompt)
         else:
             raise ValueError(f"Unrecognized type for prompt: {type(prompt)}")
         n_token = count_tokens(prompt_str, model=model_name)
@@ -405,21 +413,14 @@ def _prompt(self) -> str:
 
 """
 
-    def add_screenshot(self, prompt):
+    def add_screenshot(self, prompt: BaseMessage) -> BaseMessage:
         if self.flags.use_screenshot:
-            if isinstance(prompt, str):
-                prompt = [{"type": "text", "text": prompt}]
             if self.flags.use_som:
                 screenshot = self.obs["screenshot_som"]
             else:
                 screenshot = self.obs["screenshot"]
             img_url = image_to_jpg_base64_url(screenshot)
-            prompt.append(
-                {
-                    "type": "image_url",
-                    "image_url": {"url": img_url, "detail": self.flags.openai_vision_detail},
-                }
-            )
+            prompt.add_image(img_url, detail=self.flags.openai_vision_detail)
         return prompt
 
 
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index 622d25414..26a4a276b 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -9,7 +9,7 @@
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.chat_api import BaseModelArgs, make_system_message, make_user_message
-from agentlab.llm.llm_utils import ParseError, retry
+from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, retry
 from agentlab.llm.tracking import cost_tracker_decorator
 
 from .generic_agent_prompt import GenericPromptFlags, MainPrompt
@@ -99,9 +99,9 @@ def get_action(self, obs):
 
         max_prompt_tokens, max_trunc_itr = self._get_maxes()
 
-        system_prompt = dp.SystemPrompt().prompt
+        system_prompt = SystemMessage(dp.SystemPrompt().prompt)
 
-        prompt = dp.fit_tokens(
+        human_prompt = dp.fit_tokens(
             shrinkable=main_prompt,
             max_prompt_tokens=max_prompt_tokens,
             model_name=self.chat_model_args.model_name,
@@ -112,10 +112,7 @@ def get_action(self, obs):
             # TODO, we would need to further shrink the prompt if the retry
             # cause it to be too long
 
-            chat_messages = [
-                make_system_message(system_prompt),
-                make_user_message(prompt),
-            ]
+            chat_messages = Discussion([system_prompt, human_prompt])
             ans_dict = retry(
                 self.chat_llm,
                 chat_messages,
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index b75621497..a655b42f3 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -1,9 +1,11 @@
-from dataclasses import dataclass
 import logging
+from dataclasses import dataclass
+
 from browsergym.core import action
 from browsergym.core.action.base import AbstractActionSet
+
 from agentlab.agents import dynamic_prompting as dp
-from agentlab.llm.llm_utils import parse_html_tags_raise
+from agentlab.llm.llm_utils import HumanMessage, parse_html_tags_raise
 
 
 @dataclass
@@ -90,8 +92,9 @@ def time_for_caution():
         self.memory = Memory(visible=lambda: flags.use_memory)
 
     @property
-    def _prompt(self) -> str:
-        prompt = f"""\
+    def _prompt(self) -> HumanMessage:
+        prompt = HumanMessage(
+            f"""\
 {self.instructions.prompt}\
 {self.obs.prompt}\
 {self.history.prompt}\
@@ -103,9 +106,11 @@ def _prompt(self) -> str:
 {self.memory.prompt}\
 {self.criticise.prompt}\
 """
+        )
 
         if self.flags.use_abstract_example:
-            prompt += f"""
+            prompt.add_text(
+                f"""
 # Abstract Example
 
 Here is an abstract version of the answer with description of the content of
@@ -117,9 +122,11 @@ def _prompt(self) -> str:
 {self.criticise.abstract_ex}\
 {self.action_prompt.abstract_ex}\
 """
+            )
 
         if self.flags.use_concrete_example:
-            prompt += f"""
+            prompt.add_text(
+                f"""
 # Concrete Example
 
 Here is a concrete example of how to format your answer.
@@ -130,6 +137,7 @@ def _prompt(self) -> str:
 {self.criticise.concrete_ex}\
 {self.action_prompt.concrete_ex}\
 """
+            )
         return self.obs.add_screenshot(prompt)
 
     def shrink(self):
diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
index c40aece6e..091cf6cf5 100644
--- a/src/agentlab/agents/generic_agent/reproducibility_agent.py
+++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -26,7 +26,7 @@
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.experiments.study import Study
 from agentlab.llm.chat_api import make_assistant_message
-from agentlab.llm.llm_utils import messages_to_dict
+from agentlab.llm.llm_utils import Discussion, messages_to_dict
 
 from .generic_agent import GenericAgent, GenericAgentArgs
 
@@ -43,7 +43,7 @@ def __init__(self, old_messages, delay=1) -> None:
         self.old_messages = old_messages
         self.delay = delay
 
-    def __call__(self, messages: list):
+    def __call__(self, messages: list | Discussion):
         self.new_messages = copy(messages)
 
         if len(messages) >= len(self.old_messages):
@@ -95,7 +95,7 @@ def get_action(self, obs):
         # same answers
         step = len(self.actions)
         step_info = self.exp_result.get_step_info(step)
-        old_chat_messages = step_info.agent_info.get("chat_messages", None)
+        old_chat_messages = step_info.agent_info.get("chat_messages", None)  # type: Discussion
 
         if old_chat_messages is None:
             err_msg = self.exp_result.summary_info["err_msg"]
@@ -135,6 +135,8 @@ def _make_agent_stats(action, agent_info, step_info, old_chat_messages, new_chat
 
 
 def _format_messages(messages: list[dict]):
+    if isinstance(messages, Discussion):
+        return messages.to_string()
     messages = messages_to_dict(messages)
     return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages)
 
diff --git a/src/agentlab/agents/most_basic_agent/most_basic_agent.py b/src/agentlab/agents/most_basic_agent/most_basic_agent.py
index 2e0cfcbe0..9da6d9368 100644
--- a/src/agentlab/agents/most_basic_agent/most_basic_agent.py
+++ b/src/agentlab/agents/most_basic_agent/most_basic_agent.py
@@ -4,11 +4,18 @@
 
 import bgym
 
+from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.chat_api import make_system_message, make_user_message
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
-from agentlab.llm.llm_utils import ParseError, extract_code_blocks, retry
+from agentlab.llm.llm_utils import (
+    Discussion,
+    HumanMessage,
+    ParseError,
+    SystemMessage,
+    extract_code_blocks,
+    retry,
+)
 from agentlab.llm.tracking import cost_tracker_decorator
-from agentlab.agents.agent_args import AgentArgs
 
 if TYPE_CHECKING:
     from agentlab.llm.chat_api import BaseModelArgs
@@ -51,25 +58,25 @@ def __init__(
 
     @cost_tracker_decorator
     def get_action(self, obs: Any) -> tuple[str, dict]:
-        system_prompt = f"""
-You are a web assistant.
-"""
-        prompt = f"""
+        messages = Discussion(SystemMessage("You are a web assistant."))
+        messages.append(
+            HumanMessage(
+                f"""
 You are helping a user to accomplish the following goal on a website:
 
 {obs["goal"]}
 
-Here is the current state of the website, in the form of an html:
-
-{obs["dom_txt"]}
-
 To do so, you can interact with the environment using the following actions:
 
 {self.action_set.describe(with_long_description=False)}
 
 The inputs to those functions are the bids given in the html.
 
-The action you provide must be in between triple ticks.
+Here is the current state of the website, in the form of an html:
+
+{obs["pruned_html"]}
+
+The action you provide must be in between triple ticks and leverage the 'bid=' information provided in the html.
 Here is an example of how to use the bid action:
 
 ```
@@ -79,15 +86,14 @@ def get_action(self, obs: Any) -> tuple[str, dict]:
 Please provide a single action at a time and wait for the next observation. Provide only a single action per step. 
 Focus on the bid that are given in the html, and use them to perform the actions.
 """
+            )
+        )
         if self.use_chain_of_thought:
-            prompt += f"""
+            messages.add_text(
+                f"""
 Provide a chain of thoughts reasoning to decompose the task into smaller steps. And execute only the next step.
 """
-
-        messages = [
-            make_system_message(system_prompt),
-            make_user_message(prompt),
-        ]
+            )
 
         def parser(response: str) -> tuple[dict, bool, str]:
             blocks = extract_code_blocks(response)
@@ -108,7 +114,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
                 think=thought,
                 chat_messages=messages,
                 # put any stats that you care about as long as it is a number or a dict of numbers
-                stats={"prompt_length": len(prompt), "response_length": len(thought)},
+                stats={"prompt_length": len(messages), "response_length": len(thought)},
                 markdown_page="Add any txt information here, including base 64 images, to display in xray",
                 extra_info={"chat_model_args": asdict(self.chat_model_args)},
             ),
@@ -147,6 +153,12 @@ def parser(response: str) -> tuple[dict, bool, str]:
     ),
 ]
 
+AGENT_4o_MINI = MostBasicAgentArgs(
+    temperature=0.3,
+    use_chain_of_thought=True,
+    chat_model_args=chat_model_args,
+)
+
 
 def experiment_config():
     return exp_args
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 59db2ab29..8274ed262 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -19,8 +19,9 @@
 
 from agentlab.analyze import inspect_results
 from agentlab.experiments.exp_utils import RESULTS_DIR
-from agentlab.llm.chat_api import make_system_message, make_user_message
 from agentlab.experiments.study import get_most_recent_study
+from agentlab.llm.chat_api import make_system_message, make_user_message
+from agentlab.llm.llm_utils import Discussion
 
 select_dir_instructions = "Select Experiment Directory"
 AGENT_NAME_KEY = "agent.agent_name"
@@ -581,7 +582,9 @@ def update_chat_messages():
     global info
     agent_info = info.exp_result.steps_info[info.step].agent_info
     chat_messages = agent_info.get("chat_messages", ["No Chat Messages"])
-    messages = []
+    if isinstance(chat_messages, Discussion):
+        return chat_messages.to_markdown()
+    messages = []  # TODO(ThibaultLSDC) remove this at some point
     for i, m in enumerate(chat_messages):
         if isinstance(m, BaseMessage):  # TODO remove once langchain is deprecated
             m = m.content
diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
index ef790d0f1..e9b9dd90f 100644
--- a/src/agentlab/experiments/reproducibility_util.py
+++ b/src/agentlab/experiments/reproducibility_util.py
@@ -177,6 +177,7 @@ def get_reproducibility_info(
         "*/reproducibility_script.py",
         "*reproducibility_journal.csv",
         "*main.py",
+        "*inspect_results.ipynb",
     ),
     ignore_changes=False,
 ):
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index a4df0a977..fa0acf23e 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -13,6 +13,7 @@
 import agentlab.llm.tracking as tracking
 from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
 from agentlab.llm.huggingface_utils import HFBaseChatModel
+from agentlab.llm.llm_utils import Discussion
 
 
 def make_system_message(content: str) -> dict:
@@ -31,7 +32,10 @@ class CheatMiniWoBLLM(AbstractChatModel):
     """For unit-testing purposes only. It only work with miniwob.click-test task."""
 
     def __call__(self, messages) -> str:
-        prompt = messages[-1]["content"]
+        if isinstance(messages, Discussion):
+            prompt = messages.to_string()
+        else:
+            prompt = messages[1].get("content", "")
         match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)
 
         if match:
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index c3d750098..b0e8e8a06 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -6,8 +6,9 @@
 import os
 import re
 import time
+from copy import deepcopy
 from functools import cache
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Union
 from warnings import warn
 
 import numpy as np
@@ -23,14 +24,14 @@
 
 
 def messages_to_dict(messages: list[dict] | list[BaseMessage]) -> dict:
-    new_messages = []
+    new_messages = Discussion()
     for m in messages:
         if isinstance(m, dict):
-            new_messages.append(m)
+            new_messages.add_message(m)
         elif isinstance(m, str):
-            new_messages.append({"role": "<unknown role>", "content": m})
+            new_messages.add_message({"role": "<unknown role>", "content": m})
         elif isinstance(m, BaseMessage):
-            new_messages.append(convert_message_to_dict(m))
+            new_messages.add_message(convert_message_to_dict(m))
         else:
             raise ValueError(f"Unknown message type: {type(m)}")
     return new_messages
@@ -42,7 +43,7 @@ class RetryError(ValueError):
 
 def retry(
     chat: "ChatModel",
-    messages: list[dict],
+    messages: "Discussion",
     n_retry: int,
     parser: callable,
     log: bool = True,
@@ -80,8 +81,8 @@ def retry(
     tries = 0
     while tries < n_retry:
         answer = chat(messages)
-        messages.append(answer)  # TODO: could we change this to not use inplace modifications ?
-
+        # TODO: could we change this to not use inplace modifications ?
+        messages.append(answer)
         try:
             return parser(answer["content"])
         except ParseError as parsing_error:
@@ -322,6 +323,130 @@ def image_to_jpg_base64_url(image: np.ndarray | Image.Image):
     return f"data:image/jpeg;base64,{image_base64}"
 
 
+class BaseMessage(dict):
+    def __init__(self, role: str, content: Union[str, list[dict]]):
+        self["role"] = role
+        self["content"] = content
+
+    def __str__(self) -> str:
+        if isinstance(self["content"], str):
+            return self["content"]
+        if not all(elem["type"] == "text" for elem in self["content"]):
+            logging.warning(
+                "The content of the message has images, which are not displayed in the string representation."
+            )
+        return "\n".join([elem["text"] for elem in self["content"] if elem["type"] == "text"])
+
+    def add_content(self, type: str, content: Any):
+        if isinstance(self["content"], str):
+            text = self["content"]
+            self["content"] = []
+            self["content"].append({"type": "text", "text": text})
+        self["content"].append({"type": type, type: content})
+
+    def add_text(self, text: str):
+        self.add_content("text", text)
+
+    def add_image(self, image: np.ndarray | Image.Image | str, detail: str = None):
+        if not isinstance(image, str):
+            image_url = image_to_jpg_base64_url(image)
+        else:
+            image_url = image
+        if detail:
+            self.add_content("image", {"url": image_url, "detail": detail})
+        else:
+            self.add_content("image", image_url)
+
+    def to_markdown(self):
+        if isinstance(self["content"], str):
+            return f"\n```\n{self['content']}\n```\n"
+        res = []
+        for elem in self["content"]:
+            # add texts between ticks and images
+            if elem["type"] == "text":
+                res.append(f"\n```\n{elem['text']}\n```\n")
+            elif elem["type"] == "image":
+                res.append(f"![image]({elem['url']})")
+        return "\n".join(res)
+
+
+class SystemMessage(BaseMessage):
+    def __init__(self, content: Union[str, list[dict]]):
+        super().__init__("system", content)
+
+
+class HumanMessage(BaseMessage):
+    def __init__(self, content: Union[str, list[dict]]):
+        super().__init__("user", content)
+
+
+class AIMessage(BaseMessage):
+    def __init__(self, content: Union[str, list[dict]]):
+        super().__init__("assistant", content)
+
+
+class Discussion:
+    def __init__(self, messages: Union[list[BaseMessage], BaseMessage] = None):
+        if isinstance(messages, BaseMessage):
+            messages = [messages]
+        elif messages is None:
+            messages = []
+        self.messages = messages
+
+    @property
+    def last_message(self):
+        return self.messages[-1]
+
+    def __str__(self) -> str:
+        return "\n".join(str(m) for m in self.messages)
+
+    def to_string(self):
+        return str(self)
+
+    def to_openai(self):
+        return self.messages
+
+    def add_message(
+        self,
+        message: BaseMessage | dict = None,
+        role: str = None,
+        content: Union[str, list[dict]] = None,
+    ):
+        if message is None:
+            message = BaseMessage(role, content)
+        else:
+            if isinstance(message, dict):
+                message = BaseMessage(**message)
+        self.messages.append(message)
+
+    def append(self, message: BaseMessage | dict):
+        self.add_message(message)
+
+    def add_content(self, type: str, content: Any):
+        """Add content to the last message."""
+        self.last_message.add_content(type, content)
+
+    def add_text(self, text: str):
+        """Add text to the last message."""
+        self.last_message.add_text(text)
+
+    def add_image(self, image: np.ndarray | Image.Image | str, detail: str = None):
+        """Add an image to the last message."""
+        self.last_message.add_image(image, detail)
+
+    def __iter__(self):
+        return iter(self.messages)
+
+    def __len__(self):
+        return len(self.messages)
+
+    def __getitem__(self, key):
+        return self.messages[key]
+
+    def to_markdown(self):
+        return "\n".join([f"Message {i}\n{m.to_markdown()}\n" for i, m in enumerate(self.messages)])
+
+
 if __name__ == "__main__":
 
     # model_to_download = "THUDM/agentlm-70b"
diff --git a/tests/agents/test_agent.py b/tests/agents/test_agent.py
index 0b2c31f28..ec0b4d28e 100644
--- a/tests/agents/test_agent.py
+++ b/tests/agents/test_agent.py
@@ -11,6 +11,7 @@
 from agentlab.analyze import inspect_results
 from agentlab.experiments import launch_exp
 from agentlab.llm.chat_api import BaseModelArgs, CheatMiniWoBLLMArgs
+from agentlab.llm.llm_utils import Discussion
 
 
 def test_generic_agent():
@@ -55,7 +56,10 @@ def __call__(self, messages) -> str:
             self.retry_count += 1
             return dict(role="assistant", content="I'm retrying")
 
-        prompt = messages[1].get("content", "")
+        if isinstance(messages, Discussion):
+            prompt = messages.to_string()
+        else:
+            prompt = messages[1].get("content", "")
         match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)
 
         if match:
@@ -93,7 +97,10 @@ class CheatLLM_LLMError:
 
     def __call__(self, messages) -> str:
         if self.success:
-            prompt = messages[1].get("content", "")
+            if isinstance(messages, Discussion):
+                prompt = messages.to_string()
+            else:
+                prompt = messages[1].get("content", "")
             match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)
 
             if match:
@@ -229,4 +236,4 @@ def test_llm_error_no_success():
 
 if __name__ == "__main__":
     # test_generic_agent()
-    test_llm_error_no_success()
+    test_llm_error_success()
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index 1c8deff8a..e74e0a5c5 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -172,9 +172,9 @@ def test_shrinking_observation():
         flags=flags,
     )
 
-    prompt = prompt_maker.prompt
-    new_prompt = dp.fit_tokens(
-        prompt_maker, max_prompt_tokens=count_tokens(prompt) - 1, max_iterations=7
+    prompt = str(prompt_maker.prompt)
+    new_prompt = str(
+        dp.fit_tokens(prompt_maker, max_prompt_tokens=count_tokens(prompt) - 1, max_iterations=7)
     )
     assert count_tokens(new_prompt) < count_tokens(prompt)
     assert "[1] Click me" in prompt
@@ -204,16 +204,18 @@ def test_main_prompt_elements_gone_one_at_a_time(flag_name: str, expected_prompt
         memories = MEMORIES
 
     # Initialize MainPrompt
-    prompt = MainPrompt(
-        action_set=flags.action.action_set.make_action_set(),
-        obs_history=OBS_HISTORY,
-        actions=ACTIONS,
-        memories=memories,
-        thoughts=THOUGHTS,
-        previous_plan="1- think\n2- do it",
-        step=2,
-        flags=flags,
-    ).prompt
+    prompt = str(
+        MainPrompt(
+            action_set=flags.action.action_set.make_action_set(),
+            obs_history=OBS_HISTORY,
+            actions=ACTIONS,
+            memories=memories,
+            thoughts=THOUGHTS,
+            previous_plan="1- think\n2- do it",
+            step=2,
+            flags=flags,
+        ).prompt
+    )
 
     # Verify all elements are not present
     for expected in expected_prompts:
@@ -224,16 +226,18 @@ def test_main_prompt_elements_present():
     # Make sure the flag is enabled
 
     # Initialize MainPrompt
-    prompt = MainPrompt(
-        action_set=dp.HighLevelActionSet(),
-        obs_history=OBS_HISTORY,
-        actions=ACTIONS,
-        memories=MEMORIES,
-        thoughts=THOUGHTS,
-        previous_plan="1- think\n2- do it",
-        step=2,
-        flags=ALL_TRUE_FLAGS,
-    ).prompt
+    prompt = str(
+        MainPrompt(
+            action_set=dp.HighLevelActionSet(),
+            obs_history=OBS_HISTORY,
+            actions=ACTIONS,
+            memories=MEMORIES,
+            thoughts=THOUGHTS,
+            previous_plan="1- think\n2- do it",
+            step=2,
+            flags=ALL_TRUE_FLAGS,
+        ).prompt
+    )
     # Verify all elements are not present
     for _, expected_prompts in FLAG_EXPECTED_PROMPT:
         for expected in expected_prompts:
diff --git a/tests/llm/test_llm_utils.py b/tests/llm/test_llm_utils.py
index 7e5bb87cc..d8c29a695 100644
--- a/tests/llm/test_llm_utils.py
+++ b/tests/llm/test_llm_utils.py
@@ -183,7 +183,7 @@ def test_successful_parse_before_max_retries():
         ]
     )
 
-    result = llm_utils.retry(mock_chat, [], 5, mock_parser)
+    result = llm_utils.retry(mock_chat, llm_utils.Discussion(), 5, mock_parser)
 
     assert result == "Parsed value"
     assert mock_chat.call.call_count == 3
@@ -202,7 +202,7 @@ def test_unsuccessful_parse_before_max_retries():
         ]
     )
     with pytest.raises(llm_utils.ParseError):
-        result = llm_utils.retry(mock_chat, [], 2, mock_parser)
+        result = llm_utils.retry(mock_chat, llm_utils.Discussion(), 2, mock_parser)
 
     assert mock_chat.call.call_count == 2
 
@@ -213,7 +213,7 @@ def test_retry_parse_raises():
     parser_raises = Mock(side_effect=ValueError("Parser error"))
 
     with pytest.raises(ValueError):
-        llm_utils.retry(mock_chat, [], 3, parser_raises)
+        llm_utils.retry(mock_chat, llm_utils.Discussion(), 3, parser_raises)
 
 
 def test_extract_code_blocks():

From a6c1f93c59fb7a838d06ca02ef6c62abe2ce278c Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Mon, 21 Oct 2024 15:51:53 -0400
Subject: [PATCH 11/47] version bump

---
 src/agentlab/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py
index b5fdc7530..d31c31eae 100644
--- a/src/agentlab/__init__.py
+++ b/src/agentlab/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.2"
+__version__ = "0.2.3"

From d085e81591c441e0417a2e26e8e42f988011308c Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 22 Oct 2024 15:54:59 -0400
Subject: [PATCH 12/47] Updating generic_agent to fit use BGym's goal_object
 (#83)

* updating generic agent to goal_object

* fixing image markdown display

* updating tests

* fixing intruction BaseMessage

* added merge text in discussion

* added merge to discussion class

* added tests
---
 src/agentlab/agents/dynamic_prompting.py      | 24 ++++++++++----
 .../agents/generic_agent/generic_agent.py     |  2 ++
 .../generic_agent/generic_agent_prompt.py     |  7 ++--
 src/agentlab/llm/llm_utils.py                 | 33 +++++++++++++++++--
 tests/agents/test_generic_prompt.py           | 12 +++++++
 tests/llm/test_llm_utils.py                   | 31 ++++++++++++++++-
 6 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 54d52f2cd..e1ee4ca85 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -443,24 +443,36 @@ def __init__(self, visible: bool = True) -> None:
 
 
 class GoalInstructions(PromptElement):
-    def __init__(self, goal, visible: bool = True, extra_instructions=None) -> None:
+    def __init__(self, goal_object, visible: bool = True, extra_instructions=None) -> None:
         super().__init__(visible)
-        self._prompt = f"""\
+        self._prompt = [
+            dict(
+                type="text",
+                text=f"""\
 # Instructions
 Review the current state of the page and all other information to find the best
 possible next action to accomplish your goal. Your answer will be interpreted
 and executed by a program, make sure to follow the formatting instructions.
 
 ## Goal:
-{goal}
-"""
+""",
+            )
+        ]
+
+        self._prompt += goal_object
+
         if extra_instructions:
-            self._prompt += f"""
+            self._prompt += [
+                dict(
+                    type="text",
+                    text=f"""
 
 ## Extra instructions:
 
 {extra_instructions}
-"""
+""",
+                )
+            ]
 
 
 class ChatInstructions(PromptElement):
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index 26a4a276b..c9746d8c3 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -89,6 +89,7 @@ def get_action(self, obs):
         main_prompt = MainPrompt(
             action_set=self.action_set,
             obs_history=self.obs_history,
+            goal_object=obs["goal_object"],
             actions=self.actions,
             memories=self.memories,
             thoughts=self.thoughts,
@@ -268,3 +269,4 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
         output += f"\n<action>\n{action}\n</action>"
 
     return system_prompt, instruction_prompt, output
+    return system_prompt, instruction_prompt, output
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index a655b42f3..7f60cd04d 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -51,6 +51,7 @@ def __init__(
         self,
         action_set: AbstractActionSet,
         obs_history: list[dict],
+        goal_object: list[dict],
         actions: list[str],
         memories: list[str],
         thoughts: list[str],
@@ -71,7 +72,7 @@ def __init__(
                     "Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`."
                 )
             self.instructions = dp.GoalInstructions(
-                obs_history[-1]["goal"], extra_instructions=flags.extra_instructions
+                goal_object, extra_instructions=flags.extra_instructions
             )
 
         self.obs = dp.Observation(obs_history[-1], self.flags.obs)
@@ -93,9 +94,9 @@ def time_for_caution():
 
     @property
     def _prompt(self) -> HumanMessage:
-        prompt = HumanMessage(
+        prompt = HumanMessage(self.instructions.prompt)
+        prompt.add_text(
             f"""\
-{self.instructions.prompt}\
 {self.obs.prompt}\
 {self.history.prompt}\
 {self.action_prompt.prompt}\
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index b0e8e8a06..dec6b7f7a 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -326,7 +326,7 @@ def image_to_jpg_base64_url(image: np.ndarray | Image.Image):
 class BaseMessage(dict):
     def __init__(self, role: str, content: Union[str, list[dict]]):
         self["role"] = role
-        self["content"] = content
+        self["content"] = deepcopy(content)
 
     def __str__(self) -> str:
         if isinstance(self["content"], str):
@@ -365,10 +365,30 @@ def to_markdown(self):
             # add texts between ticks and images
             if elem["type"] == "text":
                 res.append(f"\n```\n{elem['text']}\n```\n")
-            elif elem["type"] == "image":
-                res.append(f"![image]({elem['url']})")
+            elif elem["type"] == "image_url":
+                img_str = (
+                    elem["image_url"]
+                    if isinstance(elem["image_url"], str)
+                    else elem["image_url"]["url"]
+                )
+                res.append(f"![image]({img_str})")
         return "\n".join(res)
 
+    def merge(self):
+        """Merges content elements of type 'text' if they are adjacent."""
+        if isinstance(self["content"], str):
+            return
+        new_content = []
+        for elem in self["content"]:
+            if elem["type"] == "text":
+                if new_content and new_content[-1]["type"] == "text":
+                    new_content[-1]["text"] += "\n" + elem["text"]
+                else:
+                    new_content.append(elem)
+            else:
+                new_content.append(elem)
+        self["content"] = new_content
+
 
 class SystemMessage(BaseMessage):
     def __init__(self, content: Union[str, list[dict]]):
@@ -397,13 +417,19 @@ def __init__(self, messages: Union[list[BaseMessage], BaseMessage] = None):
     def last_message(self):
         return self.messages[-1]
 
+    def merge(self):
+        for m in self.messages:
+            m.merge()
+
     def __str__(self) -> str:
         return "\n".join(str(m) for m in self.messages)
 
     def to_string(self):
+        self.merge()
         return str(self)
 
     def to_openai(self):
+        self.merge()
         return self.messages
 
     def add_message(
@@ -444,6 +470,7 @@ def __getitem__(self, key):
         return self.messages[key]
 
     def to_markdown(self):
+        self.merge()
         return "\n".join([f"Message {i}\n{m.to_markdown()}\n" for i, m in enumerate(self.messages)])
 
 
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index e74e0a5c5..b3aa6db0a 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -24,6 +24,7 @@
 OBS_HISTORY = [
     {
         "goal": "do this and that",
+        "goal_object": [{"type": "text", "text": "do this and that"}],
         "chat_messages": [{"role": "user", "message": "do this and that"}],
         "pruned_html": html_template.format(1),
         "axtree_txt": "[1] Click me",
@@ -32,6 +33,7 @@
     },
     {
         "goal": "do this and that",
+        "goal_object": [{"type": "text", "text": "do this and that"}],
         "chat_messages": [{"role": "user", "message": "do this and that"}],
         "pruned_html": html_template.format(2),
         "axtree_txt": "[1] Click me",
@@ -40,6 +42,7 @@
     },
     {
         "goal": "do this and that",
+        "goal_object": [{"type": "text", "text": "do this and that"}],
         "chat_messages": [{"role": "user", "message": "do this and that"}],
         "pruned_html": html_template.format(3),
         "axtree_txt": "[1] Click me",
@@ -47,6 +50,7 @@
         "last_action_error": "Hey, there is an error now",
     },
 ]
+GOAL_OBJECT = [{"type": "text", "text": "do this and that"}]
 ACTIONS = ["click('41')", "click('42')"]
 MEMORIES = ["memory A", "memory B"]
 THOUGHTS = ["thought A", "thought B"]
@@ -164,6 +168,7 @@ def test_shrinking_observation():
     prompt_maker = MainPrompt(
         action_set=dp.HighLevelActionSet(),
         obs_history=OBS_HISTORY,
+        goal_object=GOAL_OBJECT,
         actions=ACTIONS,
         memories=MEMORIES,
         thoughts=THOUGHTS,
@@ -208,6 +213,7 @@ def test_main_prompt_elements_gone_one_at_a_time(flag_name: str, expected_prompt
         MainPrompt(
             action_set=flags.action.action_set.make_action_set(),
             obs_history=OBS_HISTORY,
+            goal_object=GOAL_OBJECT,
             actions=ACTIONS,
             memories=memories,
             thoughts=THOUGHTS,
@@ -230,6 +236,7 @@ def test_main_prompt_elements_present():
         MainPrompt(
             action_set=dp.HighLevelActionSet(),
             obs_history=OBS_HISTORY,
+            goal_object=GOAL_OBJECT,
             actions=ACTIONS,
             memories=MEMORIES,
             thoughts=THOUGHTS,
@@ -250,3 +257,8 @@ def test_main_prompt_elements_present():
     test_main_prompt_elements_present()
     for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
         test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
diff --git a/tests/llm/test_llm_utils.py b/tests/llm/test_llm_utils.py
index d8c29a695..10febbac1 100644
--- a/tests/llm/test_llm_utils.py
+++ b/tests/llm/test_llm_utils.py
@@ -242,9 +242,38 @@ def hello_world():
     assert llm_utils.extract_code_blocks(text) == expected_output
 
 
+def test_message_merge_only_text():
+    content = [
+        {"type": "text", "text": "Hello, world!"},
+        {"type": "text", "text": "This is a test."},
+    ]
+    message = llm_utils.BaseMessage(role="system", content=content)
+    message.merge()
+    assert len(message["content"]) == 1
+    assert message["content"][0]["text"] == "Hello, world!\nThis is a test."
+
+
+def test_message_merge_text_image():
+    content = [
+        {"type": "text", "text": "Hello, world!"},
+        {"type": "text", "text": "This is a test."},
+        {"type": "image_url", "image_url": "this is a base64 image"},
+        {"type": "text", "text": "This is another test."},
+        {"type": "text", "text": "Goodbye, world!"},
+    ]
+    message = llm_utils.BaseMessage(role="system", content=content)
+    message.merge()
+    assert len(message["content"]) == 3
+    assert message["content"][0]["text"] == "Hello, world!\nThis is a test."
+    assert message["content"][1]["image_url"] == "this is a base64 image"
+    assert message["content"][2]["text"] == "This is another test.\nGoodbye, world!"
+
+
 if __name__ == "__main__":
     # test_retry_parallel()
     # test_rate_limit_max_wait_time()
     # test_successful_parse_before_max_retries()
     # test_unsuccessful_parse_before_max_retries()
-    test_extract_code_blocks()
+    # test_extract_code_blocks()
+    # test_message_merge_only_text()
+    test_message_merge_text_image()

From 59ad7cc129585a3c62f468ace2314b4fae1fe327 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 22 Oct 2024 16:17:00 -0400
Subject: [PATCH 13/47] Minor revert (#86)

* minor revert

* revert tests too
---
 src/agentlab/agents/generic_agent/generic_agent.py        | 2 +-
 src/agentlab/agents/generic_agent/generic_agent_prompt.py | 3 +--
 tests/agents/test_generic_prompt.py                       | 5 +----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index c9746d8c3..5ef8a4cc2 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -89,7 +89,6 @@ def get_action(self, obs):
         main_prompt = MainPrompt(
             action_set=self.action_set,
             obs_history=self.obs_history,
-            goal_object=obs["goal_object"],
             actions=self.actions,
             memories=self.memories,
             thoughts=self.thoughts,
@@ -270,3 +269,4 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
 
     return system_prompt, instruction_prompt, output
     return system_prompt, instruction_prompt, output
+    return system_prompt, instruction_prompt, output
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index 7f60cd04d..c1b4b6346 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -51,7 +51,6 @@ def __init__(
         self,
         action_set: AbstractActionSet,
         obs_history: list[dict],
-        goal_object: list[dict],
         actions: list[str],
         memories: list[str],
         thoughts: list[str],
@@ -72,7 +71,7 @@ def __init__(
                     "Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`."
                 )
             self.instructions = dp.GoalInstructions(
-                goal_object, extra_instructions=flags.extra_instructions
+                obs_history[-1]["goal_object"], extra_instructions=flags.extra_instructions
             )
 
         self.obs = dp.Observation(obs_history[-1], self.flags.obs)
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index b3aa6db0a..a579c2611 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -50,7 +50,6 @@
         "last_action_error": "Hey, there is an error now",
     },
 ]
-GOAL_OBJECT = [{"type": "text", "text": "do this and that"}]
 ACTIONS = ["click('41')", "click('42')"]
 MEMORIES = ["memory A", "memory B"]
 THOUGHTS = ["thought A", "thought B"]
@@ -168,7 +167,6 @@ def test_shrinking_observation():
     prompt_maker = MainPrompt(
         action_set=dp.HighLevelActionSet(),
         obs_history=OBS_HISTORY,
-        goal_object=GOAL_OBJECT,
         actions=ACTIONS,
         memories=MEMORIES,
         thoughts=THOUGHTS,
@@ -213,7 +211,6 @@ def test_main_prompt_elements_gone_one_at_a_time(flag_name: str, expected_prompt
         MainPrompt(
             action_set=flags.action.action_set.make_action_set(),
             obs_history=OBS_HISTORY,
-            goal_object=GOAL_OBJECT,
             actions=ACTIONS,
             memories=memories,
             thoughts=THOUGHTS,
@@ -236,7 +233,6 @@ def test_main_prompt_elements_present():
         MainPrompt(
             action_set=dp.HighLevelActionSet(),
             obs_history=OBS_HISTORY,
-            goal_object=GOAL_OBJECT,
             actions=ACTIONS,
             memories=MEMORIES,
             thoughts=THOUGHTS,
@@ -262,3 +258,4 @@ def test_main_prompt_elements_present():
         test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
         test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
         test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)

From 0e831330bf8110b5504494a6855d8af9363ffe6c Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Tue, 22 Oct 2024 16:18:51 -0400
Subject: [PATCH 14/47] Add tabs (#84)

* add tabs

* make sure it's not computed if not visible
---
 src/agentlab/agents/dynamic_prompting.py      | 38 ++++++++++++++++++-
 .../generic_agent/generic_agent_prompt.py     |  6 ++-
 .../experiments/test_reproducibility_util.py  |  2 +-
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index e1ee4ca85..ac49089c9 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -366,16 +366,50 @@ def __init__(self, bid, visible: bool = True, prefix="") -> None:
 """
 
 
+class Tabs(PromptElement):
+    def __init__(self, obs, visible: bool = True, prefix="") -> None:
+        super().__init__(visible=visible)
+        self.obs = obs
+        self.prefix = prefix
+
+    @property
+    def _prompt(self) -> str:
+        # by implementing this as a property, it's only coputed if visible
+        prompt_pieces = [f"\n{self.prefix}Currently open tabs:"]
+        for page_index, (page_url, page_title) in enumerate(
+            zip(self.obs["open_pages_urls"], self.obs["open_pages_titles"])
+        ):
+            active_or_not = " (active tab)" if page_index == self.obs["active_page_index"] else ""
+            prompt_piece = f"""\
+Tab {page_index}{active_or_not}:
+    Title: {page_title}
+    URL: {page_url}
+"""
+            prompt_pieces.append(prompt_piece)
+        self._prompt = "\n".join(prompt_pieces)
+
+
+def has_tab_action(action_set: bgym.HighLevelActionSetArgs):
+    return "tab" in action_set.subsets
+
+
 class Observation(Shrinkable):
     """Observation of the current step.
 
     Contains the html, the accessibility tree and the error logs.
     """
 
-    def __init__(self, obs, flags: ObsFlags) -> None:
+    def __init__(self, obs, flags: ObsFlags, use_tabs=False) -> None:
         super().__init__()
         self.flags = flags
         self.obs = obs
+
+        self.tabs = Tabs(
+            obs,
+            visible=use_tabs,
+            prefix="## ",
+        )
+
         self.html = HTML(
             obs[flags.html_type],
             visible_elements_only=flags.filter_visible_elements_only,
@@ -409,7 +443,7 @@ def shrink(self):
     def _prompt(self) -> str:
         return f"""
 # Observation of current step:
-{self.html.prompt}{self.ax_tree.prompt}{self.focused_element.prompt}{self.error.prompt}
+{self.tabs}{self.html.prompt}{self.ax_tree.prompt}{self.focused_element.prompt}{self.error.prompt}
 
 """
 
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index c1b4b6346..eb45ba59f 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -74,7 +74,11 @@ def __init__(
                 obs_history[-1]["goal_object"], extra_instructions=flags.extra_instructions
             )
 
-        self.obs = dp.Observation(obs_history[-1], self.flags.obs)
+        self.obs = dp.Observation(
+            obs_history[-1],
+            self.flags.obs,
+            use_tabs=dp.has_tab_action(self.flags.action.action_set),
+        )
 
         self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action)
 
diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py
index 57299ae3e..6008bb30e 100644
--- a/tests/experiments/test_reproducibility_util.py
+++ b/tests/experiments/test_reproducibility_util.py
@@ -11,7 +11,7 @@
 
 @pytest.mark.parametrize(
     "benchmark_name",
-    ["miniwob_all", "workarena_l1", "webarena", "visualwebarena"],
+    ["miniwob", "workarena_l1", "webarena", "visualwebarena"],
 )
 def test_get_reproducibility_info(benchmark_name):
 

From 86fe572772d1b153ceaa82655c0cbef66dda1b28 Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Wed, 23 Oct 2024 07:58:53 -0400
Subject: [PATCH 15/47] Fix reproduce study (#87)

* add tabs

* this workaround is worst

* bug fix

* fix reproduce study

* make sure it's not computed if not visible
---
 .../generic_agent/reproducibility_agent.py    | 36 ++++++++++++++-----
 src/agentlab/analyze/agent_xray.py            | 30 +++++++++-------
 .../experiments/reproducibility_util.py       |  8 ++---
 src/agentlab/experiments/study.py             | 25 +++++++++----
 4 files changed, 64 insertions(+), 35 deletions(-)

diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
index 091cf6cf5..ffec1111a 100644
--- a/src/agentlab/agents/generic_agent/reproducibility_agent.py
+++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -17,6 +17,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 
+import bgym
 from browsergym.experiments.agent import AgentInfo
 from browsergym.experiments.loop import ExpArgs, ExpResult, yield_all_exp_results
 from bs4 import BeautifulSoup
@@ -24,6 +25,7 @@
 from langchain_community.adapters.openai import convert_message_to_dict
 
 from agentlab.agents.agent_args import AgentArgs
+from agentlab.agents.dynamic_prompting import ActionFlags
 from agentlab.experiments.study import Study
 from agentlab.llm.chat_api import make_assistant_message
 from agentlab.llm.llm_utils import Discussion, messages_to_dict
@@ -141,18 +143,29 @@ def _format_messages(messages: list[dict]):
     return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages)
 
 
+def _make_backward_compatible(agent_args: GenericAgentArgs):
+    action_set = agent_args.flags.action.action_set
+    if isinstance(action_set, (str, list)):
+        if isinstance(action_set, str):
+            action_set = action_set.split("+")
+
+        agent_args.flags.action.action_set = bgym.HighLevelActionSetArgs(
+            subsets=action_set,
+            multiaction=agent_args.flags.action.multi_actions,
+        )
+
+    return agent_args
+
+
 def reproduce_study(original_study_dir: Path | str, log_level=logging.INFO):
     """Reproduce a study by running the same experiments with the same agent."""
 
     original_study_dir = Path(original_study_dir)
 
-    study = Study.load(original_study_dir)
-    study.dir = None
-    study.make_dir()
-
     exp_args_list: list[ExpArgs] = []
     for exp_result in yield_all_exp_results(original_study_dir, progress_fn=None):
-        agent_args = make_repro_agent(exp_result.exp_args.agent_args, exp_dir=exp_result.exp_dir)
+        agent_args = _make_backward_compatible(exp_result.exp_args.agent_args)
+        agent_args = make_repro_agent(agent_args, exp_dir=exp_result.exp_dir)
         exp_args_list.append(
             ExpArgs(
                 agent_args=agent_args,
@@ -160,13 +173,18 @@ def reproduce_study(original_study_dir: Path | str, log_level=logging.INFO):
                 logging_level=log_level,
             )
         )
+
+    # infer benchmark name from task list for backward compatible
     benchmark_name = exp_args_list[0].env_args.task_name.split(".")[0]
 
-    return Study(
-        exp_args_list=exp_args_list,
-        benchmark_name=benchmark_name,
-        agent_names=[agent_args.agent_name],
+    study = Study(
+        benchmark=benchmark_name,
+        agent_args=[agent_args],
     )
+    # this exp_args_list has a different agent_args for each experiment as repro_agent takes the exp_dir as argument
+    # so we overwrite exp_args_list with the one we created above
+    study.exp_args_list = exp_args_list
+    return study
 
 
 def make_repro_agent(agent_args: AgentArgs, exp_dir: Path | str):
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 8274ed262..38968fd6c 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -153,12 +153,6 @@ def filter_agent_id(self, agent_id: list[tuple]):
     white-space: normal !important;
     word-wrap: break-word !important;
 }
-#task_table {
-    height: 500px !important;
-}
-#seed_table {
-    height: 500px !important;
-}
 """
 
 
@@ -227,7 +221,7 @@ def run_gradio(results_dir: Path):
     content. You have to sort back with the Idx column to align the click with
     the order."""
                     )
-                agent_table = gr.DataFrame(height=500, show_label=False, interactive=False)
+                agent_table = gr.DataFrame(max_height=500, show_label=False, interactive=False)
             with gr.Tab("Select Task and Seed", id="Select Task"):
                 with gr.Row():
                     with gr.Column(scale=4):
@@ -244,7 +238,10 @@ def run_gradio(results_dir: Path):
                             refresh_results_button = gr.Button("↺", scale=0, size="sm")
 
                         task_table = gr.DataFrame(
-                            height=500, show_label=False, interactive=False, elem_id="task_table"
+                            max_height=500,
+                            show_label=False,
+                            interactive=False,
+                            elem_id="task_table",
                         )
 
                     with gr.Column(scale=2):
@@ -259,7 +256,10 @@ def run_gradio(results_dir: Path):
                             )
 
                         seed_table = gr.DataFrame(
-                            height=500, show_label=False, interactive=False, elem_id="seed_table"
+                            max_height=500,
+                            show_label=False,
+                            interactive=False,
+                            elem_id="seed_table",
                         )
 
             with gr.Tab("Constants and Variables"):
@@ -272,7 +272,9 @@ def run_gradio(results_dir: Path):
     **all** agents. They are displayed as a table with the name and value of the
     constant."""
                             )
-                        constants = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        constants = gr.DataFrame(
+                            max_height=500, show_label=False, interactive=False
+                        )
                     with gr.Column(scale=2):
                         with gr.Accordion("Variables", open=False):
                             gr.Markdown(
@@ -281,9 +283,11 @@ def run_gradio(results_dir: Path):
     They are displayed as a table with the name, value and count of unique
     values. A maximum of 3 different values are displayed."""
                             )
-                        variables = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        variables = gr.DataFrame(
+                            max_height=500, show_label=False, interactive=False
+                        )
             with gr.Tab("Global Stats"):
-                global_stats = gr.DataFrame(height=500, show_label=False, interactive=False)
+                global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)
 
         with gr.Row():
             episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown")
@@ -356,7 +360,7 @@ def run_gradio(results_dir: Path):
                 logs = gr.Code(language=None, **code_args)
 
             with gr.Tab("Stats") as tab_stats:
-                stats = gr.DataFrame(height=500, show_label=False, interactive=False)
+                stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)
 
             with gr.Tab("Agent Info HTML") as tab_agent_info_html:
                 with gr.Row():
diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
index e9b9dd90f..754347953 100644
--- a/src/agentlab/experiments/reproducibility_util.py
+++ b/src/agentlab/experiments/reproducibility_util.py
@@ -262,14 +262,10 @@ def _verify_report(report_df: pd.DataFrame, agent_names=list[str], strict_reprod
     unique_agent_names = report_df["agent.agent_name"].unique()
     if set(agent_names) != set(unique_agent_names):
         raise ValueError(
-            f"Agent names in the report {unique_agent_names} do not match the agent names {agent_names}.",
-            raise_error=strict_reproducibility,
+            f"Agent names in the report {unique_agent_names} do not match the agent names {agent_names}."
         )
     if len(set(agent_names)) != len(agent_names):
-        raise ValueError(
-            f"Duplicate agent names {agent_names}.",
-            raise_error=strict_reproducibility,
-        )
+        raise ValueError(f"Duplicate agent names {agent_names}.")
 
     report_df = report_df.set_index("agent.agent_name", inplace=False)
 
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 60f2166bb..c83019524 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -23,8 +23,9 @@ class Study:
     This is part of the high level API to help keep experiments organized and reproducible.
 
     Attributes:
-        benchmark: Benchmark
-            The benchmark to evaluate the agents on.
+        benchmark: Benchmark | str
+            The benchmark to evaluate the agents on. If a string is provided, it will be
+            converted to the corresponding benchmark using bgym.BENCHMARKS.
 
         agent_args: list[AgentArgs]
             The list of agents to evaluate.
@@ -43,7 +44,7 @@ class Study:
     """
 
     agent_args: list[AgentArgs] = None
-    benchmark: Benchmark = None
+    benchmark: Benchmark | str = None
     dir: Path = None
     suffix: str = ""  # used for adding a personnal comment to the study name
     uuid: str = None
@@ -157,10 +158,20 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
 
     @staticmethod
     def load(dir: Path) -> "Study":
-        with gzip.open(dir / "study.pkl.gz", "rb") as f:
-            study = pickle.load(f)  # type: Study
-
-        study.dir = dir
+        dir = Path(dir)
+        study_path = dir / "study.pkl.gz"
+        if not study_path.exists() and dir.is_dir():
+            # For backward compatibility
+            first_result = next(
+                inspect_results.yield_all_exp_results(savedir_base=dir, progress_fn=None)
+            )
+            benchmark_name = first_result.exp_args.env_args.task_name.split(".")[0]
+            agent_args = first_result.exp_args.agent_args
+            study = Study(agent_args=agent_args, benchmark=benchmark_name, dir=dir)
+        else:
+            with gzip.open(dir / "study.pkl.gz", "rb") as f:
+                study = pickle.load(f)  # type: Study
+            study.dir = dir
         return study
 
     @staticmethod

From 682e0f4cd86e2ca71052790169c3f4dec9d2c2b4 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Wed, 23 Oct 2024 08:14:32 -0400
Subject: [PATCH 16/47] upgrading gradio dependency (#88)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5d7fcb8f5..1d00e25c2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ contexttimer
 ipython
 pyyaml>=6
 pandas
-gradio<5
+gradio>=5
 gitpython # for the reproducibility script
 requests
 matplotlib

From 176fe8a4edc18436ef6d94770c6bb3cdf083910f Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Wed, 23 Oct 2024 11:59:24 -0400
Subject: [PATCH 17/47] bgym update (#90)

---
 src/agentlab/experiments/study.py              | 16 ++++++++--------
 tests/experiments/test_reproducibility_util.py | 14 ++++++++------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index c83019524..f1b823af4 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -1,19 +1,19 @@
-from dataclasses import dataclass
-from datetime import datetime
 import gzip
 import logging
-from pathlib import Path
 import pickle
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
 
-from bgym import ExpArgs, EnvArgs, Benchmark
 import bgym
+from bgym import Benchmark, EnvArgs, ExpArgs
 
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.analyze import inspect_results
 from agentlab.experiments import args
-from agentlab.experiments.launch_exp import run_experiments, find_incomplete
-from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments import reproducibility_util as repro
+from agentlab.experiments.exp_utils import RESULTS_DIR
+from agentlab.experiments.launch_exp import find_incomplete, run_experiments
 
 
 @dataclass
@@ -25,7 +25,7 @@ class Study:
     Attributes:
         benchmark: Benchmark | str
             The benchmark to evaluate the agents on. If a string is provided, it will be
-            converted to the corresponding benchmark using bgym.BENCHMARKS.
+            converted to the corresponding benchmark using bgym.DEFAULT_BENCHMARKS.
 
         agent_args: list[AgentArgs]
             The list of agents to evaluate.
@@ -54,7 +54,7 @@ class Study:
     def __post_init__(self):
         self.uuid = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
         if isinstance(self.benchmark, str):
-            self.benchmark = bgym.BENCHMARKS[self.benchmark]()
+            self.benchmark = bgym.DEFAULT_BENCHMARKS[self.benchmark]()
         if isinstance(self.dir, str):
             self.dir = Path(self.dir)
         self.make_exp_args_list()
diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py
index 6008bb30e..aa10ff47f 100644
--- a/tests/experiments/test_reproducibility_util.py
+++ b/tests/experiments/test_reproducibility_util.py
@@ -1,12 +1,14 @@
-from pathlib import Path
+import json
 import tempfile
 import time
+from pathlib import Path
+
+import bgym
+import pytest
+
+from agentlab.agents.generic_agent import AGENT_4o_MINI
 from agentlab.analyze import inspect_results
 from agentlab.experiments import reproducibility_util
-from agentlab.agents.generic_agent import AGENT_4o_MINI
-import pytest
-import json
-import bgym
 
 
 @pytest.mark.parametrize(
@@ -15,7 +17,7 @@
 )
 def test_get_reproducibility_info(benchmark_name):
 
-    benchmark = bgym.BENCHMARKS[benchmark_name]()
+    benchmark = bgym.DEFAULT_BENCHMARKS[benchmark_name]()
 
     info = reproducibility_util.get_reproducibility_info(
         "test_agent", benchmark, "test_id", ignore_changes=True

From 605c5039fde4ec4f9ccce4ef3d522fa1d98e664e Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Wed, 23 Oct 2024 22:05:55 -0400
Subject: [PATCH 18/47] Workarena TMLR experiments (#89)

* new entry

* adding llm configs

* new journal entries
---
 reproducibility_journal.csv     |  6 ++++++
 src/agentlab/llm/llm_configs.py | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index f1d937bc8..aa28461bd 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -15,3 +15,9 @@ ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.8.1,2024-10-17_10-50-
 ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_17-30-43,,0.258,0.024,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,7bba275c004f1f90dfd83eaaab963ab5066e2baf,,0.8.1,None,
 ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_18-30-28,,0.273,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,8b2b3f39a2bdb9efafad97791536a0b8cff4e708,,0.8.1,None,
 ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_all,0.9.0,2024-10-20_01-54-16,2024-10-20_01-54-02,0.588,0.014,0,1250/1250,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,1770eba87fabfe1e32cdf6078d71032fe00db736,,0.9.0,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena_l1,0.4.1,2024-10-22_18-41-55,2024-10-22_15-24-53,0.215,0.023,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,a6c1f93c59fb7a838d06ca02ef6c62abe2ce278c,,0.9.0,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.252,0.024,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
+ThibaultLSDC,GenericAgent-gpt-4o,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.488,0.028,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
+ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.579,0.027,2,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
+ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.309,0.025,2,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
+ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.527,0.027,2,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index 30889be3d..13cb69b41 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -77,6 +77,13 @@
         max_input_tokens=40_000,
         max_new_tokens=4_000,
     ),
+    "azure/gpt-4o-mini-2024-07-18": AzureModelArgs(
+        model_name="gpt-4o-mini",
+        deployment_name="gpt-4o-mini-2024-07-18",
+        max_total_tokens=128_000,
+        max_input_tokens=40_000,
+        max_new_tokens=4_000,
+    ),
     # ---------------- OSS LLMs ----------------#
     "meta-llama/Meta-Llama-3-70B-Instruct": SelfHostedModelArgs(
         model_name="meta-llama/Meta-Llama-3-70B-Instruct",
@@ -152,4 +159,11 @@
         max_new_tokens=2_000,
         temperature=1e-1,
     ),
+    "openrouter/openai/o1-mini-2024-09-12": OpenRouterModelArgs(
+        model_name="openai/o1-mini-2024-09-12",
+        max_total_tokens=128_000,
+        max_input_tokens=40_000,
+        max_new_tokens=4000,
+        temperature=1e-1,
+    ),
 }

From 96b5cd6fc52f9bccb5757876d47f6a5253827e1e Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Wed, 23 Oct 2024 22:24:43 -0400
Subject: [PATCH 19/47] handling sequntial in VWA (#91)

* handling sequntial in VWA

* enable comments

* format

---------

Co-authored-by: ThibaultLSDC <thibault.de.chezelles@gmail.com>
---
 src/agentlab/experiments/launch_exp.py | 27 ++++++++++++++-
 src/agentlab/experiments/study.py      | 48 ++++++++++++++++++++++----
 2 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index d62de9efa..f151f7ecb 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -56,6 +56,15 @@ def run_experiments(
         if parallel_backend == "joblib":
             from joblib import Parallel, delayed
 
+            # split sequential
+            sequential_exp_args, exp_args_list = _split_sequential_exp(exp_args_list)
+
+            logging.info(
+                f"Running {len(sequential_exp_args)} in sequential first. The remaining {len(exp_args_list)} will be run in parallel."
+            )
+            for exp_args in sequential_exp_args:
+                exp_args.run()
+
             Parallel(n_jobs=n_jobs, prefer="processes")(
                 delayed(exp_args.run)() for exp_args in exp_args_list
             )
@@ -98,9 +107,12 @@ def find_incomplete(study_dir: str | Path, relaunch_mode="incomplete_only"):
         )
     exp_args_list = list(_yield_incomplete_experiments(study_dir, relaunch_mode=relaunch_mode))
 
+    # sort according to exp_args.order
+    exp_args_list.sort(key=lambda exp_args: exp_args.order if exp_args.order is not None else 0)
+
     if len(exp_args_list) == 0:
         logging.info(f"No incomplete experiments found in {study_dir}.")
-        return [], study_dir
+        return exp_args_list
 
     message = f"Make sure the processes that were running are all stopped. Otherwise, "
     f"there will be concurrent writing in the same directories.\n"
@@ -140,3 +152,16 @@ def split_path(path: str):
         path = path.replace("/", ".")
     module_name, obj_name = path.rsplit(".", 1)
     return module_name, obj_name
+
+
+def _split_sequential_exp(exp_args_list: list[ExpArgs]) -> tuple[list[ExpArgs], list[ExpArgs]]:
+    """split exp_args that are flagged as sequential from those that are not"""
+    sequential_exp_args = []
+    parallel_exp_args = []
+    for exp_args in exp_args_list:
+        if getattr(exp_args, "sequential", False):
+            sequential_exp_args.append(exp_args)
+        else:
+            parallel_exp_args.append(exp_args)
+
+    return sequential_exp_args, parallel_exp_args
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index f1b823af4..2da9771fb 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -68,19 +68,23 @@ def find_incomplete(self, relaunch_mode="incomplete_or_error"):
         """Find incomplete or errored experiments in the study directory for relaunching."""
         self.exp_args_list = find_incomplete(self.dir, relaunch_mode=relaunch_mode)
 
-    def set_reproducibility_info(self, strict_reproducibility=False):
+    def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
         """Gather relevant information that may affect the reproducibility of the experiment
 
         e.g.: versions of BrowserGym, benchmark, AgentLab..."""
         agent_names = [a.agent_name for a in self.agent_args]
         info = repro.get_reproducibility_info(
-            agent_names, self.benchmark, self.uuid, ignore_changes=not strict_reproducibility
+            agent_names,
+            self.benchmark,
+            self.uuid,
+            ignore_changes=not strict_reproducibility,
+            comment=comment,
         )
         if self.reproducibility_info is not None:
             repro.assert_compatible(self.reproducibility_info, info)
         self.reproducibility_info = info
 
-    def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False):
+    def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False, comment=None):
         """Run all experiments in the study in parallel when possible.
 
         Args:
@@ -98,7 +102,9 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False)
         if self.exp_args_list is None:
             raise ValueError("exp_args_list is None. Please set exp_args_list before running.")
 
-        self.set_reproducibility_info(strict_reproducibility=strict_reproducibility)
+        self.set_reproducibility_info(
+            strict_reproducibility=strict_reproducibility, comment=comment
+        )
         self.save()
 
         run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
@@ -172,6 +178,12 @@ def load(dir: Path) -> "Study":
             with gzip.open(dir / "study.pkl.gz", "rb") as f:
                 study = pickle.load(f)  # type: Study
             study.dir = dir
+
+            # just a check
+            for i, exp_args in enumerate(study.exp_args_list):
+                if exp_args.order != i:
+                    logging.warning("The order of the experiments is not correct.")
+
         return study
 
     @staticmethod
@@ -270,6 +282,12 @@ def _agents_on_benchmark(
     if not isinstance(agents, (list, tuple)):
         agents = [agents]
 
+    if benchmark.name.startswith("visualwebarena") or benchmark.name.startswith("webarena"):
+        if len(agents) > 1:
+            raise ValueError(
+                f"Only one agent can be run on {benchmark.name} since the instance requires manual reset after each evaluation."
+            )
+
     for agent in agents:
         agent.set_benchmark(benchmark, demo_mode)  # the agent can adapt (lightly?) to the benchmark
 
@@ -277,13 +295,31 @@ def _agents_on_benchmark(
     if demo_mode:
         set_demo_mode(env_args_list)
 
-    return args.expand_cross_product(
+    exp_args_list = args.expand_cross_product(
         ExpArgs(
             agent_args=args.CrossProd(agents),
             env_args=args.CrossProd(env_args_list),
             logging_level=logging_level,
         )
-    )
+    )  # type: list[ExpArgs]
+
+    for i, exp_args in enumerate(exp_args_list):
+        exp_args.order = i
+
+    _flag_sequential_exp(exp_args_list, benchmark)
+
+    return exp_args_list
+
+
+def _flag_sequential_exp(exp_args_list: list[ExpArgs], benchmark: Benchmark):
+    if benchmark.name.startswith("visualwebarena"):
+        sequential_subset = benchmark.subset_from_glob("requires_reset", "True")
+        sequential_subset = set(
+            [env_args.task_name for env_args in sequential_subset.env_args_list]
+        )
+        for exp_args in exp_args_list:
+            if exp_args.env_args.task_name in sequential_subset:
+                exp_args.sequential = True
 
 
 # def ablation_study(start_agent: AgentArgs, changes, benchmark: str, demo_mode=False):

From 13840fca87807dfb83f9999b939c34d2b953399f Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Wed, 23 Oct 2024 22:25:06 -0400
Subject: [PATCH 20/47] Tmlr workarena (#92)

* adding llm configs

* new L1 entries
---
 reproducibility_journal.csv | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index aa28461bd..2075f2d91 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -21,3 +21,8 @@ ThibaultLSDC,GenericAgent-gpt-4o,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-
 ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.579,0.027,2,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.309,0.025,2,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
 ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2024-10-23_12-17-24,2024-10-23_02-00-49,0.527,0.027,2,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,d364ce7d5f566889830cdc0ef58b320d2093694e,,0.9.0,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.27,0.024,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
+ThibaultLSDC,GenericAgent-gpt-4o,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.455,0.027,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
+ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.564,0.027,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
+ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.279,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
+ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.567,0.027,4,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,

From 024481a398d1f29260921a130f961f2e651b46f0 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 24 Oct 2024 16:48:42 +0000
Subject: [PATCH 21/47] tmp

---
 src/agentlab/experiments/study.py | 34 +++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 2da9771fb..17d44cd44 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -14,6 +14,7 @@
 from agentlab.experiments import reproducibility_util as repro
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.launch_exp import find_incomplete, run_experiments
+logger = logging.getLogger("agentlab_"+__name__)
 
 
 @dataclass
@@ -50,6 +51,7 @@ class Study:
     uuid: str = None
     reproducibility_info: dict = None
     logging_level: int = logging.INFO
+    logging_level_stdout: int = logging.INFO
 
     def __post_init__(self):
         self.uuid = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
@@ -61,13 +63,17 @@ def __post_init__(self):
 
     def make_exp_args_list(self):
         self.exp_args_list = _agents_on_benchmark(
-            self.agent_args, self.benchmark, logging_level=self.logging_level
+            self.agent_args, self.benchmark, logging_level=self.logging_level, logging_level_stdout=self.logging_level_stdout
         )
 
     def find_incomplete(self, relaunch_mode="incomplete_or_error"):
         """Find incomplete or errored experiments in the study directory for relaunching."""
         self.exp_args_list = find_incomplete(self.dir, relaunch_mode=relaunch_mode)
 
+    def load_exp_args_list(self):
+        logger.info(f"Loading experiments from {self.dir}")
+        self.exp_args_list = list(inspect_results.yield_all_exp_results(savedir_base=self.dir))
+
     def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
         """Gather relevant information that may affect the reproducibility of the experiment
 
@@ -102,15 +108,17 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False,
         if self.exp_args_list is None:
             raise ValueError("exp_args_list is None. Please set exp_args_list before running.")
 
-        self.set_reproducibility_info(
-            strict_reproducibility=strict_reproducibility, comment=comment
-        )
+
+        logger.info("Preparing backends...")
+        self.benchmark.prepare_backends()
+        logger.info("Backends ready.")
+        self.set_reproducibility_info(strict_reproducibility=strict_reproducibility, comment=comment)
         self.save()
 
         run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
         report_df = self.get_report(ignore_cache=True)
-        logging.info(f"Study {self.name} finished.")
-        logging.info("\n" + str(report_df))
+        logger.info(f"Study {self.name} finished.")
+        logger.info("\n" + str(report_df))
 
     def append_to_journal(self, strict_reproducibility=True):
         """Append the study to the journal.
@@ -162,7 +170,9 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
 
-    @staticmethod
+
+
+
     def load(dir: Path) -> "Study":
         dir = Path(dir)
         study_path = dir / "study.pkl.gz"
@@ -179,10 +189,10 @@ def load(dir: Path) -> "Study":
                 study = pickle.load(f)  # type: Study
             study.dir = dir
 
-            # just a check
-            for i, exp_args in enumerate(study.exp_args_list):
-                if exp_args.order != i:
-                    logging.warning("The order of the experiments is not correct.")
+            # # just a check
+            # for i, exp_args in enumerate(study.exp_args_list):
+            #     if exp_args.order != i:
+            #         logging.warning(f"The order of the experiments is not correct. {exp_args.order} != {i}")
 
         return study
 
@@ -262,6 +272,7 @@ def _agents_on_benchmark(
     benchmark: bgym.Benchmark,
     demo_mode=False,
     logging_level: int = logging.INFO,
+    logging_level_stdout: int = logging.INFO,
 ):
     """Run one or multiple agents on a benchmark.
 
@@ -300,6 +311,7 @@ def _agents_on_benchmark(
             agent_args=args.CrossProd(agents),
             env_args=args.CrossProd(env_args_list),
             logging_level=logging_level,
+            logging_level_stdout=logging_level_stdout,
         )
     )  # type: list[ExpArgs]
 

From 519f51e3959033448af257e6236e261e7cf3edc1 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 24 Oct 2024 13:14:18 -0400
Subject: [PATCH 22/47] reformat

---
 src/agentlab/experiments/study.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 17d44cd44..2a9f08e68 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -14,7 +14,8 @@
 from agentlab.experiments import reproducibility_util as repro
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.launch_exp import find_incomplete, run_experiments
-logger = logging.getLogger("agentlab_"+__name__)
+
+logger = logging.getLogger("agentlab_" + __name__)
 
 
 @dataclass
@@ -63,7 +64,10 @@ def __post_init__(self):
 
     def make_exp_args_list(self):
         self.exp_args_list = _agents_on_benchmark(
-            self.agent_args, self.benchmark, logging_level=self.logging_level, logging_level_stdout=self.logging_level_stdout
+            self.agent_args,
+            self.benchmark,
+            logging_level=self.logging_level,
+            logging_level_stdout=self.logging_level_stdout,
         )
 
     def find_incomplete(self, relaunch_mode="incomplete_or_error"):
@@ -108,11 +112,12 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False,
         if self.exp_args_list is None:
             raise ValueError("exp_args_list is None. Please set exp_args_list before running.")
 
-
         logger.info("Preparing backends...")
         self.benchmark.prepare_backends()
         logger.info("Backends ready.")
-        self.set_reproducibility_info(strict_reproducibility=strict_reproducibility, comment=comment)
+        self.set_reproducibility_info(
+            strict_reproducibility=strict_reproducibility, comment=comment
+        )
         self.save()
 
         run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
@@ -170,9 +175,6 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
 
-
-
-
     def load(dir: Path) -> "Study":
         dir = Path(dir)
         study_path = dir / "study.pkl.gz"

From 8f235f8a6117d7bc9445134f3248a9910a669614 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Thu, 24 Oct 2024 15:54:05 -0400
Subject: [PATCH 23/47] adding assistantbench to reproducibility_util.py

---
 src/agentlab/experiments/reproducibility_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
index 754347953..52e4e62a3 100644
--- a/src/agentlab/experiments/reproducibility_util.py
+++ b/src/agentlab/experiments/reproducibility_util.py
@@ -38,6 +38,8 @@ def _get_benchmark_version(benchmark: bgym.Benchmark) -> str:
             return metadata.distribution("weblinx_browsergym").version
         except metadata.PackageNotFoundError:
             return "0.0.1rc1"
+    elif benchmark_name.startswith("assistantbench"):
+        return metadata.distribution("browsergym.assistantbench").version
     else:
         raise ValueError(f"Unknown benchmark {benchmark_name}")
 

From 6e18fb818a64ec1e3f379c1a6480411d2fd0628b Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Wed, 30 Oct 2024 15:09:26 -0400
Subject: [PATCH 24/47] gitignore (#97)

---
 .gitignore | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
index d0037afc9..aa26dc9dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,35 +161,10 @@ cython_debug/
 **/.DS_Store
 
 .vscode
-allowed_selenium.json
 
-# Torchtune
-finetuning/torchtune
-
-# PyLLMD repo for finetuning
-pyllmd_tune/research-pyllmd/
-pyllmd_tune/data/
-
-
-datasets/*
 _sandbox.py
-node_modules/
-/test-results/
-/playwright-report/
-/blob-report/
-/playwright/.cache/
-/test-results/
-/playwright-report/
-/blob-report/
-/playwright/.cache/
-
 
 results/
 
-# personal (optimass)
-ICML_deadline/
-mass_utils/
-pyllmd_tune/
-
-# don't ignore the miniwob_tasks_all.csv file
-!miniwob_tasks_all.csv
+# gradio
+.gradio/
\ No newline at end of file

From 05448cf22d0c0a4ab29d4ab2a65751bdcb9eea76 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 5 Nov 2024 09:07:20 -0500
Subject: [PATCH 25/47] Vision fix (#105)

* changing content name

* Update src/agentlab/llm/llm_utils.py

---------

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>
---
 src/agentlab/llm/llm_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index dec6b7f7a..eaa2a5e02 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -8,7 +8,8 @@
 import time
 from copy import deepcopy
 from functools import cache
-from typing import TYPE_CHECKING, Any, Union
+from typing import TYPE_CHECKING
+from typing import Any, Union
 from warnings import warn
 
 import numpy as np
@@ -353,9 +354,9 @@ def add_image(self, image: np.ndarray | Image.Image | str, detail: str = None):
         else:
             image_url = image
         if detail:
-            self.add_content("image", {"url": image_url, "detail": detail})
+            self.add_content("image_url", {"url": image_url, "detail": detail})
         else:
-            self.add_content("image", image_url)
+            self.add_content("image_url", image_url)
 
     def to_markdown(self):
         if isinstance(self["content"], str):

From f6ac587b316ecaab90ed8c896cf7de408b08b271 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 5 Nov 2024 11:43:14 -0500
Subject: [PATCH 26/47] L2 tmlr (#93)

* adding llm configs

* L2 entries

* claude L3

* claude vision support

* miniwob results

* 405b L1 entry
---
 reproducibility_journal.csv     | 12 ++++++++++++
 src/agentlab/llm/llm_configs.py |  1 +
 2 files changed, 13 insertions(+)

diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index 2075f2d91..5403a4d7d 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -26,3 +26,15 @@ ThibaultLSDC,GenericAgent-gpt-4o,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-
 ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.564,0.027,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.279,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
 ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.567,0.027,4,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
+ThibaultLSDC,GenericAgent-gpt-4o-mini,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.013,0.007,2,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
+ThibaultLSDC,GenericAgent-gpt-4o,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.085,0.018,3,233/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
+ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.391,0.032,3,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
+ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.021,0.009,2,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
+ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.149,0.023,1,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
+ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l3_agent_curriculum_eval,0.4.1,2024-10-24_23-03-30,2024-10-24_18-06-57,0.004,0.004,1,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,de67ed8ad4321740ff05cf26ab889978be706460,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
+ThibaultLSDC,GenericAgent-gpt-4o-mini,miniwob,0.10.2,2024-10-25_17-16-23,2024-10-25_06-08-16,0.566,0.02,0,625/625,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,f12887f776525bcad6a0c42cb49651ff4f65af43,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
+ThibaultLSDC,GenericAgent-gpt-4o,miniwob,0.10.2,2024-10-25_17-16-23,2024-10-25_06-08-16,0.638,0.019,0,625/625,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,f12887f776525bcad6a0c42cb49651ff4f65af43,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
+ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,miniwob,0.10.2,2024-10-25_17-16-23,2024-10-25_06-08-16,0.698,0.018,0,625/625,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,f12887f776525bcad6a0c42cb49651ff4f65af43,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
+ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob,0.10.2,2024-10-25_17-16-23,2024-10-25_06-08-16,0.576,0.02,0,625/625,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,f12887f776525bcad6a0c42cb49651ff4f65af43,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
+ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,miniwob,0.10.2,2024-10-25_17-16-23,2024-10-25_06-08-16,0.678,0.019,0,625/625,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,f12887f776525bcad6a0c42cb49651ff4f65af43,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
+ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,workarena_l1,0.4.1,2024-10-25_20-32-26,2024-10-25_17-34-45,0.433,0.027,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,177ba72a7469e5610e6b615adf1bdcde58cb0298,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index 13cb69b41..8376b5c25 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -151,6 +151,7 @@
         max_input_tokens=40_000,
         max_new_tokens=4000,
         temperature=1e-1,
+        vision_support=True,
     ),
     "openrouter/qwen/qwen-2-72b-instruct": OpenRouterModelArgs(
         model_name="qwen/qwen-2-72b-instruct",

From f8d1e47ed6aa40275a27cd368edfdeacf3fe9318 Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Wed, 6 Nov 2024 09:58:09 -0500
Subject: [PATCH 27/47] Replacing Dask with Ray (#100)

* dask-dependencies

* minor

* replace with ray

* adjust tests and move a few things

* markdown report

* automatic relaunch

* add dependencies

* reformat

* fix unit-test

* catch timeout

* fixing bugs and making things work

* adress comments and black format

* new dependencies viewer

* Update benchmark to use visualwebarena instead of webarena

* Fix import and uncomment code in get_ray_url.py

* Add ignore_dependencies option to Study and _agents_on_benchmark functions

* Update load_most_recent method to include contains parameter

* Update load_most_recent method to accept contains parameter and add warning for ignored dependencies in _agents_on_benchmark

* Refactor backend preparation in Study class and improve logging for ignored dependencies

* finallly some results with claude on webarena

* Add warnings for Windows timeouts and clarify parallel backend options; update get_results method to conditionally save outputs

* black

* ensure timeout is int (For the 3rd time?)

* Refactor timeout handling in context manager; update test to reduce avg_step_timeout and rename test function

* black

* Change parallel backend from "joblib" to "ray" in run_experiments function

* Update src/agentlab/experiments/study.py

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>

* Update src/agentlab/analyze/inspect_results.py

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>

* Refactor logging initialization and update layout configurations in dependency graph plotting; adjust node size and font size for better visualization

---------

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>
---
 main.py                                       |  18 +-
 reproducibility_journal.csv                   |   1 +
 requirements.txt                              |   1 +
 src/agentlab/analyze/inspect_results.ipynb    |   8 +-
 src/agentlab/analyze/inspect_results.py       |  61 +++-
 src/agentlab/experiments/exp_utils.py         | 160 ++++++++-
 src/agentlab/experiments/get_ray_url.py       |   5 +
 src/agentlab/experiments/graph_execution.py   |  96 ------
 .../experiments/graph_execution_dask.py       |  64 ++++
 .../experiments/graph_execution_ray.py        |  39 +++
 src/agentlab/experiments/launch_exp.py        | 136 +++++---
 src/agentlab/experiments/study.py             | 163 ++++++---
 src/agentlab/experiments/view_dep_graph.py    | 322 ++++++++++++++++++
 src/agentlab/llm/chat_api.py                  |  10 +-
 tests/experiments/test_dask.py                |  41 +++
 tests/experiments/test_launch_exp.py          |  60 +++-
 .../{test_graph_execution.py => test_ray.py}  |  46 +--
 17 files changed, 943 insertions(+), 288 deletions(-)
 create mode 100644 src/agentlab/experiments/get_ray_url.py
 delete mode 100644 src/agentlab/experiments/graph_execution.py
 create mode 100644 src/agentlab/experiments/graph_execution_dask.py
 create mode 100644 src/agentlab/experiments/graph_execution_ray.py
 create mode 100644 src/agentlab/experiments/view_dep_graph.py
 create mode 100644 tests/experiments/test_dask.py
 rename tests/experiments/{test_graph_execution.py => test_ray.py} (63%)

diff --git a/main.py b/main.py
index 1d0ba4464..4f9f57c84 100644
--- a/main.py
+++ b/main.py
@@ -6,7 +6,6 @@
 repository.
 """
 
-import bgym
 import logging
 from agentlab.agents.generic_agent import (
     RANDOM_SEARCH_AGENT,
@@ -26,7 +25,7 @@
 
 # ## select the benchmark to run on
 benchmark = "miniwob_tiny_test"
-# benchmark = "miniwob_all"
+# benchmark = "miniwob"
 # benchmark = "workarena_l1"
 # benchmark = "workarena_l2"
 # benchmark = "workarena_l3"
@@ -53,13 +52,18 @@
 
     if relaunch:
         #  relaunch an existing study
-        study = Study.load_most_recent()
-        study.find_incomplete(relaunch_mode="incomplete_or_error")
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
 
     else:
-        study = Study(agent_args, benchmark)
-
-    study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)
+        study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend="ray",
+        strict_reproducibility=reproducibility_mode,
+        n_relaunch=3,
+    )
 
     if reproducibility_mode:
         study.append_to_journal(strict_reproducibility=True)
diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index 5403a4d7d..ebeeacb51 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -26,6 +26,7 @@ ThibaultLSDC,GenericAgent-gpt-4o,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-
 ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.564,0.027,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.279,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
 ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.567,0.027,4,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
+recursix,GenericAgent-anthropic_claude-3.5-sonnet:beta,webarena,0.11.3,2024-11-02_23-50-17,22a9d3f5-9d86-455e-b451-3ea17690ce8a,0.329,0.016,0,812/812,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.3,418a05d90c74800cd66371b7846ef861185b8c47,,0.11.3,160167ff0d2631826f0131e8e30b92ef448d6881,
 ThibaultLSDC,GenericAgent-gpt-4o-mini,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.013,0.007,2,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
 ThibaultLSDC,GenericAgent-gpt-4o,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.085,0.018,3,233/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
 ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.391,0.032,3,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
diff --git a/requirements.txt b/requirements.txt
index 1d00e25c2..56ef81216 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@ gradio>=5
 gitpython # for the reproducibility script
 requests
 matplotlib
+ray[default]
diff --git a/src/agentlab/analyze/inspect_results.ipynb b/src/agentlab/analyze/inspect_results.ipynb
index e1281f141..b4b3828ae 100644
--- a/src/agentlab/analyze/inspect_results.ipynb
+++ b/src/agentlab/analyze/inspect_results.ipynb
@@ -151,7 +151,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(inspect_results.error_report(result_df, max_stack_trace=1))"
+    "from IPython.display import Markdown, display\n",
+    "\n",
+    "report = inspect_results.error_report(result_df, max_stack_trace=2, use_log=True)\n",
+    "# display(Markdown(report))\n",
+    "print(report)"
    ]
   },
   {
@@ -166,7 +170,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "ui-copilot",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index f304250b3..9ea2aac3e 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -581,10 +581,12 @@ def set_wrap_style(df):
 # ------------
 
 
-def map_err_key(err_msg):
+def map_err_key(err_msg: str):
     if err_msg is None:
         return err_msg
 
+    # remove logs from the message if any
+    err_msg = err_msg[: err_msg.find("=== logs ===")].rstrip()
     regex_replacements = [
         (
             r"your messages resulted in \d+ tokens",
@@ -601,7 +603,7 @@ def map_err_key(err_msg):
     return err_msg
 
 
-def error_report(df: pd.DataFrame, max_stack_trace=10):
+def error_report(df: pd.DataFrame, max_stack_trace=10, use_log=False):
     """Report the error message for each agent."""
 
     if "err_key" not in df:
@@ -611,35 +613,62 @@ def error_report(df: pd.DataFrame, max_stack_trace=10):
     report = []
     for err_key, count in unique_counts.items():
         report.append("-------------------")
-        report.append(f"{count}x : {err_key}\n")
+        report.append(f"## {count}x : " + err_key.replace("\n", "<br>") + "\n")
+
         # find sub_df with this error message
         sub_df = df[df["err_key"] == err_key]
         idx = 0
 
         exp_result_list = [get_exp_result(row.exp_dir) for _, row in sub_df.iterrows()]
-        task_names = [exp_result.exp_args.env_args.task_name for exp_result in exp_result_list]
-
-        # count unique using numpy
-        unique_task_names, counts = np.unique(task_names, return_counts=True)
-        task_and_count = sorted(zip(unique_task_names, counts), key=lambda x: x[1], reverse=True)
-        for task_name, count in task_and_count:
-            report.append(f"{count:2d} {task_name}")
+        exp_result_list = sorted(exp_result_list, key=lambda x: x.exp_args.env_args.task_name)
+        for exp_result in exp_result_list:
+            report.append(
+                f"* {exp_result.exp_args.env_args.task_name} seed: {exp_result.exp_args.env_args.task_seed}"
+            )
 
         report.append(f"\nShowing Max {max_stack_trace} stack traces:\n")
         for exp_result in exp_result_list:
             if idx >= max_stack_trace:
                 break
-            # print task name and stack trace
-            stack_trace = exp_result.summary_info.get("stack_trace", "")
-            report.append(f"Task Name: {exp_result.exp_args.env_args.task_name}\n")
-            report.append(f"exp_dir: {exp_result.exp_dir}\n")
-            report.append(f"Stack Trace: \n {stack_trace}\n")
-            report.append("\n")
+
+            if not use_log:
+                # print task name and stack trace
+                stack_trace = exp_result.summary_info.get("stack_trace", "")
+                report.append(f"Task Name: {exp_result.exp_args.env_args.task_name}\n")
+                report.append(f"exp_dir: {exp_result.exp_dir}\n")
+                report.append(f"Stack Trace: \n {stack_trace}\n")
+                report.append("\n")
+            else:
+                report.append(f"```bash\n{_format_log(exp_result)}\n```")
+
             idx += 1
 
     return "\n".join(report)
 
 
+def _format_log(exp_result: ExpResult, head_lines=10, tail_lines=50):
+    """Extract head and tail of the log. Try to find the traceback."""
+    log = exp_result.logs
+    if log is None:
+        return "No log found"
+
+    log_lines = log.split("\n")
+    if len(log_lines) <= head_lines + tail_lines:
+        return log
+
+    # first 10 lines:
+    log_head = "\n".join(log_lines[:head_lines])
+
+    try:
+        traceback_idx = log.rindex("Traceback (most recent call last):")
+        tail_idx = log.rindex("action:", 0, traceback_idx)
+        log_tail = log[tail_idx:]
+    except ValueError:
+        log_tail = "\n".join(log_lines[-tail_lines:])
+
+    return log_head + "\n...\n...truncated middle of the log\n...\n" + log_tail
+
+
 def categorize_error(row):
     if pd.isna(row.get("err_msg", None)):
         return None
diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py
index 3ae88deff..c2864ce00 100644
--- a/src/agentlab/experiments/exp_utils.py
+++ b/src/agentlab/experiments/exp_utils.py
@@ -4,6 +4,12 @@
 from tqdm import tqdm
 import logging
 from browsergym.experiments.loop import ExpArgs
+from contextlib import contextmanager
+import signal
+import sys
+from time import time, sleep
+
+logger = logging.getLogger(__name__)  # Get logger based on module name
 
 
 # TODO move this to a more appropriate place
@@ -19,8 +25,148 @@
 RESULTS_DIR.mkdir(parents=True, exist_ok=True)
 
 
+def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60):
+    """Run exp_args.run() with a timeout and handle dependencies."""
+    episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
+    with timeout_manager(seconds=episode_timeout):
+        return exp_arg.run()
+
+
+def _episode_timeout(exp_arg: ExpArgs, avg_step_timeout=60):
+    """Some logic to determine the episode timeout."""
+    max_steps = getattr(exp_arg.env_args, "max_steps", None)
+    if max_steps is None:
+        episode_timeout_global = 10 * 60 * 60  # 10 hours
+    else:
+        episode_timeout_global = exp_arg.env_args.max_steps * avg_step_timeout
+
+    episode_timeout_exp = getattr(exp_arg, "episode_timeout", episode_timeout_global)
+
+    return min(episode_timeout_global, episode_timeout_exp)
+
+
+@contextmanager
+def timeout_manager(seconds: int = None):
+    """Context manager to handle timeouts."""
+
+    if isinstance(seconds, float):
+        seconds = max(1, int(seconds))  # make sure seconds is at least 1
+
+    if seconds is None or sys.platform == "win32":
+        try:
+            logger.warning("Timeouts are not supported on Windows.")
+            yield
+        finally:
+            pass
+        return
+
+    def alarm_handler(signum, frame):
+
+        logger.warning(
+            f"Operation timed out after {seconds}s, sending SIGINT and raising TimeoutError."
+        )
+        # send sigint
+        os.kill(os.getpid(), signal.SIGINT)
+
+        # Still raise TimeoutError for immediate handling
+        raise TimeoutError(f"Operation timed out after {seconds} seconds")
+
+    previous_handler = signal.signal(signal.SIGALRM, alarm_handler)
+    signal.alarm(seconds)
+
+    try:
+        yield
+    finally:
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, previous_handler)
+
+
+def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[str, list[str]] = None):
+    """Add dependencies to a list of ExpArgs.
+
+    Args:
+        exp_args_list: list[ExpArgs]
+            A list of experiments to run.
+        task_dependencies: dict
+            A dictionary mapping task names to a list of task names that they
+            depend on. If None or empty, no dependencies are added.
+
+    Returns:
+        list[ExpArgs]
+            The modified exp_args_list with dependencies added.
+    """
+
+    if task_dependencies is None or all([len(dep) == 0 for dep in task_dependencies.values()]):
+        # nothing to be done
+        return exp_args_list
+
+    for exp_args in exp_args_list:
+        exp_args.make_id()  # makes sure there is an exp_id
+
+    exp_args_map = {exp_args.env_args.task_name: exp_args for exp_args in exp_args_list}
+    if len(exp_args_map) != len(exp_args_list):
+        raise ValueError(
+            (
+                "Task names are not unique in exp_args_map, "
+                "you can't run multiple seeds with task dependencies."
+            )
+        )
+
+    for task_name in exp_args_map.keys():
+        if task_name not in task_dependencies:
+            raise ValueError(f"Task {task_name} is missing from task_dependencies")
+
+    # turn dependencies from task names to exp_ids
+    for task_name, exp_args in exp_args_map.items():
+        exp_args.depends_on = tuple(
+            exp_args_map[dep_name].exp_id for dep_name in task_dependencies[task_name]
+        )
+
+    return exp_args_list
+
+
+# Mock implementation of the ExpArgs class with timestamp checks for unit testing
+class MockedExpArgs:
+    def __init__(self, exp_id, depends_on=None):
+        self.exp_id = exp_id
+        self.depends_on = depends_on if depends_on else []
+        self.start_time = None
+        self.end_time = None
+        self.env_args = None
+
+    def run(self):
+        self.start_time = time()
+
+        # # simulate playright code, (this was causing issues due to python async loop)
+        # import playwright.sync_api
+
+        # pw = playwright.sync_api.sync_playwright().start()
+        # pw.selectors.set_test_id_attribute("mytestid")
+        sleep(3)  # Simulate task execution time
+        self.end_time = time()
+        return self
+
+
+def make_seeds(n, offset=42):
+    raise DeprecationWarning("This function will be removed. Comment out this error if needed.")
+    return [seed + offset for seed in range(n)]
+
+
+def order(exp_args_list: list[ExpArgs]):
+    raise DeprecationWarning("This function will be removed. Comment out this error if needed.")
+    """Store the order of the list of experiments to be able to sort them back.
+
+    This is important for progression or ablation studies.
+    """
+    for i, exp_args in enumerate(exp_args_list):
+        exp_args.order = i
+    return exp_args_list
+
+
+# This was an old function for filtering some issue with the experiments.
 def hide_some_exp(base_dir, filter: callable, just_test):
     """Move all experiments that match the filter to a new name."""
+    raise DeprecationWarning("This function will be removed. Comment out this error if needed.")
     exp_list = list(yield_all_exp_results(base_dir, progress_fn=None))
 
     msg = f"Searching {len(exp_list)} experiments to move to _* expriments where `filter(exp_args)` is True."
@@ -38,17 +184,3 @@ def hide_some_exp(base_dir, filter: callable, just_test):
                 _move_old_exp(exp.exp_dir)
             filtered_out.append(exp)
     return filtered_out
-
-
-def make_seeds(n, offset=42):
-    return [seed + offset for seed in range(n)]
-
-
-def order(exp_args_list: list[ExpArgs]):
-    """Store the order of the list of experiments to be able to sort them back.
-
-    This is important for progression or ablation studies.
-    """
-    for i, exp_args in enumerate(exp_args_list):
-        exp_args.order = i
-    return exp_args_list
diff --git a/src/agentlab/experiments/get_ray_url.py b/src/agentlab/experiments/get_ray_url.py
new file mode 100644
index 000000000..b652254cb
--- /dev/null
+++ b/src/agentlab/experiments/get_ray_url.py
@@ -0,0 +1,5 @@
+import ray
+
+context = ray.init(address="auto", ignore_reinit_error=True)
+
+print(context)
diff --git a/src/agentlab/experiments/graph_execution.py b/src/agentlab/experiments/graph_execution.py
deleted file mode 100644
index c12a1048b..000000000
--- a/src/agentlab/experiments/graph_execution.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from dask import compute, delayed
-from browsergym.experiments.loop import ExpArgs
-from distributed import LocalCluster, Client
-
-
-def _run(exp_arg: ExpArgs, *dependencies):
-    return exp_arg.run()
-
-
-def make_dask_client(n_worker):
-    """Create a Dask client with a LocalCluster backend.
-
-    I struggled to find an appropriate configuration.
-    I believe it has to do with the interplay of playwright async loop (even if
-    used in sync mode) and the fact that dask uses asyncio under the hood.
-    Making sure we use processes and 1 thread per worker seems to work.
-
-    Args:
-        n_worker: int
-            Number of workers to create.
-
-    Returns:
-        A Dask client object.
-    """
-    cluster = LocalCluster(
-        n_workers=n_worker,
-        processes=True,
-        threads_per_worker=1,
-    )
-
-    return Client(cluster)
-
-
-def execute_task_graph(exp_args_list: list[ExpArgs]):
-    """Execute a task graph in parallel while respecting dependencies."""
-    exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list}
-
-    tasks = {}
-
-    def get_task(exp_arg: ExpArgs):
-        if exp_arg.exp_id not in tasks:
-            dependencies = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on]
-            tasks[exp_arg.exp_id] = delayed(_run)(exp_arg, *dependencies)
-        return tasks[exp_arg.exp_id]
-
-    for exp_arg in exp_args_list:
-        get_task(exp_arg)
-
-    task_ids, task_list = zip(*tasks.items())
-    results = compute(*task_list)
-
-    return {task_id: result for task_id, result in zip(task_ids, results)}
-
-
-def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[list] = None):
-    """Add dependencies to a list of ExpArgs.
-
-    Args:
-        exp_args_list: list[ExpArgs]
-            A list of experiments to run.
-        task_dependencies: dict
-            A dictionary mapping task names to a list of task names that they
-            depend on. If None or empty, no dependencies are added.
-
-    Returns:
-        list[ExpArgs]
-            The modified exp_args_list with dependencies added.
-    """
-
-    if task_dependencies is None or all([len(dep) == 0 for dep in task_dependencies.values()]):
-        # nothing to be done
-        return exp_args_list
-
-    exp_args_map = {exp_args.env_args.task_name: exp_args for exp_args in exp_args_list}
-    if len(exp_args_map) != len(exp_args_list):
-        raise ValueError(
-            (
-                "Task names are not unique in exp_args_map, "
-                "you can't run multiple seeds with task dependencies."
-            )
-        )
-
-    for task_name in exp_args_map.keys():
-        if task_name not in task_dependencies:
-            raise ValueError(f"Task {task_name} is missing from task_dependencies")
-
-    # turn dependencies from task names to exp_ids
-    for task_name, exp_args in exp_args_map.items():
-
-        exp_args.depends_on = tuple(
-            exp_args_map[dep_name].exp_id
-            for dep_name in task_dependencies[task_name]
-            if dep_name in exp_args_map  # ignore dependencies that are not to be run
-        )
-
-    return exp_args_list
diff --git a/src/agentlab/experiments/graph_execution_dask.py b/src/agentlab/experiments/graph_execution_dask.py
new file mode 100644
index 000000000..dc51dd518
--- /dev/null
+++ b/src/agentlab/experiments/graph_execution_dask.py
@@ -0,0 +1,64 @@
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
+
+from contextlib import contextmanager
+import threading
+from dask import compute, delayed
+from bgym import ExpArgs
+from distributed import LocalCluster, Client
+from agentlab.experiments.exp_utils import _episode_timeout
+
+# from agentlab.experiments.exp_utils import run_exp
+
+
+def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60):
+    """Run exp_args.run() with a timeout and handle dependencies."""
+    # dask can't use the timeout_manager define in exp_utils.py
+    # ValueError: signal only works in main thread of the main interpreter
+    # most alternative I try doesn't work
+    episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
+    return exp_arg.run()
+
+
+def make_dask_client(n_worker):
+    """Create a Dask client with a LocalCluster backend.
+
+    I struggled to find an appropriate configuration.
+    I believe it has to do with the interplay of playwright async loop (even if
+    used in sync mode) and the fact that dask uses asyncio under the hood.
+    Making sure we use processes and 1 thread per worker seems to work.
+
+    Args:
+        n_worker: int
+            Number of workers to create.
+
+    Returns:
+        A Dask client object.
+    """
+    cluster = LocalCluster(
+        n_workers=n_worker,
+        processes=True,
+        threads_per_worker=1,
+    )
+
+    return Client(cluster)
+
+
+def execute_task_graph(exp_args_list: list[ExpArgs]):
+    """Execute a task graph in parallel while respecting dependencies."""
+    exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list}
+
+    tasks = {}
+
+    def get_task(exp_arg: ExpArgs):
+        if exp_arg.exp_id not in tasks:
+            dependencies = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on]
+            tasks[exp_arg.exp_id] = delayed(run_exp)(exp_arg, *dependencies)
+        return tasks[exp_arg.exp_id]
+
+    for exp_arg in exp_args_list:
+        get_task(exp_arg)
+
+    task_ids, task_list = zip(*tasks.items())
+    results = compute(*task_list)
+
+    return {task_id: result for task_id, result in zip(task_ids, results)}
diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
new file mode 100644
index 000000000..3e01be31e
--- /dev/null
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -0,0 +1,39 @@
+# import os
+
+# # Disable Ray log deduplication
+# os.environ["RAY_DEDUP_LOGS"] = "0"
+
+import ray
+import bgym
+from agentlab.experiments.exp_utils import run_exp
+
+
+run_exp = ray.remote(run_exp)
+
+
+def execute_task_graph(exp_args_list: list[bgym.ExpArgs], avg_step_timeout=60):
+    """Execute a task graph in parallel while respecting dependencies using Ray."""
+
+    exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list}
+    tasks = {}
+
+    def get_task(exp_arg: bgym.ExpArgs):
+        if exp_arg.exp_id not in tasks:
+            # Get all dependency tasks first
+            dependency_tasks = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on]
+
+            # Create new task that depends on the dependency results
+            tasks[exp_arg.exp_id] = run_exp.remote(
+                exp_arg, *dependency_tasks, avg_step_timeout=avg_step_timeout
+            )
+        return tasks[exp_arg.exp_id]
+
+    # Build task graph
+    for exp_arg in exp_args_list:
+        get_task(exp_arg)
+
+    # Execute all tasks and gather results
+    task_ids = list(tasks.keys())
+    results = ray.get(list(tasks.values()))
+
+    return {task_id: result for task_id, result in zip(task_ids, results)}
diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index f151f7ecb..49a778e37 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -2,24 +2,17 @@
 from importlib import import_module
 from pathlib import Path
 
+import bgym
 from browsergym.experiments.loop import ExpArgs, yield_all_exp_results
-
-
-def import_object(path: str):
-    module_name, obj_name = split_path(path)
-    try:
-        module = import_module(module_name)
-        obj = getattr(module, obj_name)
-    except (ImportError, AttributeError) as e:
-        raise ImportError(f"Error importing {path}: {e}")
-    return obj
+from agentlab.experiments.exp_utils import run_exp
 
 
 def run_experiments(
     n_jobs,
     exp_args_list: list[ExpArgs],
     study_dir,
-    parallel_backend="joblib",
+    parallel_backend="ray",
+    avg_step_timeout=60,
 ):
     """Run a list of ExpArgs in parallel.
 
@@ -34,7 +27,10 @@ def run_experiments(
         exp_dir: Path
             Directory where the experiments will be saved.
         parallel_backend: str
-            Parallel backend to use. Either "joblib", "dask" or "sequential".
+            Parallel backend to use. Either "joblib", "ray" or "sequential".
+            The only backend that supports webarena graph dependencies correctly is ray or sequential.
+        avg_step_timeout: int
+            Will raise a TimeoutError if the episode is not finished after env_args.max_steps * avg_step_timeout seconds.
     """
 
     if len(exp_args_list) == 0:
@@ -56,27 +52,40 @@ def run_experiments(
         if parallel_backend == "joblib":
             from joblib import Parallel, delayed
 
-            # split sequential
+            # split sequential (should be no longer needed with dependencies)
             sequential_exp_args, exp_args_list = _split_sequential_exp(exp_args_list)
 
             logging.info(
                 f"Running {len(sequential_exp_args)} in sequential first. The remaining {len(exp_args_list)} will be run in parallel."
             )
             for exp_args in sequential_exp_args:
-                exp_args.run()
+                run_exp(exp_args, avg_step_timeout=avg_step_timeout)
 
             Parallel(n_jobs=n_jobs, prefer="processes")(
-                delayed(exp_args.run)() for exp_args in exp_args_list
+                delayed(run_exp)(exp_args, avg_step_timeout=avg_step_timeout)
+                for exp_args in exp_args_list
             )
 
-        elif parallel_backend == "dask":
-            from agentlab.experiments.graph_execution import execute_task_graph, make_dask_client
-
-            with make_dask_client(n_worker=n_jobs):
-                execute_task_graph(exp_args_list)
+        # dask will be deprecated, as there was issues. use ray instead
+        # elif parallel_backend == "dask":
+        #     from agentlab.experiments.graph_execution_dask import (
+        #         execute_task_graph,
+        #         make_dask_client,
+        #     )
+
+        #     with make_dask_client(n_worker=n_jobs):
+        #         execute_task_graph(exp_args_list)
+        elif parallel_backend == "ray":
+            from agentlab.experiments.graph_execution_ray import execute_task_graph, ray
+
+            ray.init(num_cpus=n_jobs)
+            try:
+                execute_task_graph(exp_args_list, avg_step_timeout=avg_step_timeout)
+            finally:
+                ray.shutdown()
         elif parallel_backend == "sequential":
             for exp_args in exp_args_list:
-                exp_args.run()
+                run_exp(exp_args, avg_step_timeout=avg_step_timeout)
         else:
             raise ValueError(f"Unknown parallel_backend: {parallel_backend}")
     finally:
@@ -88,13 +97,16 @@ def run_experiments(
         logging.info("Experiment finished.")
 
 
-def find_incomplete(study_dir: str | Path, relaunch_mode="incomplete_only"):
-    """Return incomplete experiments
+def find_incomplete(study_dir: str | Path, include_errors=True):
+    """Find all incomplete experiments for relaunching.
+
+    Note: completed experiments are kept but are replaced by dummy exp_args
+    with nothing to run. This help keeping the dependencies between tasks.
 
     Args:
         study_dir: Path
             The directory where the experiments are saved.
-        relaunch_mode: str
+        include_errors: str
             Find all incomplete experiments and relaunch them.
             - "incomplete_only": relaunch only the incomplete experiments.
             - "incomplete_or_error": relaunch incomplete or errors.
@@ -105,14 +117,19 @@ def find_incomplete(study_dir: str | Path, relaunch_mode="incomplete_only"):
         raise ValueError(
             f"You asked to relaunch an existing experiment but {study_dir} does not exist."
         )
-    exp_args_list = list(_yield_incomplete_experiments(study_dir, relaunch_mode=relaunch_mode))
 
+    exp_result_list = list(yield_all_exp_results(study_dir, progress_fn=None))
+    exp_args_list = [_hide_completed(exp_result, include_errors) for exp_result in exp_result_list]
     # sort according to exp_args.order
     exp_args_list.sort(key=lambda exp_args: exp_args.order if exp_args.order is not None else 0)
 
-    if len(exp_args_list) == 0:
+    job_count = non_dummy_count(exp_args_list)
+
+    if job_count == 0:
         logging.info(f"No incomplete experiments found in {study_dir}.")
         return exp_args_list
+    else:
+        logging.info(f"Found {job_count} incomplete experiments in {study_dir}.")
 
     message = f"Make sure the processes that were running are all stopped. Otherwise, "
     f"there will be concurrent writing in the same directories.\n"
@@ -122,38 +139,39 @@ def find_incomplete(study_dir: str | Path, relaunch_mode="incomplete_only"):
     return exp_args_list
 
 
-def _yield_incomplete_experiments(exp_root, relaunch_mode="incomplete_only"):
-    """Find all incomplete experiments and relaunch them."""
-    # TODO(make relanch_mode a callable, for flexibility)
-    for exp_result in yield_all_exp_results(exp_root, progress_fn=None):  # type: ExpArgs
-        try:
-            # TODO  implement has_finished instead of dealing with FileNotFoundError
-            summary_info = exp_result.summary_info
+def non_dummy_count(exp_args_list: list[ExpArgs]) -> int:
+    return sum([not exp_args.is_dummy for exp_args in exp_args_list])
 
-        except FileNotFoundError:
-            yield exp_result.exp_args
-            continue
 
-        if relaunch_mode == "incomplete_only":
-            continue
+def noop(*args, **kwargs):
+    pass
 
-        err_msg = summary_info.get("err_msg", None)
 
-        if err_msg is not None:
-            if relaunch_mode == "incomplete_or_error":
-                yield exp_result.exp_args
-            else:
-                raise ValueError(f"Unknown relaunch_mode: {relaunch_mode}")
+def _hide_completed(exp_result: bgym.ExpResult, include_errors: bool = True):
+    """Hide completed experiments from the list.
 
+    This little hack, allows an elegant way to keep the task dependencies for e.g. webarena
+    while skipping the tasks that are completed when relaunching.
+    """
 
-def split_path(path: str):
-    """Split a path into a module name and an object name."""
-    if "/" in path:
-        path = path.replace("/", ".")
-    module_name, obj_name = path.rsplit(".", 1)
-    return module_name, obj_name
+    hide = False
+    if exp_result.status == "done":
+        hide = True
+    if exp_result.status == "error" and (not include_errors):
+        hide = True
 
+    exp_args = exp_result.exp_args
+    exp_args.is_dummy = hide  # just to keep track
+    exp_args.status = exp_result.status
+    if hide:
+        # make those function do nothing since they are finished.
+        exp_args.run = noop
+        exp_args.prepare = noop
 
+    return exp_args
+
+
+# TODO remove this function once ray backend is stable
 def _split_sequential_exp(exp_args_list: list[ExpArgs]) -> tuple[list[ExpArgs], list[ExpArgs]]:
     """split exp_args that are flagged as sequential from those that are not"""
     sequential_exp_args = []
@@ -165,3 +183,21 @@ def _split_sequential_exp(exp_args_list: list[ExpArgs]) -> tuple[list[ExpArgs],
             parallel_exp_args.append(exp_args)
 
     return sequential_exp_args, parallel_exp_args
+
+
+def _split_path(path: str):
+    """Split a path into a module name and an object name."""
+    if "/" in path:
+        path = path.replace("/", ".")
+    module_name, obj_name = path.rsplit(".", 1)
+    return module_name, obj_name
+
+
+def import_object(path: str):
+    module_name, obj_name = _split_path(path)
+    try:
+        module = import_module(module_name)
+        obj = getattr(module, obj_name)
+    except (ImportError, AttributeError) as e:
+        raise ImportError(f"Error importing {path}: {e}")
+    return obj
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 2a9f08e68..b319eb877 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
+import uuid
 
 import bgym
 from bgym import Benchmark, EnvArgs, ExpArgs
@@ -12,10 +13,10 @@
 from agentlab.analyze import inspect_results
 from agentlab.experiments import args
 from agentlab.experiments import reproducibility_util as repro
-from agentlab.experiments.exp_utils import RESULTS_DIR
-from agentlab.experiments.launch_exp import find_incomplete, run_experiments
+from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
+from agentlab.experiments.launch_exp import find_incomplete, run_experiments, non_dummy_count
 
-logger = logging.getLogger("agentlab_" + __name__)
+logger = logging.getLogger(__name__)
 
 
 @dataclass
@@ -51,11 +52,13 @@ class Study:
     suffix: str = ""  # used for adding a personnal comment to the study name
     uuid: str = None
     reproducibility_info: dict = None
-    logging_level: int = logging.INFO
-    logging_level_stdout: int = logging.INFO
+    logging_level: int = logging.DEBUG
+    logging_level_stdout: int = logging.WARNING
+    comment: str = None  # Extra comments from the authors of this study
+    ignore_dependencies: bool = False
 
     def __post_init__(self):
-        self.uuid = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+        self.uuid = uuid.uuid4()
         if isinstance(self.benchmark, str):
             self.benchmark = bgym.DEFAULT_BENCHMARKS[self.benchmark]()
         if isinstance(self.dir, str):
@@ -68,11 +71,26 @@ def make_exp_args_list(self):
             self.benchmark,
             logging_level=self.logging_level,
             logging_level_stdout=self.logging_level_stdout,
+            ignore_dependencies=self.ignore_dependencies,
         )
 
-    def find_incomplete(self, relaunch_mode="incomplete_or_error"):
-        """Find incomplete or errored experiments in the study directory for relaunching."""
-        self.exp_args_list = find_incomplete(self.dir, relaunch_mode=relaunch_mode)
+    def find_incomplete(self, include_errors=True):
+        """Find incomplete or errored experiments in the study directory for relaunching.
+
+        Args:
+            include_errors: bool
+                If True, include errored experiments in the list.
+
+        Returns:
+            list[ExpArgs]: The list of all experiments with completed ones replaced by a
+                dummy exp_args to keep the task dependencies.
+        """
+        self.exp_args_list = find_incomplete(self.dir, include_errors=include_errors)
+        n_incomplete = non_dummy_count(self.exp_args_list)
+        n_error = [
+            getattr(exp_args, "status", "incomplete") == "error" for exp_args in self.exp_args_list
+        ].count(True)
+        return n_incomplete, n_error
 
     def load_exp_args_list(self):
         logger.info(f"Loading experiments from {self.dir}")
@@ -91,10 +109,57 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
             comment=comment,
         )
         if self.reproducibility_info is not None:
-            repro.assert_compatible(self.reproducibility_info, info)
+            repro.assert_compatible(
+                self.reproducibility_info, info, raise_if_incompatible=strict_reproducibility
+            )
         self.reproducibility_info = info
 
-    def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False, comment=None):
+    def run(
+        self,
+        n_jobs=1,
+        parallel_backend="joblib",
+        strict_reproducibility=False,
+        n_relaunch=3,
+        relaunch_errors=True,
+    ):
+
+        self.set_reproducibility_info(
+            strict_reproducibility=strict_reproducibility, comment=self.comment
+        )
+        self.save()
+
+        n_exp = len(self.exp_args_list)
+        last_error_count = None
+
+        for i in range(n_relaunch):
+            logger.info(f"Launching study {self.name} - trial {i + 1} / {n_relaunch}")
+            self._run(n_jobs, parallel_backend, strict_reproducibility)
+
+            suffix = f"trial_{i + 1}_of_{n_relaunch}"
+            _, summary_df, error_report = self.get_results(suffix=suffix)
+            logger.info("\n" + str(summary_df))
+
+            n_incomplete, n_error = self.find_incomplete(include_errors=relaunch_errors)
+
+            if n_error / n_exp > 0.3:
+                logger.warning(f"More than 30% of the experiments errored. Stopping the study.")
+                return
+
+            if last_error_count is not None and n_error >= last_error_count:
+                logger.warning(
+                    f"Last trial did not reduce the number of errors. Stopping the study."
+                )
+                return
+
+            if n_incomplete == 0:
+                logger.info(f"Study {self.name} finished.")
+                return
+
+        logger.warning(
+            f"Study {self.name} did not finish after {n_relaunch} trials. There are {n_incomplete} incomplete experiments."
+        )
+
+    def _run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False):
         """Run all experiments in the study in parallel when possible.
 
         Args:
@@ -115,15 +180,8 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False,
         logger.info("Preparing backends...")
         self.benchmark.prepare_backends()
         logger.info("Backends ready.")
-        self.set_reproducibility_info(
-            strict_reproducibility=strict_reproducibility, comment=comment
-        )
-        self.save()
 
         run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
-        report_df = self.get_report(ignore_cache=True)
-        logger.info(f"Study {self.name} finished.")
-        logger.info("\n" + str(report_df))
 
     def append_to_journal(self, strict_reproducibility=True):
         """Append the study to the journal.
@@ -142,6 +200,19 @@ def append_to_journal(self, strict_reproducibility=True):
             strict_reproducibility=strict_reproducibility,
         )
 
+    def get_results(self, suffix="", also_save=True):
+        result_df = inspect_results.load_result_df(self.dir)
+        error_report = inspect_results.error_report(result_df, max_stack_trace=3, use_log=True)
+        summary_df = inspect_results.summarize_study(result_df)
+
+        if also_save:
+            suffix = f"_{suffix}" if suffix else ""
+            result_df.to_csv(self.dir / f"result_df{suffix}.csv")
+            summary_df.to_csv(self.dir / f"summary_df{suffix}.csv")
+            (self.dir / f"error_report{suffix}.md").write_text(error_report)
+
+        return result_df, summary_df, error_report
+
     @property
     def name(self):
         agent_names = [a.agent_name for a in self.agent_args]
@@ -199,8 +270,8 @@ def load(dir: Path) -> "Study":
         return study
 
     @staticmethod
-    def load_most_recent(root_dir: Path = None):
-        return Study.load(get_most_recent_study(root_dir))
+    def load_most_recent(root_dir: Path = None, contains=None) -> "Study":
+        return Study.load(get_most_recent_study(root_dir, contains=contains))
 
 
 def get_most_recent_study(
@@ -238,28 +309,6 @@ def get_most_recent_study(
     return most_recent_folder
 
 
-# def make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error"):
-#     """Create a study from an existing study directory.
-
-#     It will search for all experiments that needs to be relaunched depending on
-#     `relaunch_mode`.
-
-#     Args:
-#         study_dir: Path
-#             The directory where the experiments are saved.
-#         relaunch_mode: str
-#             Find all incomplete experiments and relaunch them.
-#             - "incomplete_only": relaunch only the incomplete experiments.
-#             - "incomplete_or_error": relaunch incomplete or errors.
-#     """
-#     study = Study(dir=study_dir)
-#     study.exp_args_list, _ = find_incomplete(study.dir, relaunch_mode=relaunch_mode)
-#     info = study.load_reproducibility_info()
-#     study.benchmark_name = info["benchmark"]
-#     study.agent_names = info["agent_names"]
-#     return study
-
-
 def set_demo_mode(env_args_list: list[EnvArgs]):
 
     for env_args in env_args_list:
@@ -275,6 +324,7 @@ def _agents_on_benchmark(
     demo_mode=False,
     logging_level: int = logging.INFO,
     logging_level_stdout: int = logging.INFO,
+    ignore_dependencies=False,
 ):
     """Run one or multiple agents on a benchmark.
 
@@ -320,20 +370,29 @@ def _agents_on_benchmark(
     for i, exp_args in enumerate(exp_args_list):
         exp_args.order = i
 
-    _flag_sequential_exp(exp_args_list, benchmark)
+    # not required with ray, but keeping around if we would need it for visualwebareana on joblib
+    # _flag_sequential_exp(exp_args_list, benchmark)
+
+    if not ignore_dependencies:
+        # populate the depends_on field based on the task dependencies in the benchmark
+        exp_args_list = add_dependencies(exp_args_list, benchmark.dependency_graph_over_tasks())
+    else:
+        logger.warning(
+            f"Ignoring dependencies for benchmark {benchmark.name}. This could lead to different results."
+        )
 
     return exp_args_list
 
 
-def _flag_sequential_exp(exp_args_list: list[ExpArgs], benchmark: Benchmark):
-    if benchmark.name.startswith("visualwebarena"):
-        sequential_subset = benchmark.subset_from_glob("requires_reset", "True")
-        sequential_subset = set(
-            [env_args.task_name for env_args in sequential_subset.env_args_list]
-        )
-        for exp_args in exp_args_list:
-            if exp_args.env_args.task_name in sequential_subset:
-                exp_args.sequential = True
+# def _flag_sequential_exp(exp_args_list: list[ExpArgs], benchmark: Benchmark):
+#     if benchmark.name.startswith("visualwebarena"):
+#         sequential_subset = benchmark.subset_from_glob("requires_reset", "True")
+#         sequential_subset = set(
+#             [env_args.task_name for env_args in sequential_subset.env_args_list]
+#         )
+#         for exp_args in exp_args_list:
+#             if exp_args.env_args.task_name in sequential_subset:
+#                 exp_args.sequential = True
 
 
 # def ablation_study(start_agent: AgentArgs, changes, benchmark: str, demo_mode=False):
diff --git a/src/agentlab/experiments/view_dep_graph.py b/src/agentlab/experiments/view_dep_graph.py
new file mode 100644
index 000000000..0639507bc
--- /dev/null
+++ b/src/agentlab/experiments/view_dep_graph.py
@@ -0,0 +1,322 @@
+import math
+import bgym
+import matplotlib.pyplot as plt
+
+import networkx as nx
+import numpy as np
+
+
+def clean_dict(dependency_dict: dict[str, list[str]]) -> dict[str, list[str]]:
+    new_dep = {}
+    for key, deps in dependency_dict.items():
+        new_key = key.split(".")[-1]
+
+        new_dep[new_key] = [dep.split(".")[-1] for dep in deps]
+    return new_dep
+
+
+def dict_to_networkx(dependency_dict: dict[str, list[str]]) -> nx.DiGraph:
+
+    G = nx.DiGraph()
+    i = 0
+    # Add edges from each node to its dependencies
+    for node, dependencies in dependency_dict.items():
+        i += 1
+        if i > 20:
+            pass
+
+        print(node, dependencies)
+        # Add edges from the node to each of its dependencies
+        for dep in dependencies:
+            G.add_edge(dep, node)
+    return G
+
+
+def plot_graph(G, ax, title=None, node_color="lightblue", node_size=40, font_size=8):
+    """
+    Plot a single graph component on the given matplotlib axis.
+
+    Args:
+        G: NetworkX graph (should be a single connected component)
+        ax: Matplotlib axis to plot on
+        title: Optional title for the subplot
+        node_color: Color for the nodes
+        node_size: Size of the nodes
+        font_size: Size of the node labels
+    """
+    # Use a simple layout for better performance
+    # pos = nx.spring_layout(G, k=0.1, iterations=100)
+
+    pos = nx.kamada_kawai_layout(G)
+
+    # pos = nx.spectral_layout(G)
+
+    def name_to_size(name):
+        if "-" in name:
+            start, end = name.split("-")
+
+            n_nodes = int(end) - int(start) + 1
+        else:
+            n_nodes = 1
+        size_factor = node_size / 10
+        return n_nodes * size_factor
+
+    # compute size based on name
+    sizes = [name_to_size(name) for name in G.nodes]
+
+    nx.draw(
+        G,
+        pos,
+        ax=ax,
+        with_labels=True,
+        node_color=node_color,
+        node_size=sizes,
+        font_size=font_size,
+        font_weight="normal",
+        arrows=True,
+        arrowsize=15,
+    )
+
+    if title:
+        ax.set_title(title)
+    ax.axis("off")
+
+
+def plot_components_grid(
+    components, max_cols=4, node_color="lightblue", node_size=2000, font_size=10
+):
+    """
+    Plot components in a grid layout.
+
+    Args:
+        components: List of NetworkX graphs, one per component
+        max_cols: Maximum number of columns in the grid
+        node_color: Color for the nodes
+        node_size: Size of the nodes
+        font_size: Size of the node labels
+
+    Returns:
+        matplotlib figure
+    """
+    n_components = len(components)
+
+    if n_components == 0:
+        print("No components found")
+        return None
+
+    # Calculate grid dimensions
+    ncols = min(n_components, max_cols)
+    nrows = math.ceil(n_components / ncols)
+
+    # Create figure with a reasonable size per subplot
+    fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows))
+    fig.suptitle("Dependency Graph Components", size=16)
+
+    # Make axes iterable even if there's only one
+    if n_components == 1:
+        axes = np.array([[axes]])
+    elif nrows == 1:
+        axes = np.array([axes])
+    elif ncols == 1:
+        axes = axes.reshape(-1, 1)
+
+    # Plot each component
+    for idx, component in enumerate(components):
+        i, j = divmod(idx, ncols)
+        title = f"Component {idx+1} ({component.number_of_nodes()} nodes)"
+        plot_graph(
+            component,
+            axes[i, j],
+            title,
+            node_color=node_color,
+            node_size=node_size,
+            font_size=font_size,
+        )
+
+    # Remove empty subplots
+    for idx in range(n_components, nrows * ncols):
+        i, j = divmod(idx, ncols)
+        axes[i, j].remove()
+
+    plt.tight_layout()
+    return fig
+
+
+def compress_sequential_chains(dep_dict: dict[str, list[str]]) -> dict[str, list[str]]:
+    """
+    Compress chains of sequential numbers in a dependency dictionary.
+    Returns a new dictionary with compressed chains using range notation.
+
+    Args:
+        dep_dict: Dictionary mapping string numbers to list of string number dependencies
+
+    Returns:
+        Dictionary with compressed chains using range notation
+    """
+    # Convert to integers for easier processing
+    int_dict = {int(k): [int(x) for x in v] for k, v in dep_dict.items()}
+
+    # Find chains
+    chains = []
+    current_chain = []
+
+    # Sort nodes for sequential processing
+    nodes = sorted(int_dict.keys())
+
+    i = 0
+    while i < len(nodes):
+        node = nodes[i]
+
+        # Start new chain
+        if not current_chain:
+            current_chain = [node]
+            i += 1
+            continue
+
+        # Check if this node continues the chain
+        last_node = current_chain[-1]
+
+        # Conditions for chain continuation:
+        # 1. Numbers are consecutive
+        # 2. Current node has exactly one dependency
+        # 3. That dependency is the previous node in chain
+        # 4. The previous node has exactly one successor
+        is_consecutive = node == last_node + 1
+        has_single_dep = len(int_dict[node]) == 1
+        deps_on_last = has_single_dep and int_dict[node][0] == last_node
+        last_has_single_successor = sum(1 for k, v in int_dict.items() if last_node in v) == 1
+
+        if is_consecutive and deps_on_last and last_has_single_successor:
+            current_chain.append(node)
+        else:
+            if len(current_chain) > 1:
+                chains.append(current_chain)
+            current_chain = [node]
+
+        i += 1
+
+    # Add last chain if it exists
+    if len(current_chain) > 1:
+        chains.append(current_chain)
+
+    # Create compressed dictionary
+    compressed_dict = {}
+    processed_nodes = set()
+
+    # Add compressed chains
+    for chain in chains:
+        chain_name = f"{chain[0]}-{chain[-1]}"
+        # Find dependencies of first node in chain
+        deps = int_dict[chain[0]]
+        compressed_dict[chain_name] = [str(d) for d in deps]
+        processed_nodes.update(chain)
+
+    # Add remaining non-chain nodes
+    for node in nodes:
+        if node not in processed_nodes:
+            compressed_dict[str(node)] = [str(d) for d in int_dict[node]]
+
+    # Update dependencies to use compressed names
+    for k in compressed_dict:
+        deps = compressed_dict[k]
+        new_deps = []
+        for dep in deps:
+            dep_int = int(dep)
+            # Find if this dependency is part of a chain
+            chain_found = False
+            for chain in chains:
+                if dep_int in chain:
+                    new_deps.append(f"{chain[0]}-{chain[-1]}")
+                    chain_found = True
+                    break
+            if not chain_found:
+                new_deps.append(dep)
+        compressed_dict[k] = new_deps
+
+    return compressed_dict
+
+
+def compress_chains(G):
+    """
+    Compress chains in a directed graph by merging nodes that have single parent and single child.
+
+    Args:
+        G: NetworkX directed graph
+
+    Returns:
+        NetworkX directed graph with compressed chains
+    """
+    G_compressed = G.copy()
+    processed_nodes = set()
+
+    while True:
+        # Find nodes with exactly one parent and one child
+        nodes_to_compress = []
+        for node in list(
+            G_compressed.nodes()
+        ):  # Create a list to avoid modification during iteration
+            if node in processed_nodes:
+                continue
+
+            predecessors = list(G_compressed.predecessors(node))
+            successors = list(G_compressed.successors(node))
+
+            if len(predecessors) == 1 and len(successors) == 1:
+                pred = predecessors[0]
+                succ = successors[0]
+
+                # Skip if any node in the chain is already processed
+                if pred in processed_nodes or succ in processed_nodes:
+                    continue
+
+                # Only compress if middle node has single parent/child
+                pred_preds = list(G_compressed.predecessors(pred))
+                succ_succs = list(G_compressed.successors(succ))
+
+                if len(pred_preds) <= 1 and len(succ_succs) <= 1:
+                    nodes_to_compress.append((pred, node, succ))
+                    processed_nodes.update([pred, node, succ])
+
+        if not nodes_to_compress:
+            break
+
+        # Process each chain
+        for pred, mid, succ in nodes_to_compress:
+            if not all(G_compressed.has_node(n) for n in [pred, mid, succ]):
+                continue
+
+            # Create new merged node name
+            new_node = ",".join(str(n) for n in [pred, mid, succ])
+
+            # Add the new node
+            G_compressed.add_node(new_node)
+
+            # Add edges from all predecessors of first node
+            for p in list(G_compressed.predecessors(pred)):
+                G_compressed.add_edge(p, new_node)
+
+            # Add edges to all successors of last node
+            for s in list(G_compressed.successors(succ)):
+                G_compressed.add_edge(new_node, s)
+
+            # Remove the old nodes
+            G_compressed.remove_nodes_from([pred, mid, succ])
+
+    return G_compressed
+
+
+# benchmark = bgym.DEFAULT_BENCHMARKS["webarena"]()
+benchmark = bgym.DEFAULT_BENCHMARKS["visualwebarena"]()
+
+dep_graph = benchmark.dependency_graph_over_tasks()
+dep_graph = clean_dict(dep_graph)
+
+dep_graph = compress_sequential_chains(dep_graph)
+graph = dict_to_networkx(dep_graph)
+
+# graph = compress_chains(graph)
+
+components = nx.weakly_connected_components(graph)
+components = [graph.subgraph(component).copy() for component in components]
+plot_components_grid(components)
+plt.show()
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index fa0acf23e..084c854bd 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -31,7 +31,14 @@ def make_assistant_message(content: str) -> dict:
 class CheatMiniWoBLLM(AbstractChatModel):
     """For unit-testing purposes only. It only work with miniwob.click-test task."""
 
+    def __init__(self, wait_time=0) -> None:
+        self.wait_time = wait_time
+
     def __call__(self, messages) -> str:
+        if self.wait_time > 0:
+            print(f"Waiting for {self.wait_time} seconds")
+            time.sleep(self.wait_time)
+
         if isinstance(messages, Discussion):
             prompt = messages.to_string()
         else:
@@ -58,9 +65,10 @@ class CheatMiniWoBLLMArgs:
     max_total_tokens = 10240
     max_input_tokens = 8000
     max_new_tokens = 128
+    wait_time: int = 0
 
     def make_model(self):
-        return CheatMiniWoBLLM()
+        return CheatMiniWoBLLM(self.wait_time)
 
     def prepare_server(self):
         pass
diff --git a/tests/experiments/test_dask.py b/tests/experiments/test_dask.py
new file mode 100644
index 000000000..39822634f
--- /dev/null
+++ b/tests/experiments/test_dask.py
@@ -0,0 +1,41 @@
+from agentlab.experiments.graph_execution_dask import execute_task_graph, make_dask_client
+from agentlab.experiments.exp_utils import MockedExpArgs
+
+TASK_TIME = 3
+
+
+def test_execute_task_graph():
+    # Define a list of ExpArgs with dependencies
+    exp_args_list = [
+        MockedExpArgs(exp_id="task1", depends_on=[]),
+        MockedExpArgs(exp_id="task2", depends_on=["task1"]),
+        MockedExpArgs(exp_id="task3", depends_on=["task1"]),
+        MockedExpArgs(exp_id="task4", depends_on=["task2", "task3"]),
+    ]
+
+    with make_dask_client(n_worker=5):
+        results = execute_task_graph(exp_args_list)
+
+    exp_args_list = [results[task_id] for task_id in ["task1", "task2", "task3", "task4"]]
+
+    # Verify that all tasks were executed in the proper order
+    assert exp_args_list[0].start_time < exp_args_list[1].start_time
+    assert exp_args_list[0].start_time < exp_args_list[2].start_time
+    assert exp_args_list[1].end_time < exp_args_list[3].start_time
+    assert exp_args_list[2].end_time < exp_args_list[3].start_time
+
+    # # Verify that parallel tasks (task2 and task3) started within a short time of each other
+    # parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time)
+    # print(f"parallel_start_diff: {parallel_start_diff}")
+    # assert parallel_start_diff < 1.5  # Allow for a small delay
+
+    # Ensure that the entire task graph took the expected amount of time
+    total_time = exp_args_list[-1].end_time - exp_args_list[0].start_time
+    assert (
+        total_time >= TASK_TIME * 3
+    )  # Since the critical path involves at least 1.5 seconds of work
+
+
+if __name__ == "__main__":
+    test_execute_task_graph()
+    # test_add_dependencies()
diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py
index f37869fd6..91a7c2dba 100644
--- a/tests/experiments/test_launch_exp.py
+++ b/tests/experiments/test_launch_exp.py
@@ -7,34 +7,41 @@
 from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5, AGENT_4o_MINI
 from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
 from agentlab.analyze import inspect_results
-from agentlab.experiments.launch_exp import find_incomplete, run_experiments
+from agentlab.experiments.launch_exp import find_incomplete, run_experiments, non_dummy_count
 from agentlab.experiments.study import Study
 from agentlab.llm.chat_api import CheatMiniWoBLLMArgs
 
 
 def test_relaunch_study():
     study_dir = Path(__file__).parent.parent / "data" / "test_study"
-    exp_args_list = find_incomplete(study_dir, relaunch_mode="incomplete_only")
+    exp_args_list = find_incomplete(study_dir, include_errors=False)
 
-    assert len(exp_args_list) == 1
+    assert non_dummy_count(exp_args_list) == 1
     assert exp_args_list[0].env_args.task_name == "miniwob.ascending-numbers"
 
-    exp_args_list = find_incomplete(study_dir, relaunch_mode="incomplete_or_error")
+    exp_args_list = find_incomplete(study_dir, include_errors=True)
 
-    assert len(exp_args_list) == 2
+    assert non_dummy_count(exp_args_list) == 2
 
 
-@pytest.mark.repeat(3)  # there was stochastic bug caused by asyncio loop not started
-def test_launch_system(backend="dask"):
+def _test_launch_system(backend="ray", cause_timeout=False):
+
+    if cause_timeout:
+        wait_time = 10
+        avg_step_timeout = 0.5
+    else:
+        wait_time = 0
+        avg_step_timeout = 10
+
     exp_args_list = []
     for seed in range(3):
         exp_args_list.append(
             ExpArgs(
                 agent_args=GenericAgentArgs(
-                    chat_model_args=CheatMiniWoBLLMArgs(),
+                    chat_model_args=CheatMiniWoBLLMArgs(wait_time=wait_time),
                     flags=FLAGS_GPT_3_5,
                 ),
-                env_args=EnvArgs(task_name="miniwob.click-test", task_seed=seed),
+                env_args=EnvArgs(task_name="miniwob.click-test", task_seed=seed, max_steps=5),
             )
         )
 
@@ -42,7 +49,11 @@ def test_launch_system(backend="dask"):
 
         study_dir = Path(tmp_dir) / "generic_agent_test"
         run_experiments(
-            n_jobs=2, exp_args_list=exp_args_list, study_dir=study_dir, parallel_backend=backend
+            n_jobs=2,
+            exp_args_list=exp_args_list,
+            study_dir=study_dir,
+            parallel_backend=backend,
+            avg_step_timeout=avg_step_timeout,
         )
 
         results_df = inspect_results.load_result_df(study_dir, progress_fn=None)
@@ -51,22 +62,37 @@ def test_launch_system(backend="dask"):
         for _, row in results_df.iterrows():
             if row.stack_trace is not None:
                 print(row.stack_trace)
-            assert row.err_msg is None
-            assert row.cum_reward == 1.0
+            if cause_timeout:
+                assert row.err_msg is not None
+                assert "Timeout" in row.err_msg
+                assert row.cum_reward == 0
+            else:
+                assert row.err_msg is None
+                assert row.cum_reward == 1.0
 
         study_summary = inspect_results.summarize_study(results_df)
         assert len(study_summary) == 1
         assert study_summary.std_err.iloc[0] == 0
         assert study_summary.n_completed.iloc[0] == "3/3"
-        assert study_summary.avg_reward.iloc[0] == 1.0
+
+        if not cause_timeout:
+            assert study_summary.avg_reward.iloc[0] == 1.0
 
 
 def test_launch_system_joblib():
-    test_launch_system(backend="joblib")
+    _test_launch_system(backend="joblib")
 
 
 def test_launch_system_sequntial():
-    test_launch_system(backend="sequential")
+    _test_launch_system(backend="sequential")
+
+
+def test_launch_system_ray():
+    _test_launch_system(backend="ray")
+
+
+def _test_timeout_ray():
+    _test_launch_system(backend="ray", cause_timeout=True)
 
 
 @pytest.mark.pricy
@@ -94,7 +120,7 @@ def test_4o_mini_on_miniwob_tiny_test():
 
 
 if __name__ == "__main__":
-    test_relaunch_study()
+    _test_timeout_ray()
     # test_4o_mini_on_miniwob_tiny_test()
-    # test_launch_system()
+    # test_launch_system_ray()
     # test_launch_system_sequntial()
diff --git a/tests/experiments/test_graph_execution.py b/tests/experiments/test_ray.py
similarity index 63%
rename from tests/experiments/test_graph_execution.py
rename to tests/experiments/test_ray.py
index 9235358db..9af5959a1 100644
--- a/tests/experiments/test_graph_execution.py
+++ b/tests/experiments/test_ray.py
@@ -1,36 +1,12 @@
+import bgym
 import pytest
-from agentlab.experiments.graph_execution import (
-    execute_task_graph,
-    add_dependencies,
-    make_dask_client,
-)
-from time import time, sleep
-from browsergym.experiments.loop import ExpArgs, EnvArgs
+import ray
+from agentlab.experiments.graph_execution_ray import execute_task_graph
+from agentlab.experiments.exp_utils import MockedExpArgs, add_dependencies
 
 TASK_TIME = 3
 
 
-# Mock implementation of the ExpArgs class with timestamp checks
-class MockedExpArgs:
-    def __init__(self, exp_id, depends_on=None):
-        self.exp_id = exp_id
-        self.depends_on = depends_on if depends_on else []
-        self.start_time = None
-        self.end_time = None
-
-    def run(self):
-        self.start_time = time()
-
-        # # simulate playright code, (this was causing issues due to python async loop)
-        # import playwright.sync_api
-
-        # pw = playwright.sync_api.sync_playwright().start()
-        # pw.selectors.set_test_id_attribute("mytestid")
-        sleep(TASK_TIME)  # Simulate task execution time
-        self.end_time = time()
-        return self
-
-
 def test_execute_task_graph():
     # Define a list of ExpArgs with dependencies
     exp_args_list = [
@@ -40,8 +16,9 @@ def test_execute_task_graph():
         MockedExpArgs(exp_id="task4", depends_on=["task2", "task3"]),
     ]
 
-    with make_dask_client(n_worker=5):
-        results = execute_task_graph(exp_args_list)
+    ray.init(num_cpus=4)
+    results = execute_task_graph(exp_args_list)
+    ray.shutdown()
 
     exp_args_list = [results[task_id] for task_id in ["task1", "task2", "task3", "task4"]]
 
@@ -52,8 +29,9 @@ def test_execute_task_graph():
     assert exp_args_list[2].end_time < exp_args_list[3].start_time
 
     # Verify that parallel tasks (task2 and task3) started within a short time of each other
-    # parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time)
-    # assert parallel_start_diff < 1.5  # Allow for a small delay
+    parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time)
+    print(f"parallel_start_diff: {parallel_start_diff}")
+    assert parallel_start_diff < 1.5  # Allow for a small delay
 
     # Ensure that the entire task graph took the expected amount of time
     total_time = exp_args_list[-1].end_time - exp_args_list[0].start_time
@@ -66,7 +44,9 @@ def test_add_dependencies():
     # Prepare a simple list of ExpArgs
 
     def make_exp_args(task_name, exp_id):
-        return ExpArgs(agent_args=None, env_args=EnvArgs(task_name=task_name), exp_id=exp_id)
+        return bgym.ExpArgs(
+            agent_args=None, env_args=bgym.EnvArgs(task_name=task_name), exp_id=exp_id
+        )
 
     exp_args_list = [
         make_exp_args("task1", "1"),

From 6684e3dac9a1330775a51190e06e591abe69d1b2 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Wed, 6 Nov 2024 11:30:09 -0500
Subject: [PATCH 28/47] switching to 2 for loops in _agents_on_benchmark (#107)

---
 src/agentlab/experiments/study.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index b319eb877..cf53db3a2 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -358,14 +358,26 @@ def _agents_on_benchmark(
     if demo_mode:
         set_demo_mode(env_args_list)
 
-    exp_args_list = args.expand_cross_product(
-        ExpArgs(
-            agent_args=args.CrossProd(agents),
-            env_args=args.CrossProd(env_args_list),
-            logging_level=logging_level,
-            logging_level_stdout=logging_level_stdout,
-        )
-    )  # type: list[ExpArgs]
+    # exp_args_list = args.expand_cross_product(
+    #     ExpArgs(
+    #         agent_args=args.CrossProd(agents),
+    #         env_args=args.CrossProd(env_args_list),
+    #         logging_level=logging_level,
+    #         logging_level_stdout=logging_level_stdout,
+    #     )
+    # )  # type: list[ExpArgs]
+
+    exp_args_list = []
+
+    for agent in agents:
+        for env_args in env_args_list:
+            exp_args = ExpArgs(
+                agent_args=agent,
+                env_args=env_args,
+                logging_level=logging_level,
+                logging_level_stdout=logging_level_stdout,
+            )
+            exp_args_list.append(exp_args)
 
     for i, exp_args in enumerate(exp_args_list):
         exp_args.order = i

From dab1a486bf5c11041d2345546e274544c9f613c9 Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Wed, 6 Nov 2024 16:13:37 -0500
Subject: [PATCH 29/47] yet another way to kill timedout jobs (#108)

---
 src/agentlab/experiments/exp_utils.py         | 15 +++--
 .../experiments/graph_execution_ray.py        | 66 ++++++++++++++++---
 tests/experiments/test_launch_exp.py          | 12 ++--
 3 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py
index c2864ce00..97ce527d9 100644
--- a/src/agentlab/experiments/exp_utils.py
+++ b/src/agentlab/experiments/exp_utils.py
@@ -27,9 +27,11 @@
 
 def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60):
     """Run exp_args.run() with a timeout and handle dependencies."""
-    episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
-    with timeout_manager(seconds=episode_timeout):
-        return exp_arg.run()
+    # episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
+    # logger.warning(f"Running {exp_arg.exp_id} with timeout of {episode_timeout} seconds.")
+    # with timeout_manager(seconds=episode_timeout):
+    # this timeout method is not robust enough. using ray.cancel instead
+    return exp_arg.run()
 
 
 def _episode_timeout(exp_arg: ExpArgs, avg_step_timeout=60):
@@ -62,13 +64,12 @@ def timeout_manager(seconds: int = None):
 
     def alarm_handler(signum, frame):
 
-        logger.warning(
-            f"Operation timed out after {seconds}s, sending SIGINT and raising TimeoutError."
-        )
+        logger.warning(f"Operation timed out after {seconds}s, raising TimeoutError.")
         # send sigint
-        os.kill(os.getpid(), signal.SIGINT)
+        # os.kill(os.getpid(), signal.SIGINT) # this doesn't seem to do much I don't know why
 
         # Still raise TimeoutError for immediate handling
+        # This works, but it doesn't seem enough to kill the job
         raise TimeoutError(f"Operation timed out after {seconds} seconds")
 
     previous_handler = signal.signal(signal.SIGALRM, alarm_handler)
diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 3e01be31e..703f59e6c 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -2,11 +2,14 @@
 
 # # Disable Ray log deduplication
 # os.environ["RAY_DEDUP_LOGS"] = "0"
-
+import time
 import ray
 import bgym
-from agentlab.experiments.exp_utils import run_exp
+from agentlab.experiments.exp_utils import run_exp, _episode_timeout
+from ray.util import state
+import logging
 
+logger = logging.getLogger(__name__)
 
 run_exp = ray.remote(run_exp)
 
@@ -15,25 +18,70 @@ def execute_task_graph(exp_args_list: list[bgym.ExpArgs], avg_step_timeout=60):
     """Execute a task graph in parallel while respecting dependencies using Ray."""
 
     exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list}
-    tasks = {}
+    task_map = {}
 
     def get_task(exp_arg: bgym.ExpArgs):
-        if exp_arg.exp_id not in tasks:
+        if exp_arg.exp_id not in task_map:
             # Get all dependency tasks first
             dependency_tasks = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on]
 
             # Create new task that depends on the dependency results
-            tasks[exp_arg.exp_id] = run_exp.remote(
+            task_map[exp_arg.exp_id] = run_exp.remote(
                 exp_arg, *dependency_tasks, avg_step_timeout=avg_step_timeout
             )
-        return tasks[exp_arg.exp_id]
+        return task_map[exp_arg.exp_id]
 
     # Build task graph
     for exp_arg in exp_args_list:
         get_task(exp_arg)
 
-    # Execute all tasks and gather results
+    max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list])
+    return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1)
+
+
+def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_interval: float = 1.0):
+    """Cancel tasks that exceeds the timeout
+
+    I tried various different methods for killing a job that hangs. so far it's
+    the only one that seems to work reliably (hopefully)
+    """
+    task_list = list(tasks.values())
     task_ids = list(tasks.keys())
-    results = ray.get(list(tasks.values()))
 
-    return {task_id: result for task_id, result in zip(task_ids, results)}
+    logger.warning(f"Any task exceeding {timeout} seconds will be cancelled.")
+
+    while True:
+        ready, not_ready = ray.wait(task_list, num_returns=len(task_list), timeout=poll_interval)
+        for task in not_ready:
+            elapsed_time = get_elapsed_time(task)
+            # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}")
+            if elapsed_time is not None and elapsed_time > timeout:
+                msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s."
+                if elapsed_time < timeout + 60:
+                    logger.warning(msg + " Cancelling task.")
+                    ray.cancel(task, force=False, recursive=False)
+                else:
+                    logger.warning(msg + " Force killing.")
+                    ray.cancel(task, force=True, recursive=False)
+        if len(ready) == len(task_list):
+            results = []
+            for task in ready:
+                try:
+                    result = ray.get(task)
+                except Exception as e:
+                    result = e
+                results.append(result)
+
+            return {task_id: result for task_id, result in zip(task_ids, results)}
+
+
+def get_elapsed_time(task_ref: ray.ObjectRef):
+    task_id = task_ref.task_id().hex()
+    task_info = state.get_task(task_id, address="auto")
+    if task_info and task_info.start_time_ms is not None:
+        start_time_s = task_info.start_time_ms / 1000.0  # Convert ms to s
+        current_time_s = time.time()
+        elapsed_time = current_time_s - start_time_s
+        return elapsed_time
+    else:
+        return None  # Task has not started yet
diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py
index 91a7c2dba..782a9edca 100644
--- a/tests/experiments/test_launch_exp.py
+++ b/tests/experiments/test_launch_exp.py
@@ -1,3 +1,4 @@
+import math
 import tempfile
 from pathlib import Path
 
@@ -63,9 +64,8 @@ def _test_launch_system(backend="ray", cause_timeout=False):
             if row.stack_trace is not None:
                 print(row.stack_trace)
             if cause_timeout:
-                assert row.err_msg is not None
-                assert "Timeout" in row.err_msg
-                assert row.cum_reward == 0
+                # assert row.err_msg is not None
+                assert math.isnan(row.cum_reward) or row.cum_reward == 0
             else:
                 assert row.err_msg is None
                 assert row.cum_reward == 1.0
@@ -73,9 +73,9 @@ def _test_launch_system(backend="ray", cause_timeout=False):
         study_summary = inspect_results.summarize_study(results_df)
         assert len(study_summary) == 1
         assert study_summary.std_err.iloc[0] == 0
-        assert study_summary.n_completed.iloc[0] == "3/3"
 
         if not cause_timeout:
+            assert study_summary.n_completed.iloc[0] == "3/3"
             assert study_summary.avg_reward.iloc[0] == 1.0
 
 
@@ -91,7 +91,7 @@ def test_launch_system_ray():
     _test_launch_system(backend="ray")
 
 
-def _test_timeout_ray():
+def test_timeout_ray():
     _test_launch_system(backend="ray", cause_timeout=True)
 
 
@@ -120,7 +120,7 @@ def test_4o_mini_on_miniwob_tiny_test():
 
 
 if __name__ == "__main__":
-    _test_timeout_ray()
+    test_timeout_ray()
     # test_4o_mini_on_miniwob_tiny_test()
     # test_launch_system_ray()
     # test_launch_system_sequntial()

From aa59a4aba96055c34af40fa93ac9e0a9c7bd5e2d Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Thu, 7 Nov 2024 11:53:57 -0500
Subject: [PATCH 30/47] Fix prompt formatting in Observation and add static
 method to Study class (#110)

---
 src/agentlab/agents/dynamic_prompting.py | 2 +-
 src/agentlab/experiments/study.py        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index ac49089c9..1ddbf3be7 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -443,7 +443,7 @@ def shrink(self):
     def _prompt(self) -> str:
         return f"""
 # Observation of current step:
-{self.tabs}{self.html.prompt}{self.ax_tree.prompt}{self.focused_element.prompt}{self.error.prompt}
+{self.tabs.prompt}{self.html.prompt}{self.ax_tree.prompt}{self.focused_element.prompt}{self.error.prompt}
 
 """
 
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index cf53db3a2..4e3bcceac 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -246,6 +246,7 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
 
+    @staticmethod
     def load(dir: Path) -> "Study":
         dir = Path(dir)
         study_path = dir / "study.pkl.gz"

From feda734560bce00ccbc9f094950b13d59fcfccb7 Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Thu, 7 Nov 2024 15:06:35 -0500
Subject: [PATCH 31/47] Bug fix (#111)

* Fix prompt formatting in Observation and add static method to Study class

* Update gradio version to 5.5 to fix DataFrame scrolling issue
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 56ef81216..ded115e6e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ contexttimer
 ipython
 pyyaml>=6
 pandas
-gradio>=5
+gradio>=5.5 # issue with DataFrame scrolling before 5.5
 gitpython # for the reproducibility script
 requests
 matplotlib

From 7a5b91e62056fa8fb26efdd2f64f5b25a92b817c Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:31:37 -0500
Subject: [PATCH 32/47] Fixing openrouter pricing rate limit (#112)

* Update unit_tests.yml (#101)

* request is done once and then reused

* Patching minor stuff (#69)

* fixing sample_std for single experience

* making gradio shared server non default

* missing requirement for xray

* Improve agent xray app (#70)

* 0.2.2 Release (#67)

* downgrading ubuntu version for github tests (#62)

* Llm api update (#59)

* getting rid of .invoke()

* adding an AbstractChatModel

* changing chat_api structure

* Reproducibility again (#61)

* core functions

* switch to dask

* removing joblib dependency and adding dask

* fixing imports

* handles multiple backends

* ensure asyncio loop creation

* more tests

* setting dashboard address to None

* minor

* Finally found a way to make it work

* initial reproducibility files

* Seems to be superflus

* adding a reproducibility journal

* minor update

* more robust

* adding reproducibility tools

* fix white listing

* minor

* minor

* minor

* minor

* minor fix

* more tests

* more results yay

* disabling this test

* update

* update

* black

* maybe fixing github workflow ?

* make get_git_username great again

* trigger change

* new browsergym

* GPT-4o result (and new comment column)

* Seems like there was a change to 4o flags, trying these

* minor comment

* better xray

* minor fix

* addming a comment field

* new agent

* another test with GPT-4o

* adding llama3 from openrouter

* fix naming

* unused import

* new summary tools and remove "_args" from columns in results

* add Llama

* initial code for reproducibility agent

* adjust inspect results

* infer from benchmark

* fix reproducibility agent

* prevent the repro_dir to be an index variable

* updating repro agent stats

* Reproducibility agent

* instructions to setup workarena

* fixing tests

* handles better a few edge cases

* default progress function to None

* minor formatting

* minor

* initial commit

* refactoring with Study class

* refactor to adapt for study class

* minor

* fix pricy test

* fixing tests

* tmp

* print report

* minor fix

* refine little details about reproducibility

* minor

* no need for set_temp anymore

* sanity check before running main

* minor update

* minor

* new results with 4o on workarena.l1

* sharing is caring

* add llama to main.py

* new hournal entry

* lamma 3 70B

* minor

* typo

* black fix (wasn't configured)

---------

Co-authored-by: Thibault Le Sellier de Chezelles <thibault.de.chezelles@gmail.com>

* version bump

---------

Co-authored-by: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>

* Make share=TRue into a environment variable, disabled by default for security

* fix floating point issue with std_reward in agent xray

* Update src/agentlab/analyze/inspect_results.py

* Update src/agentlab/analyze/agent_xray.py

---------

Co-authored-by: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Co-authored-by: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>

* added tmlr definitive config (#71)

* downgrading gradio version (#77)

* Study refactor (#73)

* adapting to new Benchmark class

* fixing tests

* fix tests

* typo

* not ready for gradio 5

* study id and a few fixes

* fixing pricy tests

---------

Co-authored-by: ThibaultLSDC <thibault.de.chezelles@gmail.com>

* adding message class and updating generic agent accordingly (#68)

* adding message class and updating generic agent accordingly

* updating tests

* Reproducibility test before message class

* Adding inspect_result.ipynb to reprod white list

* Reproducibility test after message class

* L1 before message class

* L1 after message class

* added append as method to the Discussion class, to make it totally similar to a list

* changed to_markdown behavior

* updated most_basic_agent

* updated ReproAgent

* Update src/agentlab/analyze/agent_xray.py

* format

* new journal entry

* immutable as default kwarg

* removing __add__ and __radd__

* added deprecation warning

* updating tests

* version bump

* Updating generic_agent to fit use BGym's goal_object (#83)

* updating generic agent to goal_object

* fixing image markdown display

* updating tests

* fixing intruction BaseMessage

* added merge text in discussion

* added merge to discussion class

* added tests

* Minor revert (#86)

* minor revert

* revert tests too

* Add tabs (#84)

* add tabs

* make sure it's not computed if not visible

* Fix reproduce study (#87)

* add tabs

* this workaround is worst

* bug fix

* fix reproduce study

* make sure it's not computed if not visible

* upgrading gradio dependency (#88)

* bgym update (#90)

* Workarena TMLR experiments (#89)

* new entry

* adding llm configs

* new journal entries

* handling sequntial in VWA (#91)

* handling sequntial in VWA

* enable comments

* format

---------

Co-authored-by: ThibaultLSDC <thibault.de.chezelles@gmail.com>

* Tmlr workarena (#92)

* adding llm configs

* new L1 entries

* tmp

* reformat

* adding assistantbench to reproducibility_util.py

* gitignore (#97)

* Vision fix (#105)

* changing content name

* Update src/agentlab/llm/llm_utils.py

---------

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>

* L2 tmlr (#93)

* adding llm configs

* L2 entries

* claude L3

* claude vision support

* miniwob results

* 405b L1 entry

* Replacing Dask with Ray (#100)

* dask-dependencies

* minor

* replace with ray

* adjust tests and move a few things

* markdown report

* automatic relaunch

* add dependencies

* reformat

* fix unit-test

* catch timeout

* fixing bugs and making things work

* adress comments and black format

* new dependencies viewer

* Update benchmark to use visualwebarena instead of webarena

* Fix import and uncomment code in get_ray_url.py

* Add ignore_dependencies option to Study and _agents_on_benchmark functions

* Update load_most_recent method to include contains parameter

* Update load_most_recent method to accept contains parameter and add warning for ignored dependencies in _agents_on_benchmark

* Refactor backend preparation in Study class and improve logging for ignored dependencies

* finallly some results with claude on webarena

* Add warnings for Windows timeouts and clarify parallel backend options; update get_results method to conditionally save outputs

* black

* ensure timeout is int (For the 3rd time?)

* Refactor timeout handling in context manager; update test to reduce avg_step_timeout and rename test function

* black

* Change parallel backend from "joblib" to "ray" in run_experiments function

* Update src/agentlab/experiments/study.py

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>

* Update src/agentlab/analyze/inspect_results.py

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>

* Refactor logging initialization and update layout configurations in dependency graph plotting; adjust node size and font size for better visualization

---------

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>

* switching to 2 for loops in _agents_on_benchmark (#107)

* yet another way to kill timedout jobs (#108)

* request is done once and then reused

* switched to caching original function bc it doesnt break to tests

* added a catch for some openrouter under-the-hood error

---------

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>
Co-authored-by: Xing Han Lu <21180505+xhluca@users.noreply.github.com>
Co-authored-by: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
---
 .github/workflows/unit_tests.yml |  5 ++++-
 src/agentlab/llm/chat_api.py     | 10 ++++++++++
 src/agentlab/llm/tracking.py     |  2 ++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index a6b44f873..7a0e312a6 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -38,6 +38,9 @@ jobs:
       - name: Install Playwright
         run: playwright install chromium --with-deps
 
+      - name: Download WebArena / VisualWebArena ressource files
+        run: python -c 'import nltk; nltk.download("punkt_tab")'
+
       - name: Fetch MiniWob
         uses: actions/checkout@v4
         with:
@@ -58,4 +61,4 @@ jobs:
       - name: Run AgentLab Unit Tests
         env:
           MINIWOB_URL: "http://localhost:8080/miniwob/"
-        run: pytest -n 5 --durations=10 -m 'not pricy' -v tests/
\ No newline at end of file
+        run: pytest -n 5 --durations=10 -m 'not pricy' -v tests/
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index 084c854bd..2ed8f0d6e 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -208,6 +208,10 @@ def handle_error(error, itr, min_retry_wait_time, max_retry):
     return error_type
 
 
+class OpenRouterError(openai.OpenAIError):
+    pass
+
+
 class ChatModel(AbstractChatModel):
     def __init__(
         self,
@@ -274,6 +278,12 @@ def __call__(self, messages: list[dict]) -> dict:
                     temperature=self.temperature,
                     max_tokens=self.max_tokens,
                 )
+
+                if completion.usage is None:
+                    raise OpenRouterError(
+                        "The completion object does not contain usage information. This is likely a bug in the OpenRouter API."
+                    )
+
                 self.success = True
                 break
             except openai.OpenAIError as e:
diff --git a/src/agentlab/llm/tracking.py b/src/agentlab/llm/tracking.py
index 7e2761ac6..8e3d812a0 100644
--- a/src/agentlab/llm/tracking.py
+++ b/src/agentlab/llm/tracking.py
@@ -1,3 +1,4 @@
+from functools import cache
 import os
 import threading
 from contextlib import contextmanager
@@ -61,6 +62,7 @@ def wrapper(self, obs):
     return wrapper
 
 
+@cache
 def get_pricing_openrouter():
     api_key = os.getenv("OPENROUTER_API_KEY")
     assert api_key, "OpenRouter API key is required"

From 3e94570f7a6f8a796f28728fc21c60d3cfb43dc1 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:00:03 -0500
Subject: [PATCH 33/47] updating max prompt configs, vision support (#109)

---
 .../agents/generic_agent/agent_configs.py     | 13 ++--
 .../agents/generic_agent/tmlr_config.py       |  2 +-
 src/agentlab/llm/llm_configs.py               | 59 ++++++++++---------
 3 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
index 2d4f9bc39..a5db8c906 100644
--- a/src/agentlab/agents/generic_agent/agent_configs.py
+++ b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -1,4 +1,5 @@
 import bgym
+
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.experiments import args
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
@@ -41,7 +42,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
@@ -89,7 +90,7 @@
     use_abstract_example=True,  # useful
     use_hints=True,  # useful
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
@@ -136,7 +137,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
     add_missparsed_messages=True,
@@ -186,7 +187,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
     add_missparsed_messages=True,
@@ -234,7 +235,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
@@ -298,7 +299,7 @@
     use_hints=args.Choice([True, False], p=[0.7, 0.3]),
     be_cautious=args.Choice([True, False]),
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     extra_instructions=None,
 )
 
diff --git a/src/agentlab/agents/generic_agent/tmlr_config.py b/src/agentlab/agents/generic_agent/tmlr_config.py
index 11860e691..48a28c682 100644
--- a/src/agentlab/agents/generic_agent/tmlr_config.py
+++ b/src/agentlab/agents/generic_agent/tmlr_config.py
@@ -40,7 +40,7 @@
     use_abstract_example=True,
     use_hints=True,
     enable_chat=False,
-    max_prompt_tokens=None,
+    max_prompt_tokens=40_000,
     be_cautious=True,
     extra_instructions=None,
 )
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index 8376b5c25..feb4d1c84 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -20,28 +20,28 @@
     "openai/gpt-4o-mini-2024-07-18": OpenAIModelArgs(
         model_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         vision_support=True,
     ),
     "openai/gpt-4-1106-preview": OpenAIModelArgs(
         model_name="gpt-4-1106-preview",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,  # make sure we don't bust budget
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
     ),
     "openai/gpt-4-vision-preview": OpenAIModelArgs(
         model_name="gpt-4-vision-preview",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,  # make sure we don't bust budget
-        max_new_tokens=4000,  # I think this model has very small default value if we don't set max_new_tokens
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,  # I think this model has very small default value if we don't set max_new_tokens
         vision_support=True,
     ),
     "openai/gpt-4o-2024-05-13": OpenAIModelArgs(
         model_name="gpt-4o-2024-05-13",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,  # make sure we don't bust budget
-        max_new_tokens=4000,  # I think this model has very small default value if we don't set max_new_tokens
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,  # I think this model has very small default value if we don't set max_new_tokens
         vision_support=True,
     ),
     "openai/gpt-3.5-turbo-0125": OpenAIModelArgs(
@@ -67,22 +67,25 @@
         model_name="gpt-4o",
         deployment_name="gpt-4o-2024-05-13",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4_000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
+        vision_support=True,
     ),
     "azure/gpt-4o-2024-08-06": AzureModelArgs(
         model_name="gpt-4o",
         deployment_name="gpt-4o-2024-08-06",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4_000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
+        vision_support=True,
     ),
     "azure/gpt-4o-mini-2024-07-18": AzureModelArgs(
         model_name="gpt-4o-mini",
         deployment_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4_000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
+        vision_support=True,
     ),
     # ---------------- OSS LLMs ----------------#
     "meta-llama/Meta-Llama-3-70B-Instruct": SelfHostedModelArgs(
@@ -113,43 +116,43 @@
     "openrouter/meta-llama/llama-3.1-405b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-405b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3.1-70b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-70b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3-70b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3-70b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3.1-8b-instruct:free": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-8b-instruct:free",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/meta-llama/llama-3.1-8b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-8b-instruct",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
     "openrouter/anthropic/claude-3.5-sonnet:beta": OpenRouterModelArgs(
         model_name="anthropic/claude-3.5-sonnet:beta",
         max_total_tokens=200_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=160_000,
+        max_new_tokens=40_000,
         temperature=1e-1,
         vision_support=True,
     ),
@@ -163,8 +166,8 @@
     "openrouter/openai/o1-mini-2024-09-12": OpenRouterModelArgs(
         model_name="openai/o1-mini-2024-09-12",
         max_total_tokens=128_000,
-        max_input_tokens=40_000,
-        max_new_tokens=4000,
+        max_input_tokens=100_000,
+        max_new_tokens=28_000,
         temperature=1e-1,
     ),
 }

From 1ebb896cd527e7000b87346af6a71987424f6de4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A9o=20Boisvert?= <leo.boisvert@hotmail.ca>
Date: Fri, 8 Nov 2024 16:01:11 -0500
Subject: [PATCH 34/47] Cross-product deepcopy fix (#106)

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>
---
 .github/workflows/unit_tests.yml |  3 +++
 src/agentlab/experiments/args.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 7a0e312a6..3342ba540 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -58,6 +58,9 @@ jobs:
       - name: Check MiniWob availability
         run: curl -I "http://localhost:8080/miniwob/" || echo "MiniWob not reachable"
 
+      - name: Pre-download nltk ressources
+        run: python -c "import nltk; nltk.download('punkt_tab')"
+
       - name: Run AgentLab Unit Tests
         env:
           MINIWOB_URL: "http://localhost:8080/miniwob/"
diff --git a/src/agentlab/experiments/args.py b/src/agentlab/experiments/args.py
index bbbb3b7b4..6a4fa804e 100644
--- a/src/agentlab/experiments/args.py
+++ b/src/agentlab/experiments/args.py
@@ -105,13 +105,19 @@ def expand_cross_product(obj: Any | list[Any]):
     for obj in obj_list:
         cprod_paths = _find_cprod_with_paths(obj)
         if not cprod_paths:
-            return [copy.deepcopy(obj)]
+            result.append(copy.deepcopy(obj))
+            continue
 
         paths, cprod_objects = zip(*cprod_paths)
         combinations = product(*[cprod_obj.elements for cprod_obj in cprod_objects])
 
+        # create a base object with empty fields to make fast deep copies from
+        base_obj = copy.deepcopy(obj)
+        for path in paths:
+            _set_value(base_obj, path, None)
+
         for combo in combinations:
-            new_obj = copy.deepcopy(obj)
+            new_obj = copy.deepcopy(base_obj)
             for path, value in zip(paths, combo):
                 _set_value(new_obj, path, value)
             result.append(new_obj)

From f35dea0000f6dd2ac9112f4d64ed86e990f4ade0 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Mon, 11 Nov 2024 08:49:01 -0500
Subject: [PATCH 35/47] slugify study_name (#114)

---
 requirements.txt                  |  1 +
 src/agentlab/experiments/study.py | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ded115e6e..453f312d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,3 +21,4 @@ gitpython # for the reproducibility script
 requests
 matplotlib
 ray[default]
+python-slugify
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 4e3bcceac..81baecf30 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -1,20 +1,26 @@
 import gzip
 import logging
 import pickle
+import re
+import uuid
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
-import uuid
 
 import bgym
 from bgym import Benchmark, EnvArgs, ExpArgs
+from slugify import slugify
 
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.analyze import inspect_results
 from agentlab.experiments import args
 from agentlab.experiments import reproducibility_util as repro
 from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
-from agentlab.experiments.launch_exp import find_incomplete, run_experiments, non_dummy_count
+from agentlab.experiments.launch_exp import (
+    find_incomplete,
+    non_dummy_count,
+    run_experiments,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -220,6 +226,9 @@ def name(self):
             study_name = f"{agent_names[0]}_on_{self.benchmark.name}"
         else:
             study_name = f"{len(agent_names)}_agents_on_{self.benchmark.name}"
+
+        study_name = slugify(study_name, max_length=100, allow_unicode=True)
+
         if self.suffix:
             study_name += f"_{self.suffix}"
         return study_name

From c5dfb17a319437e8d43652b030513b0be38d26cf Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Wed, 6 Nov 2024 21:29:28 +0000
Subject: [PATCH 36/47] Improve timeout handling in task polling logic

---
 src/agentlab/experiments/graph_execution_ray.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 703f59e6c..46b96bd88 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -36,6 +36,7 @@ def get_task(exp_arg: bgym.ExpArgs):
         get_task(exp_arg)
 
     max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list])
+    
     return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1)
 
 
@@ -57,7 +58,7 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter
             # print(f"Task {task.task_id().hex()} elapsed time: {elapsed_time}")
             if elapsed_time is not None and elapsed_time > timeout:
                 msg = f"Task {task.task_id().hex()} hase been running for {elapsed_time}s, more than the timeout: {timeout}s."
-                if elapsed_time < timeout + 60:
+                if elapsed_time < timeout + 60 + poll_interval:
                     logger.warning(msg + " Cancelling task.")
                     ray.cancel(task, force=False, recursive=False)
                 else:

From cf05bc69ad6756da05fed613a3c8f33e1bebe3f3 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 7 Nov 2024 22:05:40 +0000
Subject: [PATCH 37/47] Add method to override max_steps in Study class

---
 src/agentlab/experiments/study.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 81baecf30..f301541a1 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -254,6 +254,10 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
         return inspect_results.get_study_summary(
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
+    
+    def override_max_steps(self, max_steps):
+        for exp_args in self.exp_args_list:
+            exp_args.env_args.max_steps = max_steps
 
     @staticmethod
     def load(dir: Path) -> "Study":

From 5297157602dedde17614d1531e80eab9a6795425 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 18:32:15 +0000
Subject: [PATCH 38/47] add support for tab visibility in observation flags and
 update related components

---
 src/agentlab/agents/dynamic_prompting.py      | 13 ++---
 .../agents/generic_agent/generic_agent.py     |  3 +-
 .../generic_agent/generic_agent_prompt.py     |  1 -
 tests/agents/test_generic_prompt.py           | 47 +++++++++----------
 4 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 1ddbf3be7..3b4fcd73b 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -10,8 +10,6 @@
 
 import bgym
 from browsergym.core.action.base import AbstractActionSet
-from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.core.action.python import PythonActionSet
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
@@ -71,6 +69,7 @@ class ObsFlags(Flags):
 
     use_html: bool = True
     use_ax_tree: bool = False
+    use_tabs: bool = False
     use_focused_element: bool = False
     use_error_logs: bool = False
     use_history: bool = False
@@ -386,11 +385,7 @@ def _prompt(self) -> str:
     URL: {page_url}
 """
             prompt_pieces.append(prompt_piece)
-        self._prompt = "\n".join(prompt_pieces)
-
-
-def has_tab_action(action_set: bgym.HighLevelActionSetArgs):
-    return "tab" in action_set.subsets
+        return  "\n".join(prompt_pieces)
 
 
 class Observation(Shrinkable):
@@ -399,14 +394,14 @@ class Observation(Shrinkable):
     Contains the html, the accessibility tree and the error logs.
     """
 
-    def __init__(self, obs, flags: ObsFlags, use_tabs=False) -> None:
+    def __init__(self, obs, flags: ObsFlags) -> None:
         super().__init__()
         self.flags = flags
         self.obs = obs
 
         self.tabs = Tabs(
             obs,
-            visible=use_tabs,
+            visible=lambda: flags.use_tabs,
             prefix="## ",
         )
 
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index 5ef8a4cc2..98026dc1f 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -32,6 +32,7 @@ def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
         if benchmark.name.startswith("miniwob"):
             self.flags.obs.use_html = True
 
+        self.flags.obs.use_tabs = benchmark.is_multi_tab
         self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args)
 
         # for backward compatibility with old traces
@@ -268,5 +269,3 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
         output += f"\n<action>\n{action}\n</action>"
 
     return system_prompt, instruction_prompt, output
-    return system_prompt, instruction_prompt, output
-    return system_prompt, instruction_prompt, output
diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
index eb45ba59f..67899f182 100644
--- a/src/agentlab/agents/generic_agent/generic_agent_prompt.py
+++ b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -77,7 +77,6 @@ def __init__(
         self.obs = dp.Observation(
             obs_history[-1],
             self.flags.obs,
-            use_tabs=dp.has_tab_action(self.flags.action.action_set),
         )
 
         self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action)
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index a579c2611..66b173a2a 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -20,33 +20,28 @@
 </html>
 """
 
+base_obs = {
+    "goal": "do this and that",
+    "goal_object": [{"type": "text", "text": "do this and that"}],
+    "chat_messages": [{"role": "user", "message": "do this and that"}],
+    "axtree_txt": "[1] Click me",
+    "focused_element_bid": "45-256",
+    "open_pages_urls": ["https://example.com"],
+    "open_pages_titles": ["Example"],
+    "active_page_index": 0,
+}
 
 OBS_HISTORY = [
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(1),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "",
     },
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(2),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "Hey, this is an error in the past",
     },
-    {
-        "goal": "do this and that",
-        "goal_object": [{"type": "text", "text": "do this and that"}],
-        "chat_messages": [{"role": "user", "message": "do this and that"}],
+    base_obs | {
         "pruned_html": html_template.format(3),
-        "axtree_txt": "[1] Click me",
-        "focused_element_bid": "45-256",
         "last_action_error": "Hey, there is an error now",
     },
 ]
@@ -58,6 +53,7 @@
     obs=dp.ObsFlags(
         use_html=True,
         use_ax_tree=True,
+        use_tabs=True,
         use_focused_element=True,
         use_error_logs=True,
         use_history=True,
@@ -104,6 +100,10 @@
         "obs.use_ax_tree",
         ("AXTree:", "Click me"),
     ),
+    (
+        "obs.use_tabs",
+        ("Currently open tabs:","(active tab)"),
+    ),
     (
         "obs.use_focused_element",
         ("Focused element:", "bid='45-256'"),
@@ -251,11 +251,6 @@ def test_main_prompt_elements_present():
     # for debugging
     test_shrinking_observation()
     test_main_prompt_elements_present()
-    for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-        test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+    # for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
+    #     test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
+  
\ No newline at end of file

From 4279d5ca4e943adc0395b33b7132d4369ab1412b Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 19:59:45 +0000
Subject: [PATCH 39/47] fix tests

---
 src/agentlab/agents/dynamic_prompting.py      |  9 ++++++--
 .../experiments/graph_execution_ray.py        | 10 +++++----
 src/agentlab/experiments/study.py             |  2 +-
 tests/agents/test_generic_prompt.py           | 21 ++++++++++++-------
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 3b4fcd73b..07dcc58d9 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -10,7 +10,12 @@
 
 import bgym
 from browsergym.core.action.base import AbstractActionSet
-from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
+from browsergym.utils.obs import (
+    flatten_axtree_to_str,
+    flatten_dom_to_str,
+    overlay_som,
+    prune_html,
+)
 
 from agentlab.llm.llm_utils import (
     BaseMessage,
@@ -385,7 +390,7 @@ def _prompt(self) -> str:
     URL: {page_url}
 """
             prompt_pieces.append(prompt_piece)
-        return  "\n".join(prompt_pieces)
+        return "\n".join(prompt_pieces)
 
 
 class Observation(Shrinkable):
diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
index 46b96bd88..5dd18d4ae 100644
--- a/src/agentlab/experiments/graph_execution_ray.py
+++ b/src/agentlab/experiments/graph_execution_ray.py
@@ -2,12 +2,14 @@
 
 # # Disable Ray log deduplication
 # os.environ["RAY_DEDUP_LOGS"] = "0"
+import logging
 import time
-import ray
+
 import bgym
-from agentlab.experiments.exp_utils import run_exp, _episode_timeout
+import ray
 from ray.util import state
-import logging
+
+from agentlab.experiments.exp_utils import _episode_timeout, run_exp
 
 logger = logging.getLogger(__name__)
 
@@ -36,7 +38,7 @@ def get_task(exp_arg: bgym.ExpArgs):
         get_task(exp_arg)
 
     max_timeout = max([_episode_timeout(exp_args, avg_step_timeout) for exp_args in exp_args_list])
-    
+
     return poll_for_timeout(task_map, max_timeout, poll_interval=max_timeout * 0.1)
 
 
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index f301541a1..2139ce7b8 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -254,7 +254,7 @@ def get_report(self, ignore_cache=False, ignore_stale=False):
         return inspect_results.get_study_summary(
             self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
         )
-    
+
     def override_max_steps(self, max_steps):
         for exp_args in self.exp_args_list:
             exp_args.env_args.max_steps = max_steps
diff --git a/tests/agents/test_generic_prompt.py b/tests/agents/test_generic_prompt.py
index 66b173a2a..cc1f9036d 100644
--- a/tests/agents/test_generic_prompt.py
+++ b/tests/agents/test_generic_prompt.py
@@ -5,7 +5,10 @@
 
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5
-from agentlab.agents.generic_agent.generic_agent_prompt import GenericPromptFlags, MainPrompt
+from agentlab.agents.generic_agent.generic_agent_prompt import (
+    GenericPromptFlags,
+    MainPrompt,
+)
 from agentlab.llm.llm_utils import count_tokens
 
 html_template = """
@@ -32,15 +35,18 @@
 }
 
 OBS_HISTORY = [
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(1),
         "last_action_error": "",
     },
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(2),
         "last_action_error": "Hey, this is an error in the past",
     },
-    base_obs | {
+    base_obs
+    | {
         "pruned_html": html_template.format(3),
         "last_action_error": "Hey, there is an error now",
     },
@@ -102,7 +108,7 @@
     ),
     (
         "obs.use_tabs",
-        ("Currently open tabs:","(active tab)"),
+        ("Currently open tabs:", "(active tab)"),
     ),
     (
         "obs.use_focused_element",
@@ -165,7 +171,7 @@ def test_shrinking_observation():
     flags.obs.use_html = True
 
     prompt_maker = MainPrompt(
-        action_set=dp.HighLevelActionSet(),
+        action_set=bgym.HighLevelActionSet(),
         obs_history=OBS_HISTORY,
         actions=ACTIONS,
         memories=MEMORIES,
@@ -231,7 +237,7 @@ def test_main_prompt_elements_present():
     # Initialize MainPrompt
     prompt = str(
         MainPrompt(
-            action_set=dp.HighLevelActionSet(),
+            action_set=bgym.HighLevelActionSet(),
             obs_history=OBS_HISTORY,
             actions=ACTIONS,
             memories=MEMORIES,
@@ -253,4 +259,3 @@ def test_main_prompt_elements_present():
     test_main_prompt_elements_present()
     # for flag, expected_prompts in FLAG_EXPECTED_PROMPT:
     #     test_main_prompt_elements_gone_one_at_a_time(flag, expected_prompts)
-  
\ No newline at end of file

From ad374fc54e7a7b1d9cb0a79fe0da49251d37bdcd Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 22:23:46 +0000
Subject: [PATCH 40/47] Fix sorting bug.  improve directory content retrieval
 with summary statistics

---
 src/agentlab/analyze/agent_xray.py      | 67 ++++++++++++++++---------
 src/agentlab/analyze/inspect_results.py |  1 +
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 38968fd6c..310ebd227 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -184,8 +184,6 @@ def run_gradio(results_dir: Path):
 
     2. **Select Task**: Select the task you want to analyze, this will trigger
        an update of the available seeds.
-       **IMPORTANT NOTE**: Due to a gradio bug, if you sort the columns of the table, the task
-       selection will not correspond to the right one.
 
     3. **Select the Seed**: You might have multiple repetition for a given task,
        you will be able to select the seed you want to analyze.
@@ -216,10 +214,9 @@ def run_gradio(results_dir: Path):
                         """\
     Click on a row to select an agent. It will trigger the update of other
     fields.
-
-    **GRADIO BUG**: If you sort the columns the click will not match the
-    content. You have to sort back with the Idx column to align the click with
-    the order."""
+    
+    The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
+    """
                     )
                 agent_table = gr.DataFrame(max_height=500, show_label=False, interactive=False)
             with gr.Tab("Select Task and Seed", id="Select Task"):
@@ -231,9 +228,8 @@ def run_gradio(results_dir: Path):
                                     """\
         Click on a row to select a task. It will trigger the update of other fields.
 
-        **GRADIO BUG**: If you sort the columns the click will not match the
-        content. You have to sort back with the Idx column to align the click with
-        the order."""
+        The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
+        """
                                 )
                             refresh_results_button = gr.Button("↺", scale=0, size="sm")
 
@@ -250,9 +246,8 @@ def run_gradio(results_dir: Path):
                                 """\
     Click on a row to select a seed. It will trigger the update of other fields.
 
-    **GRADIO BUG**: If you sort the columns the click will not match the
-    content. You have to sort back with the Idx column to align the click with
-    the order."""
+    The update mechanism is somewhat flacky, please help figure out why (or is it just gradio?).
+    """
                             )
 
                         seed_table = gr.DataFrame(
@@ -824,22 +819,22 @@ def extract_columns(row: pd.Series):
         )
 
     seed_df = result_df.apply(extract_columns, axis=1)
-    seed_df["Idx"] = seed_df.index
     return seed_df
 
 
 def on_select_agent(evt: gr.SelectData, df: pd.DataFrame):
-    global info
+    # TODO try to find a clever way to solve the sort bug here
     return info.get_agent_id(df.iloc[evt.index[0]])
 
 
 def on_select_task(evt: gr.SelectData, df: pd.DataFrame, agent_id: list[tuple]):
-    return (agent_id, df.iloc[evt.index[0]][TASK_NAME_KEY])
+    # get col index
+    col_idx = df.columns.get_loc(TASK_NAME_KEY)
+    return (agent_id, evt.row_value[col_idx])
 
 
 def update_seeds(agent_task_id: tuple):
     agent_id, task_name = agent_task_id
-    global info
     seed_df = get_seeds_df(info.agent_df, task_name)
     first_seed = seed_df.iloc[0]["seed"]
     return seed_df, EpisodeId(agent_id=agent_id, task_name=task_name, seed=first_seed)
@@ -847,7 +842,8 @@ def update_seeds(agent_task_id: tuple):
 
 def on_select_seed(evt: gr.SelectData, df: pd.DataFrame, agent_task_id: tuple):
     agent_id, task_name = agent_task_id
-    seed = df.iloc[evt.index[0]]["seed"]
+    col_idx = df.columns.get_loc("seed")
+    seed = evt.row_value[col_idx]  # seed should be the first column
     return EpisodeId(agent_id=agent_id, task_name=task_name, seed=seed)
 
 
@@ -933,6 +929,7 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
     if exp_dir == select_dir_instructions:
         return None, None
 
+    exp_dir = exp_dir.split(" - ")[0]
     global info
 
     if len(exp_dir) == 0:
@@ -943,10 +940,13 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
     info.result_df = inspect_results.load_result_df(info.exp_list_dir, progress_fn=progress.tqdm)
     info.result_df = remove_args_from_col(info.result_df)
 
-    agent_report = display_table(get_agent_report(info.result_df))
+    study_summary = inspect_results.summarize_study(info.result_df)
+    # save study_summary
+    study_summary.to_csv(info.exp_list_dir / "summary_df.csv", index=False)
+    agent_report = display_table(study_summary)
+
     info.agent_id_keys = agent_report.index.names
     agent_report.reset_index(inplace=True)
-    agent_report["Idx"] = agent_report.index
 
     agent_id = info.get_agent_id(agent_report.iloc[0])
 
@@ -960,7 +960,6 @@ def new_agent_id(agent_id: list[tuple]):
 
     info.tasks_df = inspect_results.reduce_episodes(info.agent_df).reset_index()
     info.tasks_df = info.tasks_df.drop(columns=["std_err"])
-    info.tasks_df["Idx"] = info.tasks_df.index
 
     # task name of first element
     task_name = info.tasks_df.iloc[0][TASK_NAME_KEY]
@@ -968,10 +967,30 @@ def new_agent_id(agent_id: list[tuple]):
 
 
 def get_directory_contents(results_dir: Path):
-    directories = sorted(
-        [str(file.name) for file in results_dir.iterdir() if file.is_dir()], reverse=True
-    )
-    return [select_dir_instructions] + directories
+    exp_descriptions = []
+    for dir in results_dir.iterdir():
+        if not dir.is_dir():
+            continue
+
+        exp_description = dir.name
+        # get summary*.csv files and find the most recent
+        summary_files = list(dir.glob("summary*.csv"))
+        if len(summary_files) != 0:
+            most_recent_summary = max(summary_files, key=os.path.getctime)
+            summary_df = pd.read_csv(most_recent_summary)
+
+            # get row with max avg_reward
+            max_reward_row = summary_df.loc[summary_df["avg_reward"].idxmax()]
+            reward = max_reward_row["avg_reward"] * 100
+            completed = max_reward_row["n_completed"]
+            n_err = max_reward_row["n_err"]
+            exp_description += (
+                f" - avg-reward: {reward:.1f}% - completed: {completed} - errors: {n_err}"
+            )
+
+        exp_descriptions.append(exp_description)
+
+    return [select_dir_instructions] + sorted(exp_descriptions, reverse=True)
 
 
 def most_recent_folder(results_dir: Path):
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 9ea2aac3e..cf0387734 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -295,6 +295,7 @@ def summarize(sub_df, use_bootstrap=False):
             avg_steps=sub_df["n_steps"].mean(skipna=True).round(3),
             n_completed=f"{n_completed}/{len(sub_df)}",
             n_err=err.sum(skipna=True),
+            cum_cost=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
         )
 
     return pd.Series(record)

From ef33f1f76cafb9fb4712da9fb2d41449030c249a Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 22:27:38 +0000
Subject: [PATCH 41/47] fix test

---
 src/agentlab/analyze/inspect_results.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index cf0387734..4cb4ccf7a 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -295,8 +295,9 @@ def summarize(sub_df, use_bootstrap=False):
             avg_steps=sub_df["n_steps"].mean(skipna=True).round(3),
             n_completed=f"{n_completed}/{len(sub_df)}",
             n_err=err.sum(skipna=True),
-            cum_cost=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
         )
+        if "stats.cum_cost" in sub_df:
+            record["cum_cost"]=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
 
     return pd.Series(record)
 

From f86b5057febcbe3bbec51a23b16603ace7ab5049 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 8 Nov 2024 22:27:50 +0000
Subject: [PATCH 42/47] black

---
 src/agentlab/analyze/inspect_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 4cb4ccf7a..8df2c1ef6 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False):
             n_err=err.sum(skipna=True),
         )
         if "stats.cum_cost" in sub_df:
-            record["cum_cost"]=sub_df["stats.cum_cost"].sum(skipna=True).round(4),
+            record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),)
 
     return pd.Series(record)
 

From 11964553b0942c9a10ac2e8dd2492c57932eb576 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Tue, 12 Nov 2024 11:14:39 -0500
Subject: [PATCH 43/47] Weblinx results (#104)

* adding weblinx results

* adding old weblinx results

---------

Co-authored-by: ThibaultLSDC <thibault.de.chezelles@gmail.com>
---
 reproducibility_journal.csv | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
index ebeeacb51..ad2bfaa81 100644
--- a/reproducibility_journal.csv
+++ b/reproducibility_journal.csv
@@ -39,3 +39,10 @@ ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,miniwob,0.10.2,2024-1
 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob,0.10.2,2024-10-25_17-16-23,2024-10-25_06-08-16,0.576,0.02,0,625/625,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,f12887f776525bcad6a0c42cb49651ff4f65af43,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
 ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,miniwob,0.10.2,2024-10-25_17-16-23,2024-10-25_06-08-16,0.678,0.019,0,625/625,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,f12887f776525bcad6a0c42cb49651ff4f65af43,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,workarena_l1,0.4.1,2024-10-25_20-32-26,2024-10-25_17-34-45,0.433,0.027,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,177ba72a7469e5610e6b615adf1bdcde58cb0298,,0.10.2,a9e44a88139798543ba53fc8c45d44997665ccca,
+Maxime Gasse,GenericAgent-gpt-4o-2024-05-13,weblinx_test,0.0.1.dev13,2024-11-04_16-01-14,2024-11-04_15-59-12,0.123,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.0,1.39.0,0.2.3,6e18fb818a64ec1e3f379c1a6480411d2fd0628b,,0.11.3,3ab1843edb14bfce7d39485f0106d0dc0c2d7486,
+ThibaultLSDC,GenericAgent-gpt-4o-mini,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.116,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
+ThibaultLSDC,GenericAgent-gpt-4o,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.125,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
+ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.137,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
+ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.089,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
+ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.125,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,
+ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.079,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a,

From 5c8d6278a161d7617564511f737ccbfd315576bc Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Tue, 12 Nov 2024 11:33:40 -0500
Subject: [PATCH 44/47] Max new tokens fix (#118)

* Lower max_new_tokens for OpenAI models

* updating configs

---------

Co-authored-by: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Co-authored-by: ThibaultLSDC <thibault.de.chezelles@gmail.com>
---
 src/agentlab/llm/llm_configs.py | 42 ++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index feb4d1c84..ec6086868 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -20,41 +20,41 @@
     "openai/gpt-4o-mini-2024-07-18": OpenAIModelArgs(
         model_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
-        max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_input_tokens=128_000,
+        max_new_tokens=16_384,
         vision_support=True,
     ),
     "openai/gpt-4-1106-preview": OpenAIModelArgs(
         model_name="gpt-4-1106-preview",
         max_total_tokens=128_000,
-        max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_input_tokens=128_000,
+        max_new_tokens=4_096,
     ),
     "openai/gpt-4-vision-preview": OpenAIModelArgs(
         model_name="gpt-4-vision-preview",
         max_total_tokens=128_000,
-        max_input_tokens=100_000,
-        max_new_tokens=28_000,  # I think this model has very small default value if we don't set max_new_tokens
+        max_input_tokens=128_000,
+        max_new_tokens=16_384,  # I think this model has very small default value if we don't set max_new_tokens
         vision_support=True,
     ),
     "openai/gpt-4o-2024-05-13": OpenAIModelArgs(
         model_name="gpt-4o-2024-05-13",
         max_total_tokens=128_000,
-        max_input_tokens=100_000,
-        max_new_tokens=28_000,  # I think this model has very small default value if we don't set max_new_tokens
+        max_input_tokens=128_000,
+        max_new_tokens=4_096,  # I think this model has very small default value if we don't set max_new_tokens
         vision_support=True,
     ),
     "openai/gpt-3.5-turbo-0125": OpenAIModelArgs(
         model_name="gpt-3.5-turbo-0125",
         max_total_tokens=16_384,
-        max_input_tokens=15_000,
-        max_new_tokens=1_000,
+        max_input_tokens=16_384,
+        max_new_tokens=4096,
     ),
     "openai/gpt-3.5-turbo-1106": OpenAIModelArgs(
         model_name="gpt-3.5-turbo-1106",
         max_total_tokens=16_384,
-        max_input_tokens=15_000,
-        max_new_tokens=1_000,
+        max_input_tokens=16_384,
+        max_new_tokens=4096,
     ),
     "azure/gpt-35-turbo/gpt-35-turbo": AzureModelArgs(
         model_name="gpt-35-turbo",
@@ -68,23 +68,23 @@
         deployment_name="gpt-4o-2024-05-13",
         max_total_tokens=128_000,
         max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_new_tokens=16_384,
         vision_support=True,
     ),
     "azure/gpt-4o-2024-08-06": AzureModelArgs(
         model_name="gpt-4o",
         deployment_name="gpt-4o-2024-08-06",
         max_total_tokens=128_000,
-        max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_input_tokens=128_000,
+        max_new_tokens=16_384,
         vision_support=True,
     ),
     "azure/gpt-4o-mini-2024-07-18": AzureModelArgs(
         model_name="gpt-4o-mini",
         deployment_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
-        max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_input_tokens=128_000,
+        max_new_tokens=16_384,
         vision_support=True,
     ),
     # ---------------- OSS LLMs ----------------#
@@ -151,8 +151,8 @@
     "openrouter/anthropic/claude-3.5-sonnet:beta": OpenRouterModelArgs(
         model_name="anthropic/claude-3.5-sonnet:beta",
         max_total_tokens=200_000,
-        max_input_tokens=160_000,
-        max_new_tokens=40_000,
+        max_input_tokens=200_000,
+        max_new_tokens=8_192,
         temperature=1e-1,
         vision_support=True,
     ),
@@ -166,8 +166,8 @@
     "openrouter/openai/o1-mini-2024-09-12": OpenRouterModelArgs(
         model_name="openai/o1-mini-2024-09-12",
         max_total_tokens=128_000,
-        max_input_tokens=100_000,
-        max_new_tokens=28_000,
+        max_input_tokens=128_000,
+        max_new_tokens=64_000,
         temperature=1e-1,
     ),
 }

From b644474c4998e94845ab11278bcdc8f7e1198d1a Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 12 Nov 2024 17:24:46 -0500
Subject: [PATCH 45/47] version bump (#119)

---
 src/agentlab/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py
index d31c31eae..ecad3607d 100644
--- a/src/agentlab/__init__.py
+++ b/src/agentlab/__init__.py
@@ -1 +1,2 @@
-__version__ = "0.2.3"
+__version__ = "0.3.0"
+

From e695e11c67b414d7c5417077c2d3a9278953a982 Mon Sep 17 00:00:00 2001
From: Thibault LSDC <78021491+ThibaultLSDC@users.noreply.github.com>
Date: Tue, 12 Nov 2024 17:32:31 -0500
Subject: [PATCH 46/47] fix format (#120)

---
 src/agentlab/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py
index ecad3607d..493f7415d 100644
--- a/src/agentlab/__init__.py
+++ b/src/agentlab/__init__.py
@@ -1,2 +1 @@
 __version__ = "0.3.0"
-

From 16e752629700c112fe76beb26243d93b0d934d48 Mon Sep 17 00:00:00 2001
From: Alexandre Lacoste <alex.lacoste.shmu@gmail.com>
Date: Wed, 13 Nov 2024 11:59:34 -0500
Subject: [PATCH 47/47] Clean pipeline (#117)

* yet another way to kill timedout jobs

* Improve timeout handling in task polling logic

* Add method to override max_steps in Study class

* add support for tab visibility in observation flags and update related components

* fix tests

* black

* Improve timeout handling in task polling logic

* yet another way to kill timedout jobs (#108)

* Add method to override max_steps in Study class

* add support for tab visibility in observation flags and update related components

* fix tests

* black

* black

* Fix sorting bug.
 improve directory content retrieval with summary statistics

* fix test

* black

* tmp

* add error report, add cum cost to summary and ray backend by default

* black

* fix test (chaing to joblib backend)

* black

---------

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>
---
 src/agentlab/agents/dynamic_prompting.py |  7 +-----
 src/agentlab/analyze/agent_xray.py       | 27 ++++++++++++++++++++----
 src/agentlab/analyze/inspect_results.py  |  2 +-
 src/agentlab/experiments/launch_exp.py   |  6 +++---
 src/agentlab/experiments/study.py        |  2 +-
 tests/agents/test_agent.py               | 25 ++++++++++++++++------
 6 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
index 07dcc58d9..73688f0f4 100644
--- a/src/agentlab/agents/dynamic_prompting.py
+++ b/src/agentlab/agents/dynamic_prompting.py
@@ -10,12 +10,7 @@
 
 import bgym
 from browsergym.core.action.base import AbstractActionSet
-from browsergym.utils.obs import (
-    flatten_axtree_to_str,
-    flatten_dom_to_str,
-    overlay_som,
-    prune_html,
-)
+from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
     BaseMessage,
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 310ebd227..da7e98d39 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -142,6 +142,10 @@ def filter_agent_id(self, agent_id: list[tuple]):
     max-height: 400px;
     overflow-y: auto;
 }
+.error-report {
+    max-height: 700px;
+    overflow-y: auto;
+}
 .my-code-view {
     max-height: 300px;
     overflow-y: auto;
@@ -284,6 +288,8 @@ def run_gradio(results_dir: Path):
             with gr.Tab("Global Stats"):
                 global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)
 
+            with gr.Tab("Error Report"):
+                error_report = gr.Markdown(elem_classes="error-report", show_copy_button=True)
         with gr.Row():
             episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown")
             action_info = gr.Markdown(label="Action Info", elem_classes="my-markdown")
@@ -411,7 +417,7 @@ def run_gradio(results_dir: Path):
         exp_dir_choice.change(
             fn=new_exp_dir,
             inputs=exp_dir_choice,
-            outputs=[agent_table, agent_id, constants, variables, global_stats],
+            outputs=[agent_table, agent_id, constants, variables, global_stats, error_report],
         )
 
         agent_table.select(fn=on_select_agent, inputs=agent_table, outputs=[agent_id])
@@ -918,19 +924,25 @@ def get_agent_report(result_df: pd.DataFrame):
 
 
 def update_global_stats():
-    global info
     stats = inspect_results.global_report(info.result_df, reduce_fn=inspect_results.summarize_stats)
     stats.reset_index(inplace=True)
     return stats
 
 
+def update_error_report():
+    report_files = list(info.exp_list_dir.glob("error_report*.md"))
+    if len(report_files) == 0:
+        return "No error report found"
+    report_files = sorted(report_files, key=os.path.getctime, reverse=True)
+    return report_files[0].read_text()
+
+
 def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
 
     if exp_dir == select_dir_instructions:
         return None, None
 
     exp_dir = exp_dir.split(" - ")[0]
-    global info
 
     if len(exp_dir) == 0:
         info.exp_list_dir = None
@@ -951,7 +963,14 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
     agent_id = info.get_agent_id(agent_report.iloc[0])
 
     constants, variables = format_constant_and_variables()
-    return agent_report, agent_id, constants, variables, update_global_stats()
+    return (
+        agent_report,
+        agent_id,
+        constants,
+        variables,
+        update_global_stats(),
+        update_error_report(),
+    )
 
 
 def new_agent_id(agent_id: list[tuple]):
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 8df2c1ef6..09ba23a05 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False):
             n_err=err.sum(skipna=True),
         )
         if "stats.cum_cost" in sub_df:
-            record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),)
+            record["cum_cost"] = sub_df["stats.cum_cost"].sum(skipna=True).round(4)
 
     return pd.Series(record)
 
diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index 49a778e37..cb331a99f 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -40,9 +40,9 @@ def run_experiments(
     study_dir = Path(study_dir)
     study_dir.mkdir(parents=True, exist_ok=True)
 
-    if n_jobs == 1 and parallel_backend != "sequential":
-        logging.warning("Only 1 job, switching to sequential backend.")
-        parallel_backend = "sequential"
+    # if n_jobs == 1 and parallel_backend != "sequential":
+    #     logging.warning("Only 1 job, switching to sequential backend.")
+    #     parallel_backend = "sequential"
 
     logging.info(f"Saving experiments to {study_dir}")
     for exp_args in exp_args_list:
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
index 2139ce7b8..b42f0bb5d 100644
--- a/src/agentlab/experiments/study.py
+++ b/src/agentlab/experiments/study.py
@@ -123,7 +123,7 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
     def run(
         self,
         n_jobs=1,
-        parallel_backend="joblib",
+        parallel_backend="ray",
         strict_reproducibility=False,
         n_relaunch=3,
         relaunch_errors=True,
diff --git a/tests/agents/test_agent.py b/tests/agents/test_agent.py
index ec0b4d28e..2632f66b5 100644
--- a/tests/agents/test_agent.py
+++ b/tests/agents/test_agent.py
@@ -25,7 +25,9 @@ def test_generic_agent():
 
     with tempfile.TemporaryDirectory() as tmp_dir:
 
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
 
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
@@ -144,9 +146,12 @@ def test_generic_agent_parse_retry():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        # TODO why these tests don't work with ray backend?
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
-
+        print(result_record)
         target = {
             "stats.cum_n_retry": 2,
             "stats.cum_busted_retry": 0,
@@ -169,7 +174,9 @@ def test_bust_parse_retry():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
         target = {
@@ -195,7 +202,9 @@ def test_llm_error_success():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
         target = {
@@ -220,7 +229,9 @@ def test_llm_error_no_success():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
         target = {
@@ -236,4 +247,4 @@ def test_llm_error_no_success():
 
 if __name__ == "__main__":
     # test_generic_agent()
-    test_llm_error_success()
+    test_generic_agent_parse_retry()