ServiceNow · TLSDC · Nov 13, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -58,6 +58,9 @@ jobs:
       - name: Check MiniWob availability
         run: curl -I "http://localhost:8080/miniwob/" || echo "MiniWob not reachable"
 
+      - name: Pre-download nltk ressources
+        run: python -c "import nltk; nltk.download('punkt_tab')"
+
       - name: Run AgentLab Unit Tests
         env:
           MINIWOB_URL: "http://localhost:8080/miniwob/"

diff --git a/.gitignore b/.gitignore
@@ -161,35 +161,10 @@ cython_debug/
 **/.DS_Store
 
 .vscode
-allowed_selenium.json
 
-# Torchtune
-finetuning/torchtune
-
-# PyLLMD repo for finetuning
-pyllmd_tune/research-pyllmd/
-pyllmd_tune/data/
-
-
-datasets/*
 _sandbox.py
-node_modules/
-/test-results/
-/playwright-report/
-/blob-report/
-/playwright/.cache/
-/test-results/
-/playwright-report/
-/blob-report/
-/playwright/.cache/
-
 
 results/
 
-# personal (optimass)
-ICML_deadline/
-mass_utils/
-pyllmd_tune/
-
-# don't ignore the miniwob_tasks_all.csv file
-!miniwob_tasks_all.csv
+# gradio
+.gradio/
diff --git a/main.py b/main.py
@@ -7,29 +7,28 @@
 """
 
 import logging
-
 from agentlab.agents.generic_agent import (
     RANDOM_SEARCH_AGENT,
     AGENT_4o,
     AGENT_4o_MINI,
     AGENT_LLAMA3_70B,
     AGENT_LLAMA31_70B,
 )
-from agentlab.analyze.inspect_results import get_most_recent_folder
-from agentlab.experiments import study_generators
+from agentlab.experiments.study import Study
 
 logging.getLogger().setLevel(logging.INFO)
 
 # choose your agent or provide a new agent
 agent_args = [AGENT_4o_MINI]
 # agent_args = [AGENT_4o]
 
-## select the benchmark to run on
+
+# ## select the benchmark to run on
 benchmark = "miniwob_tiny_test"
 # benchmark = "miniwob"
-# benchmark = "workarena.l1"
-# benchmark = "workarena.l2"
-# benchmark = "workarena.l3"
+# benchmark = "workarena_l1"
+# benchmark = "workarena_l2"
+# benchmark = "workarena_l3"
 # benchmark = "webarena"
 
 # Set reproducibility_mode = True for reproducibility
@@ -53,13 +52,18 @@
 
     if relaunch:
         #  relaunch an existing study
-        study_dir = get_most_recent_folder()
-        study = study_generators.make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
 
     else:
-        study = study_generators.run_agents_on_benchmark(agent_args, benchmark)
-
-    study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)
+        study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend="ray",
+        strict_reproducibility=reproducibility_mode,
+        n_relaunch=3,
+    )
 
     if reproducibility_mode:
         study.append_to_journal(strict_reproducibility=True)
diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
diff --git a/requirements.txt b/requirements.txt
@@ -16,6 +16,9 @@ contexttimer
 ipython
 pyyaml>=6
 pandas
-gradio
+gradio>=5.5 # issue with DataFrame scrolling before 5.5
 gitpython # for the reproducibility script
-requests
+requests
+matplotlib
+ray[default]
+python-slugify
diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.2"
+__version__ = "0.3.0"
diff --git a/src/agentlab/agents/agent_args.py b/src/agentlab/agents/agent_args.py
@@ -1,9 +1,10 @@
 from bgym import AbstractAgentArgs
+import bgym
 
 
 class AgentArgs(AbstractAgentArgs):
 
-    def set_benchmark(self, benchmark: str, demo_mode: bool):
+    def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode: bool):
         """Optional method to set benchmark specific flags.
 
         This allows the agent to have minor adjustments based on the benchmark.

diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
@@ -1,5 +1,4 @@
 import abc
-import difflib
 import logging
 import platform
 import time
@@ -9,12 +8,12 @@
 from typing import Literal
 from warnings import warn
 
+import bgym
 from browsergym.core.action.base import AbstractActionSet
-from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.core.action.python import PythonActionSet
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
+    BaseMessage,
     ParseError,
     count_tokens,
     extract_code_blocks,
@@ -70,6 +69,7 @@ class ObsFlags(Flags):
 
     use_html: bool = True
     use_ax_tree: bool = False
+    use_tabs: bool = False
     use_focused_element: bool = False
     use_error_logs: bool = False
     use_history: bool = False
@@ -94,13 +94,14 @@ class ObsFlags(Flags):
 
 @dataclass
 class ActionFlags(Flags):
-    multi_actions: bool = False
-    action_set: str = "bid"
-    is_strict: bool = False
-    demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = "off"
+    action_set: bgym.HighLevelActionSetArgs = None  # should be set by the set_benchmark method
     long_description: bool = True
     individual_examples: bool = False
 
+    # for backward compatibility
+    multi_actions: bool = None
+    is_strict: bool = None
+
 
 class PromptElement:
     """Base class for all prompt elements. Prompt elements can be hidden."""
@@ -121,7 +122,7 @@ def __init__(self, visible: bool = True) -> None:
         self._visible = visible
 
     @property
-    def prompt(self):
+    def prompt(self) -> str | BaseMessage:
         """Avoid overriding this method. Override _prompt instead."""
         if self.is_visible:
             return self._prompt
@@ -252,7 +253,14 @@ def fit_tokens(
         if isinstance(prompt, str):
             prompt_str = prompt
         elif isinstance(prompt, list):
+            # warn deprecated
+            warn(
+                "Using list of prompts is deprecated. Use a Discussion object instead.",
+                DeprecationWarning,
+            )
             prompt_str = "\n".join([p["text"] for p in prompt if p["type"] == "text"])
+        elif isinstance(prompt, BaseMessage):
+            prompt_str = str(prompt)
         else:
             raise ValueError(f"Unrecognized type for prompt: {type(prompt)}")
         n_token = count_tokens(prompt_str, model=model_name)
@@ -357,6 +365,29 @@ def __init__(self, bid, visible: bool = True, prefix="") -> None:
 """
 
 
+class Tabs(PromptElement):
+    def __init__(self, obs, visible: bool = True, prefix="") -> None:
+        super().__init__(visible=visible)
+        self.obs = obs
+        self.prefix = prefix
+
+    @property
+    def _prompt(self) -> str:
+        # by implementing this as a property, it's only coputed if visible
+        prompt_pieces = [f"\n{self.prefix}Currently open tabs:"]
+        for page_index, (page_url, page_title) in enumerate(
+            zip(self.obs["open_pages_urls"], self.obs["open_pages_titles"])
+        ):
+            active_or_not = " (active tab)" if page_index == self.obs["active_page_index"] else ""
+            prompt_piece = f"""\
+Tab {page_index}{active_or_not}:
+    Title: {page_title}
+    URL: {page_url}
+"""
+            prompt_pieces.append(prompt_piece)
+        return "\n".join(prompt_pieces)
+
+
 class Observation(Shrinkable):
     """Observation of the current step.
 
@@ -367,6 +398,13 @@ def __init__(self, obs, flags: ObsFlags) -> None:
         super().__init__()
         self.flags = flags
         self.obs = obs
+
+        self.tabs = Tabs(
+            obs,
+            visible=lambda: flags.use_tabs,
+            prefix="## ",
+        )
+
         self.html = HTML(
             obs[flags.html_type],
             visible_elements_only=flags.filter_visible_elements_only,
@@ -400,25 +438,18 @@ def shrink(self):
     def _prompt(self) -> str:
         return f"""
 # Observation of current step:
-{self.html.prompt}{self.ax_tree.prompt}{self.focused_element.prompt}{self.error.prompt}
+{self.tabs.prompt}{self.html.prompt}{self.ax_tree.prompt}{self.focused_element.prompt}{self.error.prompt}
 
 """
 
-    def add_screenshot(self, prompt):
+    def add_screenshot(self, prompt: BaseMessage) -> BaseMessage:
         if self.flags.use_screenshot:
-            if isinstance(prompt, str):
-                prompt = [{"type": "text", "text": prompt}]
             if self.flags.use_som:
                 screenshot = self.obs["screenshot_som"]
             else:
                 screenshot = self.obs["screenshot"]
             img_url = image_to_jpg_base64_url(screenshot)
-            prompt.append(
-                {
-                    "type": "image_url",
-                    "image_url": {"url": img_url, "detail": self.flags.openai_vision_detail},
-                }
-            )
+            prompt.add_image(img_url, detail=self.flags.openai_vision_detail)
         return prompt
 
 
@@ -441,24 +472,36 @@ def __init__(self, visible: bool = True) -> None:
 
 
 class GoalInstructions(PromptElement):
-    def __init__(self, goal, visible: bool = True, extra_instructions=None) -> None:
+    def __init__(self, goal_object, visible: bool = True, extra_instructions=None) -> None:
         super().__init__(visible)
-        self._prompt = f"""\
+        self._prompt = [
+            dict(
+                type="text",
+                text=f"""\
 # Instructions
 Review the current state of the page and all other information to find the best
 possible next action to accomplish your goal. Your answer will be interpreted
 and executed by a program, make sure to follow the formatting instructions.
 
 ## Goal:
-{goal}
-"""
+""",
+            )
+        ]
+
+        self._prompt += goal_object
+
         if extra_instructions:
-            self._prompt += f"""
+            self._prompt += [
+                dict(
+                    type="text",
+                    text=f"""
 
 ## Extra instructions:
 
 {extra_instructions}
-"""
+""",
+                )
+            ]
 
 
 class ChatInstructions(PromptElement):
@@ -592,24 +635,24 @@ def _parse_answer(self, text_answer):
         return ans_dict
 
 
-def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:
+# def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:
 
-    if action_flags.action_set == "python":
-        action_set = PythonActionSet(strict=action_flags.is_strict)
-        if action_flags.demo_mode != "off":
-            warn(
-                f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
-            )
-        return action_set
+#     if action_flags.action_set == "python":
+#         action_set = PythonActionSet(strict=action_flags.is_strict)
+#         if action_flags.demo_mode != "off":
+#             warn(
+#                 f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
+#             )
+#         return action_set
 
-    action_set = HighLevelActionSet(
-        subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
-        multiaction=action_flags.multi_actions,
-        strict=action_flags.is_strict,
-        demo_mode=action_flags.demo_mode,
-    )
+#     action_set = HighLevelActionSet(
+#         subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
+#         multiaction=action_flags.multi_actions,
+#         strict=action_flags.is_strict,
+#         demo_mode=action_flags.demo_mode,
+#     )
 
-    return action_set
+#     return action_set
 
 
 class Think(PromptElement):