Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 73 additions & 12 deletions agentq/core/agent/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import os
from typing import Callable, List, Optional, Tuple, Type
from typing import Callable, Dict, List, Literal, Optional, Tuple, Type, get_args

import instructor
import instructor.patch
Expand All @@ -13,6 +13,57 @@
from agentq.utils.function_utils import get_function_schema
from agentq.utils.logger import logger

# ── Typed LLM-provider selection seam (Piece 1: DeepSeek wiring) ───────────
# agent-q's LLM client is an OpenAI-API-compatible client wrapped by
# ``instructor`` (Mode.JSON). Provider selection is a single typed seam:
# a ``ProviderKind`` discriminator, a provider→client-factory registry, and
# a provider→default-model map. This keeps all agents source-agnostic (one
# seam, not a ``client=`` threaded through every subclass) and makes provider
# validity / key presence decidable-in-code via ``ProviderConfigError`` rather
# than a silent ``AttributeError`` / bare ``KeyError``.
ProviderKind = Literal["openai", "together", "deepseek"]


class ProviderConfigError(ValueError):
"""Raised when the selected LLM provider is unknown, or its required API
key is absent. A named, decidable failure — never a bare KeyError or a
silent fall-through that leaves ``self.client`` unset."""


def _make_openai_client() -> openai.OpenAI:
return openai.Client()


def _make_together_client() -> openai.OpenAI:
key = os.environ.get("TOGETHER_API_KEY")
if not key:
raise ProviderConfigError(
"Provider 'together' selected but TOGETHER_API_KEY is not set."
)
return openai.OpenAI(base_url="https://api.together.xyz/v1", api_key=key)


def _make_deepseek_client() -> openai.OpenAI:
key = os.environ.get("DEEPSEEK_API_KEY")
if not key:
raise ProviderConfigError(
"Provider 'deepseek' selected but DEEPSEEK_API_KEY is not set."
)
return openai.OpenAI(base_url="https://api.deepseek.com/v1", api_key=key)


_PROVIDER_CLIENT_FACTORIES: Dict[ProviderKind, Callable[[], openai.OpenAI]] = {
"openai": _make_openai_client,
"together": _make_together_client,
"deepseek": _make_deepseek_client,
}

_PROVIDER_MODEL_MAP: Dict[ProviderKind, str] = {
"openai": "gpt-4o-2024-08-06",
"together": "gpt-4o-2024-08-06",
"deepseek": "deepseek-chat",
}


class BaseAgent:
def __init__(
Expand All @@ -23,7 +74,7 @@ def __init__(
output_format: Type[BaseModel],
tools: Optional[List[Tuple[Callable, str]]] = None,
keep_message_history: bool = True,
client: str = "openai",
client: Optional[ProviderKind] = None,
):
# Metdata
self.agent_name = name
Expand All @@ -44,16 +95,22 @@ def __init__(
litellm.logging = True
litellm.set_verbose = True

# Llm client
if client == "openai":
self.client = openai.Client()
elif client == "together":
self.client = openai.OpenAI(
base_url="https://api.together.xyz/v1",
api_key=os.environ["TOGETHER_API_KEY"],
# LLM client — typed, env-driven provider selection at ONE seam.
# Explicit ``client=`` wins; otherwise read AGENTQ_LLM_PROVIDER (so the
# zero-arg orchestrated agents all pick up the selected provider).
provider = (client or os.environ.get("AGENTQ_LLM_PROVIDER", "openai")).strip().lower()
if provider not in get_args(ProviderKind):
raise ProviderConfigError(
f"Unknown LLM provider {provider!r}; "
f"must be one of {list(get_args(ProviderKind))}."
)

self.client = instructor.from_openai(self.client, mode=Mode.JSON)
self.provider: ProviderKind = provider # type: ignore[assignment]
# Registry lookup + named fail-fast (never a silent self.client-unset path).
self.client = instructor.from_openai(
_PROVIDER_CLIENT_FACTORIES[provider](), mode=Mode.JSON
)
# Provider-bound default model, resolved once from typed state.
self._default_model: str = _PROVIDER_MODEL_MAP[provider]

# Tools
self.tools_list = []
Expand All @@ -76,8 +133,12 @@ async def run(
screenshot: str = None,
session_id: str = None,
# model: str = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
model: str = "gpt-4o-2024-08-06",
model: Optional[str] = None,
) -> BaseModel:
# Default to the provider-bound model chosen at construction time
# (self._default_model); an explicit model= override still wins.
if model is None:
model = self._default_model
if not isinstance(input_data, self.input_format):
raise ValueError(f"Input data must be of type {self.input_format.__name__}")

Expand Down
40 changes: 8 additions & 32 deletions agentq/core/mcts/browser_mcts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from agentq.core.mcts.core.mcts import MCTS, MCTSResult
from agentq.core.mcts.visualization.visualizer_client import visualize
from agentq.core.models.models import (
ActionType,
AgentQActorInput,
AgentQActorOutput,
AgentQCriticInput,
Expand All @@ -29,13 +28,10 @@
VisionInput,
VisionOutput,
)
from agentq.core.skills.click_using_selector import click
from agentq.core.skills.enter_text_and_click import enter_text_and_click
from agentq.core.skills.enter_text_using_selector import EnterTextEntry, entertext
from agentq.core.skills.dispatch import dispatch_action
from agentq.core.skills.get_dom_with_content_type import get_dom_with_content_type
from agentq.core.skills.get_screenshot import get_screenshot
from agentq.core.skills.get_url import geturl
from agentq.core.skills.open_url import openurl
from agentq.core.web_driver.playwright import PlaywrightManager

# ANSI color codes
Expand Down Expand Up @@ -103,33 +99,13 @@ async def execute_browser_action(
action = browser_action.task_with_action.actions_to_be_performed[0]
print(f"{YELLOW}[DEBUG] Executing browser action: {action.type}{RESET}")

if action.type == ActionType.GOTO_URL:
print(f"{CYAN}[DEBUG] Trying to go to url{RESET}")
await openurl(url=action.website, timeout=action.timeout or 1)
print(f"{CYAN}[DEBUG] Went to url{RESET}")
elif action.type == ActionType.TYPE:
entry = EnterTextEntry(
query_selector=f"[mmid='{action.mmid}']",
text=action.content,
)
await entertext(entry)
# await wait_for_navigation()
print(f"{CYAN}[DEBUG] Typed text into element{RESET}")
elif action.type == ActionType.CLICK:
await click(
selector=f"[mmid='{action.mmid}']",
wait_before_execution=action.wait_before_execution or 2,
)
print(f"{CYAN}[DEBUG] Clicked element{RESET}")
elif action.type == ActionType.ENTER_TEXT_AND_CLICK:
await enter_text_and_click(
text_selector=f"[mmid='{action.text_element_mmid}']",
text_to_enter=action.text_to_enter,
click_selector=f"[mmid='{action.click_element_mmid}']",
wait_before_click_execution=action.wait_before_click_execution or 2,
)
# await wait_for_navigation()
print(f"{CYAN}[DEBUG] Entered text and clicked element{RESET}")
# All browser-skill dispatch lives in the shared dispatch_action (skills/dispatch.py). This is
# the MCTS search-rollout execution path, so it passes the SEARCH waits (2/2/2). An unhandled
# ActionType raises UnhandledActionTypeError (a ValueError) — preserving the prior else-raise.
await dispatch_action(
action, click_wait=2, enter_text_and_click_wait=2, captcha_wait=2
)
print(f"{CYAN}[DEBUG] Executed {action.type} via dispatch_action{RESET}")

try:
new_dom = await self.get_current_dom()
Expand Down
52 changes: 12 additions & 40 deletions agentq/core/orchestrator/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,10 @@
Task,
TaskWithActions,
)
from agentq.core.skills.click_using_selector import click
from agentq.core.skills.enter_text_and_click import enter_text_and_click
from agentq.core.skills.enter_text_using_selector import EnterTextEntry, entertext
from agentq.core.skills.dispatch import dispatch_action, UnhandledActionTypeError
from agentq.core.skills.get_dom_with_content_type import get_dom_with_content_type
from agentq.core.skills.get_screenshot import get_screenshot
from agentq.core.skills.get_url import geturl
from agentq.core.skills.open_url import openurl
from agentq.core.skills.solve_captcha import solve_captcha
from agentq.core.web_driver.playwright import PlaywrightManager

init(autoreset=True)
Expand Down Expand Up @@ -395,46 +391,22 @@ async def handle_agentq_actions(self, actions: List[Action]):

for attempt in range(max_retries):
try:
# Every action dispatches identically (real-loop waits 1/1.5/1); only GOTO needs the
# extra post-navigation settle. dispatch_action stays inside this retry `try`, so a
# transient failure still retries. openurl already waits for networkidle internally
# (skills/open_url.py); the wait below is defensive redundancy, scoped to GOTO.
result = await dispatch_action(
action, click_wait=1, enter_text_and_click_wait=1.5, captcha_wait=1
)
if action.type == ActionType.GOTO_URL:
result = await openurl(
url=action.website, timeout=action.timeout or 1
)
await page.wait_for_load_state("networkidle", timeout=10000)
print("Action - GOTO")
elif action.type == ActionType.TYPE:
entry = EnterTextEntry(
query_selector=f"[mmid='{action.mmid}']",
text=action.content,
)
result = await entertext(entry)
print("Action - TYPE")
elif action.type == ActionType.CLICK:
result = await click(
selector=f"[mmid='{action.mmid}']",
wait_before_execution=action.wait_before_execution or 1,
)
print("Action - CLICK")
elif action.type == ActionType.ENTER_TEXT_AND_CLICK:
result = await enter_text_and_click(
text_selector=f"[mmid='{action.text_element_mmid}']",
text_to_enter=action.text_to_enter,
click_selector=f"[mmid='{action.click_element_mmid}']",
wait_before_click_execution=action.wait_before_click_execution
or 1.5,
)
print("Action - ENTER TEXT AND CLICK")
elif action.type == ActionType.SOLVE_CAPTCHA:
result = await solve_captcha(
text_selector=f"[mmid='{action.text_element_mmid}']",
click_selector=f"[mmid='{action.click_element_mmid}']",
wait_before_click_execution=action.wait_before_click_execution
or 1,
)
else:
result = f"Unsupported action type: {action.type}"

results.append(result)
break # If successful, break out of the retry loop
except UnhandledActionTypeError:
# An unknown/future ActionType is a coverage error, not a transient failure —
# surface it immediately instead of retrying 3x and swallowing it into a string.
raise
except Exception as e:
print(f"Error during action {action.type}: {e}")
if attempt < max_retries - 1:
Expand Down
72 changes: 72 additions & 0 deletions agentq/core/skills/dispatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Single shared typed-`Action` → browser-skill dispatch map for agent-q.

Both LIVE executors route through this one function:
- execute_browser_action (agentq/core/mcts/browser_mcts.py) — MCTS search rollouts
- handle_agentq_actions (agentq/core/orchestrator/orchestrator.py) — orchestrator real loop

The dispatch MAP is shared; each caller passes its OWN wait timing, because an MCTS search rollout and
a real orchestrator execution are distinct execution contexts (the search waits are world-model
parameters; the real-loop waits are infra configuration — they are deliberately NOT unified).

`dispatch_action` is TOTAL over ActionType: an unhandled/future type raises `UnhandledActionTypeError`
rather than silently no-opping.
"""
from typing import Any

from agentq.core.models.models import Action, ActionType
from agentq.core.skills.click_using_selector import click
from agentq.core.skills.enter_text_and_click import enter_text_and_click
from agentq.core.skills.enter_text_using_selector import EnterTextEntry, entertext
from agentq.core.skills.open_url import openurl
from agentq.core.skills.solve_captcha import solve_captcha


class UnhandledActionTypeError(ValueError):
"""Raised by `dispatch_action` for an ActionType with no branch.

A `ValueError` subclass (so existing ``except ValueError`` sites still catch it), but a DISTINCT
type so a caller with a retry loop can re-raise it immediately — an unknown action type is a
programming/coverage error, not a transient failure to retry.
"""


async def dispatch_action(
action: Action,
*,
click_wait: float,
enter_text_and_click_wait: float,
captcha_wait: float,
) -> Any:
"""Dispatch one typed `Action` to its browser skill and return the skill's result.

The wait values are supplied by the CALLER (not defaulted here) so the MCTS search path and the
orchestrator real-loop path each keep their own timing.
"""
if action.type == ActionType.GOTO_URL:
return await openurl(url=action.website, timeout=action.timeout or 1)
elif action.type == ActionType.TYPE:
return await entertext(
EnterTextEntry(query_selector=f"[mmid='{action.mmid}']", text=action.content)
)
elif action.type == ActionType.CLICK:
return await click(
selector=f"[mmid='{action.mmid}']",
wait_before_execution=action.wait_before_execution or click_wait,
)
elif action.type == ActionType.ENTER_TEXT_AND_CLICK:
return await enter_text_and_click(
text_selector=f"[mmid='{action.text_element_mmid}']",
text_to_enter=action.text_to_enter,
click_selector=f"[mmid='{action.click_element_mmid}']",
wait_before_click_execution=action.wait_before_click_execution or enter_text_and_click_wait,
)
elif action.type == ActionType.SOLVE_CAPTCHA:
return await solve_captcha(
text_selector=f"[mmid='{action.text_element_mmid}']",
click_selector=f"[mmid='{action.click_element_mmid}']",
wait_before_click_execution=action.wait_before_click_execution or captcha_wait,
)
else:
raise UnhandledActionTypeError(
f"Unhandled ActionType in dispatch_action: {action.type}"
)
Loading