src / toolsProvider.ts
import { text, tool, type ToolsProviderController } from "@lmstudio/sdk";
import { z } from "zod";
import { join, isAbsolute } from "path";
import { configSchematics } from "./configSchematics";
import {
runAgentBrowser,
summarize,
tryParseJson,
type AgentBrowserSettings,
} from "./agentBrowser";
export async function toolsProvider(ctl: ToolsProviderController) {
const config = ctl.getPluginConfig(configSchematics);
const getSettings = (): AgentBrowserSettings => ({
binCommand: config.get("binCommand"),
session: config.get("session"),
headed: config.get("headed"),
timeoutMs: config.get("timeoutMs"),
screenshotDir: config.get("screenshotDir"),
userAgent: config.get("userAgent"),
viewport: config.get("viewport"),
acceptLanguage: config.get("acceptLanguage"),
colorScheme: config.get("colorScheme"),
extraBrowserArgs: config.get("extraBrowserArgs"),
});
const browserOpenTool = tool({
name: "browser_open",
description: text`
Open the browser, optionally navigating to a URL. Starts an agent-browser session if one is
not already running. Use this before any other browser tool. Aliases: goto, navigate.
`,
parameters: { url: z.string().optional() },
implementation: async ({ url }, { signal, status }) => {
status("Launching browser...");
const args = ["open"];
if (url !== undefined && url !== "") {
args.push(url);
}
const result = await runAgentBrowser(getSettings(), args, { signal });
return summarize(result, url !== undefined ? `Opened ${url}.` : "Browser opened.");
},
});
const browserSnapshotTool = tool({
name: "browser_snapshot",
description: text`
Capture an accessibility-tree snapshot of the current page. Returns a compact representation
with @eN element references that can be passed as selectors to other tools (e.g. @e2).
Prefer this over screenshots for understanding page structure — it uses far less context.
`,
parameters: {
include_urls: z.boolean().optional(),
compact: z.boolean().optional(),
max_depth: z.number().int().min(1).max(50).optional(),
selector: z.string().optional(),
},
implementation: async (
{ include_urls, compact, max_depth, selector },
{ signal, status },
) => {
status("Capturing snapshot...");
const args = ["snapshot"];
if (include_urls === true) args.push("-u");
if (compact === true) args.push("-c");
if (max_depth !== undefined) args.push("-d", String(max_depth));
if (selector !== undefined && selector !== "") args.push("-s", selector);
const result = await runAgentBrowser(getSettings(), args, { signal });
return summarize(result, "(empty snapshot)");
},
});
const browserScreenshotTool = tool({
name: "browser_screenshot",
description: text`
Save a screenshot of the current page to disk and return the file path. Use sparingly — the
accessibility snapshot is usually a better choice. Set full_page to capture beyond the
viewport.
`,
parameters: {
filename: z.string().optional(),
full_page: z.boolean().optional(),
},
implementation: async ({ filename, full_page }, { signal, status }) => {
status("Taking screenshot...");
const settings = getSettings();
const name = filename ?? `screenshot-${Date.now()}.png`;
const dir = settings.screenshotDir.trim();
const target = dir === "" || isAbsolute(name) ? name : join(dir, name);
const args = ["screenshot", target];
if (full_page === true) args.push("--full");
const result = await runAgentBrowser(settings, args, { signal });
if (result.exitCode !== 0) {
return summarize(result, "Screenshot failed.");
}
return `Screenshot saved to ${target}.`;
},
});
const browserClickTool = tool({
name: "browser_click",
description: text`
Click an element. The selector accepts CSS selectors, text= queries, or @eN references from
a recent snapshot (e.g. "@e3").
`,
parameters: {
selector: z.string(),
new_tab: z.boolean().optional(),
},
implementation: async ({ selector, new_tab }, { signal, status }) => {
status(`Clicking ${selector}...`);
const args = ["click", selector];
if (new_tab === true) args.push("--new-tab");
const result = await runAgentBrowser(getSettings(), args, { signal });
return summarize(result, `Clicked ${selector}.`);
},
});
const browserTypeTool = tool({
name: "browser_type",
description: text`
Clear an input element and type text into it. Use browser_press afterwards if you need to
submit (e.g. press "Enter").
`,
parameters: {
selector: z.string(),
text: z.string(),
},
implementation: async ({ selector, text: value }, { signal, status }) => {
status(`Filling ${selector}...`);
const result = await runAgentBrowser(
getSettings(),
["fill", selector, value],
{ signal },
);
return summarize(result, `Filled ${selector}.`);
},
});
const browserPressTool = tool({
name: "browser_press",
description: text`
Press a keyboard key globally on the page. Valid keys include "Enter", "Tab", "Escape",
"ArrowDown", "Control+a", etc.
`,
parameters: { key: z.string() },
implementation: async ({ key }, { signal, status }) => {
status(`Pressing ${key}...`);
const result = await runAgentBrowser(getSettings(), ["press", key], { signal });
return summarize(result, `Pressed ${key}.`);
},
});
const browserGetTextTool = tool({
name: "browser_get_text",
description: text`
Read the visible text content of an element. Pass a CSS selector or an @eN reference.
`,
parameters: { selector: z.string() },
implementation: async ({ selector }, { signal }) => {
const result = await runAgentBrowser(
getSettings(),
["get", "text", selector],
{ signal },
);
return summarize(result, "(no text)");
},
});
const browserPageInfoTool = tool({
name: "browser_page_info",
description: "Return the current page URL and title as JSON.",
parameters: {},
implementation: async (_params, { signal }) => {
const settings = getSettings();
const [urlRes, titleRes] = await Promise.all([
runAgentBrowser(settings, ["get", "url"], { signal }),
runAgentBrowser(settings, ["get", "title"], { signal }),
]);
return {
url: urlRes.exitCode === 0 ? urlRes.stdout.trim() : null,
title: titleRes.exitCode === 0 ? titleRes.stdout.trim() : null,
};
},
});
const browserWaitTool = tool({
name: "browser_wait",
description: text`
Wait for a condition before proceeding. The target may be:
- a CSS selector or @eN ref (waits for the element to appear)
- a number of milliseconds (e.g. "1500")
- a "url:" prefix followed by a URL pattern (e.g. "url:**/dashboard")
- a "load" or "networkidle" load state
`,
parameters: { target: z.string() },
implementation: async ({ target }, { signal, status }) => {
status(`Waiting for ${target}...`);
const result = await runAgentBrowser(getSettings(), ["wait", target], { signal });
return summarize(result, `Wait satisfied: ${target}.`);
},
});
const browserFindTool = tool({
name: "browser_find",
description: text`
Find an element by semantic role, text, label, placeholder, or test id, then optionally
perform an action on it. Useful when CSS selectors are brittle.
Examples:
- kind="text", target="Sign in", action="click" — click the link/button labeled "Sign in".
- kind="role", target="button", action="click", name="Submit" — click button named "Submit".
- kind="label", target="Email", action="fill", value="user@example.com" — fill labeled input.
`,
parameters: {
kind: z.enum(["role", "text", "label", "placeholder", "alt", "title", "testid"]),
target: z.string(),
action: z.string(),
value: z.string().optional(),
name: z.string().optional(),
exact: z.boolean().optional(),
},
implementation: async (
{ kind, target, action, value, name, exact },
{ signal, status },
) => {
status(`Finding ${kind}=${target}...`);
const args = ["find", kind, target, action];
if (value !== undefined) args.push(value);
if (name !== undefined) args.push("--name", name);
if (exact === true) args.push("--exact");
const result = await runAgentBrowser(getSettings(), args, { signal });
return summarize(result, `Found and ran ${action} on ${kind}=${target}.`);
},
});
const browserNavigateTool = tool({
name: "browser_navigate",
description: "Navigate within history: 'back', 'forward', or 'reload'.",
parameters: { action: z.enum(["back", "forward", "reload"]) },
implementation: async ({ action }, { signal, status }) => {
status(`Navigating ${action}...`);
const result = await runAgentBrowser(getSettings(), [action], { signal });
return summarize(result, `${action} complete.`);
},
});
const browserEvalTool = tool({
name: "browser_eval",
description: text`
Run a JavaScript expression in the page context and return its result. Use for extraction
that the other tools cannot handle (e.g. computing values from many DOM nodes). The
expression should evaluate to JSON-serializable data.
`,
parameters: { expression: z.string() },
implementation: async ({ expression }, { signal, status, warn }) => {
status("Evaluating script...");
const result = await runAgentBrowser(
getSettings(),
["eval", expression],
{ signal },
);
if (result.exitCode !== 0) {
return summarize(result, "eval failed.");
}
const parsed = tryParseJson(result.stdout);
if (parsed === null && result.stdout.trim() !== "null") {
warn("eval output was not JSON; returning raw text.");
return result.stdout.trim();
}
return parsed;
},
});
const browserCloseTool = tool({
name: "browser_close",
description: "Close the current agent-browser session.",
parameters: { all: z.boolean().optional() },
implementation: async ({ all }, { signal, status }) => {
status("Closing browser...");
const args = ["close"];
if (all === true) args.push("--all");
const result = await runAgentBrowser(getSettings(), args, { signal });
return summarize(result, "Browser closed.");
},
});
return [
browserOpenTool,
browserSnapshotTool,
browserScreenshotTool,
browserClickTool,
browserTypeTool,
browserPressTool,
browserGetTextTool,
browserPageInfoTool,
browserWaitTool,
browserFindTool,
browserNavigateTool,
browserEvalTool,
browserCloseTool,
];
}