src / toolsProvider.ts
import {
text,
tool,
type LLM,
type LLMDynamicHandle,
type LLMInstanceInfo,
type LLMInfo,
type ToolsProviderController,
type Tool,
} from "@lmstudio/sdk";
import { mkdir, stat } from "node:fs/promises";
import { homedir } from "node:os";
import { basename, dirname, extname, isAbsolute, join, resolve } from "node:path";
import sharp from "sharp";
import { z } from "zod";
const preloadedModelIdentifier = "vlm-tools-vlm";
const recommendedPreloadModel = "google/gemma-4-26b-a4b";
const recommendedPreloadCommand =
`lms load ${recommendedPreloadModel} --identifier ${preloadedModelIdentifier}`;
const vlmModelPriority = [
"google/gemma-4-26b-a4b",
"google/gemma-4-e4b",
"google/gemma-4-26b-a4b-qat",
"gemma-4-31b-it-mlx",
"google/gemma-3-4b",
] as const;
const qwenVlmModelPriority = [
"qwen/qwen3.6-27b",
"lmstudio-community/Qwen3-VL-4B-Instruct-MLX-8bit",
"lmstudio-community/Qwen3-VL-4B-Instruct-MLX-4bit",
"lmstudio-community/Qwen3-VL-8B-Instruct-MLX-4bit",
"lmstudio-community/Qwen3-VL-30B-A3B-Instruct-MLX-4bit",
"lmstudio-community/Qwen3-VL-2B-Instruct-GGUF",
] as const;
const supportedImageExtensions = new Set([".jpg", ".jpeg", ".png", ".webp"]);
const defaultAnalyzeMaxTokens = 1200;
const defaultBoundingBoxMaxTokens = 3600;
const boundingBoxModelInputMaxSide = 2200;
const generatedImagesDirectory = ".generated/images";
const boundingBoxModes = ["gemma", "qwen"] as const;
type BoundingBoxMode = typeof boundingBoxModes[number];
const gemmaBoundingBoxesJsonSchema = {
type: "array",
items: {
type: "object",
additionalProperties: false,
properties: {
label: { type: "string" },
box_2d: {
type: "array",
description: "Gemma box coordinates in [y1, x1, y2, x2] order on a 0-1000 grid.",
minItems: 4,
maxItems: 4,
items: { type: "integer", minimum: 0, maximum: 1000 },
},
},
required: ["label", "box_2d"],
},
};
const qwenBoundingBoxesJsonSchema = {
type: "array",
items: {
type: "object",
additionalProperties: false,
properties: {
label: { type: "string" },
bbox_2d: {
type: "array",
description: "Qwen box coordinates in [x1, y1, x2, y2] order. Prefer absolute pixels in the provided image; 0-1000 normalized coordinates are also accepted.",
minItems: 4,
maxItems: 4,
items: { type: "integer", minimum: 0 },
},
},
required: ["label", "bbox_2d"],
},
};
interface ResolvedImagePath {
absolutePath: string;
}
interface VlmModelResolution {
model: LLMDynamicHandle;
modelInfo: LLMInstanceInfo;
source: "loaded_identifier" | "loaded_compatible" | "priority_load";
}
interface GemmaBoundingBox {
box_2d: [number, number, number, number];
label: string;
}
interface QwenBoundingBox {
bbox_2d: [number, number, number, number];
label: string;
}
interface ImageRegion {
height: number;
left: number;
top: number;
width: number;
}
interface BoundingBoxImagePreparation {
modelImagePath: string;
modelInputSize: {
height: number;
width: number;
};
originalSize: {
height: number;
width: number;
};
sourceRegion: ImageRegion;
}
interface ProcessedBoundingBox {
bbox_2d: [number, number, number, number];
box_2d: [number, number, number, number];
coordinate_format: "normalized_0_1000" | "model_input_pixels";
label: string;
model_input_bbox_2d: [number, number, number, number];
model_input_box_2d: [number, number, number, number];
normalized: {
height: number;
width: number;
x: number;
y: number;
};
pixels: {
bottom: number;
height: number;
left: number;
right: number;
top: number;
width: number;
};
}
function expandHomePath(path: string): string {
if (path === "~") {
return homedir();
}
if (path.startsWith("~/")) {
return resolve(homedir(), path.slice(2));
}
return path;
}
async function resolveImagePath(imagePath: string, workingDirectory: string): Promise<ResolvedImagePath> {
const trimmedPath = imagePath.trim();
if (trimmedPath.length === 0) {
throw new Error("image_path must not be empty.");
}
const expandedPath = expandHomePath(trimmedPath);
const absolutePath = isAbsolute(expandedPath)
? resolve(expandedPath)
: resolve(workingDirectory, expandedPath);
const extension = extname(absolutePath).toLowerCase();
if (!supportedImageExtensions.has(extension)) {
throw new Error("image_path must point to a JPEG, PNG, or WebP image.");
}
const imageStats = await stat(absolutePath);
if (!imageStats.isFile()) {
throw new Error("image_path must point to a file.");
}
return { absolutePath };
}
function formatModelInfo(modelInfo: LLMInstanceInfo) {
return {
architecture: modelInfo.architecture ?? null,
display_name: modelInfo.displayName,
identifier: modelInfo.identifier,
context_length: modelInfo.contextLength,
max_context_length: modelInfo.maxContextLength,
model_key: modelInfo.modelKey,
path: modelInfo.path,
};
}
function formatModelStatusName(modelInfo: LLMInstanceInfo): string {
return modelInfo.displayName ?? modelInfo.identifier ?? modelInfo.modelKey;
}
function getTimestampedPath(workingDirectory: string, directoryName: string, imagePath: string, suffix: string): string {
const extension = extname(imagePath);
const stem = basename(imagePath, extension).replace(/[^a-z0-9._-]+/giu, "-") || "image";
const timestamp = new Date().toISOString().replace(/[:.]/gu, "-");
return join(workingDirectory, generatedImagesDirectory, directoryName, `${stem}-${suffix}-${timestamp}.png`);
}
async function getPreloadedVlmModel(ctl: ToolsProviderController): Promise<VlmModelResolution | null> {
const model = ctl.client.llm.createDynamicHandle(preloadedModelIdentifier);
const modelInfo = await model.getModelInfo().catch(() => undefined);
if (modelInfo === undefined) {
return null;
}
return {
model,
modelInfo,
source: "loaded_identifier",
};
}
async function getDownloadedModelsByKey(ctl: ToolsProviderController): Promise<Map<string, LLMInfo> | null> {
try {
const downloadedModels = await ctl.client.system.listDownloadedModels("llm");
return new Map(downloadedModels.map(model => [model.modelKey, model]));
} catch {
return null;
}
}
function getModelSearchText(model: LLMInfo | LLMInstanceInfo): string {
return [
model.modelKey,
model.displayName,
"identifier" in model ? model.identifier : "",
model.path,
model.architecture ?? "",
model.paramsString ?? "",
model.format,
].join(" ").toLowerCase();
}
function getQwenModelRank(model: LLMInfo | LLMInstanceInfo): number | null {
if (!model.vision) {
return null;
}
const searchText = getModelSearchText(model);
if (!searchText.includes("qwen")) {
return null;
}
let rank = 100;
if (searchText.includes("qwen3.6")) {
rank -= 40;
}
if (searchText.includes("mlx")) {
rank -= 20;
}
if (searchText.includes("27b")) {
rank -= 18;
} else if (searchText.includes("30b")) {
rank -= 16;
} else if (searchText.includes("8b")) {
rank -= 8;
} else if (searchText.includes("4b")) {
rank -= 4;
} else if (searchText.includes("2b")) {
rank -= 2;
}
if (searchText.includes("instruct")) {
rank -= 4;
}
if (searchText.includes("gguf")) {
rank += 6;
}
return rank;
}
function getGemmaModelRank(model: LLMInfo | LLMInstanceInfo): number | null {
if (!model.vision) {
return null;
}
const searchText = getModelSearchText(model);
if (!searchText.includes("gemma")) {
return null;
}
const exactPriorityIndex = vlmModelPriority.findIndex(modelKey => model.modelKey === modelKey);
if (exactPriorityIndex !== -1) {
return exactPriorityIndex;
}
let rank = 100;
if (searchText.includes("gemma-4") || searchText.includes("gemma4")) {
rank -= 30;
}
if (searchText.includes("26b")) {
rank -= 20;
} else if (searchText.includes("31b")) {
rank -= 18;
} else if (searchText.includes("4b") || searchText.includes("e4b")) {
rank -= 8;
}
if (searchText.includes("mlx")) {
rank -= 4;
}
return rank;
}
function getModelRankForMode(
model: LLMInfo | LLMInstanceInfo,
preferredMode: BoundingBoxMode,
): number | null {
return preferredMode === "qwen"
? getQwenModelRank(model)
: getGemmaModelRank(model);
}
function getModelModeLabel(model: LLMInfo | LLMInstanceInfo): BoundingBoxMode | "unknown" {
if (getQwenModelRank(model) !== null) {
return "qwen";
}
if (getGemmaModelRank(model) !== null) {
return "gemma";
}
return "unknown";
}
async function getLoadedCompatibleVlmModel(
ctl: ToolsProviderController,
preferredMode: BoundingBoxMode | "default",
): Promise<VlmModelResolution | null> {
if (preferredMode === "default") {
return null;
}
let loadedModels: LLM[];
try {
loadedModels = await ctl.client.llm.listLoaded();
} catch {
return null;
}
const candidates = await Promise.all(loadedModels.map(async model => {
const modelInfo = await model.getModelInfo().catch(() => null);
if (modelInfo === null) {
return null;
}
const rank = getModelRankForMode(modelInfo, preferredMode);
if (rank === null) {
return null;
}
return { model, modelInfo, rank };
}));
const bestCandidate = candidates
.filter((candidate): candidate is { model: LLM; modelInfo: LLMInstanceInfo; rank: number } => candidate !== null)
.sort((left, right) => {
if (left.rank !== right.rank) {
return left.rank - right.rank;
}
return right.modelInfo.sizeBytes - left.modelInfo.sizeBytes;
})[0];
if (bestCandidate === undefined) {
return null;
}
return {
model: bestCandidate.model,
modelInfo: bestCandidate.modelInfo,
source: "loaded_compatible",
};
}
function getModelCandidates(
downloadedModelsByKey: Map<string, LLMInfo> | null,
preferredMode: BoundingBoxMode | "default",
): string[] {
const priority = preferredMode === "qwen" ? qwenVlmModelPriority : vlmModelPriority;
if (downloadedModelsByKey === null) {
return [...priority];
}
const exactCandidates = priority.filter(modelKey => downloadedModelsByKey.has(modelKey));
if (preferredMode !== "qwen") {
return exactCandidates;
}
const fuzzyCandidates = [...downloadedModelsByKey.values()]
.map(model => ({ model, rank: getQwenModelRank(model) }))
.filter((entry): entry is { model: LLMInfo; rank: number } => entry.rank !== null)
.sort((left, right) => {
if (left.rank !== right.rank) {
return left.rank - right.rank;
}
return left.model.sizeBytes - right.model.sizeBytes;
})
.map(entry => entry.model.modelKey);
return [...new Set([...exactCandidates, ...fuzzyCandidates])];
}
function getDownloadedQwenSummary(downloadedModelsByKey: Map<string, LLMInfo> | null): string {
if (downloadedModelsByKey === null) {
return "";
}
const qwenModels = [...downloadedModelsByKey.values()]
.filter(model => getModelSearchText(model).includes("qwen"))
.map(model => `- ${model.modelKey} (${model.displayName}, vision=${model.vision})`);
if (qwenModels.length === 0) {
return "Downloaded Qwen-like models: none.";
}
return `Downloaded Qwen-like models:\n${qwenModels.join("\n")}`;
}
async function ensureVlmModel(
ctl: ToolsProviderController,
signal: AbortSignal,
status: (statusText: string) => void,
preferredMode: BoundingBoxMode | "default" = "default",
): Promise<VlmModelResolution> {
const preloadedModel = await getPreloadedVlmModel(ctl);
if (preloadedModel !== null) {
if (
preferredMode === "default" ||
getModelRankForMode(preloadedModel.modelInfo, preferredMode) !== null
) {
status(`Using preloaded VLM ${preloadedModel.modelInfo.identifier}`);
return preloadedModel;
}
status(
`Ignoring preloaded VLM ${preloadedModel.modelInfo.identifier} because it looks like ${getModelModeLabel(preloadedModel.modelInfo)}, not ${preferredMode}`,
);
}
const loadedCompatibleModel = await getLoadedCompatibleVlmModel(ctl, preferredMode);
if (loadedCompatibleModel !== null) {
status(`Using already-loaded VLM ${loadedCompatibleModel.modelInfo.identifier}`);
return loadedCompatibleModel;
}
const downloadedModelsByKey = await getDownloadedModelsByKey(ctl);
const candidates = getModelCandidates(downloadedModelsByKey, preferredMode);
const errors: string[] = [];
for (const modelKey of candidates) {
signal.throwIfAborted();
try {
const maxContextLength = downloadedModelsByKey?.get(modelKey)?.maxContextLength;
status(maxContextLength === undefined
? `Loading VLM ${modelKey}`
: `Loading VLM ${modelKey} with ${maxContextLength} context tokens`);
const model = await ctl.client.llm.model(modelKey, {
config: maxContextLength === undefined ? undefined : {
contextLength: maxContextLength,
},
signal,
onProgress: progress => status(`Loading VLM ${modelKey} (${Math.round(progress * 100)}%)`),
});
const modelInfo = await model.getModelInfo();
return {
model,
modelInfo,
source: "priority_load",
};
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
errors.push(`${modelKey}: ${message}`);
}
}
const downloadedNote = downloadedModelsByKey === null
? "Could not inspect downloaded models; tried the hardcoded priority list."
: `Downloaded priority candidates found: ${candidates.length}.`;
throw new Error(text`
No VLM model could be loaded. ${downloadedNote}
Recommended preload command:
${recommendedPreloadCommand}
Load attempts:
${errors.length === 0 ? "No priority candidates were installed." : errors.join("\n")}
${preferredMode === "qwen" ? getDownloadedQwenSummary(downloadedModelsByKey) : ""}
`);
}
async function runImagePrompt({
ctl,
actionDescription,
imagePath,
imagePreprocessor,
maxTokens,
modelMode,
prompt,
signal,
status,
structured,
temperature,
}: {
ctl: ToolsProviderController;
actionDescription: string;
imagePath: string;
imagePreprocessor?: (absoluteImagePath: string, workingDirectory: string) => Promise<BoundingBoxImagePreparation>;
maxTokens: number;
modelMode?: BoundingBoxMode | "default";
prompt: string;
signal: AbortSignal;
status: (statusText: string) => void;
structured?: { type: "json"; jsonSchema: unknown };
temperature: number;
}) {
const resolvedImagePath = await resolveImagePath(imagePath, ctl.getWorkingDirectory());
const { model, modelInfo, source } = await ensureVlmModel(ctl, signal, status, modelMode);
status(`Using ${formatModelStatusName(modelInfo)} to ${actionDescription}`);
const imagePreparation = imagePreprocessor === undefined
? undefined
: await imagePreprocessor(resolvedImagePath.absolutePath, ctl.getWorkingDirectory());
status("Preparing image");
const image = await ctl.client.files.prepareImage(
imagePreparation?.modelImagePath ?? resolvedImagePath.absolutePath,
);
signal.throwIfAborted();
status("Running VLM");
const result = await model.respond(
[{ role: "user", content: prompt, images: [image] }],
{
maxTokens,
signal,
structured,
temperature,
},
);
return {
absoluteImagePath: resolvedImagePath.absolutePath,
imagePreparation,
modelInfo,
modelSource: source,
result,
};
}
function parseStructuredJson(content: string): unknown {
try {
return JSON.parse(content);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
throw new Error(`The VLM returned invalid JSON despite structured output: ${message}\n${content}`);
}
}
function toForwardSlashPath(path: string): string {
return path.replace(/\\/gu, "/");
}
function buildMarkdownImageLink(altText: string, absolutePath: string): string {
const safeAltText = altText.replace(/[\]\r\n]/gu, " ").trim() || "annotated image";
const markdownPath = toForwardSlashPath(absolutePath);
const linkPath = /[\s()<>]/u.test(markdownPath)
? `<${markdownPath.replace(/>/gu, "%3E")}>`
: markdownPath;
return ``;
}
function escapeXml(value: string): string {
return value
.replace(/&/gu, "&")
.replace(/</gu, "<")
.replace(/>/gu, ">")
.replace(/"/gu, """);
}
function sanitizeBoxLabel(label: string): string {
const specialTokenIndex = label.search(/<\||<channel/iu);
const cleanedLabel = (specialTokenIndex === -1 ? label : label.slice(0, specialTokenIndex))
.replace(/[\r\n\t]+/gu, " ")
.replace(/\s{2,}/gu, " ")
.trim();
return cleanedLabel.length === 0 ? "object" : cleanedLabel.slice(0, 120);
}
async function hasTrimmableTopLeftPixel(imagePath: string): Promise<boolean> {
const { data, info } = await sharp(imagePath)
.rotate()
.ensureAlpha()
.extract({ height: 1, left: 0, top: 0, width: 1 })
.raw()
.toBuffer({ resolveWithObject: true });
if (info.channels < 4) {
return false;
}
const [red, green, blue, alpha] = data;
return alpha <= 16 || (alpha > 16 && red <= 24 && green <= 24 && blue <= 24);
}
async function getOuterMatteTrimRegion(
imagePath: string,
originalWidth: number,
originalHeight: number,
): Promise<ImageRegion> {
if (!(await hasTrimmableTopLeftPixel(imagePath))) {
return {
height: originalHeight,
left: 0,
top: 0,
width: originalWidth,
};
}
const trimInfo = await sharp(imagePath)
.rotate()
.flatten({ background: "#ffffff" })
.trim({ threshold: 12 })
.png()
.toBuffer({ resolveWithObject: true })
.then(result => result.info);
const left = Math.max(0, Math.round(Math.abs(trimInfo.trimOffsetLeft ?? 0)));
const top = Math.max(0, Math.round(Math.abs(trimInfo.trimOffsetTop ?? 0)));
const width = Math.min(originalWidth - left, trimInfo.width);
const height = Math.min(originalHeight - top, trimInfo.height);
const areaRatio = (width * height) / (originalWidth * originalHeight);
if (
width <= 0 ||
height <= 0 ||
areaRatio < 0.25 ||
(left === 0 && top === 0 && width === originalWidth && height === originalHeight)
) {
return {
height: originalHeight,
left: 0,
top: 0,
width: originalWidth,
};
}
return { height, left, top, width };
}
async function prepareBoundingBoxModelImage(
imagePath: string,
workingDirectory: string,
): Promise<BoundingBoxImagePreparation> {
const metadata = await sharp(imagePath).metadata();
if (metadata.width === undefined || metadata.height === undefined) {
throw new Error("Could not read image dimensions for VLM preprocessing.");
}
const originalSize = {
height: metadata.height,
width: metadata.width,
};
const sourceRegion = await getOuterMatteTrimRegion(imagePath, originalSize.width, originalSize.height);
const modelImagePath = getTimestampedPath(workingDirectory, "inputs", imagePath, "model-input");
await mkdir(dirname(modelImagePath), { recursive: true });
await sharp(imagePath)
.rotate()
.extract(sourceRegion)
.flatten({ background: "#ffffff" })
.resize({
fit: "inside",
height: boundingBoxModelInputMaxSide,
width: boundingBoxModelInputMaxSide,
withoutEnlargement: true,
})
.png()
.toFile(modelImagePath);
const modelInputMetadata = await sharp(modelImagePath).metadata();
if (modelInputMetadata.width === undefined || modelInputMetadata.height === undefined) {
throw new Error("Could not read normalized model-input image dimensions.");
}
return {
modelImagePath,
modelInputSize: {
height: modelInputMetadata.height,
width: modelInputMetadata.width,
},
originalSize,
sourceRegion,
};
}
function asGemmaBoundingBoxes(parsedJson: unknown): GemmaBoundingBox[] {
const items = Array.isArray(parsedJson)
? parsedJson
: (
typeof parsedJson === "object" &&
parsedJson !== null &&
Array.isArray((parsedJson as { boxes?: unknown }).boxes)
)
? (parsedJson as { boxes: unknown[] }).boxes
: null;
if (items === null) {
throw new Error("Structured output must be a Gemma bounding-box array.");
}
return items.map((item, index) => {
if (typeof item !== "object" || item === null) {
throw new Error(`Box ${index + 1} must be an object.`);
}
const box = item as { box_2d?: unknown; label?: unknown };
if (typeof box.label !== "string" || box.label.trim().length === 0) {
throw new Error(`Box ${index + 1} must have a non-empty label.`);
}
if (
!Array.isArray(box.box_2d) ||
box.box_2d.length !== 4 ||
!box.box_2d.every(value => typeof value === "number" && Number.isFinite(value))
) {
throw new Error(`Box ${index + 1} must have box_2d as [y1, x1, y2, x2].`);
}
return {
box_2d: box.box_2d as [number, number, number, number],
label: sanitizeBoxLabel(box.label),
};
});
}
function asQwenBoundingBoxes(parsedJson: unknown): QwenBoundingBox[] {
const items = Array.isArray(parsedJson)
? parsedJson
: (
typeof parsedJson === "object" &&
parsedJson !== null &&
Array.isArray((parsedJson as { boxes?: unknown }).boxes)
)
? (parsedJson as { boxes: unknown[] }).boxes
: null;
if (items === null) {
throw new Error("Structured output must be a Qwen bounding-box array.");
}
return items.map((item, index) => {
if (typeof item !== "object" || item === null) {
throw new Error(`Box ${index + 1} must be an object.`);
}
const box = item as { bbox_2d?: unknown; label?: unknown };
if (typeof box.label !== "string" || box.label.trim().length === 0) {
throw new Error(`Box ${index + 1} must have a non-empty label.`);
}
if (
!Array.isArray(box.bbox_2d) ||
box.bbox_2d.length !== 4 ||
!box.bbox_2d.every(value => typeof value === "number" && Number.isFinite(value))
) {
throw new Error(`Box ${index + 1} must have bbox_2d as [x1, y1, x2, y2].`);
}
return {
bbox_2d: box.bbox_2d as [number, number, number, number],
label: sanitizeBoxLabel(box.label),
};
});
}
function clampGemmaCoordinate(value: number): number {
return Math.max(0, Math.min(1000, Math.round(value)));
}
function processGemmaBox(
box: GemmaBoundingBox,
imageWidth: number,
imageHeight: number,
sourceRegion: ImageRegion,
): ProcessedBoundingBox {
const [rawY1, rawX1, rawY2, rawX2] = box.box_2d.map(clampGemmaCoordinate) as [
number,
number,
number,
number,
];
const y1 = Math.min(rawY1, rawY2);
const y2 = Math.max(rawY1, rawY2);
const x1 = Math.min(rawX1, rawX2);
const x2 = Math.max(rawX1, rawX2);
const left = Math.round(sourceRegion.left + (x1 / 1000) * sourceRegion.width);
const top = Math.round(sourceRegion.top + (y1 / 1000) * sourceRegion.height);
const right = Math.round(sourceRegion.left + (x2 / 1000) * sourceRegion.width);
const bottom = Math.round(sourceRegion.top + (y2 / 1000) * sourceRegion.height);
const originalY1 = clampGemmaCoordinate((top / imageHeight) * 1000);
const originalX1 = clampGemmaCoordinate((left / imageWidth) * 1000);
const originalY2 = clampGemmaCoordinate((bottom / imageHeight) * 1000);
const originalX2 = clampGemmaCoordinate((right / imageWidth) * 1000);
return {
box_2d: [originalY1, originalX1, originalY2, originalX2],
bbox_2d: [originalX1, originalY1, originalX2, originalY2],
coordinate_format: "normalized_0_1000",
label: box.label,
model_input_box_2d: [y1, x1, y2, x2],
model_input_bbox_2d: [x1, y1, x2, y2],
normalized: {
height: Math.max(0, bottom - top) / imageHeight,
width: Math.max(0, right - left) / imageWidth,
x: left / imageWidth,
y: top / imageHeight,
},
pixels: {
bottom,
height: Math.max(0, bottom - top),
left,
right,
top,
width: Math.max(0, right - left),
},
};
}
function processQwenBox(
box: QwenBoundingBox,
imageWidth: number,
imageHeight: number,
sourceRegion: ImageRegion,
modelInputSize: { height: number; width: number },
): ProcessedBoundingBox {
const [rawX1, rawY1, rawX2, rawY2] = box.bbox_2d;
const rawCoordinatesFitModelInput =
rawX1 <= modelInputSize.width &&
rawX2 <= modelInputSize.width &&
rawY1 <= modelInputSize.height &&
rawY2 <= modelInputSize.height;
const rawCoordinatesFitNormalizedGrid = Math.max(rawX1, rawY1, rawX2, rawY2) <= 1000;
const coordinateFormat =
rawCoordinatesFitModelInput &&
!(rawCoordinatesFitNormalizedGrid && (modelInputSize.width > 1000 || modelInputSize.height > 1000))
? "model_input_pixels"
: "normalized_0_1000";
const clampPixelX = (value: number) => Math.max(0, Math.min(modelInputSize.width, Math.round(value)));
const clampPixelY = (value: number) => Math.max(0, Math.min(modelInputSize.height, Math.round(value)));
const toPixelX = coordinateFormat === "model_input_pixels"
? (value: number) => clampPixelX(value)
: (value: number) => clampPixelX((value / 1000) * modelInputSize.width);
const toPixelY = coordinateFormat === "model_input_pixels"
? (value: number) => clampPixelY(value)
: (value: number) => clampPixelY((value / 1000) * modelInputSize.height);
const x1 = Math.min(toPixelX(rawX1), toPixelX(rawX2));
const x2 = Math.max(toPixelX(rawX1), toPixelX(rawX2));
const y1 = Math.min(toPixelY(rawY1), toPixelY(rawY2));
const y2 = Math.max(toPixelY(rawY1), toPixelY(rawY2));
const left = Math.round(sourceRegion.left + (x1 / modelInputSize.width) * sourceRegion.width);
const top = Math.round(sourceRegion.top + (y1 / modelInputSize.height) * sourceRegion.height);
const right = Math.round(sourceRegion.left + (x2 / modelInputSize.width) * sourceRegion.width);
const bottom = Math.round(sourceRegion.top + (y2 / modelInputSize.height) * sourceRegion.height);
const originalY1 = clampGemmaCoordinate((top / imageHeight) * 1000);
const originalX1 = clampGemmaCoordinate((left / imageWidth) * 1000);
const originalY2 = clampGemmaCoordinate((bottom / imageHeight) * 1000);
const originalX2 = clampGemmaCoordinate((right / imageWidth) * 1000);
return {
box_2d: [originalY1, originalX1, originalY2, originalX2],
bbox_2d: [originalX1, originalY1, originalX2, originalY2],
coordinate_format: coordinateFormat,
label: box.label,
model_input_box_2d: [y1, x1, y2, x2],
model_input_bbox_2d: [x1, y1, x2, y2],
normalized: {
height: Math.max(0, bottom - top) / imageHeight,
width: Math.max(0, right - left) / imageWidth,
x: left / imageWidth,
y: top / imageHeight,
},
pixels: {
bottom,
height: Math.max(0, bottom - top),
left,
right,
top,
width: Math.max(0, right - left),
},
};
}
function getAnnotatedImagePath(workingDirectory: string, imagePath: string): string {
return getTimestampedPath(workingDirectory, "boxes", imagePath, "boxes");
}
function buildBoxesSvgOverlay(
boxes: ProcessedBoundingBox[],
imageWidth: number,
imageHeight: number,
): string {
const colors = ["#f43f5e", "#2563eb", "#16a34a", "#f59e0b", "#7c3aed", "#0891b2"];
const renderedBoxes = boxes
.filter(box => box.pixels.width > 0 && box.pixels.height > 0)
.map((box, index) => {
const color = colors[index % colors.length];
const label = escapeXml(box.label);
const labelWidth = Math.min(imageWidth - box.pixels.left, Math.max(56, label.length * 9 + 18));
const labelY = Math.max(0, box.pixels.top - 24);
return `
<rect x="${box.pixels.left}" y="${box.pixels.top}" width="${box.pixels.width}" height="${box.pixels.height}" fill="none" stroke="${color}" stroke-width="4"/>
<rect x="${box.pixels.left}" y="${labelY}" width="${labelWidth}" height="24" rx="4" fill="${color}"/>
<text x="${box.pixels.left + 8}" y="${labelY + 17}" font-family="Arial, Helvetica, sans-serif" font-size="16" font-weight="700" fill="white">${label}</text>
`;
})
.join("\n");
return `
<svg width="${imageWidth}" height="${imageHeight}" viewBox="0 0 ${imageWidth} ${imageHeight}" xmlns="http://www.w3.org/2000/svg">
${renderedBoxes}
</svg>
`;
}
async function renderAnnotatedImage({
boxes,
imagePath,
outputPath,
}: {
boxes: ProcessedBoundingBox[];
imagePath: string;
outputPath: string;
}): Promise<void> {
const imageMetadata = await sharp(imagePath).metadata();
if (imageMetadata.width === undefined || imageMetadata.height === undefined) {
throw new Error("Could not read image dimensions for annotation.");
}
const overlay = Buffer.from(buildBoxesSvgOverlay(boxes, imageMetadata.width, imageMetadata.height));
await mkdir(dirname(outputPath), { recursive: true });
await sharp(imagePath)
.composite([{ input: overlay, left: 0, top: 0 }])
.png()
.toFile(outputPath);
}
export async function toolsProvider(ctl: ToolsProviderController) {
const tools: Tool[] = [];
tools.push(tool({
name: "analyze_image",
description: text`
Analyze an image with a local vision-language model. Use this for free-form visual
descriptions, OCR-like summaries, UI inspection, visual QA, and other natural-language image
analysis.
`,
parameters: {
image_path: z.string().min(1).describe("Path to a JPEG, PNG, or WebP image. Relative paths resolve from the LM Studio working directory."),
prompt: z.string().min(1).max(8000).describe("Prompt to send to the VLM about the image."),
},
implementation: async ({ image_path, prompt }, { signal, status }) => {
try {
const { absoluteImagePath, modelInfo, modelSource, result } = await runImagePrompt({
ctl,
actionDescription: "analyze image",
imagePath: image_path,
maxTokens: defaultAnalyzeMaxTokens,
prompt,
signal,
status,
temperature: 0.2,
});
return {
analysis: result.content,
image_path: absoluteImagePath,
model: formatModelInfo(modelInfo),
model_status: `Used ${formatModelStatusName(modelInfo)} to analyze image.`,
model_source: modelSource,
};
} catch (error) {
return `Error: ${error instanceof Error ? error.message : String(error)}`;
}
},
}));
tools.push(tool({
name: "bounding_boxes",
description: text`
Detect objects or regions in an image and return bounding boxes. This tool also draws the
boxes on a PNG copy of the image by default and returns a markdown image link for showing the
annotated image in the chat body. Use this when the user asks where something is, asks to
locate objects, or wants visual bounding boxes.
For broad UI screenshot annotation, pass this exact prompt:
You are a precise UI annotation expert. Examine this entire image carefully and identify
every single distinct visual element with exact bounding boxes. Be extremely thorough.
Locate all: text blocks, headings, paragraphs, buttons, icons, images, containers, panels,
navigation elements, input fields, labels. Draw tight bounding boxes around every element
you can see. Do not merge regions together - each distinct visual element gets its own box.
Return as many accurate bounding boxes as possible with clear descriptive labels.
Use mode="gemma" for Gemma models. This prompts for box_2d as [y1, x1, y2, x2].
Use mode="qwen" for Qwen-VL models. This prompts for bbox_2d as [x1, y1, x2, y2]
coordinates and accepts either model-input pixels or a 0-1000 grid. Both modes convert boxes
to normalized and pixel coordinates automatically.
`,
parameters: {
image_path: z.string().min(1).describe("Path to a JPEG, PNG, or WebP image. Relative paths resolve from the LM Studio working directory."),
prompt: z.string().min(1).max(8000).describe(text`
Prompt describing what objects or regions to locate. For broad UI screenshot annotation,
use: You are a precise UI annotation expert. Examine this entire image carefully and
identify every single distinct visual element with exact bounding boxes. Be extremely
thorough. Locate all: text blocks, headings, paragraphs, buttons, icons, images,
containers, panels, navigation elements, input fields, labels. Draw tight bounding boxes
around every element you can see. Do not merge regions together - each distinct visual
element gets its own box. Return as many accurate bounding boxes as possible with clear
descriptive labels.
`),
mode: z
.enum(boundingBoxModes)
.describe("Required bounding-box mode. Use qwen when the user asks for Qwen mode or when vlm-tools-vlm is a loaded Qwen model. Use gemma for Gemma models."),
draw_boxes: z
.boolean()
.optional()
.describe("Draw labeled boxes on a PNG copy of the image. Default: true."),
},
implementation: async ({ image_path, prompt, mode, draw_boxes }, { signal, status }) => {
const boundingBoxMode = mode;
const structuredPrompt = boundingBoxMode === "gemma"
? text`
For each requested object, identify the topmost, leftmost, bottommost, and rightmost visible
points. Detect ${prompt}.
Return only JSON as an array of {"box_2d":[y1,x1,y2,x2],"label":"object name"}.
Coordinates are integers from 0 to 1000 in Gemma order: y1, x1, y2, x2.
Return only boxes that match the request.
`
: text`
Locate ${prompt}.
Return only JSON as an array of {"bbox_2d":[x1,y1,x2,y2],"label":"object name"}.
Coordinates are integer pixel positions relative to the provided image. Do not normalize
coordinates to a 0-1000 grid.
Use Qwen order: x1, y1, x2, y2, where x1,y1 is the top-left corner and x2,y2 is the bottom-right corner.
Return only boxes that match the request.
`;
try {
const { absoluteImagePath, imagePreparation, modelInfo, modelSource, result } = await runImagePrompt({
ctl,
actionDescription: `detect bounding boxes in ${boundingBoxMode} mode`,
imagePreprocessor: prepareBoundingBoxModelImage,
imagePath: image_path,
maxTokens: defaultBoundingBoxMaxTokens,
modelMode: boundingBoxMode,
prompt: structuredPrompt,
signal,
status,
structured: {
type: "json",
jsonSchema: boundingBoxMode === "gemma"
? gemmaBoundingBoxesJsonSchema
: qwenBoundingBoxesJsonSchema,
},
temperature: 0,
});
signal.throwIfAborted();
if (imagePreparation === undefined) {
return "Error: missing bounding-box image preprocessing metadata.";
}
const imageWidth = imagePreparation.originalSize.width;
const imageHeight = imagePreparation.originalSize.height;
if (imageWidth === undefined || imageHeight === undefined) {
return "Error: could not read image dimensions for bounding-box conversion.";
}
const parsedJson = parseStructuredJson(result.content);
const rawGemmaBoxes = boundingBoxMode === "gemma" ? asGemmaBoundingBoxes(parsedJson) : null;
const rawQwenBoxes = boundingBoxMode === "qwen" ? asQwenBoundingBoxes(parsedJson) : null;
const boxes = boundingBoxMode === "gemma"
? rawGemmaBoxes?.map(box => processGemmaBox(
box,
imageWidth,
imageHeight,
imagePreparation.sourceRegion,
)) ?? []
: rawQwenBoxes?.map(box => processQwenBox(
box,
imageWidth,
imageHeight,
imagePreparation.sourceRegion,
imagePreparation.modelInputSize,
)) ?? [];
const shouldDrawBoxes = draw_boxes ?? true;
let annotatedImagePath: string | null = null;
let annotatedImageMarkdown: string | null = null;
if (shouldDrawBoxes) {
status("Rendering bounding boxes");
annotatedImagePath = getAnnotatedImagePath(ctl.getWorkingDirectory(), absoluteImagePath);
await renderAnnotatedImage({
boxes,
imagePath: absoluteImagePath,
outputPath: annotatedImagePath,
});
annotatedImageMarkdown = buildMarkdownImageLink("annotated bounding boxes", annotatedImagePath);
}
return {
markdown: annotatedImageMarkdown,
annotated_image_markdown: annotatedImageMarkdown,
annotated_image_path: annotatedImagePath,
path: annotatedImagePath,
absolute_path: annotatedImagePath,
mime_type: annotatedImagePath === null ? null : "image/png",
bbox_mode: boundingBoxMode,
boxes,
boxes_raw_gemma: rawGemmaBoxes,
boxes_raw_qwen: rawQwenBoxes,
image_path: absoluteImagePath,
image_preprocessing: {
model_input_max_side: boundingBoxModelInputMaxSide,
model_input_path: imagePreparation.modelImagePath,
model_input_size: imagePreparation.modelInputSize,
original_size: imagePreparation.originalSize,
source_region: imagePreparation.sourceRegion,
},
image_size: {
height: imageHeight,
width: imageWidth,
},
model: formatModelInfo(modelInfo),
model_status: `Used ${formatModelStatusName(modelInfo)} to detect bounding boxes in ${boundingBoxMode} mode.`,
model_source: modelSource,
$hint:
annotatedImageMarkdown === null
? `The tool already used ${formatModelStatusName(modelInfo)} in ${boundingBoxMode} mode. Use the returned boxes data. Do not ask the user to load another model.`
: `The tool already used ${formatModelStatusName(modelInfo)} in ${boundingBoxMode} mode. Present the annotated image to the user by including the markdown image link exactly as returned. Do not rewrite it as a relative path, do not wrap it in a code block, and do not ask the user to load another model.`,
};
} catch (error) {
return `Error: ${error instanceof Error ? error.message : String(error)}`;
}
},
}));
return tools;
}