Project Files
src / tools / analyse_image.ts
import { tool, type Tool, type ToolsProviderController, type ToolCallContext } from "@lmstudio/sdk";
import { z } from "zod";
import path from "path";
import fs from "fs";
import { formatToolMetaBlock, readState, type ChatMediaState } from "../core-bundle.mjs";
import { syncAttachmentsToState } from "../core-bundle.mjs";
import { readPngGenerationMeta, formatGenerationMeta } from "../helpers/readPngMetadata.js";
import { reportToolStatus, reportToolStep } from "../helpers/toolProgress.js";
import { defaultPluginSettings, globalConfigSchematics } from "../config.js";
import {
analyzeLmStudioVisionBatch,
ensureLmStudioVisionInstanceReady,
type LmStudioVisionAnalyzerConfig,
type VisionAnalysisItem,
} from "../services/lmStudioVisionAnalyzer.js";
function formatPluginMeta(): string {
return formatToolMetaBlock();
}
function getGlobalConfig(ctl: ToolsProviderController): any | null {
const ctlAny = ctl as any;
const getter = ctlAny.getGlobalPluginConfig || ctlAny.getGlobalConfig;
if (!getter) return null;
try {
return getter.call(ctl, globalConfigSchematics);
} catch {
return null;
}
}
function getGlobalString(gcfg: any | null, key: string, fallback: string): string {
try {
const value = gcfg?.get(key);
return typeof value === "string" ? value : fallback;
} catch {
return fallback;
}
}
function getGlobalNumber(gcfg: any | null, key: string, fallback: number): number {
try {
const value = gcfg?.get(key);
return typeof value === "number" && Number.isFinite(value) ? value : fallback;
} catch {
return fallback;
}
}
/**
* Flexible targets schema: accepts array OR comma/space-separated string.
* Reduces parse errors when models output "a1, v2" instead of ["a1", "v2"].
*/
const FlexibleTargetsList = z
.union([
z.string().transform((s) => (s.match(/[aivp]\d+/gi) ?? []).map((x) => x.toLowerCase())),
z.array(z.string()),
])
.refine((arr) => arr.length >= 1, "targets must contain at least one notation")
.refine((arr) => arr.length <= 16, "targets must contain at most 16 notations (MLX API limit)");
const AnalyseImageParamsShape = {
targets: FlexibleTargetsList,
prompt: z.string().optional().describe("Optional prompt for the vision model. Empty = model default."),
} satisfies Record<string, z.ZodTypeAny>;
type ParsedTargets = { a: number[]; v: number[]; i: number[]; p: number[] };
function parseTargets(targets: string[]): {
parsed: ParsedTargets;
invalid: string[];
} {
const parsed: ParsedTargets = { a: [], v: [], i: [], p: [] };
const invalid: string[] = [];
for (const raw of targets) {
const s = typeof raw === "string" ? raw.trim() : "";
const m = /^([avip])(\d+)$/i.exec(s);
if (!m) {
invalid.push(String(raw));
continue;
}
const kind = m[1].toLowerCase() as "a" | "v" | "i" | "p";
const n = parseInt(m[2], 10);
if (!Number.isFinite(n) || n <= 0) {
invalid.push(String(raw));
continue;
}
parsed[kind].push(n);
}
// Unique + sort for stable behavior
(Object.keys(parsed) as Array<keyof ParsedTargets>).forEach((k) => {
parsed[k] = Array.from(new Set(parsed[k])).sort((a, b) => a - b);
});
return { parsed, invalid };
}
function getAvailable(state: ChatMediaState) {
const availableA = (state.attachments || [])
.map((x: any) => (typeof x?.a === "number" ? x.a : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
const availableV = (state.variants || [])
.map((x: any) => (typeof x?.v === "number" ? x.v : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
const availableI = (state.images || [])
.map((x: any) => (typeof x?.i === "number" ? x.i : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
const availableP = (state.pictures || [])
.map((x: any) => (typeof x?.p === "number" ? x.p : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
return { availableA, availableV, availableI, availableP };
}
async function ensurePreviewExists(chatWd: string, previewRel: string): Promise<void> {
const pAbs = path.join(chatWd, previewRel);
await fs.promises.access(pAbs, fs.constants.F_OK);
}
function classifyVisionError(errMsg: string): string {
if (/\b503\b/.test(errMsg)) {
return "The Vision API is reachable, but the configured vision model is not available for inference. Check the configured vision model key and loaded model state.";
}
if (/aborted|aborterror|timed out|timeout/i.test(errMsg)) {
return "The Vision API request timed out before the model returned.";
}
if (/ECONNREFUSED|ENOTFOUND|ECONNRESET|network socket|fetch failed/i.test(errMsg)) {
return "The Vision API is not reachable. Check the configured vision/embedding API base URL and try again.";
}
return /Vision API/i.test(errMsg) ? errMsg : `Vision API error: ${errMsg}`;
}
function analyzeTimeoutMs(itemCount: number): number {
return Math.min(600_000, Math.max(180_000, itemCount * 60_000));
}
export function createAnalyseImageTool(ctl: ToolsProviderController): Tool {
return tool({
name: "analyse_image",
description: `Inspect existing media items (images, pictures, variants, attachments) for generation metadata and, optionally, visual content.
PRIMARY use — generation metadata:
Call this tool whenever the user asks how an image was generated, what settings were used, or wants to reuse generation parameters (prompt, model, sampler, seed, steps, guidance scale, LoRA, source images, …). Those parameters are embedded in the PNG file and are returned automatically — no vision prompt needed.
SECONDARY use — visual description (on demand only):
Only request a visual description when the user explicitly asks you to describe or analyze the image content. Pass a prompt in the 'prompt' parameter. Without a prompt, no vision model is invoked and no description is returned.
Parameters:
- targets: Field in the JSON argument object. Pass a JSON array of notations, e.g. analyse_image({"targets":["a1", "v2"]}). Notation: aN=attachment, vN=variant, iN=image, pN=picture.
- prompt: (optional) Vision prompt for visual description — omit unless explicitly requested.
${formatPluginMeta()}`,
parameters: AnalyseImageParamsShape,
implementation: async (args: any, ctx: ToolCallContext) => {
try {
// Tolerant mode: drop invalid items instead of failing
const strict = false;
let targets: string[];
if (Array.isArray(args?.targets)) {
targets = args.targets;
} else if (typeof args?.targets === "string" && args.targets.trim().startsWith("[")) {
try {
const parsed = JSON.parse(args.targets.trim());
targets = Array.isArray(parsed) ? parsed.map((s: any) => String(s).trim()).filter(Boolean) : [];
} catch {
targets = [];
}
} else {
targets = [];
}
const prompt = typeof args?.prompt === "string" ? args.prompt : "";
const globalConfig = getGlobalConfig(ctl);
const configuredVisionPrompt = getGlobalString(
globalConfig,
"visionPrompt",
process.env.VISION_PROMPT || defaultPluginSettings.visionPrompt
);
const effectivePrompt = (prompt || configuredVisionPrompt || "").trim();
// Parse explicit notations (aN/vN/iN/pN)
const { parsed, invalid: invalidRaw } = parseTargets(targets);
const workingDir = ctl.getWorkingDirectory();
if (typeof workingDir !== "string" || !workingDir.trim()) {
return "analyse_image failed: working directory not available.";
}
const chatWd = workingDir;
// Sync attachments from conversation.json → chat_media_state.json.
// maxPreviewAttachments=MAX: importAttachmentBatch generates previews for all attachments
// and writes them to state atomically — the canonical path via draw-things-chat-core.
try {
await syncAttachmentsToState(chatWd, false, Number.MAX_SAFE_INTEGER);
} catch (syncErr: any) {
console.warn("[analyse_image] attachment sync failed (non-fatal):", syncErr?.message ?? syncErr);
}
const state = await readState(chatWd);
const { availableA, availableV, availableI, availableP } = getAvailable(state);
// If any invalid notations and strict=true, reject
if (invalidRaw.length > 0 && strict) {
return `analyse_image failed: invalid targets: ${invalidRaw
.map((s) => JSON.stringify(s))
.join(", ")}`;
}
// Collect analysis items (id + preview file path)
const analysisItems: VisionAnalysisItem[] = [];
// Map from notation id → absolute path of original file (for XMP reading)
const originalFilePaths = new Map<string, string>();
// Map from notation id → human-readable filename for the result
const displayNames = new Map<string, string>();
const missingNotations = new Set<string>();
const missingDetails: string[] = [];
// Helper: resolve & validate preview for a notation
const addItem = async (
notation: string,
rec: any,
previewField: string,
originalAbsPath?: string,
displayName?: string
): Promise<boolean> => {
if (!rec) {
missingNotations.add(notation);
missingDetails.push(notation);
return false;
}
const previewRel =
typeof rec[previewField] === "string" ? String(rec[previewField]) : "";
if (!previewRel.trim()) {
missingNotations.add(notation);
missingDetails.push(`${notation} (missing preview)`);
return false;
}
try {
await ensurePreviewExists(chatWd, previewRel);
analysisItems.push({
id: notation,
filePath: path.join(chatWd, previewRel),
});
if (originalAbsPath) {
originalFilePaths.set(notation, originalAbsPath);
}
const dn = displayName || (originalAbsPath ? path.basename(originalAbsPath) : undefined);
if (dn) {
displayNames.set(notation, dn);
}
return true;
} catch {
missingNotations.add(notation);
missingDetails.push(`${notation} (preview file missing)`);
return false;
}
};
// Process attachments (aN)
// Preview generation + state write already handled by syncAttachmentsToState above.
for (const n of parsed.a) {
const rec = (state.attachments || []).find((x: any) => x?.a === n);
const origAbs: string | undefined =
(rec?.originAbs as string | undefined) ?? (rec?.filename ? path.join(chatWd, rec.filename as string) : undefined);
// originalName holds the real user-visible filename (e.g. "Katze.png")
const origName: string | undefined =
typeof rec?.originalName === "string" && rec.originalName ? rec.originalName as string : undefined;
await addItem(`a${n}`, rec, "preview", origAbs, origName);
}
// Process variants (vN)
for (const n of parsed.v) {
const rec = (state.variants || []).find((x: any) => x?.v === n);
const origAbs: string | undefined = rec?.filename ? path.join(chatWd, rec.filename as string) : undefined;
await addItem(`v${n}`, rec, "preview", origAbs);
}
// Process images (iN)
for (const n of parsed.i) {
const rec = (state.images || []).find((x: any) => x?.i === n);
const origAbs: string | undefined = rec?.filename ? path.join(chatWd, rec.filename as string) : undefined;
await addItem(`i${n}`, rec, "preview", origAbs);
}
// Process pictures (pN)
for (const n of parsed.p) {
const rec = (state.pictures || []).find((x: any) => x?.p === n);
const origAbs: string | undefined = rec?.filename ? path.join(chatWd, rec.filename as string) : undefined;
await addItem(`p${n}`, rec, "preview", origAbs);
}
// If strict, fail on any missing items
if (missingDetails.length > 0 && strict) {
const hint =
`Available: ` +
`a=[${availableA.map((x) => `a${x}`).join(", ") || "(none)"}] ` +
`v=[${availableV.map((x) => `v${x}`).join(", ") || "(none)"}] ` +
`i=[${availableI.map((x) => `i${x}`).join(", ") || "(none)"}] ` +
`p=[${availableP.map((x) => `p${x}`).join(", ") || "(none)"}]`;
return `analyse_image failed: unknown/invalid targets: ${missingDetails.join(", ")}. ${hint}`;
}
// If no valid items after filtering, return early
if (analysisItems.length === 0) {
const hint =
`Available: ` +
`a=[${availableA.map((x) => `a${x}`).join(", ") || "(none)"}] ` +
`v=[${availableV.map((x) => `v${x}`).join(", ") || "(none)"}] ` +
`i=[${availableI.map((x) => `i${x}`).join(", ") || "(none)"}] ` +
`p=[${availableP.map((x) => `p${x}`).join(", ") || "(none)"}]`;
return `analyse_image: no valid targets found. ${hint}`;
}
let visionError: string | null = null;
const visionResults = new Map<string, string>();
let totalInferenceTimeMs: number | null = null;
if (effectivePrompt) {
const envServerMaxTokens = Number.parseInt(process.env.SERVER_MAX_TOKENS || "", 10);
const envServerTemperature = Number.parseFloat(process.env.SERVER_TEMPERATURE || "");
const configuredMaxTokens = Math.floor(getGlobalNumber(
globalConfig,
"serverMaxTokens",
Number.isFinite(envServerMaxTokens) && envServerMaxTokens > 0 ? envServerMaxTokens : defaultPluginSettings.serverMaxTokens
));
const configuredTemperature = getGlobalNumber(
globalConfig,
"serverTemperature",
Number.isFinite(envServerTemperature) ? envServerTemperature : defaultPluginSettings.serverTemperature
);
const lmStudioConfig: LmStudioVisionAnalyzerConfig = {
baseUrl: getGlobalString(globalConfig, "embeddingBaseUrl", process.env.LMSTUDIO_VISION_API_BASE_URL || defaultPluginSettings.embeddingBaseUrl),
apiKey: getGlobalString(globalConfig, "embeddingApiKey", process.env.LMSTUDIO_VISION_API_KEY || defaultPluginSettings.embeddingApiKey),
model: getGlobalString(globalConfig, "qwen3VlModelPath", process.env.LMSTUDIO_VISION_MODEL_KEY || defaultPluginSettings.qwen3VlModelPath),
prompt: effectivePrompt,
maxTokens: configuredMaxTokens,
temperature: configuredTemperature,
timeoutMs: analyzeTimeoutMs(1),
};
try {
const totalSteps = analysisItems.length + 2;
reportToolStatus(ctx, `Analyzing ${analysisItems.length} image${analysisItems.length === 1 ? "" : "s"}...`);
reportToolStep(ctx, 1, totalSteps, `Preparing ${analysisItems.length} image${analysisItems.length === 1 ? "" : "s"} for visual analysis...`);
const ready = await ensureLmStudioVisionInstanceReady({
baseUrl: lmStudioConfig.baseUrl,
apiKey: lmStudioConfig.apiKey,
modelKey: lmStudioConfig.model || "",
status: (message) => { try { ctx.status(message); } catch {} },
});
if (!ready.ok) {
throw new Error(ready.error);
}
let totalMs = 0;
for (let idx = 0; idx < analysisItems.length; idx++) {
const item = analysisItems[idx];
reportToolStep(ctx, idx + 2, totalSteps, `Analyzing ${item.id} (${idx + 1}/${analysisItems.length})...`);
const batchResult = await analyzeLmStudioVisionBatch([item], lmStudioConfig);
for (const r of batchResult.results) {
visionResults.set(r.id, r.text.trim() || "(no description)");
}
totalMs += batchResult.totalInferenceTimeMs;
}
totalInferenceTimeMs = totalMs;
reportToolStep(ctx, totalSteps, totalSteps, "Formatting analysis results...");
} catch (e) {
visionError = classifyVisionError((e as Error).message || String(e));
}
}
const includeGenMeta = process.env.INCLUDE_GENERATION_METADATA !== "false";
// Build structured result
const lines: string[] = [];
lines.push(`Analysis results (${analysisItems.length} image${analysisItems.length !== 1 ? "s" : ""}):`)
lines.push("");
if (visionError) {
lines.push(`Note: Visual analysis unavailable — ${visionError}`);
lines.push("");
}
for (const item of analysisItems) {
const { id } = item;
const displayName = displayNames.get(id);
const origPath = originalFilePaths.get(id);
const header = displayName ? `${id} — ${displayName}` : id;
lines.push(`- ${header}`);
if (effectivePrompt) {
if (visionError) {
lines.push(` Visual: (not available)`);
} else {
lines.push(` Visual: ${visionResults.get(id) ?? "(no description)"}`);
}
}
if (includeGenMeta) {
if (origPath && origPath.toLowerCase().endsWith(".png")) {
const meta = readPngGenerationMeta(origPath);
if (meta) {
lines.push(formatGenerationMeta(meta));
} else {
lines.push(` (No embedded generation metadata)`);
}
} else if (origPath) {
lines.push(` (No embedded generation metadata — not a PNG file)`);
}
}
lines.push("");
}
if (totalInferenceTimeMs !== null) {
lines.push(`Total inference time: ${Math.round(totalInferenceTimeMs)}ms`);
}
// Status update: done
try {
const statusSuffix = visionError ? " (vision unavailable)" : " successfully";
ctx.status(`Analyzed ${analysisItems.length} image${analysisItems.length !== 1 ? "s" : ""}${statusSuffix}`);
} catch {
// best-effort
}
return lines.join("\n");
} catch (e) {
return `analyse_image failed: ${String((e as any)?.message || e)}`;
}
},
});
}