Project Files
src / tools / analyse_image.ts
import { tool, type Tool, type ToolsProviderController, type ToolCallContext } from "@lmstudio/sdk";
import { z } from "zod";
import path from "path";
import fs from "fs";
import { readState, type ChatMediaState } from "../state.js";
import { syncAttachmentsToState } from "../core-bundle.mjs";
import { readPngGenerationMeta, formatGenerationMeta } from "../helpers/readPngMetadata.js";
import {
analyzeMlxVisionBatch,
type MlxAnalysisItem,
type MlxVisionAnalyzerConfig,
} from "../services/mlxVisionAnalyzer.js";
import { ensureFastvlmServerRunning } from "../fastvlm-server-manager.js";
function formatPluginMeta(): string {
try {
const cwd = process.cwd();
const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
const mf = JSON.parse(fs.readFileSync(path.join(cwd, "manifest.json"), "utf-8"));
const id = mf?.owner && mf?.name ? `${mf.owner}/${mf.name}` : pkg?.name || "ceveyne/analyse-image";
return `Plugin-Identifier: ${id}\nPlugin version: ${pkg?.version || ""}`;
} catch {
return "Plugin-Identifier: ceveyne/analyse-image";
}
}
/**
* Flexible targets schema: accepts array OR comma/space-separated string.
* Reduces parse errors when models output "a1, v2" instead of ["a1", "v2"].
*/
const FlexibleTargetsList = z
.union([
// String form: split by comma/space
z.string().transform((s) =>
s
.split(/[\s,]+/)
.map((x) => x.trim())
.filter((x) => x.length > 0)
),
// Array form: pass through
z.array(z.string()),
])
.refine((arr) => arr.length >= 1, "targets must contain at least one notation")
.refine((arr) => arr.length <= 16, "targets must contain at most 16 notations (MLX API limit)");
const AnalyseImageParamsShape = {
targets: FlexibleTargetsList,
prompt: z.string().optional().describe("Optional prompt for the vision model. Empty = model default."),
} satisfies Record<string, z.ZodTypeAny>;
type ParsedTargets = { a: number[]; v: number[]; i: number[]; p: number[] };
function parseTargets(targets: string[]): {
parsed: ParsedTargets;
invalid: string[];
} {
const parsed: ParsedTargets = { a: [], v: [], i: [], p: [] };
const invalid: string[] = [];
for (const raw of targets) {
const s = typeof raw === "string" ? raw.trim() : "";
const m = /^([avip])(\d+)$/i.exec(s);
if (!m) {
invalid.push(String(raw));
continue;
}
const kind = m[1].toLowerCase() as "a" | "v" | "i" | "p";
const n = parseInt(m[2], 10);
if (!Number.isFinite(n) || n <= 0) {
invalid.push(String(raw));
continue;
}
parsed[kind].push(n);
}
// Unique + sort for stable behavior
(Object.keys(parsed) as Array<keyof ParsedTargets>).forEach((k) => {
parsed[k] = Array.from(new Set(parsed[k])).sort((a, b) => a - b);
});
return { parsed, invalid };
}
function getAvailable(state: ChatMediaState) {
const availableA = (state.attachments || [])
.map((x: any) => (typeof x?.a === "number" ? x.a : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
const availableV = (state.variants || [])
.map((x: any) => (typeof x?.v === "number" ? x.v : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
const availableI = (state.images || [])
.map((x: any) => (typeof x?.i === "number" ? x.i : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
const availableP = (state.pictures || [])
.map((x: any) => (typeof x?.p === "number" ? x.p : undefined))
.filter((x: any) => typeof x === "number" && x > 0)
.sort((a: number, b: number) => a - b);
return { availableA, availableV, availableI, availableP };
}
async function ensurePreviewExists(chatWd: string, previewRel: string): Promise<void> {
const pAbs = path.join(chatWd, previewRel);
await fs.promises.access(pAbs, fs.constants.F_OK);
}
function classifyVisionError(errMsg: string): string {
if (/\b503\b/.test(errMsg)) {
return "The FastVLM server is running, but no vision model is currently loaded. Load a model in LM Studio and try again.";
}
if (/ECONNREFUSED|ENOTFOUND|ECONNRESET|network socket|fetch failed/i.test(errMsg)) {
return "The FastVLM server is not reachable. Make sure it is running (default: http://localhost:8765) and try again.";
}
return `Vision API error: ${errMsg}`;
}
export function createAnalyseImageTool(ctl: ToolsProviderController): Tool {
return tool({
name: "analyse_image",
description: `Analyze existing media items (images, pictures, variants, attachments) using a vision model.
Returns text descriptions for each requested item.
Use this when the user asks you to describe, analyze, or understand image content.
Also use this to retrieve generation parameters (prompt, model, sampler, seed, steps, guidance scale, LoRA, source images, …) for images generated by Draw Things. Those parameters are embedded in the PNG file and are automatically included in the result alongside the visual analysis. Call this tool whenever the user asks about how an image was generated, what settings were used, or wants to reuse the generation parameters.
Parameters:
- targets: Array of explicit notations: 'aN' (attachment), 'vN' (variant), 'iN' (image), 'pN' (picture)
- prompt: Optional vision prompt (empty = model default) — MUST be written in English
${formatPluginMeta()}`,
parameters: AnalyseImageParamsShape,
implementation: async (args: any, ctx: ToolCallContext) => {
try {
// Tolerant mode: drop invalid items instead of failing
const strict = false;
const targets = Array.isArray(args?.targets) ? args.targets : [];
const prompt = typeof args?.prompt === "string" ? args.prompt : "";
// Parse explicit notations (aN/vN/iN/pN)
const { parsed, invalid: invalidRaw } = parseTargets(targets);
const workingDir = ctl.getWorkingDirectory();
if (typeof workingDir !== "string" || !workingDir.trim()) {
return "analyse_image failed: working directory not available.";
}
const chatWd = workingDir;
// Sync attachments from conversation.json → chat_media_state.json.
// maxPreviewAttachments=MAX: importAttachmentBatch generates previews for all attachments
// and writes them to state atomically — the canonical path via draw-things-chat-core.
try {
await syncAttachmentsToState(chatWd, false, Number.MAX_SAFE_INTEGER);
} catch (syncErr: any) {
console.warn("[analyse_image] attachment sync failed (non-fatal):", syncErr?.message ?? syncErr);
}
const state = await readState(chatWd);
const { availableA, availableV, availableI, availableP } = getAvailable(state);
// If any invalid notations and strict=true, reject
if (invalidRaw.length > 0 && strict) {
return `analyse_image failed: invalid targets: ${invalidRaw
.map((s) => JSON.stringify(s))
.join(", ")}`;
}
// Collect analysis items (id + preview file path)
const analysisItems: MlxAnalysisItem[] = [];
// Map from notation id → absolute path of original file (for XMP reading)
const originalFilePaths = new Map<string, string>();
// Map from notation id → human-readable filename for the result
const displayNames = new Map<string, string>();
const missingNotations = new Set<string>();
const missingDetails: string[] = [];
// Helper: resolve & validate preview for a notation
const addItem = async (
notation: string,
rec: any,
previewField: string,
originalAbsPath?: string,
displayName?: string
): Promise<boolean> => {
if (!rec) {
missingNotations.add(notation);
missingDetails.push(notation);
return false;
}
const previewRel =
typeof rec[previewField] === "string" ? String(rec[previewField]) : "";
if (!previewRel.trim()) {
missingNotations.add(notation);
missingDetails.push(`${notation} (missing preview)`);
return false;
}
try {
await ensurePreviewExists(chatWd, previewRel);
analysisItems.push({
id: notation,
filePath: path.join(chatWd, previewRel),
});
if (originalAbsPath) {
originalFilePaths.set(notation, originalAbsPath);
}
const dn = displayName || (originalAbsPath ? path.basename(originalAbsPath) : undefined);
if (dn) {
displayNames.set(notation, dn);
}
return true;
} catch {
missingNotations.add(notation);
missingDetails.push(`${notation} (preview file missing)`);
return false;
}
};
// Process attachments (aN)
// Preview generation + state write already handled by syncAttachmentsToState above.
for (const n of parsed.a) {
const rec = (state.attachments || []).find((x: any) => x?.a === n);
const origAbs: string | undefined =
(rec?.originAbs as string | undefined) ?? (rec?.filename ? path.join(chatWd, rec.filename as string) : undefined);
// originalName holds the real user-visible filename (e.g. "Katze.png")
const origName: string | undefined =
typeof rec?.originalName === "string" && rec.originalName ? rec.originalName as string : undefined;
await addItem(`a${n}`, rec, "preview", origAbs, origName);
}
// Process variants (vN)
for (const n of parsed.v) {
const rec = (state.variants || []).find((x: any) => x?.v === n);
const origAbs: string | undefined = rec?.filename ? path.join(chatWd, rec.filename as string) : undefined;
await addItem(`v${n}`, rec, "preview", origAbs);
}
// Process images (iN)
for (const n of parsed.i) {
const rec = (state.images || []).find((x: any) => x?.i === n);
const origAbs: string | undefined = rec?.filename ? path.join(chatWd, rec.filename as string) : undefined;
await addItem(`i${n}`, rec, "preview", origAbs);
}
// Process pictures (pN)
for (const n of parsed.p) {
const rec = (state.pictures || []).find((x: any) => x?.p === n);
const origAbs: string | undefined = rec?.filename ? path.join(chatWd, rec.filename as string) : undefined;
await addItem(`p${n}`, rec, "preview", origAbs);
}
// If strict, fail on any missing items
if (missingDetails.length > 0 && strict) {
const hint =
`Available: ` +
`a=[${availableA.map((x) => `a${x}`).join(", ") || "(none)"}] ` +
`v=[${availableV.map((x) => `v${x}`).join(", ") || "(none)"}] ` +
`i=[${availableI.map((x) => `i${x}`).join(", ") || "(none)"}] ` +
`p=[${availableP.map((x) => `p${x}`).join(", ") || "(none)"}]`;
return `analyse_image failed: unknown/invalid targets: ${missingDetails.join(", ")}. ${hint}`;
}
// If no valid items after filtering, return early
if (analysisItems.length === 0) {
const hint =
`Available: ` +
`a=[${availableA.map((x) => `a${x}`).join(", ") || "(none)"}] ` +
`v=[${availableV.map((x) => `v${x}`).join(", ") || "(none)"}] ` +
`i=[${availableI.map((x) => `i${x}`).join(", ") || "(none)"}] ` +
`p=[${availableP.map((x) => `p${x}`).join(", ") || "(none)"}]`;
return `analyse_image: no valid targets found. ${hint}`;
}
// Start managed server unless TTL=0 (external server mode)
const serverTTL = parseInt(process.env.SERVER_TTL ?? "1440", 10);
if (serverTTL !== 0) {
try {
await ensureFastvlmServerRunning(
{
port: parseInt(process.env.MLX_VISION_PORT ?? "8765", 10),
modelPath: process.env.MLX_VISION_MODEL_PATH ?? "",
mlxVisionEnabled: process.env.MLX_VISION_ENABLED !== "false",
backend: process.env.FASTVLM_BACKEND || "mlx",
maxTokens: process.env.MLX_VISION_MAX_TOKENS ? parseInt(process.env.MLX_VISION_MAX_TOKENS, 10) : undefined,
temperature: process.env.MLX_VISION_TEMPERATURE ? parseFloat(process.env.MLX_VISION_TEMPERATURE) : undefined,
florence2ModelPath: process.env.FLORENCE2_MODEL_PATH || process.env.DETECT_MODEL_PATH || "",
},
(msg) => { try { ctx.status(msg); } catch {} }
);
} catch (e) {
throw new Error(`Failed to start FastVLM server: ${(e as Error).message || String(e)}`);
}
}
// Status update: analyzing N items
try {
ctx.status(
`Analyzing ${analysisItems.length} image${
analysisItems.length > 1 ? "s" : ""
}...`
);
} catch {
// best-effort
}
// Call MLX Vision API
const port = parseInt(process.env.MLX_VISION_PORT ?? "8765", 10);
const mlxConfig: MlxVisionAnalyzerConfig = {
endpoint: process.env.MLX_VISION_ENDPOINT || `http://localhost:${port}/analyze`,
prompt: prompt || process.env.MLX_VISION_PROMPT || undefined,
timeoutMs: 30_000,
};
let visionError: string | null = null;
const visionResults = new Map<string, string>();
let totalInferenceTimeMs: number | null = null;
try {
const batchResult = await analyzeMlxVisionBatch(analysisItems, mlxConfig);
for (const r of batchResult.results) {
visionResults.set(r.id, r.text.trim() || "(no description)");
}
totalInferenceTimeMs = batchResult.totalInferenceTimeMs;
} catch (e) {
visionError = classifyVisionError((e as Error).message || String(e));
}
const includeGenMeta = process.env.INCLUDE_GENERATION_METADATA !== "false";
// Build structured result
const lines: string[] = [];
lines.push(`Analysis results (${analysisItems.length} image${analysisItems.length !== 1 ? "s" : ""}):`)
lines.push("");
if (visionError) {
lines.push(`Note: Visual analysis unavailable — ${visionError}`);
lines.push("");
}
for (const item of analysisItems) {
const { id } = item;
const displayName = displayNames.get(id);
const origPath = originalFilePaths.get(id);
const header = displayName ? `${id} — ${displayName}` : id;
lines.push(`- ${header}`);
if (visionError) {
lines.push(` Visual: (not available)`);
} else {
lines.push(` Visual: ${visionResults.get(id) ?? "(no description)"}`);
}
if (includeGenMeta) {
if (origPath && origPath.toLowerCase().endsWith(".png")) {
const meta = readPngGenerationMeta(origPath);
if (meta) {
lines.push(formatGenerationMeta(meta));
} else {
lines.push(` (No embedded generation metadata)`);
}
} else if (origPath) {
lines.push(` (No embedded generation metadata — not a PNG file)`);
}
}
lines.push("");
}
if (totalInferenceTimeMs !== null) {
lines.push(`Total inference time: ${Math.round(totalInferenceTimeMs)}ms`);
}
// Status update: done
try {
const statusSuffix = visionError ? " (vision unavailable)" : " successfully";
ctx.status(`Analyzed ${analysisItems.length} image${analysisItems.length !== 1 ? "s" : ""}${statusSuffix}`);
} catch {
// best-effort
}
return lines.join("\n");
} catch (e) {
return `analyse_image failed: ${String((e as any)?.message || e)}`;
}
},
});
}