Project Files
src / tools / generate_image.ts
import { tool, type Tool, type ToolsProviderController, type ToolCallContext } from "@lmstudio/sdk";
import { z } from "zod";
import {
GenerateToolParamsShapeMinimal,
setActiveChatContext,
formatToolMetaBlock,
} from "../core-bundle.mjs";
import type { ProgressCallback } from "../services/imageBackend.js";
import path from "path";
// Temporarily hide `quality` from the agent-facing interface.
// Internally, the backend + handler still support it.
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const { quality: _hiddenQuality, ...GenerateToolParamsShapeAgent } =
GenerateToolParamsShapeMinimal;
export function createGenerateImageTool(ctl: ToolsProviderController): Tool {
return tool({
name: "generate_image",
description: `Generate an image or video using Draw Things.
All parameters have sensible defaults. Only override them when there's a good reason OR the user explicitly requests it.
Parameters:
- prompt: Image/video description (mode: 'text2image'/'text2video') OR description of desired changes (mode: 'image2image'/'edit'/'image2video').
- mode:
- 'text2image': generate a NEW image from the prompt. No sources needed.
- 'image2image': uses source image(s) as visual context. Supports moodboard via gRPC if model allows. Prompting: focus on desired refinement.
- 'edit': Supports MULTIPLE reference images (canvas + moodboard). Prompting: apply only the requested changes, be short, concrete, action-oriented.
- 'text2video': generate a video from the prompt. No sources needed. Uses 25 fps.
- 'image2video': generate a video from a source image + prompt. Uses 25 fps.
- model: 'auto' (default) | 'z-image' | 'qwen-image' | 'flux' | 'ltx' | 'custom'. Default 'auto' selects best model for the mode.
- imageFormat: 'square' | 'landscape' | 'portrait' | '16:9'. Override if context suggests. '16:9' yields 1024×576 (video-optimized).
- variants: 1-4. Number of images. Default is 1. Note: text2video/image2video support only 1 variant; higher values are silently ignored by the backend.
- numFrames: Number of video frames (text2video/image2video only). Must be ×32+1 (e.g. 33, 65, 97). Invalid values are auto-corrected.
- canvas: Primary source image for image2image/edit/image2video. REQUIRED for these modes.
- Notation: 'a1' (attachment 1), 'v2' (variant 2), 'p1' (picture 1)
- Shorthand: 'a'→'a1', 'v'→'v1', 'p'→'p1'
- Digit-only (e.g., '2') allowed only when a single pool is populated
- moodboard: Additional style references for image2image/edit modes. Array of same notations as canvas.
- Note: moodboard for image2image requires gRPC transport. HTTP supports single-source only.
Critical rules:
- HTTP transport: image2image supports only ONE source. Use gRPC or mode='edit' for multiple references.
- Always specify canvas explicitly for image2image/edit/image2video modes.
Returns: inline preview(s), JSON with file URLs, and clickable links.
${formatToolMetaBlock()}`,
parameters: GenerateToolParamsShapeAgent as Record<string, z.ZodTypeAny>,
implementation: async (args: any, ctx: ToolCallContext) => {
try {
// CRITICAL: Refresh chat context on every tool call to prevent TTL expiry.
// The Generator sets context once per turn, but tool execution can take longer
// than the 60s TTL (model thinking + Draw Things generation time).
try {
const workingDir = ctl.getWorkingDirectory();
if (typeof workingDir === "string" && workingDir.trim().length > 0) {
const chatId = path.basename(workingDir);
if (/^\d+$/.test(chatId)) {
setActiveChatContext({
chatId,
workingDir,
requestId: `tool-${Date.now()}`,
});
}
}
} catch {
// non-fatal: fallback to existing context or heuristic
}
// Unload agent model before long gRPC renders to free VRAM.
// Guard 0: feature toggle (config: unloadAgentModelDuringRender, default true)
// Guard 1: mode must be one of text2video | image2video
// Guard 2: baseUrl must be local (127.0.0.1 or localhost)
// ensureAgentModelLoaded() at the start of the next Generator turn reloads it.
const UNLOAD_MODES = new Set(["text2video", "image2video"]);
const unloadFeatureEnabled = process.env.UNLOAD_AGENT_MODEL_DURING_RENDER !== "false";
if (unloadFeatureEnabled && UNLOAD_MODES.has(args?.mode)) {
const unloadCtx = (globalThis as any).__dtc_agentModelUnloadCtx as
| { modelKey: string; baseUrl: string; apiKey?: string }
| undefined;
const isLocalInstance = unloadCtx?.baseUrl
? /(127\.0\.0\.1|localhost)/i.test(unloadCtx.baseUrl)
: false;
if (isLocalInstance && unloadCtx?.modelKey) {
// Clear immediately so concurrent/re-entrant calls don't double-unload.
delete (globalThis as any).__dtc_agentModelUnloadCtx;
const { unloadAgentModel } = await import("../core-bundle.mjs");
if (typeof unloadAgentModel === "function") {
// Fire-and-forget – tool execution must not block on this.
unloadAgentModel(unloadCtx).catch(() => {});
}
}
}
// Create progress callback that updates the tool status in the UI.
// Note: Only gRPC backend invokes this callback (streaming).
// HTTP backend has no progress streaming, so it stays silent.
const onProgress: ProgressCallback = (step, totalSteps, message) => {
try {
if (step === -1 && message) {
ctx.status(message);
return;
}
if (totalSteps && totalSteps > 0) {
const pct = Math.round((step / (totalSteps + 1)) * 100);
ctx.status(`Step ${step}/${totalSteps} (${pct}%)`);
} else {
ctx.status(`Step ${step}...`);
}
} catch {
// Non-fatal: UI update failed
}
};
const mod = await import("../core/tools.js");
if (typeof (mod as any).handleGenerateImage !== "function") {
return "generate_image core handler not available.";
}
const resp = await (mod as any).handleGenerateImage(args, onProgress);
// Return structured content so LM Studio can auto-save images and render markdown
if (resp && Array.isArray((resp as any).content))
return (resp as any).content;
return typeof resp === "string" ? resp : JSON.stringify(resp);
} catch (e) {
return `generate_image failed: ${String((e as any)?.message || e)}`;
}
},
});
}