Project Files
src / strategies / GeminiImageThinkingStrategy.ts
import { BaseGeminiStrategy } from "./BaseGeminiStrategy";
import { GenerationContext } from "./ModelStrategy";
import { GoogleGenerativeAI, HarmCategory, HarmBlockThreshold } from '@google/generative-ai';
import { detectCapabilities, shouldUseFilesApiForModel } from "../capabilities";
import { buildGeminiTools } from "../tools";
import fs from "fs";
import path from "path";
import { encodeJpegFromBuffer, toIsoLikeTimestamp, resizeMaxDimJpegFromFile, fileUriToPath } from "../image";
import { buildPromotionPartsForMode } from "../visionModeSelector";
import { snapshotHistoryMediaState } from "../history-state";
import { recordVariantsProvision, readChatMediaState, type ChatMediaState } from "../chat-media-state";
import { findAllAttachmentsFromLastTurn, importAttachmentBatch } from "../attachments";
import {
safeStringify,
toGeminiMessages,
getLastUserText,
collectSystemText,
pad2,
streamTextFragments,
parseAttachmentWrappers,
stableJsonStringify
} from "../generator-utils";
import { computeContentHash, loadThoughtState, saveThoughtState } from "../thought-state";
import { getReasoningPolicy, ImageLightweightReasoningPolicy } from "../reasoningState";
export class GeminiImageThinkingStrategy extends BaseGeminiStrategy {
private async saveImage(wd: string, data: string, filename: string): Promise<string | null> {
const abs = path.join(wd, filename);
try {
const buf = Buffer.from(data, "base64");
await fs.promises.writeFile(abs, buf);
return filename;
} catch (e) {
console.error(`[GeminiImageThinkingStrategy] Failed to save image ${filename}:`, e);
// Fallback: Check if file exists and has content (maybe saved by parallel process or race condition?)
try {
const stats = await fs.promises.stat(abs);
if (stats.size > 0) {
console.warn(`[GeminiImageThinkingStrategy] Write failed but file exists ${filename}. Recovering.`);
return filename;
}
} catch { }
return null;
}
}
// Fully isolated implementation of generate for this strategy
public override async generate(context: GenerationContext): Promise<void> {
const { ctl, history, model, apiKey, globalConfig, pluginConfig, debugChunks, logRequests } = context;
const visionPromotionPersistent = globalConfig.get("visionPromotionPersistent");
const useFilesApiForVision = pluginConfig.get("useFilesApiForVision");
const showOnlyLastImageVariant = pluginConfig.get("showOnlyLastImageVariant");
const redactSecrets: string[] = [];
const genAI = new GoogleGenerativeAI(apiKey);
const caps = detectCapabilities(model);
if (debugChunks) console.info("[GeminiImageThinkingStrategy] Isolated generate call. model=", model);
const lastUserText = getLastUserText(history);
const supportsFunctionCalling = caps.supportsTools;
const systemText = collectSystemText(history);
const chatWd = ctl.getWorkingDirectory();
await snapshotHistoryMediaState(ctl, history, chatWd, model);
const shouldUseFilesApi = shouldUseFilesApiForModel(model, useFilesApiForVision);
// Reconcile Attachments (Duplicated logic from Base)
await this.localReconcileAttachments(context, shouldUseFilesApi);
await this.localBackfillAnalysisPreviews(context, shouldUseFilesApi);
const { tools: geminiTools, originalToSafe, safeToOriginal } = supportsFunctionCalling
? buildGeminiTools(ctl, lastUserText || "")
: { tools: undefined, originalToSafe: new Map<string, string>(), safeToOriginal: new Map<string, string>() } as any;
// Manage Thought Signatures (via optional policy, gated to gemini-3-pro-image-preview)
let contents: any[];
if (model === "gemini-3-pro-image-preview" && caps.supportsThinking) {
const policy = new ImageLightweightReasoningPolicy();
contents = policy.buildContents({
history,
context,
baseContentsBuilder: () => {
const sigState = loadThoughtState(chatWd);
const signaturesArray = Object.entries(sigState.signatures).map(([contentHash, signature]) => ({ contentHash, signature }));
return toGeminiMessages(history, originalToSafe, signaturesArray);
},
});
} else {
let signaturesArray: Array<{ contentHash: string; signature: string }> = [];
if (caps.supportsThinking) {
const sigState = loadThoughtState(chatWd);
signaturesArray = Object.entries(sigState.signatures).map(([contentHash, signature]) => ({ contentHash, signature }));
}
contents = toGeminiMessages(history, originalToSafe, signaturesArray);
}
this.modifyContents(contents, caps);
// Vision promotion (Duplicated logic)
try {
// Auto-detect turns that are pure tool/functionResponse replays and
// suppress vision promotion for them to avoid mismatched thought_signature
// requirements on implicitly promoted images.
if (typeof context.suppressVisionPromotionForThisTurn === "undefined") {
try {
const msgs = Array.from(history as any);
if (msgs.length > 0) {
const last: any = msgs[msgs.length - 1];
const role = typeof last.getRole === "function" ? last.getRole() : undefined;
const results = typeof last.getToolCallResults === "function" ? last.getToolCallResults() : undefined;
if (role === "tool" && Array.isArray(results) && results.length > 0) {
if (debugChunks) console.info("[ImageThinking Vision Suppress] Last history message is a tool result; suppressing vision promotion for this turn.");
context.suppressVisionPromotionForThisTurn = true;
}
}
} catch { /* best-effort only */ }
}
let promoParts: any[] = [];
const res = await buildPromotionPartsForMode({
ctl,
history,
apiKey,
chatWd,
debugChunks,
shouldUseFilesApi,
model,
visionPromotionPersistent,
useFilesApiForVision: !!useFilesApiForVision,
suppressVisionPromotionForThisTurn: !!context.suppressVisionPromotionForThisTurn,
} as any);
promoParts = res.promoParts || [];
if (promoParts.length) {
let lastUserMessage = contents.slice().reverse().find(m => m.role === 'user');
if (lastUserMessage) {
lastUserMessage.parts = [...promoParts, ...(lastUserMessage.parts || [])];
} else {
contents.push({ role: "user", parts: promoParts });
}
}
} catch (e) { if (debugChunks) console.error("Promotion parts error:", (e as Error).message); }
// System Instruction
let systemInstruction: any | undefined;
{
const parts: Array<{ text: string }> = [];
if (systemText) parts.push({ text: systemText });
const now = new Date();
const tz = "UTC";
parts.push({ text: `Current date/time: ${now.toISOString()} (${tz})` });
if (!supportsFunctionCalling) {
parts.push({ text: "You are an image-capable model without tool support." });
} else if (geminiTools) {
parts.push({ text: "Tool use policy: Use available tools only when necessary." });
}
if (parts.length) systemInstruction = { parts };
}
const generateContent: any = { contents };
if (geminiTools && supportsFunctionCalling) {
generateContent.tools = geminiTools;
generateContent.toolConfig = { functionCallingConfig: { mode: "AUTO" } } as any;
}
if (systemInstruction) generateContent.systemInstruction = systemInstruction;
this.modifyGenerationConfig(generateContent, context, caps);
try {
// logRequests: nur Rohdaten ohne Telemetrie-Präfixe
if (logRequests) console.info(safeStringify({ direction: "request", model, payload: generateContent }, redactSecrets));
const generativeModel = genAI.getGenerativeModel({
model: model,
...(systemInstruction ? { system_instruction: systemInstruction } : {}),
safetySettings: ([{ category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold: HarmBlockThreshold.BLOCK_NONE }] as any),
});
let capturedSignature: string | undefined;
let collectedText = "";
let collectedFullText = ""; // Includes thoughts, for hashing
let collectedToolCalls: any[] = [];
// Image handling state
const ts = toIsoLikeTimestamp(new Date());
let imageCount = 0;
const collectedImages: Array<{ filename: string; preview: string }> = [];
if (caps.supportsStreaming) {
const stream = await generativeModel.generateContentStream({
contents: generateContent.contents,
tools: generateContent.tools,
toolConfig: generateContent.toolConfig,
generationConfig: generateContent.generationConfig,
} as any);
let toolCallEmitted = false;
const allowedSafe = new Set<string>(safeToOriginal ? Array.from(safeToOriginal.keys()) : []);
for await (const item of stream.stream) {
if (debugChunks) console.info("[Chunk]", safeStringify(item));
const scands = (item as any)?.candidates as any[] | undefined;
if (!Array.isArray(scands) || !scands.length) continue;
const candidate = scands[0];
const parts = candidate?.content?.parts;
if (!Array.isArray(parts)) continue;
if (debugChunks) {
try {
const partKinds = parts.map((p: any) => ({
hasInline: !!(p?.inline_data || p?.inlineData),
hasThought: !!p?.thought,
hasSig: !!(p?.thought_signature || p?.thoughtSignature),
hasText: typeof p?.text === "string",
}));
} catch { /* ignore */ }
}
let textBuf = "";
const toolCalls: Array<{ name: string; args: any }> = [];
for (const part of parts) {
const p = part as any;
const isImageLink = typeof p.text === 'string' && /!\[Image\]\(.*?\)/.test(p.text);
const hasSig = p.thought_signature || p.thoughtSignature;
// Handle inline images (Thinking images or final images)
const b64 = p.inline_data?.data || p.inlineData?.data;
if (b64) {
imageCount++;
const mime = p.inline_data?.mime_type || p.inlineData?.mimeType || "image/png";
const ext = mime.includes("jpeg") ? ".jpg" : ".png";
// Use standard naming convention: image-TIMESTAMP-vCOUNT.ext
const filename = `image-${ts}-v${imageCount}${ext}`;
const savedName = await this.saveImage(chatWd, b64, filename);
if (savedName) {
let preview = savedName;
// ALWAYS create analysis preview for variants (for local display & Base64 fallback)
try {
const analysisName = `analysis-generated-image-${ts}-v${imageCount}.jpg`;
const buf = Buffer.from(b64, "base64");
const jpeg = await encodeJpegFromBuffer(buf, 85);
await fs.promises.writeFile(path.join(chatWd, analysisName), jpeg);
preview = analysisName;
if (debugChunks) console.info("[Streaming] Created analysis preview:", analysisName);
} catch (e) {
if (debugChunks) console.warn("[Streaming] Failed to create analysis preview:", e);
}
// Always collect every physical variant for media-state,
// regardless of showOnlyLastImageVariant (which is a pure UI/promotion flag).
collectedImages.push({ filename: savedName, preview });
if (debugChunks) console.info(`[Streaming] Collected image ${imageCount}: ${savedName} (Total: ${collectedImages.length})`);
// Injection into the chat stream is controlled by showOnlyLastImageVariant,
// but this must NOT affect what we persist in chat_media_state.json.
if (!showOnlyLastImageVariant) {
const md = `\n\n\n\n`;
ctl.fragmentGenerated(md);
collectedText += md;
collectedFullText += md;
if (debugChunks) console.info("[Streaming] Image saved and injected:", savedName);
} else {
if (debugChunks) console.info("[Streaming] Image saved but injection suppressed (showOnlyLastImageVariant=true):", savedName);
}
} else {
if (debugChunks) console.error("[Streaming] Failed to save image, skipping collection:", filename);
}
}
if (hasSig) {
capturedSignature = hasSig;
if (debugChunks) console.info("[GeminiImageThinkingStrategy] Captured thought signature (truncated):", hasSig.slice(0, 50) + "...");
}
// Vertrauen in die API: Gedanken werden explizit mit thought=true markiert.
const isThought = !isImageLink && !!p.thought;
if (p.text && !isImageLink && !isThought) {
collectedText += p.text;
}
if (isThought) {
const content = typeof p.thought === 'string' ? p.thought : (p.text || "");
if (content) {
if (debugChunks) console.info("[Streaming] Thought update:", content.slice(0, 120));
ctl.fragmentGenerated(content, { reasoningType: "reasoning" });
collectedFullText += content;
}
} else if (p.text) {
// ungekennzeichneter Text ist finale Antwort (kein Thought)
textBuf += (textBuf ? "\n" : "") + p.text;
}
const fcall = part?.functionCall || part?.function_call;
if (fcall && fcall.name) {
// Function-Calls signalisieren das Ende der reinen Thinking-Phase,
// beeinflussen aber nur das Signatur-Handling, nicht isThought.
let args = fcall.args;
if (typeof args === "string") { try { args = JSON.parse(args); } catch { } }
toolCalls.push({ name: String(fcall.name), args });
collectedToolCalls.push({ name: String(fcall.name), args });
if (debugChunks) console.info("[Streaming] Tool call detected:", JSON.stringify({ name: fcall.name, args }));
}
}
if (textBuf.trim().length) {
if (debugChunks) console.info("[Streaming] Text update:", textBuf.slice(0, 120));
ctl.fragmentGenerated(textBuf);
collectedFullText += textBuf;
}
if (!toolCallEmitted && toolCalls.length) {
const streamingToolCall = toolCalls.find(tc => allowedSafe.has(tc.name));
if (streamingToolCall) {
const originalName = safeToOriginal.get(streamingToolCall.name) || streamingToolCall.name;
const argsJson = typeof streamingToolCall.args === "string" ? streamingToolCall.args : JSON.stringify(streamingToolCall.args ?? {});
const callId = `gemini-fc-${Date.now()}-0`;
ctl.toolCallGenerationStarted();
ctl.toolCallGenerationNameReceived(originalName);
ctl.toolCallGenerationArgumentFragmentGenerated(argsJson);
ctl.toolCallGenerationEnded({ type: "function", name: originalName, arguments: streamingToolCall.args ?? {}, id: callId });
toolCallEmitted = true;
// CRITICAL (LM Studio tool-call loop): once we emit a tool call, end this
// generate() invocation immediately so LM Studio can run the tool and
// re-invoke generate() with the tool result.
ctl.fragmentGenerated("");
return;
}
}
}
ctl.fragmentGenerated("");
const response = stream.response;
if (logRequests) console.info(safeStringify({ direction: "response", model, payload: response }, redactSecrets));
const candidates = (response as any)?.candidates as any[] | undefined;
if (Array.isArray(candidates) && candidates.length > 0) {
// Vision Promotion is always ON
const { markdown, savedImages } = await this.localProcessCandidates(candidates, context, safeToOriginal, true, shouldUseFilesApi, caps, true);
collectedText += markdown;
collectedFullText += markdown;
// In streaming, localProcessCandidates ignores images, so savedImages will be empty.
// Robustness: if the final response snapshot contains additional
// non-thinking text that was never streamed, render it once here.
try {
let responseText = "";
for (const cand of candidates) {
const parts = cand?.content?.parts;
if (!Array.isArray(parts)) continue;
for (const p of parts) {
const isImageLink = typeof p.text === "string" && /!\[Image\]\(.*?\)/.test(p.text);
const isThought = !isImageLink && !!p.thought;
if (p.text && !isImageLink && !isThought) {
responseText += (responseText ? "\n" : "") + p.text;
}
}
}
if (responseText.trim().length) {
if (debugChunks) console.info("[Streaming] Final snapshot text update:", responseText.slice(0, 200));
ctl.fragmentGenerated(responseText);
collectedText += responseText;
collectedFullText += responseText;
}
} catch { /* best-effort only */ }
}
// If showOnlyLastImageVariant is enabled, inject the LAST image now
if (showOnlyLastImageVariant && collectedImages.length > 0) {
const lastImage = collectedImages[collectedImages.length - 1];
const md = `\n\n\n\n`;
ctl.fragmentGenerated(md);
collectedText += md;
collectedFullText += md;
}
} else {
const result = await generativeModel.generateContent({
contents: generateContent.contents,
tools: generateContent.tools,
toolConfig: generateContent.toolConfig,
} as any);
const candidates = result.response?.candidates as any[] | undefined;
if (logRequests) console.info(safeStringify({ direction: "response", model, payload: result.response }, redactSecrets));
if (candidates) {
// For non-streaming, we need to extract signature and text from candidates
for (const cand of candidates) {
const parts = cand?.content?.parts;
if (Array.isArray(parts)) {
for (const p of parts) {
if (p.thought_signature || p.thoughtSignature) capturedSignature = p.thought_signature || p.thoughtSignature;
if (p.text) {
collectedText += p.text;
collectedFullText += p.text;
}
const fcall = p.functionCall || p.function_call;
if (fcall && fcall.name) {
let args = fcall.args;
if (typeof args === "string") { try { args = JSON.parse(args); } catch { } }
collectedToolCalls.push({ name: String(fcall.name), args });
}
}
}
}
// Vision Promotion is always ON
const { markdown, savedImages } = await this.localProcessCandidates(candidates, context, safeToOriginal, true, shouldUseFilesApi, caps, false);
collectedText += markdown;
collectedFullText += markdown;
collectedImages.push(...savedImages);
}
}
// Save Thought Signature (Post-Turn)
if (capturedSignature && (collectedFullText.trim() || collectedToolCalls.length > 0)) {
if (model === "gemini-3-pro-image-preview" && caps.supportsThinking) {
const policy = new ImageLightweightReasoningPolicy();
policy.updateFromResponse({
response: undefined,
history,
context,
collectedFullText,
collectedToolCalls,
capturedSignature,
});
if (debugChunks) console.info("[ReasoningPolicy] Updated thought state for gemini-3-pro-image-preview.");
} else {
const thoughtState = loadThoughtState(chatWd);
// 1. Hash full text (including thoughts)
if (collectedFullText) {
const hash = computeContentHash(collectedFullText);
thoughtState.signatures[hash] = capturedSignature;
// 2. Hash trimmed text (robustness)
if (collectedFullText.trim() !== collectedFullText) {
const trimmedHash = computeContentHash(collectedFullText.trim());
thoughtState.signatures[trimmedHash] = capturedSignature;
}
}
// 3. ALWAYS Save as LATEST_TEXT_SIG for immediate fallback (robustness against hash mismatches)
thoughtState.signatures["LATEST_TEXT_SIG"] = capturedSignature;
// 4. Fallback for tool-only responses (if no text at all)
if (collectedToolCalls.length > 0 && !collectedFullText.trim()) {
const emptyHash = computeContentHash("");
thoughtState.signatures[emptyHash] = capturedSignature;
}
// 5. Save for tool calls (CRITICAL for mixed text/tool turns)
if (collectedToolCalls.length > 0) {
for (const tc of collectedToolCalls) {
// Must match getSignatureForFunctionCall in generator-utils.ts
const id = `${tc.name}:${stableJsonStringify(tc.args ?? {})}`;
const hash = computeContentHash(id);
thoughtState.signatures[hash] = capturedSignature;
}
}
saveThoughtState(chatWd, thoughtState);
if (debugChunks) console.info("Saved thought signature for turn.");
}
}
// Update Media State (for Vision Promotion)
// OLD/CONSISTENT BEHAVIOR: keep only the LAST visible variant in state,
// so both first and subsequent runs behave identically.
if (collectedImages.length > 0) {
const lastImage = collectedImages[collectedImages.length - 1];
await recordVariantsProvision(chatWd, [lastImage]);
} else {
if (debugChunks && imageCount > 0) {
}
}
} catch (error: any) {
console.error("Gemini Isolated Strategy Error:", error);
throw error;
}
}
/**
* @param _allowVisionPromotion - deprecated; Vision Promotion is now always ON.
* Kept in signature for call-site compat; ignored internally.
*/
private async localProcessCandidates(candidates: any[], context: GenerationContext, safeToOriginal: Map<string, string>, _allowVisionPromotion: boolean, shouldUseFilesApi: boolean, caps?: any, isStreaming: boolean = false) {
const { ctl, debugChunks, pluginConfig } = context;
const showOnlyLastImageVariant = pluginConfig.get("showOnlyLastImageVariant");
const chatWd = ctl.getWorkingDirectory();
const mimeToExt = (mime: string): string => {
if (mime.includes("jpeg")) return ".jpg";
if (mime.includes("png")) return ".png";
return ".png";
};
// Refactoring to return generated markdown and saved images
let generatedMarkdown = "";
const savedImages: Array<{ filename: string; preview: string }> = [];
for (const candidate of candidates) {
const parts = candidate?.content?.parts;
if (Array.isArray(parts)) {
let textBuf = "";
let isThinking = caps?.supportsThinking;
const images: Array<{ data: string; mimeType: string }> = [];
for (const part of parts) {
const hasSig = part.thought_signature || part.thoughtSignature;
const isImageLink = typeof part.text === 'string' && /!\[Image\]\(.*?\)/.test(part.text);
const isThought = !isImageLink && !!part.thought;
if (part?.text && !isThought) {
textBuf += (textBuf ? "\n" : "") + part.text;
}
// Only process images here if NOT streaming (streaming handles them in real-time)
if (!isStreaming) {
const b64 = part?.inline_data?.data || part?.inlineData?.data;
if (b64) {
images.push({ data: b64, mimeType: part?.inline_data?.mime_type || "image/png" });
}
}
}
if (textBuf.trim().length && !isStreaming) {
await streamTextFragments(ctl, textBuf);
}
if (images.length > 0) {
const wd = ctl.getWorkingDirectory();
const fileNames: string[] = [];
const ts = toIsoLikeTimestamp(new Date());
let idx = 0;
for (const img of images) {
const baseName = images.length > 1 ? `image-${ts}-v${++idx}` : `image-${ts}`;
const ext = mimeToExt(img.mimeType);
const fileName = `${baseName}${ext}`;
const abs = path.join(wd, fileName);
try {
const buf = Buffer.from(img.data, "base64");
await fs.promises.writeFile(abs, buf);
fileNames.push(fileName);
// For non-streaming, we don't generate separate analysis previews yet, so use the file itself
savedImages.push({ filename: fileName, preview: fileName });
} catch { }
}
if (fileNames.length > 0) {
if (showOnlyLastImageVariant) {
// Only inject the LAST image
const lastFn = fileNames[fileNames.length - 1];
const md = `\n\n\n\n`;
ctl.fragmentGenerated(md);
generatedMarkdown += md;
} else {
const md = fileNames.map(fn => ``).join("\n\n");
const fragment = "\n\n" + md + "\n";
ctl.fragmentGenerated(fragment);
generatedMarkdown += fragment;
}
}
}
}
}
return { markdown: generatedMarkdown, savedImages };
}
private async localReconcileAttachments(context: GenerationContext, shouldUseFilesApi: boolean) {
const { ctl, history, debugChunks } = context;
const chatWd = ctl.getWorkingDirectory();
try {
// Use unified SSOT scan from attachments.ts (last turn only for reconcile)
const ssotPaths = await findAllAttachmentsFromLastTurn(chatWd, !!debugChunks);
if (ssotPaths.length === 0) {
if (debugChunks) console.info('[Image Strategy Attachment Reconcile] No attachments in history; preserving existing state');
return;
}
// Read current state (or initialize empty)
const state = await readChatMediaState(chatWd).catch(() => ({
attachments: [],
variants: [],
counters: { nextN: 1, nextV: 1 }
} as ChatMediaState));
// Use importAttachmentBatch for stable n-numbering, idempotent, no copies
const result = await importAttachmentBatch(
chatWd,
state,
ssotPaths,
{ maxDim: 1024, quality: 85 },
2, // max 2 attachments
!!debugChunks
);
if (result.changed && debugChunks) {
console.info(`[Image Strategy Attachment Reconcile] Imported attachments from SSOT`);
}
} catch (e) {
if (debugChunks) console.warn('[Image Strategy Attachment Reconcile] Error:', (e as Error).message);
}
}
private async localBackfillAnalysisPreviews(context: GenerationContext, shouldUseFilesApi: boolean) {
// DEPRECATED: Preview generation is now handled by importAttachmentBatch in localReconcileAttachments
// This method is kept for compatibility but does nothing
// The new preview naming is: preview-<origin> (e.g., preview-1766100380042 - 811.jpg)
}
protected override modifyContents(contents: any[], caps: any) {
// No-op here because we handle signature injection in generate() via toGeminiMessages
// But wait, toGeminiMessages needs the signatures passed to it.
// In my generate() override, I call:
// const sigState = await loadSignaturesV3(chatWd);
// const contents = toGeminiMessages(history, originalToSafe, sigState.signatures);
// So it is handled there.
}
protected override modifyGenerationConfig(generateContent: any, context: GenerationContext, caps: any) {
const { pluginConfig } = context;
const thinkingConfig: any = {
includeThoughts: true,
};
// Only add thinkingLevel if the model supports specific levels (gemini-3-pro-preview does, gemini-3-pro-image-preview does not)
if (caps.thinking?.levels && caps.thinking.levels.length > 0) {
const thinkingLevel = pluginConfig.get("thinkingLevel");
thinkingConfig.thinkingLevel = thinkingLevel;
}
generateContent.generationConfig = {
...(generateContent.generationConfig || {}),
thinkingConfig
};
if (caps.responseModalities) {
generateContent.generationConfig.responseModalities = caps.responseModalities;
}
}
}