src / vlClient.ts
// Picks a vision-language model loaded in LM Studio and runs per-page transcription.
import type { LMStudioClient, LLM } from "@lmstudio/sdk";
export const SYSTEM_PROMPT =
`You are a meticulous document-transcription engine. Your only job is to ` +
`look at an image of a document page and reproduce its visible textual ` +
`content as clean Markdown. You DO NOT summarise, paraphrase, translate, ` +
`or comment. You DO NOT invent content that is not visible. If a region ` +
`is illegible, write \`[illisible]\` and continue.\n\n` +
`Output rules:\n` +
` • Use Markdown (#, ##, **bold**, *italic*, lists, > quotes, tables).\n` +
` • Preserve the reading order: top-to-bottom, then left column → right column.\n` +
` • Tables: try to render them with \`|\` syntax. If too complex, fall back\n` +
` to a description in italics like *(tableau 4 colonnes × 8 lignes)*\n` +
` followed by row-by-row text.\n` +
` • Mathematical formulas: render in LaTeX between \`$...$\` (inline) or\n` +
` \`$$...$$\` (block).\n` +
` • Page numbers, running headers, running footers: include them on first\n` +
` occurrence, then a tag \`<!-- header-repeat -->\` on subsequent pages.\n` +
` • Hand-written annotations: transcribe between \`<!-- handwritten: ... -->\`.\n` +
` • If the image is blank or contains no readable text: output exactly the\n` +
` string \`[page vide]\`.\n\n` +
`Output ONLY the markdown of the page. No preamble, no explanation, no ` +
`"Voici la transcription :".`;
export type BuildPromptArgs = {
pageNumber: number;
totalPages: number;
language: string;
style: string;
hint?: string;
};
export function buildUserPrompt(args: BuildPromptArgs): string {
const lines = [
"Transcribe this page into Markdown.",
"",
`Document language: ${args.language}`,
`Style: ${args.style}`,
`Page ${args.pageNumber} of ${args.totalPages}.`,
];
if (args.hint && args.hint.trim()) {
lines.push(`Additional hint: ${args.hint.trim()}`);
}
lines.push("", "(Image attached.)");
return lines.join("\n");
}
export type PickedModel = {
model: LLM;
identifier: string;
};
// Returns the LLM to use, or { error } if no suitable model is loaded.
// If override is non-empty, we trust the user (no vision check).
// Otherwise: list loaded LLMs, take the first one with .vision === true.
export async function pickVisionModel(
client: LMStudioClient,
override: string,
): Promise<PickedModel | { error: string }> {
if (override && override.trim()) {
try {
const m = await client.llm.model(override.trim());
return { model: m, identifier: override.trim() };
} catch (e: unknown) {
return {
error:
`Failed to access the configured vision model "${override}". ` +
`Check the identifier in the plugin settings or load the model in LM Studio. ` +
`(${e instanceof Error ? e.message : String(e)})`,
};
}
}
const loaded = await client.llm.listLoaded();
if (loaded.length === 0) {
return {
error:
"No LLM is currently loaded in LM Studio. Load a vision-language model " +
"(e.g. Qwen3-VL, Mistral-Small-VL, Llama-3.2-Vision) and try again.",
};
}
const visionModel = loaded.find((m) => m.vision === true);
if (!visionModel) {
return {
error:
"None of the currently loaded LLMs has vision capabilities. Load a " +
"vision-language model (e.g. Qwen3-VL, Mistral-Small-VL, Llama-3.2-Vision) and try again.",
};
}
const info = await visionModel.getModelInfo();
return { model: visionModel, identifier: info.identifier ?? info.modelKey ?? "(loaded VL)" };
}
export type TranscribeArgs = {
client: LMStudioClient;
model: LLM;
pngBase64: string;
pageNumber: number;
totalPages: number;
language: string;
style: string;
hint?: string;
abortSignal: AbortSignal;
};
export async function transcribePage(args: TranscribeArgs): Promise<string> {
const fileName = `page-${args.pageNumber}.png`;
const handle = await args.client.files.prepareImageBase64(fileName, args.pngBase64);
const userText = buildUserPrompt({
pageNumber: args.pageNumber,
totalPages: args.totalPages,
language: args.language,
style: args.style,
hint: args.hint,
});
const result = await args.model.respond(
[
{ role: "system", content: SYSTEM_PROMPT },
{ role: "user", content: userText, images: [handle] },
],
{
temperature: 0.1,
maxTokens: 4096,
signal: args.abortSignal,
},
);
return result.content.trim();
}