Project Files
src
configSchematics.ts
index.ts
markdown.ts
pdfRender.ts
toolsProvider.ts
vlClient.ts
.gitignore
LICENSE.md
manifest.json
package-lock.json
package.json
README.md
tsconfig.json
src / vlClient.ts
// Picks a vision-language model loaded in LM Studio and runs per-page transcription.

import type { LMStudioClient, LLM } from "@lmstudio/sdk";

export const SYSTEM_PROMPT =
  `You are a meticulous document-transcription engine. Your only job is to ` +
  `look at an image of a document page and reproduce its visible textual ` +
  `content as clean Markdown. You DO NOT summarise, paraphrase, translate, ` +
  `or comment. You DO NOT invent content that is not visible. If a region ` +
  `is illegible, write \`[illisible]\` and continue.\n\n` +
  `Output rules:\n` +
  `  • Use Markdown (#, ##, **bold**, *italic*, lists, > quotes, tables).\n` +
  `  • Preserve the reading order: top-to-bottom, then left column → right column.\n` +
  `  • Tables: try to render them with \`|\` syntax. If too complex, fall back\n` +
  `    to a description in italics like *(tableau 4 colonnes × 8 lignes)*\n` +
  `    followed by row-by-row text.\n` +
  `  • Mathematical formulas: render in LaTeX between \`$...$\` (inline) or\n` +
  `    \`$$...$$\` (block).\n` +
  `  • Page numbers, running headers, running footers: include them on first\n` +
  `    occurrence, then a tag \`<!-- header-repeat -->\` on subsequent pages.\n` +
  `  • Hand-written annotations: transcribe between \`<!-- handwritten: ... -->\`.\n` +
  `  • If the image is blank or contains no readable text: output exactly the\n` +
  `    string \`[page vide]\`.\n\n` +
  `Output ONLY the markdown of the page. No preamble, no explanation, no ` +
  `"Voici la transcription :".`;

export type BuildPromptArgs = {
  pageNumber: number;
  totalPages: number;
  language: string;
  style: string;
  hint?: string;
};

export function buildUserPrompt(args: BuildPromptArgs): string {
  const lines = [
    "Transcribe this page into Markdown.",
    "",
    `Document language: ${args.language}`,
    `Style: ${args.style}`,
    `Page ${args.pageNumber} of ${args.totalPages}.`,
  ];
  if (args.hint && args.hint.trim()) {
    lines.push(`Additional hint: ${args.hint.trim()}`);
  }
  lines.push("", "(Image attached.)");
  return lines.join("\n");
}

export type PickedModel = {
  model: LLM;
  identifier: string;
};

// Returns the LLM to use, or { error } if no suitable model is loaded.
// If override is non-empty, we trust the user (no vision check).
// Otherwise: list loaded LLMs, take the first one with .vision === true.
export async function pickVisionModel(
  client: LMStudioClient,
  override: string,
): Promise<PickedModel | { error: string }> {
  if (override && override.trim()) {
    try {
      const m = await client.llm.model(override.trim());
      return { model: m, identifier: override.trim() };
    } catch (e: unknown) {
      return {
        error:
          `Failed to access the configured vision model "${override}". ` +
          `Check the identifier in the plugin settings or load the model in LM Studio. ` +
          `(${e instanceof Error ? e.message : String(e)})`,
      };
    }
  }
  const loaded = await client.llm.listLoaded();
  if (loaded.length === 0) {
    return {
      error:
        "No LLM is currently loaded in LM Studio. Load a vision-language model " +
        "(e.g. Qwen3-VL, Mistral-Small-VL, Llama-3.2-Vision) and try again.",
    };
  }
  const visionModel = loaded.find((m) => m.vision === true);
  if (!visionModel) {
    return {
      error:
        "None of the currently loaded LLMs has vision capabilities. Load a " +
        "vision-language model (e.g. Qwen3-VL, Mistral-Small-VL, Llama-3.2-Vision) and try again.",
    };
  }
  const info = await visionModel.getModelInfo();
  return { model: visionModel, identifier: info.identifier ?? info.modelKey ?? "(loaded VL)" };
}

export type TranscribeArgs = {
  client: LMStudioClient;
  model: LLM;
  pngBase64: string;
  pageNumber: number;
  totalPages: number;
  language: string;
  style: string;
  hint?: string;
  abortSignal: AbortSignal;
};

export async function transcribePage(args: TranscribeArgs): Promise<string> {
  const fileName = `page-${args.pageNumber}.png`;
  const handle = await args.client.files.prepareImageBase64(fileName, args.pngBase64);
  const userText = buildUserPrompt({
    pageNumber: args.pageNumber,
    totalPages: args.totalPages,
    language: args.language,
    style: args.style,
    hint: args.hint,
  });
  const result = await args.model.respond(
    [
      { role: "system", content: SYSTEM_PROMPT },
      { role: "user", content: userText, images: [handle] },
    ],
    {
      temperature: 0.1,
      maxTokens: 4096,
      signal: args.abortSignal,
    },
  );
  return result.content.trim();
}
read-pdf