src / toolsProvider.ts
import { tool, text, type ToolsProviderController } from "@lmstudio/sdk";
import { z } from "zod";
import * as fs from "node:fs";
import * as path from "node:path";
import { configSchematics } from "./configSchematics";
import { openPdf, parsePageRange } from "./pdfRender";
import { pickVisionModel, transcribePage } from "./vlClient";
import { concatPages } from "./markdown";
const LOG_PREFIX = "[read-pdf]";
export async function toolsProvider(ctl: ToolsProviderController) {
const config = ctl.getPluginConfig(configSchematics);
const defaultRenderScale = config.get("defaultRenderScale");
const maxPages = config.get("maxPages");
const maxFileSizeMb = config.get("maxFileSizeMb");
const vlModelOverride = config.get("vlModelOverride");
const defaultLanguage = config.get("defaultLanguage");
const transcriptionStyle = config.get("transcriptionStyle");
const verboseLogging = config.get("verboseLogging");
const readPdfToMarkdown = tool({
name: "read_pdf_to_markdown",
description: text`
Read a PDF file and transcribe its visible content into Markdown by
rasterising each page to an image and asking the currently-loaded
vision-language model (VL) to transcribe it.
Use this tool when the user asks you to:
β’ read / open / rΓ©sumer / analyser / fiche / rΓ©viser un PDF
β’ extract text from a scanned document
β’ convert a PDF lesson, contract, invoice, or article to markdown
How it works (so you set expectations correctly):
β’ Each page becomes a PNG, then is sent to the VL with a transcription prompt.
β’ Quality scales with the VL: a 2B vision model gives rough text;
a 7-8B vision model approximates a real OCR; a 30B+ vision model
rivals dedicated tools.
β’ Speed: ~5-30 seconds per page on consumer hardware. A 30-page lesson
can take several minutes β warn the user before launching.
β’ Tables and complex layouts may be approximated, not perfect.
β’ Everything stays local β no network call leaves the machine.
Typical workflow: call read_pdf_to_markdown(path), then if the user wants
the result saved, hand the resulting markdown to the filesystem plugin's
write_file tool. You don't need to read it all back to the user verbatim
β a one-paragraph summary plus "saved to <path>" is usually right.
Hard limits: maxPages and maxFileSizeMb (configured in plugin settings).
PDFs that are encrypted with a password will be refused.
`,
parameters: {
path: z
.string()
.min(1)
.describe("Absolute path to the .pdf file on the user's machine."),
pages: z
.string()
.optional()
.describe(
"Optional page range, e.g. '1-3', '5', '1,3,5-7'. Defaults to all pages.",
),
page_render_scale: z
.number()
.min(0.5)
.max(4)
.optional()
.describe(
"Rasterisation scale factor passed to pdf.js. Defaults to the configured value (~2.0). Lower = faster but worse OCR; higher = slower but cleaner.",
),
transcription_hint: z
.string()
.optional()
.describe(
"Free-form hint added to the per-page prompt. Use it to bias the VL: 'this is a French contract, preserve clause numbering' or 'these are math lecture notes, render formulas in LaTeX'.",
),
include_page_separators: z
.boolean()
.optional()
.describe(
"Insert a '# Page N' separator between pages. Defaults to true.",
),
},
implementation: async (args, ctx) => {
const t0 = Date.now();
const warnings: string[] = [];
const pushWarn = (msg: string) => {
warnings.push(msg);
ctx.warn(msg);
};
ctx.status("Reading PDFβ¦");
// 8.1 β entry guards
if (!path.isAbsolute(args.path)) {
return { error: `Path must be absolute, received: ${args.path}` };
}
if (!fs.existsSync(args.path)) {
return { error: `File not found: ${args.path}` };
}
if (path.extname(args.path).toLowerCase() !== ".pdf") {
return { error: `Expected a .pdf file, got: ${args.path}` };
}
const stat = fs.statSync(args.path);
const sizeMb = stat.size / (1024 * 1024);
if (sizeMb > maxFileSizeMb) {
return {
error:
`PDF is ${sizeMb.toFixed(1)} MB, exceeds the configured cap of ${maxFileSizeMb} MB. ` +
`Raise 'Max file size' in the plugin settings to process it.`,
};
}
if (sizeMb > 50) {
pushWarn(`Large file: ${sizeMb.toFixed(1)} MB β rendering may be slow.`);
}
const buffer = fs.readFileSync(args.path);
// 8.2 β open PDF
ctx.status(`Opening PDF (${sizeMb.toFixed(1)} MB)β¦`);
let doc;
try {
doc = await openPdf(buffer);
} catch (e: unknown) {
return { error: `Failed to open PDF: ${e instanceof Error ? e.message : String(e)}` };
}
if (doc.isEncrypted) {
return { error: "Encrypted PDFs are not supported in this iteration." };
}
if (doc.numPages === 0) {
await doc.destroy();
return { error: "PDF contains zero pages." };
}
// Resolve which pages to render.
let targetPages: number[];
if (args.pages) {
const parsed = parsePageRange(args.pages, doc.numPages);
if (parsed === null) {
pushWarn(`Could not parse 'pages' spec "${args.pages}" β falling back to all pages.`);
targetPages = range(1, doc.numPages);
} else {
targetPages = parsed;
}
} else {
targetPages = range(1, doc.numPages);
}
if (maxPages > 0 && targetPages.length > maxPages) {
pushWarn(
`Truncated from ${targetPages.length} to ${maxPages} pages (plugin 'maxPages' cap).`,
);
targetPages = targetPages.slice(0, maxPages);
}
const totalPages = targetPages.length;
// 8.4 β pick VL model
ctx.status("Selecting vision-language modelβ¦");
const picked = await pickVisionModel(ctl.client, vlModelOverride);
if ("error" in picked) {
await doc.destroy();
return { error: picked.error };
}
if (verboseLogging) {
console.log(`${LOG_PREFIX} using model: ${picked.identifier}`);
}
// 8.3 + 8.4 β render and transcribe each page sequentially.
const renderScale = args.page_render_scale ?? defaultRenderScale;
const includeSeparators = args.include_page_separators ?? true;
const transcriptions: Array<{ pageNumber: number; markdown: string }> = [];
const failed: number[] = [];
const msPerPage: number[] = [];
let aborted = false;
for (let i = 0; i < targetPages.length; i++) {
if (ctx.signal.aborted || ctl.abortSignal.aborted) {
aborted = true;
pushWarn(`Aborted by user at page ${i + 1}/${totalPages}.`);
break;
}
const pageNumber = targetPages[i];
const tPage = Date.now();
ctx.status(
`Page ${i + 1}/${totalPages} (PDF p.${pageNumber}) β rendering @ scale ${renderScale}β¦`,
);
try {
const rendered = await doc.renderPage(pageNumber, renderScale);
if (verboseLogging) {
console.log(
`${LOG_PREFIX} page ${pageNumber}: rendered ${rendered.width}x${rendered.height}, calling VLβ¦`,
);
}
if (ctx.signal.aborted) {
aborted = true;
pushWarn(`Aborted by user at page ${i + 1}/${totalPages}.`);
break;
}
ctx.status(
`Page ${i + 1}/${totalPages} (PDF p.${pageNumber}) β transcribing via ${picked.identifier}β¦`,
);
const md = await transcribePage({
client: ctl.client,
model: picked.model,
pngBase64: rendered.pngBase64,
pageNumber,
totalPages,
language: defaultLanguage,
style: transcriptionStyle,
hint: args.transcription_hint,
abortSignal: ctx.signal,
});
transcriptions.push({ pageNumber, markdown: md });
const dt = Date.now() - tPage;
msPerPage.push(dt);
if (verboseLogging) {
console.log(`${LOG_PREFIX} page ${pageNumber}: ${md.length} chars in ${dt} ms`);
}
} catch (e: unknown) {
const dt = Date.now() - tPage;
msPerPage.push(dt);
if (ctx.signal.aborted) {
aborted = true;
pushWarn(`Aborted by user at page ${i + 1}/${totalPages}.`);
break;
}
failed.push(pageNumber);
const msg = e instanceof Error ? e.message : String(e);
pushWarn(`Page ${pageNumber} failed: ${msg}`);
console.log(`${LOG_PREFIX} page ${pageNumber} FAILED: ${msg}`);
}
}
await doc.destroy();
if (transcriptions.length === 0) {
return {
error: aborted
? "Aborted before any page could be transcribed."
: failed.length > 0
? `All ${failed.length} pages failed transcription. Check that the loaded model is a vision-language model.`
: "No pages were transcribed.",
};
}
// 8.5 β concat
ctx.status(
aborted
? `Assembling partial markdown (${transcriptions.length}/${totalPages} pages)β¦`
: `Assembling markdown (${transcriptions.length} pages)β¦`,
);
const markdown = concatPages({
pages: transcriptions,
includeSeparators,
cleanRepeatedHeaders: transcriptionStyle === "clean",
});
const msTotal = Date.now() - t0;
ctx.status(
aborted
? `Aborted β ${transcriptions.length}/${totalPages} pages, ${failed.length} failed (${msTotal} ms).`
: `Done β ${transcriptions.length}/${totalPages} pages, ${failed.length} failed (${msTotal} ms).`,
);
return {
markdown,
pages_processed: transcriptions.length,
pages_failed: failed,
aborted,
total_chars: markdown.length,
ms_total: msTotal,
ms_per_page: msPerPage,
warnings,
};
},
});
return [readPdfToMarkdown];
}
function range(start: number, end: number): number[] {
const out: number[] = [];
for (let i = start; i <= end; i++) out.push(i);
return out;
}