src / toolsProvider.ts
/**
* Document Parser Plugin — toolsProvider
*
* Tools:
* parse_document — extract full text from PDF/DOCX/spreadsheet/txt
* search_document — keyword/regex scan through a document
*/
import { text, tool, type Tool, type ToolCallContext, type ToolsProvider } from "@lmstudio/sdk";
import { readFile, stat } from "fs/promises";
import { resolve, extname } from "path";
import { z } from "zod";
import { pluginConfigSchematics } from "./config";
import { detectRagPeer } from "./peers";
import pdfParse from "pdf-parse";
import mammoth from "mammoth";
import * as XLSX from "xlsx";
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function json(obj: unknown): string {
return JSON.stringify(obj, null, 2);
}
function safe_impl<T extends Record<string, unknown>>(
name: string,
fn: (params: T, ctx: ToolCallContext) => Promise<string>
): (params: T, ctx: ToolCallContext) => Promise<string> {
return async (params: T, ctx: ToolCallContext) => {
if (ctx.signal.aborted) {
return JSON.stringify({ tool_error: true, tool: name, error: "cancelled" });
}
try {
return await fn(params, ctx);
} catch (err: unknown) {
const msg = err instanceof Error ? err.message : String(err);
return JSON.stringify({ tool_error: true, tool: name, error: msg }, null, 2);
}
};
}
async function resolvePath(filePath: string, workspace: string, maxMb: number): Promise<string> {
const fullPath = workspace ? resolve(workspace, filePath) : resolve(filePath);
const info = await stat(fullPath);
const sizeMb = info.size / (1024 * 1024);
if (sizeMb > maxMb) {
throw new Error(`File is ${sizeMb.toFixed(1)} MB, exceeds limit of ${maxMb} MB`);
}
return fullPath;
}
function sheetToMarkdown(sheet: XLSX.WorkSheet): string {
const rows: string[][] = XLSX.utils.sheet_to_json(sheet, { header: 1, defval: "" }) as string[][];
if (rows.length === 0) return "(empty sheet)";
const header = rows[0];
const sep = header.map(() => "---");
const body = rows.slice(1);
const toRow = (r: string[]) => "| " + r.map(String).join(" | ") + " |";
return [toRow(header), toRow(sep), ...body.map(toRow)].join("\n");
}
// sheetName: blank = all sheets, non-blank = single sheet only
async function extractText(
fullPath: string,
format: "pdf" | "docx" | "spreadsheet" | "txt",
sheetName = ""
): Promise<string> {
if (format === "pdf") {
const buf = await readFile(fullPath);
const result = await pdfParse(buf);
return `Pages: ${result.numpages}\n\n${result.text}`;
}
if (format === "docx") {
const result = await mammoth.extractRawText({ path: fullPath });
return result.value;
}
if (format === "spreadsheet") {
const wb = XLSX.readFile(fullPath);
const targetSheets = sheetName ? [sheetName] : wb.SheetNames;
const sections = targetSheets.map(n => {
const sheet = wb.Sheets[n];
if (!sheet) throw new Error(`Sheet "${n}" not found. Available: ${wb.SheetNames.join(", ")}`);
return `## Sheet: ${n}\n\n${sheetToMarkdown(sheet)}`;
});
return sections.join("\n\n---\n\n");
}
return readFile(fullPath, "utf-8");
}
function detectFormat(ext: string): "pdf" | "docx" | "spreadsheet" | "txt" {
if (ext === ".pdf") return "pdf";
if (ext === ".docx") return "docx";
if ([".xlsx", ".xls", ".ods", ".csv"].includes(ext)) return "spreadsheet";
return "txt";
}
// ---------------------------------------------------------------------------
// Tools
// ---------------------------------------------------------------------------
export const toolsProvider: ToolsProvider = async (ctl) => {
await detectRagPeer(ctl as unknown as { client: any });
const cfg = ctl.getPluginConfig(pluginConfigSchematics);
return [
tool({
name: "parse_document",
description: text`
Extract text content from a local file.
format: "auto" (detect from extension), "pdf", "docx", "spreadsheet", "txt"
Returns plain text. Spreadsheets return one Markdown table per sheet.
Set sheet_name to target a single sheet (spreadsheet only).
Set max_chars to truncate output (default 80000 — roughly 60 pages).
`,
parameters: {
path: z.string().describe("Relative (within workspace) or absolute file path"),
format: z.enum(["auto", "pdf", "docx", "spreadsheet", "txt"]).default("auto"),
sheet_name: z.string().default("").describe("Target sheet name (spreadsheets only; blank = all sheets)"),
max_chars: z.coerce.number().int().min(1000).max(500000).default(80000),
},
implementation: safe_impl("parse_document", async ({ path, format, sheet_name, max_chars }, ctx) => {
ctx.status(`Resolving ${path}`);
const fullPath = await resolvePath(path, cfg.get("workspacePath"), cfg.get("maxFileSizeMb"));
const ext = extname(fullPath).toLowerCase();
const detected = format === "auto" ? detectFormat(ext) : format;
ctx.status(`Parsing as ${detected}`);
let output = await extractText(fullPath, detected, sheet_name);
if (output.length > max_chars) {
output = output.slice(0, max_chars) + `\n\n[truncated — ${output.length} total chars, showing first ${max_chars}]`;
}
return json({ path: fullPath, format: detected, chars: output.length, content: output });
}),
}),
tool({
name: "search_document",
description: text`
Search for a keyword or regex pattern inside a document (PDF/DOCX/spreadsheet/txt).
Returns matching lines with surrounding context.
pattern: literal string or JavaScript regex (e.g. "/\\d{4}-\\d{2}-\\d{2}/i")
context_lines: number of lines before/after each match to include (default 2)
max_matches: stop after this many matches (default 50)
`,
parameters: {
path: z.string().describe("File path (relative to workspace or absolute)"),
pattern: z.string().describe("Search string or /regex/flags"),
format: z.enum(["auto", "pdf", "docx", "spreadsheet", "txt"]).default("auto"),
context_lines: z.coerce.number().int().min(0).max(10).default(2),
max_matches: z.coerce.number().int().min(1).max(500).default(50),
},
implementation: safe_impl("search_document", async ({ path, pattern, format, context_lines, max_matches }, ctx) => {
ctx.status(`Loading ${path}`);
const fullPath = await resolvePath(path, cfg.get("workspacePath"), cfg.get("maxFileSizeMb"));
const ext = extname(fullPath).toLowerCase();
const detected = format === "auto" ? detectFormat(ext) : format;
const rawText = await extractText(fullPath, detected, "");
const lines = rawText.split("\n");
let regex: RegExp;
const reMatch = pattern.match(/^\/(.+)\/([gimsuy]*)$/);
if (reMatch) {
regex = new RegExp(reMatch[1], reMatch[2] || "gi");
} else {
regex = new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "gi");
}
ctx.status(`Scanning ${lines.length} lines for "${pattern}"`);
const matches: Array<{ line: number; text: string; context: string[] }> = [];
for (let i = 0; i < lines.length && matches.length < max_matches; i++) {
if (regex.test(lines[i])) {
regex.lastIndex = 0;
const start = Math.max(0, i - context_lines);
const end = Math.min(lines.length - 1, i + context_lines);
matches.push({
line: i + 1,
text: lines[i],
context: lines.slice(start, end + 1),
});
}
regex.lastIndex = 0;
}
return json({ path: fullPath, pattern, total_lines: lines.length, matches_found: matches.length, matches });
}),
}),
];
};