Project Files
src
config.ts
index.ts
peers.ts
promptPreprocessor.ts
toolsProvider.ts
config.js
index.js
manifest.json
package-lock.json
package.json
peers.js
promptPreprocessor.js
README.md
toolsProvider.js
tsconfig.json
src / toolsProvider.ts
/**
 * Document Parser Plugin — toolsProvider
 *
 * Tools:
 *   parse_document  — extract full text from PDF/DOCX/spreadsheet/txt
 *   search_document — keyword/regex scan through a document
 */

import { text, tool, type Tool, type ToolCallContext, type ToolsProvider } from "@lmstudio/sdk";
import { readFile, stat } from "fs/promises";
import { resolve, extname } from "path";
import { z } from "zod";
import { pluginConfigSchematics } from "./config";
import { detectRagPeer } from "./peers";
import pdfParse from "pdf-parse";
import mammoth from "mammoth";
import * as XLSX from "xlsx";

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

function json(obj: unknown): string {
  return JSON.stringify(obj, null, 2);
}

function safe_impl<T extends Record<string, unknown>>(
  name: string,
  fn: (params: T, ctx: ToolCallContext) => Promise<string>
): (params: T, ctx: ToolCallContext) => Promise<string> {
  return async (params: T, ctx: ToolCallContext) => {
    if (ctx.signal.aborted) {
      return JSON.stringify({ tool_error: true, tool: name, error: "cancelled" });
    }
    try {
      return await fn(params, ctx);
    } catch (err: unknown) {
      const msg = err instanceof Error ? err.message : String(err);
      return JSON.stringify({ tool_error: true, tool: name, error: msg }, null, 2);
    }
  };
}

async function resolvePath(filePath: string, workspace: string, maxMb: number): Promise<string> {
  const fullPath = workspace ? resolve(workspace, filePath) : resolve(filePath);
  const info = await stat(fullPath);
  const sizeMb = info.size / (1024 * 1024);
  if (sizeMb > maxMb) {
    throw new Error(`File is ${sizeMb.toFixed(1)} MB, exceeds limit of ${maxMb} MB`);
  }
  return fullPath;
}

function sheetToMarkdown(sheet: XLSX.WorkSheet): string {
  const rows: string[][] = XLSX.utils.sheet_to_json(sheet, { header: 1, defval: "" }) as string[][];
  if (rows.length === 0) return "(empty sheet)";
  const header = rows[0];
  const sep = header.map(() => "---");
  const body = rows.slice(1);
  const toRow = (r: string[]) => "| " + r.map(String).join(" | ") + " |";
  return [toRow(header), toRow(sep), ...body.map(toRow)].join("\n");
}

// sheetName: blank = all sheets, non-blank = single sheet only
async function extractText(
  fullPath: string,
  format: "pdf" | "docx" | "spreadsheet" | "txt",
  sheetName = ""
): Promise<string> {
  if (format === "pdf") {
    const buf = await readFile(fullPath);
    const result = await pdfParse(buf);
    return `Pages: ${result.numpages}\n\n${result.text}`;
  }
  if (format === "docx") {
    const result = await mammoth.extractRawText({ path: fullPath });
    return result.value;
  }
  if (format === "spreadsheet") {
    const wb = XLSX.readFile(fullPath);
    const targetSheets = sheetName ? [sheetName] : wb.SheetNames;
    const sections = targetSheets.map(n => {
      const sheet = wb.Sheets[n];
      if (!sheet) throw new Error(`Sheet "${n}" not found. Available: ${wb.SheetNames.join(", ")}`);
      return `## Sheet: ${n}\n\n${sheetToMarkdown(sheet)}`;
    });
    return sections.join("\n\n---\n\n");
  }
  return readFile(fullPath, "utf-8");
}

function detectFormat(ext: string): "pdf" | "docx" | "spreadsheet" | "txt" {
  if (ext === ".pdf") return "pdf";
  if (ext === ".docx") return "docx";
  if ([".xlsx", ".xls", ".ods", ".csv"].includes(ext)) return "spreadsheet";
  return "txt";
}

// ---------------------------------------------------------------------------
// Tools
// ---------------------------------------------------------------------------

export const toolsProvider: ToolsProvider = async (ctl) => {
  await detectRagPeer(ctl as unknown as { client: any });
  const cfg = ctl.getPluginConfig(pluginConfigSchematics);

  return [
    tool({
      name: "parse_document",
      description: text`
        Extract text content from a local file.

        format: "auto" (detect from extension), "pdf", "docx", "spreadsheet", "txt"

        Returns plain text. Spreadsheets return one Markdown table per sheet.
        Set sheet_name to target a single sheet (spreadsheet only).
        Set max_chars to truncate output (default 80000 — roughly 60 pages).
      `,
      parameters: {
        path: z.string().describe("Relative (within workspace) or absolute file path"),
        format: z.enum(["auto", "pdf", "docx", "spreadsheet", "txt"]).default("auto"),
        sheet_name: z.string().default("").describe("Target sheet name (spreadsheets only; blank = all sheets)"),
        max_chars: z.coerce.number().int().min(1000).max(500000).default(80000),
      },
      implementation: safe_impl("parse_document", async ({ path, format, sheet_name, max_chars }, ctx) => {
        ctx.status(`Resolving ${path}`);
        const fullPath = await resolvePath(path, cfg.get("workspacePath"), cfg.get("maxFileSizeMb"));
        const ext = extname(fullPath).toLowerCase();
        const detected = format === "auto" ? detectFormat(ext) : format;

        ctx.status(`Parsing as ${detected}`);
        let output = await extractText(fullPath, detected, sheet_name);

        if (output.length > max_chars) {
          output = output.slice(0, max_chars) + `\n\n[truncated — ${output.length} total chars, showing first ${max_chars}]`;
        }

        return json({ path: fullPath, format: detected, chars: output.length, content: output });
      }),
    }),

    tool({
      name: "search_document",
      description: text`
        Search for a keyword or regex pattern inside a document (PDF/DOCX/spreadsheet/txt).
        Returns matching lines with surrounding context.

        pattern: literal string or JavaScript regex (e.g. "/\\d{4}-\\d{2}-\\d{2}/i")
        context_lines: number of lines before/after each match to include (default 2)
        max_matches: stop after this many matches (default 50)
      `,
      parameters: {
        path: z.string().describe("File path (relative to workspace or absolute)"),
        pattern: z.string().describe("Search string or /regex/flags"),
        format: z.enum(["auto", "pdf", "docx", "spreadsheet", "txt"]).default("auto"),
        context_lines: z.coerce.number().int().min(0).max(10).default(2),
        max_matches: z.coerce.number().int().min(1).max(500).default(50),
      },
      implementation: safe_impl("search_document", async ({ path, pattern, format, context_lines, max_matches }, ctx) => {
        ctx.status(`Loading ${path}`);
        const fullPath = await resolvePath(path, cfg.get("workspacePath"), cfg.get("maxFileSizeMb"));
        const ext = extname(fullPath).toLowerCase();
        const detected = format === "auto" ? detectFormat(ext) : format;

        const rawText = await extractText(fullPath, detected, "");
        const lines = rawText.split("\n");

        let regex: RegExp;
        const reMatch = pattern.match(/^\/(.+)\/([gimsuy]*)$/);
        if (reMatch) {
          regex = new RegExp(reMatch[1], reMatch[2] || "gi");
        } else {
          regex = new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "gi");
        }

        ctx.status(`Scanning ${lines.length} lines for "${pattern}"`);

        const matches: Array<{ line: number; text: string; context: string[] }> = [];
        for (let i = 0; i < lines.length && matches.length < max_matches; i++) {
          if (regex.test(lines[i])) {
            regex.lastIndex = 0;
            const start = Math.max(0, i - context_lines);
            const end = Math.min(lines.length - 1, i + context_lines);
            matches.push({
              line: i + 1,
              text: lines[i],
              context: lines.slice(start, end + 1),
            });
          }
          regex.lastIndex = 0;
        }

        return json({ path: fullPath, pattern, total_lines: lines.length, matches_found: matches.length, matches });
      }),
    }),
  ];
};
document-parser

document-parser