Project Files
src / tools / analysis.ts
import { tool, type Tool } from "@lmstudio/sdk";
import { z } from "zod";
import * as fs from "fs";
import * as path from "path";
import ExcelJS from "exceljs";
import * as XLSX from "xlsx";
import { loadWorkbook, saveWorkbook, getSheet, worksheetToJson, jsonToWorksheet, preview, clearWorksheet } from "../utils/workbook";
import { checkReadable, ensureDir, resolvePath } from "../utils/config";
export function getAnalysisTools(workingDir: string): Tool[] {
return [
// ── 29. excel_summarize ──────────────────────────────────────────────────
tool({
name: "excel_summarize",
description: "Full statistical summary per column: min/max/avg/sum/stddev, unique count, null count, outliers (±3σ), duplicate rows, empty columns.",
parameters: {
filePath: z.string(),
sheet: z.string().optional(),
},
implementation: async ({ filePath, sheet }) => {
filePath = resolvePath(filePath, workingDir);
const err = checkReadable(filePath);
if (err) return `Error: ${err}`;
try {
const wb = await loadWorkbook(filePath);
const ws = getSheet(wb, sheet);
if (typeof ws === "string") return `Error: ${ws}`;
const rows = worksheetToJson(ws);
if (!rows.length) return JSON.stringify({ file: filePath, empty: true });
const dups = rows.length - new Set(rows.map(r => JSON.stringify(r))).size;
const headers = Object.keys(rows[0]);
const empty: string[] = [];
const stats: Record<string, unknown> = {};
for (const col of headers) {
const vals = rows.map(r => r[col]).filter(v => v !== null && v !== "");
if (!vals.length) { empty.push(col); continue; }
const nums = vals.filter(v => typeof v === "number") as number[];
if (nums.length > 0) {
const sum = nums.reduce((a, b) => a + b, 0);
const avg = sum / nums.length;
const stddev = Math.sqrt(nums.reduce((a, b) => a + (b - avg) ** 2, 0) / nums.length);
const outliers = nums.filter(n => Math.abs(n - avg) > 3 * stddev);
stats[col] = {
type: "numeric", count: nums.length, nulls: rows.length - nums.length,
min: Math.min(...nums), max: Math.max(...nums),
avg: +avg.toFixed(4), sum: +sum.toFixed(4), stddev: +stddev.toFixed(4),
outliers: outliers.length ? outliers.slice(0, 10) : "none",
};
} else {
const unique = new Set(vals.map(String));
stats[col] = {
type: "text", count: vals.length, nulls: rows.length - vals.length,
unique_values: unique.size, sample: [...unique].slice(0, 5),
};
}
}
return JSON.stringify({
file: filePath, sheet: ws.name,
total_rows: rows.length, total_columns: headers.length,
duplicate_rows: dups, empty_columns: empty.length ? empty : "none",
column_stats: stats,
}, null, 2);
} catch (e) { return `Error: ${e}`; }
},
}),
// ── 30. excel_pivot ──────────────────────────────────────────────────────
tool({
name: "excel_pivot",
description: "Create a pivot table: group by one column, aggregate one or more numeric columns (sum/avg/count/min/max). Saves result to a new sheet.",
parameters: {
filePath: z.string(),
groupBy: z.string().describe("Column to group by"),
valueCols: z.array(z.object({
column: z.string(),
aggregation: z.enum(["sum","avg","count","min","max"]),
})).describe("Columns and aggregation functions"),
outputSheet: z.string().optional().describe("Sheet name for result (default: Pivot)"),
sheet: z.string().optional(),
},
implementation: async ({ filePath, groupBy, valueCols, outputSheet, sheet }) => {
filePath = resolvePath(filePath, workingDir);
const err = checkReadable(filePath);
if (err) return `Error: ${err}`;
try {
const wb = await loadWorkbook(filePath);
const ws = getSheet(wb, sheet);
if (typeof ws === "string") return `Error: ${ws}`;
const rows = worksheetToJson(ws);
// Build case-insensitive column lookup to tolerate header name mismatches
const headers0 = rows.length > 0 ? Object.keys(rows[0]) : [];
const colLookup = (name: string): string => {
const exact = headers0.find(h => h === name);
if (exact) return exact;
const ci = headers0.find(h => h.toLowerCase() === name.toLowerCase().trim());
return ci ?? name;
};
const resolvedGroupBy = colLookup(groupBy);
const resolvedCols = valueCols.map(v => ({ ...v, column: colLookup(v.column) }));
// Robust numeric parse: handle "1 234,56" (FR) and "1,234.56" (EN)
const toNum = (v: unknown): number | null => {
if (typeof v === "number") return isNaN(v) ? null : v;
if (v === null || v === undefined || v === "") return null;
const s = String(v).trim()
.replace(/\s/g, "") // remove spaces (FR thousands)
.replace(/,(?=\d{3})/g, "") // remove EN thousands comma: 1,234 → 1234
.replace(",", "."); // FR decimal comma → dot
const n = parseFloat(s);
return isNaN(n) ? null : n;
};
const groups = new Map<string, Record<string, number[]>>();
for (const row of rows) {
const key = String(row[resolvedGroupBy] ?? "(blank)");
if (!groups.has(key)) groups.set(key, Object.fromEntries(resolvedCols.map(v => [v.column, []])));
for (const { column } of resolvedCols) {
const n = toNum(row[column]);
if (n !== null) groups.get(key)![column].push(n);
}
}
const agg = (nums: number[], fn: string): number => {
if (!nums.length) return 0;
switch (fn) {
case "sum": return nums.reduce((a,b) => a+b, 0);
case "avg": return nums.reduce((a,b) => a+b, 0) / nums.length;
case "count": return nums.length;
case "min": return Math.min(...nums);
case "max": return Math.max(...nums);
default: return 0;
}
};
const pivotRows = [...groups.entries()].map(([key, colNums]) => {
const obj: Record<string, unknown> = { [resolvedGroupBy]: key };
for (const { column, aggregation } of resolvedCols)
obj[`${aggregation}(${column})`] = +agg(colNums[column], aggregation).toFixed(4);
return obj;
});
const pivotName = outputSheet ?? "Pivot";
const pivotWs = wb.getWorksheet(pivotName) ?? wb.addWorksheet(pivotName);
clearWorksheet(pivotWs);
jsonToWorksheet(pivotWs, pivotRows);
await saveWorkbook(wb, filePath);
return JSON.stringify({ success: true, file: filePath, pivot_sheet: pivotName,
group_by: groupBy, groups: groups.size, preview: preview(pivotRows, 10) }, null, 2);
} catch (e) { return `Error: ${e}`; }
},
}),
// ── 31. excel_diff ───────────────────────────────────────────────────────
tool({
name: "excel_diff",
description: "Compare two files and return added rows, removed rows, and modified rows (with before/after values per changed cell).",
parameters: {
fileA: z.string().describe("Original file"),
fileB: z.string().describe("Modified file"),
keyColumn: z.string().describe("Column that uniquely identifies each row (primary key)"),
sheetA: z.string().optional(), sheetB: z.string().optional(),
exportDiff: z.string().optional().describe("Optional .xlsx path to export a color-coded diff file"),
},
implementation: async ({ fileA, fileB, keyColumn, sheetA, sheetB, exportDiff }) => {
fileA = resolvePath(fileA, workingDir);
fileB = resolvePath(fileB, workingDir);
if (exportDiff) exportDiff = resolvePath(exportDiff, workingDir);
const e1 = checkReadable(fileA); if (e1) return `Error: ${e1}`;
const e2 = checkReadable(fileB); if (e2) return `Error: ${e2}`;
try {
const wbA = await loadWorkbook(fileA); const wbB = await loadWorkbook(fileB);
const wsA = getSheet(wbA, sheetA); if (typeof wsA === "string") return `Error: ${wsA}`;
const wsB = getSheet(wbB, sheetB); if (typeof wsB === "string") return `Error: ${wsB}`;
const rowsA = worksheetToJson(wsA); const rowsB = worksheetToJson(wsB);
const mapA = new Map(rowsA.map(r => [String(r[keyColumn]), r]));
const mapB = new Map(rowsB.map(r => [String(r[keyColumn]), r]));
const added = rowsB.filter(r => !mapA.has(String(r[keyColumn])));
const removed = rowsA.filter(r => !mapB.has(String(r[keyColumn])));
const modified: { key: string; changes: Record<string, {from:unknown;to:unknown}> }[] = [];
for (const [key, rowB] of mapB) {
const rowA = mapA.get(key); if (!rowA) continue;
const changes: Record<string, {from:unknown;to:unknown}> = {};
for (const col of new Set([...Object.keys(rowA),...Object.keys(rowB)]))
if (String(rowA[col]??"") !== String(rowB[col]??"")) changes[col] = { from: rowA[col], to: rowB[col] };
if (Object.keys(changes).length) modified.push({ key, changes });
}
// Optional color-coded export
if (exportDiff) {
ensureDir(exportDiff);
const ewb = new ExcelJS.Workbook();
const addSheet = (name: string, data: Record<string, unknown>[], color: string) => {
if (!data.length) return;
const ws = ewb.addWorksheet(name);
jsonToWorksheet(ws, data);
ws.eachRow((row, i) => { if (i > 1) row.fill = { type:"pattern", pattern:"solid", fgColor:{argb:color} }; });
};
addSheet("Added", added, "FFC6EFCE");
addSheet("Removed", removed, "FFFFC7CE");
if (modified.length) {
const modRows = modified.map(m => ({ key: m.key, changes: JSON.stringify(m.changes) }));
addSheet("Modified", modRows, "FFFFFFCC");
}
await saveWorkbook(ewb, exportDiff);
}
return JSON.stringify({ fileA, fileB, key_column: keyColumn,
summary: { added: added.length, removed: removed.length, modified: modified.length },
added: added.slice(0,20), removed: removed.slice(0,20), modified: modified.slice(0,20),
...(exportDiff ? { diff_file: exportDiff } : {}),
}, null, 2);
} catch (e) { return `Error: ${e}`; }
},
}),
// ── 32. excel_validate_data ──────────────────────────────────────────────
tool({
name: "excel_validate_data",
description: "Check data quality rules: required columns not null, values in allowed list, numeric range, regex pattern, unique constraint. Returns all violations.",
parameters: {
filePath: z.string(),
rules: z.array(z.object({
column: z.string(),
type: z.enum(["required","allowedValues","numericRange","regex","unique"]),
allowedValues: z.array(z.string()).optional(),
min: z.number().optional(), max: z.number().optional(),
pattern: z.string().optional().describe("Regex pattern"),
})),
sheet: z.string().optional(),
},
implementation: async ({ filePath, rules, sheet }) => {
filePath = resolvePath(filePath, workingDir);
const err = checkReadable(filePath);
if (err) return `Error: ${err}`;
try {
const wb = await loadWorkbook(filePath);
const ws = getSheet(wb, sheet);
if (typeof ws === "string") return `Error: ${ws}`;
const rows = worksheetToJson(ws);
const violations: { row: number; column: string; rule: string; value: unknown }[] = [];
const seenValues: Record<string, Set<string>> = {};
rows.forEach((row, i) => {
for (const rule of rules) {
const val = row[rule.column];
const v = String(val ?? "");
const rowNum = i + 2; // 1-based + header
switch (rule.type) {
case "required":
if (val === null || val === undefined || v === "") violations.push({ row: rowNum, column: rule.column, rule: "required", value: val }); break;
case "allowedValues":
if (v && rule.allowedValues && !rule.allowedValues.includes(v)) violations.push({ row: rowNum, column: rule.column, rule: `allowedValues: ${rule.allowedValues.join(",")}`, value: val }); break;
case "numericRange":
if (v) { const n = Number(val); if (isNaN(n) || (rule.min !== undefined && n < rule.min) || (rule.max !== undefined && n > rule.max)) violations.push({ row: rowNum, column: rule.column, rule: `numericRange [${rule.min}–${rule.max}]`, value: val }); } break;
case "regex":
if (v && rule.pattern && !new RegExp(rule.pattern).test(v)) violations.push({ row: rowNum, column: rule.column, rule: `regex: ${rule.pattern}`, value: val }); break;
case "unique":
if (!seenValues[rule.column]) seenValues[rule.column] = new Set();
if (v && seenValues[rule.column].has(v)) violations.push({ row: rowNum, column: rule.column, rule: "unique", value: val });
seenValues[rule.column].add(v); break;
}
}
});
return JSON.stringify({ file: filePath, sheet: ws.name, rows_checked: rows.length,
violations_found: violations.length, violations: violations.slice(0, 100),
status: violations.length === 0 ? "PASS" : "FAIL" }, null, 2);
} catch (e) { return `Error: ${e}`; }
},
}),
// ── 33. excel_query ──────────────────────────────────────────────────────
tool({
name: "excel_query",
description: "Run a SQL-like query on a sheet. Syntax: SELECT col1, col2 FROM sheetName WHERE col3 > 100 ORDER BY col1 DESC LIMIT 50. Use * to select all columns.",
parameters: {
filePath: z.string(),
query: z.string().describe("SQL-like query string"),
},
implementation: async ({ filePath, query }) => {
filePath = resolvePath(filePath, workingDir);
const err = checkReadable(filePath);
if (err) return `Error: ${err}`;
try {
const sel = query.match(/SELECT\s+(.*?)\s+FROM\s+["']?([^"'\s]+)["']?/i);
if (!sel) return "Error: Query must start with SELECT ... FROM ...";
const cols = sel[1].trim() === "*" ? null : sel[1].split(",").map(s => s.trim());
const sheetName = sel[2];
const whereM = query.match(/WHERE\s+(.*?)(?:\s+ORDER\s+BY|\s+LIMIT|$)/i);
const orderM = query.match(/ORDER\s+BY\s+(.*?)(?:\s+LIMIT|$)/i);
const limitM = query.match(/LIMIT\s+(\d+)/i);
const wb = await loadWorkbook(filePath);
const ws = getSheet(wb, sheetName);
if (typeof ws === "string") return `Error: ${ws}`;
let rows = worksheetToJson(ws);
// WHERE
if (whereM) {
const cond = whereM[1].trim();
rows = rows.filter(row => {
try {
// Safe eval: replace column names with row values
const expr = cond.replace(/(\w+)/g, (m) => row[m] !== undefined ? JSON.stringify(row[m]) : m);
// eslint-disable-next-line no-new-func
return new Function(`return (${expr})`)();
} catch { return false; }
});
}
// ORDER BY
if (orderM) {
const parts = orderM[1].split(",").map(s => {
const t = s.trim().split(/\s+/);
return { col: t[0], dir: (t[1]?.toUpperCase() === "DESC" ? -1 : 1) };
});
rows.sort((a, b) => {
for (const { col, dir } of parts) {
const av = a[col]; const bv = b[col];
if (av === bv) continue;
if (typeof av === "number" && typeof bv === "number") return (av - bv) * dir;
return String(av??"").localeCompare(String(bv??"")) * dir;
}
return 0;
});
}
// LIMIT
if (limitM) rows = rows.slice(0, parseInt(limitM[1]));
// SELECT columns
const result = cols ? rows.map(r => Object.fromEntries(cols.map(c => [c, r[c]]))) : rows;
return JSON.stringify({ query, sheet: ws.name, total_matched: rows.length,
columns: result[0] ? Object.keys(result[0]) : [], data: result }, null, 2);
} catch (e) { return `Error: ${e}`; }
},
}),
// ── 34. excel_split_sheet ────────────────────────────────────────────────
tool({
name: "excel_split_sheet",
description: "Split a sheet into multiple files (one per unique value of a column). E.g. split a sales sheet by Region → one file per region.",
parameters: {
filePath: z.string(),
splitColumn: z.string().describe("Column whose unique values become separate files"),
outputDir: z.string().describe("Directory to write output files into"),
sheet: z.string().optional(),
},
implementation: async ({ filePath, splitColumn, outputDir, sheet }) => {
filePath = resolvePath(filePath, workingDir);
const err = checkReadable(filePath);
if (err) return `Error: ${err}`;
try {
const wb = await loadWorkbook(filePath);
const ws = getSheet(wb, sheet);
if (typeof ws === "string") return `Error: ${ws}`;
const rows = worksheetToJson(ws);
const groups = new Map<string, Record<string, unknown>[]>();
for (const row of rows) {
const key = String(row[splitColumn] ?? "blank");
if (!groups.has(key)) groups.set(key, []);
groups.get(key)!.push(row);
}
ensureDir(path.join(outputDir, "placeholder"));
const files: { value: string; file: string; rows: number }[] = [];
for (const [key, groupRows] of groups) {
const safe = key.replace(/[^a-zA-Z0-9_\-]/g, "_");
const outPath = path.join(outputDir, `${safe}.xlsx`);
const outWb = new ExcelJS.Workbook();
const outWs = outWb.addWorksheet(key.slice(0, 31));
jsonToWorksheet(outWs, groupRows);
await saveWorkbook(outWb, outPath);
files.push({ value: key, file: outPath, rows: groupRows.length });
}
return JSON.stringify({ success: true, source: filePath, split_by: splitColumn,
files_created: files.length, files }, null, 2);
} catch (e) { return `Error: ${e}`; }
},
}),
// ── 35. excel_info ───────────────────────────────────────────────────────
tool({
name: "excel_info",
description: "Get complete metadata about a file: format, size, sheets, row/column counts, named ranges, whether VBA is present, last modified date.",
parameters: {
filePath: z.string(),
},
implementation: async ({ filePath }) => {
filePath = resolvePath(filePath, workingDir);
const err = checkReadable(filePath);
if (err) return `Error: ${err}`;
try {
const stat = fs.statSync(filePath);
const ext = path.extname(filePath).toLowerCase();
const wb = await loadWorkbook(filePath);
const sheets = wb.worksheets.map(ws => {
const rows = worksheetToJson(ws);
return { name: ws.name, rows: rows.length,
columns: rows[0] ? Object.keys(rows[0]).length : 0,
column_names: rows[0] ? Object.keys(rows[0]) : [],
has_formulas: ws.rowCount > 0,
};
});
// Check VBA (only for xlsm via SheetJS)
let hasVba = false;
if (ext === ".xlsm") {
try {
const sjWb = XLSX.readFile(filePath, { bookVBA: true });
hasVba = !!sjWb.vbaraw;
} catch {}
}
return JSON.stringify({
file: filePath, format: ext, size_bytes: stat.size,
size_kb: +(stat.size / 1024).toFixed(1),
last_modified: stat.mtime.toISOString(),
sheet_count: sheets.length, sheets,
has_vba: hasVba,
total_rows: sheets.reduce((a, s) => a + s.rows, 0),
}, null, 2);
} catch (e) { return `Error: ${e}`; }
},
}),
];
}