Forked from altra/ai-lab
toolsProvider.js
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.toolsProvider = void 0;
const sdk_1 = require("@lmstudio/sdk");
const zod_1 = require("zod");
const config_1 = require("./config");
const db_1 = require("./db");
function json(obj) {
return JSON.stringify(obj, null, 2);
}
function safe_impl(name, fn) {
return async (params) => {
try {
return await fn(params);
}
catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return JSON.stringify({
tool_error: true,
tool: name,
error: msg,
hint: "Read the error above, fix the parameter causing the issue, and retry the tool call.",
}, null, 2);
}
};
}
function diffStrings(a, b) {
const aLines = a.split("\n");
const bLines = b.split("\n");
const result = [];
const max = Math.max(aLines.length, bLines.length);
for (let i = 0; i < max; i++) {
const la = aLines[i];
const lb = bLines[i];
if (la === undefined) {
result.push(`+ ${lb}`);
}
else if (lb === undefined) {
result.push(`- ${la}`);
}
else if (la !== lb) {
result.push(`- ${la}`, `+ ${lb}`);
}
else {
result.push(` ${la}`);
}
}
return result;
}
function extractVariables(template) {
const matches = template.match(/\{\{(\w+)\}\}/g) ?? [];
return [...new Set(matches.map((m) => m.slice(2, -2)))];
}
function applyTemplate(template, vars) {
return template.replace(/\{\{(\w+)\}\}/g, (_, key) => vars[key] ?? `{{${key}}}`);
}
const toolsProvider = async (ctl) => {
const cfg = ctl.getPluginConfig(config_1.pluginConfigSchematics);
const db = () => (0, db_1.getDb)((0, db_1.getDataDir)(cfg.get("dataPath")));
const tools = [
// =========================================================================
// PROMPT MANAGEMENT
// =========================================================================
(0, sdk_1.tool)({
name: "save_prompt",
description: (0, sdk_1.text) `
Save a new version of a prompt by name.
Each call creates a new version — previous versions are never overwritten.
Returns the prompt ID and version number.
Variables in the template use {{variable_name}} syntax and are auto-detected.
Call whenever a prompt is refined, tuned, or updated.
`,
parameters: {
name: zod_1.z.string().min(2).describe("Prompt name — unique identifier, use snake_case"),
content: zod_1.z.string().min(1).describe("Full prompt content. Use {{variable}} for template variables."),
description: zod_1.z.string().default("").describe("What this prompt does and when to use it"),
tags: zod_1.z.array(zod_1.z.string()).default([]).describe("Tags for filtering — task type, model family, etc."),
},
implementation: safe_impl("save_prompt", async ({ name, content, description, tags }) => {
const variables = extractVariables(content);
const prompt = (0, db_1.savePrompt)(db(), name, content, description, tags, variables);
return json({
success: true,
name: prompt.name,
version: prompt.version,
id: prompt.id,
variables,
description,
});
}),
}),
(0, sdk_1.tool)({
name: "get_prompt",
description: (0, sdk_1.text) `
Get a prompt by name, optionally at a specific version.
Returns full content, description, variables, and version metadata.
If no version specified, returns the latest version.
Call before diff_prompts or run_prompt_template.
`,
parameters: {
name: zod_1.z.string().describe("Prompt name"),
version: zod_1.z.coerce.number().int().optional().describe("Specific version number. Omit for latest."),
},
implementation: safe_impl("get_prompt", async ({ name, version }) => {
const prompt = (0, db_1.getPrompt)(db(), name, version);
return json({
...prompt,
tags: JSON.parse(prompt.tags),
variables: JSON.parse(prompt.variables),
});
}),
}),
(0, sdk_1.tool)({
name: "list_prompts",
description: (0, sdk_1.text) `
List all prompts with their latest version number and description.
Returns prompt name, latest version, description, and tags.
Call at session start to see what prompts exist before creating duplicates.
`,
parameters: {},
implementation: safe_impl("list_prompts", async () => {
const prompts = (0, db_1.listPromptNames)(db());
return json({
total: prompts.length,
prompts: prompts.map((p) => ({
name: p.name,
latestVersion: p.latestVersion,
description: p.description,
tags: JSON.parse(p.tags),
})),
});
}),
}),
(0, sdk_1.tool)({
name: "list_prompt_versions",
description: (0, sdk_1.text) `
List all saved versions of a specific prompt, newest first.
Returns version number, description, and createdAt for each.
Call before diff_prompts to see which versions to compare.
`,
parameters: {
name: zod_1.z.string().describe("Prompt name"),
},
implementation: safe_impl("list_prompt_versions", async ({ name }) => {
const versions = (0, db_1.listPromptVersions)(db(), name);
if (versions.length === 0)
throw new Error(`Prompt '${name}' not found.`);
return json({
name,
versionCount: versions.length,
versions: versions.map((v) => ({
version: v.version,
description: v.description,
variables: JSON.parse(v.variables),
createdAt: v.createdAt,
})),
});
}),
}),
(0, sdk_1.tool)({
name: "diff_prompts",
description: (0, sdk_1.text) `
Show a line-by-line diff between two versions of a prompt.
Returns changed, added, and removed lines.
Use to understand what changed between iterations and whether a change helped.
Requires at least 2 saved versions of the prompt.
`,
parameters: {
name: zod_1.z.string().describe("Prompt name"),
versionA: zod_1.z.coerce.number().int().describe("Older version number"),
versionB: zod_1.z.coerce.number().int().describe("Newer version number"),
},
implementation: safe_impl("diff_prompts", async ({ name, versionA, versionB }) => {
const a = (0, db_1.getPrompt)(db(), name, versionA);
const b = (0, db_1.getPrompt)(db(), name, versionB);
const diff = diffStrings(a.content, b.content);
const additions = diff.filter((l) => l.startsWith("+ ")).length;
const removals = diff.filter((l) => l.startsWith("- ")).length;
const unchanged = diff.filter((l) => l.startsWith(" ")).length;
return json({
name,
versionA,
versionB,
summary: { additions, removals, unchanged },
diff,
});
}),
}),
(0, sdk_1.tool)({
name: "run_prompt_template",
description: (0, sdk_1.text) `
Fill a prompt template's {{variable}} placeholders with provided values
and return the rendered prompt. No side effects — does not run a model.
Use to preview the final prompt before passing it to an LLM manually.
Returns the filled prompt text and any unfilled variables.
`,
parameters: {
name: zod_1.z.string().describe("Prompt name"),
version: zod_1.z.coerce.number().int().optional().describe("Version to use. Omit for latest."),
variables: zod_1.z.record(zod_1.z.string(), zod_1.z.string()).default({})
.describe("Variable values as key-value pairs matching {{variable}} names in the template"),
},
implementation: safe_impl("run_prompt_template", async ({ name, version, variables }) => {
const prompt = (0, db_1.getPrompt)(db(), name, version);
const allVars = JSON.parse(prompt.variables);
const filled = applyTemplate(prompt.content, variables);
const unfilled = allVars.filter((v) => !(v in variables));
return json({
name: prompt.name,
version: prompt.version,
renderedPrompt: filled,
variablesProvided: Object.keys(variables),
variablesUnfilled: unfilled,
warning: unfilled.length > 0 ? `${unfilled.length} variable(s) not filled: ${unfilled.join(", ")}` : null,
});
}),
}),
// =========================================================================
// EVAL DATASETS
// =========================================================================
(0, sdk_1.tool)({
name: "create_eval_dataset",
description: (0, sdk_1.text) `
Create a named eval dataset for organizing test cases.
A dataset groups cases that test the same task or capability.
Returns the dataset ID.
Call once per task type — e.g. "summarization", "code_review", "classification".
`,
parameters: {
name: zod_1.z.string().min(2).describe("Dataset name — unique, snake_case"),
description: zod_1.z.string().default("").describe("What capability or task this dataset tests"),
tags: zod_1.z.array(zod_1.z.string()).default([]).describe("Tags for filtering"),
},
implementation: safe_impl("create_eval_dataset", async ({ name, description, tags }) => {
const dataset = (0, db_1.createDataset)(db(), name, description, tags);
return json({
success: true,
id: dataset.id,
name: dataset.name,
description,
});
}),
}),
(0, sdk_1.tool)({
name: "add_eval_case",
description: (0, sdk_1.text) `
Add a test case to an eval dataset.
Each case has an input (what you send to the model) and optional expected output.
Returns the case ID.
Build datasets with at least 5–10 cases before running comparisons.
`,
parameters: {
datasetName: zod_1.z.string().describe("Dataset to add the case to"),
input: zod_1.z.string().min(1).describe("Input to the model — the exact prompt or message"),
expectedOutput: zod_1.z.string().default("").describe("Reference output (optional) — used for scoring"),
tags: zod_1.z.array(zod_1.z.string()).default([]).describe("Tags — difficulty, topic, edge case type"),
notes: zod_1.z.string().default("").describe("Notes about this case — why it's interesting, what it tests"),
},
implementation: safe_impl("add_eval_case", async ({ datasetName, input, expectedOutput, tags, notes }) => {
const dataset = (0, db_1.getDataset)(db(), datasetName);
const evalCase = (0, db_1.addEvalCase)(db(), dataset.id, input, expectedOutput, tags, notes);
return json({
success: true,
caseId: evalCase.id,
datasetId: dataset.id,
datasetName,
});
}),
}),
(0, sdk_1.tool)({
name: "list_eval_datasets",
description: (0, sdk_1.text) `
List all eval datasets with case counts.
Call at session start to see what's already been set up.
`,
parameters: {},
implementation: safe_impl("list_eval_datasets", async () => {
const datasets = (0, db_1.listDatasets)(db());
return json({
total: datasets.length,
datasets: datasets.map((d) => ({
id: d.id,
name: d.name,
description: d.description,
caseCount: d.caseCount,
tags: JSON.parse(d.tags),
})),
});
}),
}),
(0, sdk_1.tool)({
name: "get_eval_dataset",
description: (0, sdk_1.text) `
Get a dataset with all its test cases.
Returns full input, expected output, and notes for each case.
Call before log_model_result to get the case IDs you need.
`,
parameters: {
name: zod_1.z.string().describe("Dataset name"),
},
implementation: safe_impl("get_eval_dataset", async ({ name }) => {
const dataset = (0, db_1.getDataset)(db(), name);
const cases = (0, db_1.getDatasetCases)(db(), dataset.id);
return json({
dataset: { ...dataset, tags: JSON.parse(dataset.tags) },
caseCount: cases.length,
cases: cases.map((c) => ({
id: c.id,
input: c.input,
expectedOutput: c.expectedOutput,
tags: JSON.parse(c.tags),
notes: c.notes,
})),
});
}),
}),
// =========================================================================
// MODEL EVALUATION
// =========================================================================
(0, sdk_1.tool)({
name: "log_model_result",
description: (0, sdk_1.text) `
Log a model's output for a specific eval case.
Records which model, which prompt (and version), what it output, and an optional score.
Score is 0–1 (0=completely wrong, 1=perfect). Leave null if not yet scored.
Call once per model × prompt × case combination.
Do NOT call for every token — log the final output for a case.
`,
parameters: {
caseId: zod_1.z.coerce.number().int().describe("Eval case ID"),
model: zod_1.z.string().describe("Model identifier — e.g. 'llama-3-8b', 'mistral-7b', 'qwen2.5-14b'"),
output: zod_1.z.string().describe("The model's actual output for this case"),
promptName: zod_1.z.string().default("").describe("Prompt name used (empty if no versioned prompt was used)"),
promptVersion: zod_1.z.coerce.number().int().default(0).describe("Prompt version used (0 if unversioned)"),
score: zod_1.z.coerce.number().min(0).max(1).nullable().default(null)
.describe("Quality score 0–1. Null if not scored yet."),
scoreLabel: zod_1.z.string().default("").describe("Label for score basis: 'exact_match', 'rubric', 'human', 'llm_judge'"),
notes: zod_1.z.string().default("").describe("Observations — what was good, what was wrong, failure mode"),
},
implementation: safe_impl("log_model_result", async ({ caseId, model, output, promptName, promptVersion, score, scoreLabel, notes }) => {
const result = (0, db_1.logModelResult)(db(), caseId, model, promptName, promptVersion, output, score, scoreLabel, notes);
return json({ success: true, resultId: result.id, caseId, model, score });
}),
}),
(0, sdk_1.tool)({
name: "compare_models",
description: (0, sdk_1.text) `
Compare model results across all cases in a dataset.
Returns: per-model average score, case count, and a side-by-side view of each case.
Use to identify which model + prompt combination performs best on a task.
Requires log_model_result to have been called for at least 2 models.
`,
parameters: {
datasetName: zod_1.z.string().describe("Dataset to compare results for"),
},
implementation: safe_impl("compare_models", async ({ datasetName }) => {
const dataset = (0, db_1.getDataset)(db(), datasetName);
const casesWithResults = (0, db_1.getResultsForDataset)(db(), dataset.id);
const stats = (0, db_1.getModelStats)(db(), dataset.id);
const models = [...new Set(stats.map((s) => `${s.model}${s.promptName ? ` + ${s.promptName}@v${s.promptVer}` : ""}`))];
return json({
datasetName,
caseCount: casesWithResults.length,
leaderboard: stats.map((s) => ({
model: s.model,
prompt: s.promptName ? `${s.promptName}@v${s.promptVer}` : "(none)",
caseCount: s.caseCount,
avgScore: s.avgScore !== null ? Math.round((s.avgScore ?? 0) * 1000) / 1000 : null,
scoredCount: s.scoredCount,
})),
modelsCompared: models.length,
caseComparison: casesWithResults.map((c) => ({
caseId: c.id,
input: c.input.slice(0, 200) + (c.input.length > 200 ? "…" : ""),
expectedOutput: c.expectedOutput.slice(0, 150),
results: c.results.map((r) => ({
model: r.model,
prompt: r.promptName ? `${r.promptName}@v${r.promptVer}` : "(none)",
output: r.output.slice(0, 300) + (r.output.length > 300 ? "…" : ""),
score: r.score,
scoreLabel: r.scoreLabel,
notes: r.notes,
})),
})),
});
}),
}),
(0, sdk_1.tool)({
name: "generate_eval_report",
description: (0, sdk_1.text) `
Scaffold a structured evaluation report for a dataset.
Loads all results and returns a prompt payload for the LLM to write
a comprehensive analysis: what works, what fails, patterns, and recommendations.
No side effects — returns the report payload.
Call after compare_models when you need a narrative summary of findings.
`,
parameters: {
datasetName: zod_1.z.string().describe("Dataset to generate a report for"),
focusQuestion: zod_1.z.string().default("").describe("Optional: a specific question the report should answer"),
},
implementation: safe_impl("generate_eval_report", async ({ datasetName, focusQuestion }) => {
const dataset = (0, db_1.getDataset)(db(), datasetName);
const casesWithResults = (0, db_1.getResultsForDataset)(db(), dataset.id);
const stats = (0, db_1.getModelStats)(db(), dataset.id);
if (stats.length === 0) {
return json({ action: "no_results", message: "No model results logged yet. Call log_model_result first." });
}
return json({
action: "generate_report",
datasetName,
description: dataset.description,
focusQuestion: focusQuestion || null,
leaderboard: stats,
caseCount: casesWithResults.length,
cases: casesWithResults.map((c) => ({
id: c.id,
input: c.input,
expectedOutput: c.expectedOutput,
results: c.results.map((r) => ({
model: r.model,
prompt: r.promptName ? `${r.promptName}@v${r.promptVer}` : "(none)",
output: r.output,
score: r.score,
scoreLabel: r.scoreLabel,
notes: r.notes,
})),
})),
instructions: [
focusQuestion
? `Answer this question using the eval data above: "${focusQuestion}"`
: `Write a structured evaluation report for the "${datasetName}" dataset.`,
"Structure:",
"1. LEADERBOARD SUMMARY — rank models by avg score with brief explanation of the gap",
"2. FAILURE PATTERNS — what kinds of cases does each model struggle with? Be specific.",
"3. SUCCESS PATTERNS — what does each model do well? What cases does it consistently handle?",
"4. PROMPT IMPACT — if multiple prompt versions were tested, what changed and did it help?",
"5. EDGE CASES — highlight the most interesting or surprising individual case results",
"6. RECOMMENDATION — which model + prompt to use for this task and why",
"7. NEXT STEPS — what additional cases or models should be tested?",
"Cite specific cases by caseId. Do not generalize beyond what the data shows.",
].join("\n"),
});
}),
}),
];
return tools;
};
exports.toolsProvider = toolsProvider;