Forked from altra/research
src / toolsProvider.ts
import { text, tool, type Tool, type ToolsProvider } from "@lmstudio/sdk";
import { join } from "path";
import { homedir } from "os";
import { z } from "zod";
import { pluginConfigSchematics } from "./config";
import { openDb, upsertEntity, insertClaim, upsertSource, insertTimelineEvent, insertReport, listReports, getEntityByName, getEntityIdsByNames } from "./db";
import { webSearch, type TimeRange } from "./search";
import { fetchPage, fetchWikipediaSummary } from "./fetch";
import { scoreSource } from "./scorer";
import { checkGoogleFactCheck } from "./factcheck";
function json(obj: unknown): string {
return JSON.stringify(obj, null, 2);
}
function safe_impl<T extends Record<string, unknown>>(
name: string,
fn: (params: T) => Promise<string>,
): (params: T) => Promise<string> {
return async (params: T) => {
try {
return await fn(params);
} catch (err: unknown) {
const msg = err instanceof Error ? err.message : String(err);
return JSON.stringify({
tool_error: true, tool: name, error: msg,
hint: "Read the error, fix the parameter causing the issue, and retry.",
}, null, 2);
}
};
}
const _dbCache = new Map<string, ReturnType<typeof openDb>>();
function getDb(path: string): ReturnType<typeof openDb> {
let db = _dbCache.get(path);
if (!db) { db = openDb(path); _dbCache.set(path, db); }
return db;
}
export const toolsProvider: ToolsProvider = async (ctl) => {
const cfg = ctl.getPluginConfig(pluginConfigSchematics);
const dataPath = () => cfg.get("dataPath").trim() || join(homedir(), "research-data");
const searxngUrl = () => cfg.get("searxngUrl").trim() || undefined;
const maxResultsCfg = () => cfg.get("maxResultsPerQuery");
const searchWindow = (): TimeRange | undefined => {
const v = cfg.get("searchRecencyWindow").trim().toLowerCase();
return (["day", "week", "month", "year"].includes(v) ? v : undefined) as TimeRange | undefined;
};
const wikiEnabled = () => cfg.get("enableWikipediaGrounding") === "true";
const factCheckKey = () => cfg.get("googleFactCheckApiKey").trim() || undefined;
const tools: Tool[] = [
tool({
name: "plan_research",
description: text`
Decompose a research question into sub-questions, identify knowns vs. unknowns,
and produce concrete search queries. Returns a scaffold payload — follow the
instructions field to generate the plan JSON.
Call this FIRST on any new research topic before calling search_sources.
Do NOT call for simple factual lookups that need only one search.
`,
parameters: {
question: z.string().describe("The research question to decompose"),
depth: z.enum(["quick", "standard", "deep"]).default("standard")
.describe("quick=3 sub-questions, standard=5, deep=8"),
},
implementation: safe_impl("plan_research", async ({ question, depth }) => {
const counts = { quick: 3, standard: 5, deep: 8 };
const n = counts[depth];
return json({
action: "plan_research",
question,
depth,
instructions: `Break the research question into ${n} specific sub-questions that together would fully answer it. Identify what is already known vs. unknown. Generate 2 concrete search queries per sub-question.
Return a JSON object:
{
"subQuestions": ["...", ...],
"knownFacts": ["..."],
"unknowns": ["..."],
"suggestedSearchQueries": ["...", ...],
"estimatedSources": ${n * 2}
}
Output ONLY valid JSON.`,
});
}),
}),
tool({
name: "search_sources",
description: text`
Execute web searches across multiple queries and return deduplicated candidate
URLs with title, snippet, and domain. Returns a flat results array.
Call after plan_research with all suggested queries in one call.
Do NOT call before plan_research on a new topic.
`,
parameters: {
queries: z.array(z.string()).describe("Search query strings, e.g. [\"query1\", \"query2\"]"),
maxResults: z.coerce.number().int().min(1).max(20).optional()
.describe("Max results per query. Defaults to config value."),
recencyWindow: z.enum(["day", "week", "month", "year", "any"]).optional()
.describe("Recency filter. Defaults to config value."),
},
implementation: safe_impl("search_sources", async ({ queries, maxResults, recencyWindow }) => {
const max = maxResults ?? maxResultsCfg();
const window = recencyWindow === "any" ? undefined : (recencyWindow ?? searchWindow());
const seen = new Set<string>();
const allResults: Array<{ url: string; title: string; snippet: string; domain: string; query: string }> = [];
for (const q of queries) {
const results = await webSearch(q, max, 10_000, searxngUrl(), window);
for (const r of results) {
if (seen.has(r.url)) continue;
seen.add(r.url);
try {
const domain = new URL(r.url).hostname.replace(/^www\./, "");
allResults.push({ url: r.url, title: r.title, snippet: r.snippet, domain, query: q });
} catch { /* skip malformed URLs */ }
}
}
return json({ results: allResults, totalFound: allResults.length });
}),
}),
tool({
name: "read_source",
description: text`
Fetch a URL and return clean readable body text, word count, published date,
named entities, and structural signals. Optionally enriches named entities with
Wikipedia summaries. Returns data for extract_claims and score_source.
Call once per URL from search_sources. Do NOT call the same URL twice.
`,
parameters: {
url: z.string().describe("URL to fetch and read"),
enrichWithWikipedia: z.boolean().optional()
.describe("Fetch Wikipedia summaries for named entities. Defaults to config.enableWikipediaGrounding."),
},
implementation: safe_impl("read_source", async ({ url, enrichWithWikipedia }) => {
const page = await fetchPage(url);
const doWiki = enrichWithWikipedia ?? wikiEnabled();
const wikipediaGrounding: Array<{ entity: string; summary: string }> = [];
if (doWiki && page.namedEntities.length > 0) {
for (const entity of page.namedEntities.slice(0, 5)) {
const w = await fetchWikipediaSummary(entity);
if (w) wikipediaGrounding.push(w);
}
}
return json({
url: page.url,
title: page.title,
bodyText: page.bodyText,
wordCount: page.wordCount,
publishedDate: page.publishedDate,
namedEntities: page.namedEntities,
hasSchemaMarkup: page.hasSchemaMarkup,
hasHeadings: page.hasHeadings,
wikipediaGrounding,
});
}),
}),
tool({
name: "extract_claims",
description: text`
Extract specific verifiable claims from a source's body text. Returns a scaffold
payload — follow the instructions field to produce a JSON array of claim objects.
Pass the output to compare_evidence and save_entity.
Call once per source after read_source. Do NOT pass the full HTML — use bodyText only.
`,
parameters: {
sourceText: z.string().describe("The source body text (from read_source.bodyText)"),
url: z.string().describe("URL of the source"),
focusTopic: z.string().optional().describe("Optional: filter claims to those relevant to this topic"),
},
implementation: safe_impl("extract_claims", async ({ sourceText, url, focusTopic }) => {
return json({
action: "extract_claims",
sourceText: sourceText.slice(0, 8_000),
url,
focusTopic: focusTopic ?? null,
instructions: `Extract specific verifiable claims from the source text above.${focusTopic ? ` Focus on claims related to: ${focusTopic}` : ""}
For each claim return a JSON object with:
- "claim": the specific factual assertion (not opinion)
- "type": "factual" | "statistical" | "opinion" | "citation"
- "confidence": 0.0–1.0 (how clearly stated vs. implied)
- "verbatimQuote": the exact quote from the source
- "sourceUrl": "${url}"
Focus on: specific dates, numbers, named entities, causal claims, statistics.
Skip vague claims like "experts say" with no attribution.
Return ONLY a JSON array of claim objects. No other text.`,
});
}),
}),
tool({
name: "score_source",
description: text`
Score a source on 6 heuristic signals and return a 0–100 reliability score with
per-signal breakdown and a verdict (high/moderate/low/very_low). Persists the
score to the local knowledge graph. Score measures reliability proxies, NOT truth.
Call once per source after read_source, before compare_evidence.
`,
parameters: {
url: z.string().describe("URL of the source"),
bodyText: z.string().describe("Body text of the source (from read_source.bodyText)"),
publishedDate: z.string().optional()
.describe("ISO date string from read_source.publishedDate. Leave blank if unknown."),
hasSchemaMarkup: z.boolean().optional().describe("From read_source.hasSchemaMarkup"),
hasHeadings: z.boolean().optional().describe("From read_source.hasHeadings"),
otherSourceUrls: z.array(z.string()).optional()
.describe("Other source URLs fetched this session, for cross-source corroboration"),
},
implementation: safe_impl("score_source", async ({ url, bodyText, publishedDate, hasSchemaMarkup, hasHeadings, otherSourceUrls }) => {
const result = scoreSource(url, bodyText, publishedDate ?? "", hasSchemaMarkup ?? false, hasHeadings ?? false, otherSourceUrls ?? []);
try {
upsertSource(
getDb(dataPath()),
url,
new URL(url).hostname.replace(/^www\./, ""),
result.score,
result.verdict,
result.signals,
);
} catch { /* non-fatal: persist failure shouldn't break scoring */ }
return json(result);
}),
}),
tool({
name: "check_fact",
description: text`
Query the Google Fact Check Tools API for professional fact-checks on a specific claim.
Returns verdicts from PolitiFact, Snopes, Reuters, AFP, and others.
Returns covered:false for niche or technical claims not covered by fact-checkers.
Call when the user asks if a specific claim is true/false, or after compare_evidence
surfaces a contradicted claim. Works without API key (rate-limited unauthenticated).
`,
parameters: {
claim: z.string().describe("The specific claim to fact-check"),
},
implementation: safe_impl("check_fact", async ({ claim }) => {
const result = await checkGoogleFactCheck(claim, factCheckKey());
return json(result);
}),
}),
tool({
name: "compare_evidence",
description: text`
Compare claims from multiple sources for corroboration, contradiction, and single-source
risk. Returns a scaffold payload — follow the instructions field to produce a JSON
object with corroborated/unverified/contradicted claim sets and a synthesis summary.
Call after collecting claims from at least 2 sources via extract_claims.
Do NOT call with claims from a single source — use check_fact instead.
`,
parameters: {
claimsJson: z.string()
.describe("JSON array of claim objects from extract_claims across multiple sources"),
topic: z.string().describe("The research topic these claims relate to"),
},
implementation: safe_impl("compare_evidence", async ({ claimsJson, topic }) => {
const claims = JSON.parse(claimsJson) as Array<{ claim: string; type: string; sourceUrl: string; verbatimQuote: string }>;
return json({
action: "compare_evidence",
topic,
claims,
instructions: `Analyze the ${claims.length} claims above from multiple sources about "${topic}".
For each claim:
1. If 2+ independent sources support it → mark corroborated: true
2. If only 1 source mentions it → mark unverified: true
3. If sources directly contradict each other → mark both contradicted: true, add contradiction_note
Return a JSON object:
{
"corroborated": [...claims with corroborated:true],
"unverified": [...claims with unverified:true],
"contradicted": [...pairs with contradicted:true and contradiction_note],
"summary": "2-3 sentence synthesis of overall evidence quality"
}
Output ONLY valid JSON.`,
});
}),
}),
tool({
name: "save_entity",
description: text`
Persist a named entity (company, person, technology, concept, or event) and its
associated claims to the local SQLite knowledge graph. Returns the entityId.
Entities are upserted — calling again with the same name updates rather than duplicates.
Call once per key entity after compare_evidence and before generate_report.
Do NOT skip this — without it the knowledge graph stays empty across sessions.
`,
parameters: {
entityName: z.string().describe("Name of the entity (company, person, technology, concept, event)"),
entityType: z.enum(["company", "person", "technology", "concept", "event"])
.describe("Entity type"),
claimsJson: z.string().describe("JSON array of claim objects to associate with this entity"),
sourceUrlsJson: z.string().describe("JSON array of source URL strings"),
tags: z.string().optional().describe("JSON array of tag strings, e.g. [\"AI\",\"startup\"]"),
},
implementation: safe_impl("save_entity", async ({ entityName, entityType, claimsJson, sourceUrlsJson, tags }) => {
const db = getDb(dataPath());
const parsedTags: string[] = tags ? JSON.parse(tags) : [];
const entityId = upsertEntity(db, entityName, entityType, parsedTags);
const claims = JSON.parse(claimsJson) as Array<{
claim: string; type?: string; confidence?: number; sourceUrl?: string; verbatimQuote?: string;
}>;
for (const c of claims) {
insertClaim(db, entityId, c.claim, c.type ?? "factual", c.confidence ?? 0.5, c.sourceUrl ?? "", c.verbatimQuote ?? "");
}
const sourceUrls: string[] = JSON.parse(sourceUrlsJson);
for (const url of sourceUrls) {
try {
upsertSource(db, url, new URL(url).hostname.replace(/^www\./, ""), 0, "unknown", {});
} catch { /* skip malformed URL */ }
}
return json({ entityId, entityName, saved: true, claimsSaved: claims.length });
}),
}),
tool({
name: "generate_report",
description: text`
Produce a structured research report from collected evidence. Returns a scaffold
payload — follow the instructions field to write the final report.
Call after compare_evidence. Pass entityNamesJson with entity names from any
save_entity calls made this session so they are linked in the knowledge graph.
Do NOT call before you have scored sources and compared evidence.
`,
parameters: {
topic: z.string().describe("Research topic title"),
claimsJson: z.string().describe("JSON array of all claim objects collected"),
sourcesJson: z.string().describe("JSON array of scored source objects (from score_source results)"),
entityNamesJson: z.string().optional().default("[]")
.describe("JSON array of entity name strings saved via save_entity this session, e.g. [\"OpenAI\",\"Sam Altman\"]"),
format: z.enum(["briefing", "dossier", "market_map", "literature_review", "competitor_comparison"])
.describe("Report format"),
includeConfidenceMarkers: z.boolean().optional().default(true)
.describe("Add ✓/⚠/✗ confidence markers to claims"),
},
implementation: safe_impl("generate_report", async ({ topic, claimsJson, sourcesJson, entityNamesJson, format, includeConfidenceMarkers }) => {
const claims = JSON.parse(claimsJson);
const sources = JSON.parse(sourcesJson);
const sourceUrls = (sources as Array<{ url: string }>).map(s => s.url);
const entityNames: string[] = JSON.parse(entityNamesJson ?? "[]");
try {
const db = getDb(dataPath());
const entityIds = getEntityIdsByNames(db, entityNames);
insertReport(db, topic, format, entityIds, sourceUrls);
} catch { /* non-fatal */ }
const formatGuide: Record<string, string> = {
briefing: "400–600 words. Executive summary. Key findings + implications.",
dossier: "800–1200 words. Structured profile with named sections.",
market_map: "Player landscape with positioning. Use tables or lists.",
literature_review: "Academic synthesis. Note methodologies, gaps, consensus.",
competitor_comparison: "Side-by-side strengths/weaknesses/differentiators.",
};
return json({
action: "generate_report",
topic,
format,
formatGuide: formatGuide[format],
sources,
claims,
instructions: `Write a ${format} on "${topic}" using only the evidence above.
Rules:
- Cite sources inline as [domain] e.g. [reuters.com]
${includeConfidenceMarkers ? "- Mark each key claim: ✓ (2+ sources agree) ⚠(single source) ✗ (contradicted)" : ""}
- Do not introduce facts not in the evidence above
- Flag information gaps explicitly if critical data is missing
- ${formatGuide[format]}
Write the report directly. No preamble.`,
});
}),
}),
tool({
name: "list_prior_reports",
description: text`
List previously saved research reports from the local knowledge graph.
Returns id, topic, format, createdAt, entityCount, and sourceCount per report.
Does NOT return full report content — it returns metadata only.
Call when the user asks "what have we researched?" or "show me past reports".
Pass a topic keyword to filter results.
`,
parameters: {
topic: z.string().optional().describe("Optional keyword filter for report topic"),
limit: z.coerce.number().int().min(1).max(100).optional()
.describe("Max number of reports to return. Default 20."),
},
implementation: safe_impl("list_prior_reports", async ({ topic, limit }) => {
const db = getDb(dataPath());
const reports = listReports(db, topic, limit ?? 20);
return json({ reports, count: reports.length });
}),
}),
tool({
name: "update_entity_timeline",
description: text`
Append a timestamped event to an existing entity's history in the knowledge graph.
Events are INSERT-only — this never overwrites existing timeline entries.
Requires the entity to already exist (call save_entity first).
Call when the user says "add that X happened" or when a source reveals a new
dated event for a tracked entity.
`,
parameters: {
entityName: z.string().describe("Name of the entity (must already exist — call save_entity first)"),
event: z.string().describe("Description of the event"),
date: z.string().describe("ISO date string, e.g. 2024-03-15"),
sourceUrl: z.string().describe("URL of the source for this event"),
},
implementation: safe_impl("update_entity_timeline", async ({ entityName, event, date, sourceUrl }) => {
const db = getDb(dataPath());
const entity = getEntityByName(db, entityName);
if (!entity) {
throw new Error(`Entity "${entityName}" not found. Call save_entity first to create it.`);
}
insertTimelineEvent(db, entity.id, event, date, sourceUrl);
return json({ updated: true, entityId: entity.id, entityName, event, date });
}),
}),
];
return tools;
};