src / toolsProvider.ts
import { text, tool, type Tool, type ToolsProvider } from "@lmstudio/sdk";
import { join } from "path";
import { homedir } from "os";
import { z } from "zod";
import { pluginConfigSchematics } from "./config";
import { openDb, upsertEntity, insertClaim, upsertSource, insertTimelineEvent, insertReport, listReports, getEntityByName, getEntityIdsByNames } from "./db";
import { webSearch, type TimeRange } from "./search";
import { fetchPage, fetchWikipediaSummary } from "./fetch";
import { scoreSource } from "./scorer";
import { checkGoogleFactCheck } from "./factcheck";

function json(obj: unknown): string {
  return JSON.stringify(obj, null, 2);
}

function safe_impl<T extends Record<string, unknown>>(
  name: string,
  fn: (params: T) => Promise<string>,
): (params: T) => Promise<string> {
  return async (params: T) => {
    try {
      return await fn(params);
    } catch (err: unknown) {
      const msg = err instanceof Error ? err.message : String(err);
      return JSON.stringify({
        tool_error: true, tool: name, error: msg,
        hint: "Read the error, fix the parameter causing the issue, and retry.",
      }, null, 2);
    }
  };
}

const _dbCache = new Map<string, ReturnType<typeof openDb>>();
function getDb(path: string): ReturnType<typeof openDb> {
  let db = _dbCache.get(path);
  if (!db) { db = openDb(path); _dbCache.set(path, db); }
  return db;
}

export const toolsProvider: ToolsProvider = async (ctl) => {
  const cfg = ctl.getPluginConfig(pluginConfigSchematics);
  const dataPath = () => cfg.get("dataPath").trim() || join(homedir(), "research-data");
  const searxngUrl = () => cfg.get("searxngUrl").trim() || undefined;
  const maxResultsCfg = () => cfg.get("maxResultsPerQuery");
  const searchWindow = (): TimeRange | undefined => {
    const v = cfg.get("searchRecencyWindow").trim().toLowerCase();
    return (["day", "week", "month", "year"].includes(v) ? v : undefined) as TimeRange | undefined;
  };
  const wikiEnabled = () => cfg.get("enableWikipediaGrounding") === "true";
  const factCheckKey = () => cfg.get("googleFactCheckApiKey").trim() || undefined;

  const tools: Tool[] = [

    tool({
      name: "plan_research",
      description: text`
        Decompose a research question into sub-questions, identify knowns vs. unknowns,
        and produce concrete search queries. Returns a scaffold payload — follow the
        instructions field to generate the plan JSON.
        Call this FIRST on any new research topic before calling search_sources.
        Do NOT call for simple factual lookups that need only one search.
      `,
      parameters: {
        question: z.string().describe("The research question to decompose"),
        depth: z.enum(["quick", "standard", "deep"]).default("standard")
          .describe("quick=3 sub-questions, standard=5, deep=8"),
      },
      implementation: safe_impl("plan_research", async ({ question, depth }) => {
        const counts = { quick: 3, standard: 5, deep: 8 };
        const n = counts[depth];
        return json({
          action: "plan_research",
          question,
          depth,
          instructions: `Break the research question into ${n} specific sub-questions that together would fully answer it. Identify what is already known vs. unknown. Generate 2 concrete search queries per sub-question.

Return a JSON object:
{
  "subQuestions": ["...", ...],
  "knownFacts": ["..."],
  "unknowns": ["..."],
  "suggestedSearchQueries": ["...", ...],
  "estimatedSources": ${n * 2}
}

Output ONLY valid JSON.`,
        });
      }),
    }),

    tool({
      name: "search_sources",
      description: text`
        Execute web searches across multiple queries and return deduplicated candidate
        URLs with title, snippet, and domain. Returns a flat results array.
        Call after plan_research with all suggested queries in one call.
        Do NOT call before plan_research on a new topic.
      `,
      parameters: {
        queries: z.array(z.string()).describe("Search query strings, e.g. [\"query1\", \"query2\"]"),
        maxResults: z.coerce.number().int().min(1).max(20).optional()
          .describe("Max results per query. Defaults to config value."),
        recencyWindow: z.enum(["day", "week", "month", "year", "any"]).optional()
          .describe("Recency filter. Defaults to config value."),
      },
      implementation: safe_impl("search_sources", async ({ queries, maxResults, recencyWindow }) => {
        const max = maxResults ?? maxResultsCfg();
        const window = recencyWindow === "any" ? undefined : (recencyWindow ?? searchWindow());
        const seen = new Set<string>();
        const allResults: Array<{ url: string; title: string; snippet: string; domain: string; query: string }> = [];
        for (const q of queries) {
          const results = await webSearch(q, max, 10_000, searxngUrl(), window);
          for (const r of results) {
            if (seen.has(r.url)) continue;
            seen.add(r.url);
            try {
              const domain = new URL(r.url).hostname.replace(/^www\./, "");
              allResults.push({ url: r.url, title: r.title, snippet: r.snippet, domain, query: q });
            } catch { /* skip malformed URLs */ }
          }
        }
        return json({ results: allResults, totalFound: allResults.length });
      }),
    }),

    tool({
      name: "read_source",
      description: text`
        Fetch a URL and return clean readable body text, word count, published date,
        named entities, and structural signals. Optionally enriches named entities with
        Wikipedia summaries. Returns data for extract_claims and score_source.
        Call once per URL from search_sources. Do NOT call the same URL twice.
      `,
      parameters: {
        url: z.string().describe("URL to fetch and read"),
        enrichWithWikipedia: z.boolean().optional()
          .describe("Fetch Wikipedia summaries for named entities. Defaults to config.enableWikipediaGrounding."),
      },
      implementation: safe_impl("read_source", async ({ url, enrichWithWikipedia }) => {
        const page = await fetchPage(url);
        const doWiki = enrichWithWikipedia ?? wikiEnabled();
        const wikipediaGrounding: Array<{ entity: string; summary: string }> = [];
        if (doWiki && page.namedEntities.length > 0) {
          for (const entity of page.namedEntities.slice(0, 5)) {
            const w = await fetchWikipediaSummary(entity);
            if (w) wikipediaGrounding.push(w);
          }
        }
        return json({
          url: page.url,
          title: page.title,
          bodyText: page.bodyText,
          wordCount: page.wordCount,
          publishedDate: page.publishedDate,
          namedEntities: page.namedEntities,
          hasSchemaMarkup: page.hasSchemaMarkup,
          hasHeadings: page.hasHeadings,
          wikipediaGrounding,
        });
      }),
    }),

    tool({
      name: "extract_claims",
      description: text`
        Extract specific verifiable claims from a source's body text. Returns a scaffold
        payload — follow the instructions field to produce a JSON array of claim objects.
        Pass the output to compare_evidence and save_entity.
        Call once per source after read_source. Do NOT pass the full HTML — use bodyText only.
      `,
      parameters: {
        sourceText: z.string().describe("The source body text (from read_source.bodyText)"),
        url: z.string().describe("URL of the source"),
        focusTopic: z.string().optional().describe("Optional: filter claims to those relevant to this topic"),
      },
      implementation: safe_impl("extract_claims", async ({ sourceText, url, focusTopic }) => {
        return json({
          action: "extract_claims",
          sourceText: sourceText.slice(0, 8_000),
          url,
          focusTopic: focusTopic ?? null,
          instructions: `Extract specific verifiable claims from the source text above.${focusTopic ? ` Focus on claims related to: ${focusTopic}` : ""}

For each claim return a JSON object with:
- "claim": the specific factual assertion (not opinion)
- "type": "factual" | "statistical" | "opinion" | "citation"
- "confidence": 0.0–1.0 (how clearly stated vs. implied)
- "verbatimQuote": the exact quote from the source
- "sourceUrl": "${url}"

Focus on: specific dates, numbers, named entities, causal claims, statistics.
Skip vague claims like "experts say" with no attribution.

Return ONLY a JSON array of claim objects. No other text.`,
        });
      }),
    }),

    tool({
      name: "score_source",
      description: text`
        Score a source on 6 heuristic signals and return a 0–100 reliability score with
        per-signal breakdown and a verdict (high/moderate/low/very_low). Persists the
        score to the local knowledge graph. Score measures reliability proxies, NOT truth.
        Call once per source after read_source, before compare_evidence.
      `,
      parameters: {
        url: z.string().describe("URL of the source"),
        bodyText: z.string().describe("Body text of the source (from read_source.bodyText)"),
        publishedDate: z.string().optional()
          .describe("ISO date string from read_source.publishedDate. Leave blank if unknown."),
        hasSchemaMarkup: z.boolean().optional().describe("From read_source.hasSchemaMarkup"),
        hasHeadings: z.boolean().optional().describe("From read_source.hasHeadings"),
        otherSourceUrls: z.array(z.string()).optional()
          .describe("Other source URLs fetched this session, for cross-source corroboration"),
      },
      implementation: safe_impl("score_source", async ({ url, bodyText, publishedDate, hasSchemaMarkup, hasHeadings, otherSourceUrls }) => {
        const result = scoreSource(url, bodyText, publishedDate ?? "", hasSchemaMarkup ?? false, hasHeadings ?? false, otherSourceUrls ?? []);
        try {
          upsertSource(
            getDb(dataPath()),
            url,
            new URL(url).hostname.replace(/^www\./, ""),
            result.score,
            result.verdict,
            result.signals,
          );
        } catch { /* non-fatal: persist failure shouldn't break scoring */ }
        return json(result);
      }),
    }),

    tool({
      name: "check_fact",
      description: text`
        Query the Google Fact Check Tools API for professional fact-checks on a specific claim.
        Returns verdicts from PolitiFact, Snopes, Reuters, AFP, and others.
        Returns covered:false for niche or technical claims not covered by fact-checkers.
        Call when the user asks if a specific claim is true/false, or after compare_evidence
        surfaces a contradicted claim. Works without API key (rate-limited unauthenticated).
      `,
      parameters: {
        claim: z.string().describe("The specific claim to fact-check"),
      },
      implementation: safe_impl("check_fact", async ({ claim }) => {
        const result = await checkGoogleFactCheck(claim, factCheckKey());
        return json(result);
      }),
    }),

    tool({
      name: "compare_evidence",
      description: text`
        Compare claims from multiple sources for corroboration, contradiction, and single-source
        risk. Returns a scaffold payload — follow the instructions field to produce a JSON
        object with corroborated/unverified/contradicted claim sets and a synthesis summary.
        Call after collecting claims from at least 2 sources via extract_claims.
        Do NOT call with claims from a single source — use check_fact instead.
      `,
      parameters: {
        claimsJson: z.string()
          .describe("JSON array of claim objects from extract_claims across multiple sources"),
        topic: z.string().describe("The research topic these claims relate to"),
      },
      implementation: safe_impl("compare_evidence", async ({ claimsJson, topic }) => {
        const claims = JSON.parse(claimsJson) as Array<{ claim: string; type: string; sourceUrl: string; verbatimQuote: string }>;
        return json({
          action: "compare_evidence",
          topic,
          claims,
          instructions: `Analyze the ${claims.length} claims above from multiple sources about "${topic}".

For each claim:
1. If 2+ independent sources support it → mark corroborated: true
2. If only 1 source mentions it → mark unverified: true
3. If sources directly contradict each other → mark both contradicted: true, add contradiction_note

Return a JSON object:
{
  "corroborated": [...claims with corroborated:true],
  "unverified": [...claims with unverified:true],
  "contradicted": [...pairs with contradicted:true and contradiction_note],
  "summary": "2-3 sentence synthesis of overall evidence quality"
}

Output ONLY valid JSON.`,
        });
      }),
    }),

    tool({
      name: "save_entity",
      description: text`
        Persist a named entity (company, person, technology, concept, or event) and its
        associated claims to the local SQLite knowledge graph. Returns the entityId.
        Entities are upserted — calling again with the same name updates rather than duplicates.
        Call once per key entity after compare_evidence and before generate_report.
        Do NOT skip this — without it the knowledge graph stays empty across sessions.
      `,
      parameters: {
        entityName: z.string().describe("Name of the entity (company, person, technology, concept, event)"),
        entityType: z.enum(["company", "person", "technology", "concept", "event"])
          .describe("Entity type"),
        claimsJson: z.string().describe("JSON array of claim objects to associate with this entity"),
        sourceUrlsJson: z.string().describe("JSON array of source URL strings"),
        tags: z.string().optional().describe("JSON array of tag strings, e.g. [\"AI\",\"startup\"]"),
      },
      implementation: safe_impl("save_entity", async ({ entityName, entityType, claimsJson, sourceUrlsJson, tags }) => {
        const db = getDb(dataPath());
        const parsedTags: string[] = tags ? JSON.parse(tags) : [];
        const entityId = upsertEntity(db, entityName, entityType, parsedTags);
        const claims = JSON.parse(claimsJson) as Array<{
          claim: string; type?: string; confidence?: number; sourceUrl?: string; verbatimQuote?: string;
        }>;
        for (const c of claims) {
          insertClaim(db, entityId, c.claim, c.type ?? "factual", c.confidence ?? 0.5, c.sourceUrl ?? "", c.verbatimQuote ?? "");
        }
        const sourceUrls: string[] = JSON.parse(sourceUrlsJson);
        for (const url of sourceUrls) {
          try {
            upsertSource(db, url, new URL(url).hostname.replace(/^www\./, ""), 0, "unknown", {});
          } catch { /* skip malformed URL */ }
        }
        return json({ entityId, entityName, saved: true, claimsSaved: claims.length });
      }),
    }),

    tool({
      name: "generate_report",
      description: text`
        Produce a structured research report from collected evidence. Returns a scaffold
        payload — follow the instructions field to write the final report.
        Call after compare_evidence. Pass entityNamesJson with entity names from any
        save_entity calls made this session so they are linked in the knowledge graph.
        Do NOT call before you have scored sources and compared evidence.
      `,
      parameters: {
        topic: z.string().describe("Research topic title"),
        claimsJson: z.string().describe("JSON array of all claim objects collected"),
        sourcesJson: z.string().describe("JSON array of scored source objects (from score_source results)"),
        entityNamesJson: z.string().optional().default("[]")
          .describe("JSON array of entity name strings saved via save_entity this session, e.g. [\"OpenAI\",\"Sam Altman\"]"),
        format: z.enum(["briefing", "dossier", "market_map", "literature_review", "competitor_comparison"])
          .describe("Report format"),
        includeConfidenceMarkers: z.boolean().optional().default(true)
          .describe("Add ✓/⚠/✗ confidence markers to claims"),
      },
      implementation: safe_impl("generate_report", async ({ topic, claimsJson, sourcesJson, entityNamesJson, format, includeConfidenceMarkers }) => {
        const claims = JSON.parse(claimsJson);
        const sources = JSON.parse(sourcesJson);
        const sourceUrls = (sources as Array<{ url: string }>).map(s => s.url);
        const entityNames: string[] = JSON.parse(entityNamesJson ?? "[]");
        try {
          const db = getDb(dataPath());
          const entityIds = getEntityIdsByNames(db, entityNames);
          insertReport(db, topic, format, entityIds, sourceUrls);
        } catch { /* non-fatal */ }
        const formatGuide: Record<string, string> = {
          briefing: "400–600 words. Executive summary. Key findings + implications.",
          dossier: "800–1200 words. Structured profile with named sections.",
          market_map: "Player landscape with positioning. Use tables or lists.",
          literature_review: "Academic synthesis. Note methodologies, gaps, consensus.",
          competitor_comparison: "Side-by-side strengths/weaknesses/differentiators.",
        };
        return json({
          action: "generate_report",
          topic,
          format,
          formatGuide: formatGuide[format],
          sources,
          claims,
          instructions: `Write a ${format} on "${topic}" using only the evidence above.

Rules:
- Cite sources inline as [domain] e.g. [reuters.com]
${includeConfidenceMarkers ? "- Mark each key claim: ✓ (2+ sources agree) ⚠ (single source) ✗ (contradicted)" : ""}
- Do not introduce facts not in the evidence above
- Flag information gaps explicitly if critical data is missing
- ${formatGuide[format]}

Write the report directly. No preamble.`,
        });
      }),
    }),

    tool({
      name: "list_prior_reports",
      description: text`
        List previously saved research reports from the local knowledge graph.
        Returns id, topic, format, createdAt, entityCount, and sourceCount per report.
        Does NOT return full report content — it returns metadata only.
        Call when the user asks "what have we researched?" or "show me past reports".
        Pass a topic keyword to filter results.
      `,
      parameters: {
        topic: z.string().optional().describe("Optional keyword filter for report topic"),
        limit: z.coerce.number().int().min(1).max(100).optional()
          .describe("Max number of reports to return. Default 20."),
      },
      implementation: safe_impl("list_prior_reports", async ({ topic, limit }) => {
        const db = getDb(dataPath());
        const reports = listReports(db, topic, limit ?? 20);
        return json({ reports, count: reports.length });
      }),
    }),

    tool({
      name: "update_entity_timeline",
      description: text`
        Append a timestamped event to an existing entity's history in the knowledge graph.
        Events are INSERT-only — this never overwrites existing timeline entries.
        Requires the entity to already exist (call save_entity first).
        Call when the user says "add that X happened" or when a source reveals a new
        dated event for a tracked entity.
      `,
      parameters: {
        entityName: z.string().describe("Name of the entity (must already exist — call save_entity first)"),
        event: z.string().describe("Description of the event"),
        date: z.string().describe("ISO date string, e.g. 2024-03-15"),
        sourceUrl: z.string().describe("URL of the source for this event"),
      },
      implementation: safe_impl("update_entity_timeline", async ({ entityName, event, date, sourceUrl }) => {
        const db = getDb(dataPath());
        const entity = getEntityByName(db, entityName);
        if (!entity) {
          throw new Error(`Entity "${entityName}" not found. Call save_entity first to create it.`);
        }
        insertTimelineEvent(db, entity.id, event, date, sourceUrl);
        return json({ updated: true, entityId: entity.id, entityName, event, date });
      }),
    }),

  ];

  return tools;
};