src / index.ts
import { tool, Tool, ToolsProviderController, type PluginContext } from "@lmstudio/sdk";
import { z } from "zod";
import * as cheerio from "cheerio";
import { randomUUID } from "crypto";
// --- GLOBAL CONFIGURATION ---
const MAX_TEXT_LENGTH = 20000; // Max length for stored content
const MAX_CHUNK_LENGTH = 15000; // The size of each "scroll" segment passed to the model
const SEARCH_CRAWL_LIMIT = 5; // Max number of search results to AUTOMATICALLY CRAWL
const AUTO_CRAWL_CONTENT_LIMIT = 1000; // The amount of content to extract from an auto-crawled page
const MAX_LINKS_TO_RETURN = 10; // Limit the number of suggested links to return for manual following
// --- STATE MANAGEMENT: Cache for content that can be scrolled and suggested links ---
// Stores the full, compressed text, the current scroll offset, and the map of indexed links (L1, L2, etc.).
const pageCache = new Map<string, { fullText: string, currentOffset: number, suggestedLinks: Record<string, string> }>();
// --- COMMON EXCLUSIONS FOR DEEPER SEARCH (Filtering Low-Value URLs) ---
const COMMON_URL_EXCLUSIONS = [
// 1. Media/Files
/\.pdf$/, /\.jpg$/, /\.png$/, /\.gif$/, /\.svg$/,
/\.zip$/, /\.rar$/, /\.tar$/, /\.gz$/,
/\.mp3$/, /\.mp4$/, /\.mov$/, /\.avi$/,
// 2. Forms/Auth/Social
/\/login\/?$/, /\/register\/?$/, /\/forgot-password\/?$/,
/:\/\/(www\.)?twitter\.com/, /:\/\/(www\.)?facebook\.com/, /:\/\/(www\.)?instagram\.com/,
// 3. Query Parameter Traps
/\?.*(sort|filter|order|limit|sessionid|phpsessid|cart|checkout|wishlist|compare|src)=/i,
];
// --- Helper Functions ---
/**
* Normalizes a URL, makes it absolute, and filters it against common exclusions.
*/
function normalizeAndFilterLink(baseUrl: string, href: string | undefined): string | null {
if (!href) return null;
try {
const url = new URL(href, baseUrl);
const normalizedUrl = url.href.replace(/#.*$/, '');
if (!normalizedUrl.startsWith('http')) return null;
if (COMMON_URL_EXCLUSIONS.some(pattern => pattern.test(normalizedUrl.toLowerCase()))) {
return null;
}
return normalizedUrl;
} catch (e) {
return null;
}
}
/**
* Fetches, strips boilerplate, extracts structured data, and compresses the content of a single URL.
* Used exclusively by searchWeb for quick, summarized results.
*/
async function fetchAndSummarizeUrl(url: string, contentLimit: number, discoverLink: boolean = false): Promise<{ summary: string, deepLink: string | null }> {
let deepLink: string | null = null;
try {
const httpResponse = await fetch(url);
if (!httpResponse.ok) return { summary: `[Crawl Failed: Status ${httpResponse.status}]`, deepLink };
const html = await httpResponse.text();
const $ = cheerio.load(html);
// --- STRUCTURAL COMPRESSION (Stricter Stripping) ---
$("script, style, noscript, footer, nav, aside, header, form, .ad, .sidebar, .comments, #cookie-banner, [role='navigation'], .wp-block-group, blockquote, figcaption").remove();
const title = $("title").text().trim() || "No title found";
// --- LINK DISCOVERY for Deep Crawl (used by searchWeb's deepCrawl=true) ---
if (discoverLink) {
const baseUrlObject = new URL(url);
$("a").each((i, el) => {
const href = $(el).attr("href");
const normalizedLink = normalizeAndFilterLink(url, href);
if (normalizedLink) {
try {
const linkUrlObject = new URL(normalizedLink);
if (linkUrlObject.hostname === baseUrlObject.hostname || linkUrlObject.hostname.endsWith(`.${baseUrlObject.hostname}`)) {
deepLink = normalizedLink;
return false;
}
} catch (e) { /* ignore */ }
}
});
}
// --- CONTENT COMPRESSION (Explicit Links - Highly Compressed for Search) ---
// Replaces HTML <a> tags with explicit, plain text links
$("a").each((i, el) => {
const linkText = $(el).text().trim();
const linkUrl = $(el).attr("href");
if (linkText && linkUrl) {
// Use a short form of the URL for maximum compression in search summary
const shortUrl = linkUrl.length > 30 ? linkUrl.substring(0, 30) + "..." : linkUrl;
$(el).replaceWith(`${linkText} (${shortUrl})`);
} else if (linkUrl) {
$(el).replaceWith(`Link: (${linkUrl.substring(0, 30)}...)`);
} else {
$(el).remove();
}
});
// --- STRUCTURAL PRIORITIZATION ---
let structuredContent = "";
// Headers
$("h1, h2, h3").each((i, el) => {
structuredContent += `[H${el.tagName.charAt(1)}]: ${$(el).text().trim()} | `;
});
// Lists (Condensed)
$("ul, ol").each((i, el) => {
const listItems = $(el).find('li').map((j, li) => $(li).text().trim()).get().join('; ');
if (listItems.length > 0) {
structuredContent += `[LIST]: ${listItems} | `;
}
});
// Tables (Summarized)
$("table").each((i, el) => {
const headers = $(el).find('thead th').map((j, th) => $(th).text().trim()).get().join(' / ');
const firstRow = $(el).find('tbody tr').eq(0).find('td').map((j, td) => $(td).text().trim()).get().join(' / ');
if (headers.length > 0) {
structuredContent += `[TABLE Headers]: ${headers} [First Row]: ${firstRow} | `;
}
});
let cleanedText = $("main, article").text().trim() || $("body").text().trim();
// --- CHARACTER COMPRESSION (Whitespace Minimization) ---
let fullCleanedText = (structuredContent + cleanedText)
.replace(/\s+/g, " ") // Reduce all whitespace to a single space
.replace(/[.,;:]\s*[.,;:]+/g, (match) => match[0] + " ") // Clean up double punctuation
.trim();
let contentSummary = fullCleanedText.substring(0, contentLimit);
if (fullCleanedText.length > contentLimit) {
contentSummary += " [Content Truncated]";
}
const summary = `T: ${title} | U: ${url} | C: ${contentSummary}`;
return { summary, deepLink };
} catch (error: any) {
return { summary: `[Crawl Failed: Unexpected Error]`, deepLink };
}
}
/**
* Fetches, strips boilerplate, extracts structured data, and compresses the content of a single URL,
* returning the full text for cache storage (used by the crawlWebPage and followSuggestedLink tools).
*/
async function fetchAndExtractFullContent(url: string, filterQuery?: string): Promise<{ fullText: string, title: string, rawDiscoveredLinks: Map<string, string> }> {
const httpResponse = await fetch(url);
if (!httpResponse.ok) throw new Error(`Status ${httpResponse.status}`);
const html = await httpResponse.text();
const $ = cheerio.load(html);
// Apply Structural Compression filters
$("script, style, noscript, footer, nav, aside, header, form, .ad, .sidebar, .comments, #cookie-banner, [role='navigation'], .wp-block-group, blockquote, figcaption").remove();
const title = $("title").text().trim() || "No title";
// --- Link Collection for Further Crawling ---
const rawDiscoveredLinks = new Map<string, string>(); // Link Text -> Full URL
$("a").each((i, el) => {
const href = $(el).attr("href");
const linkText = $(el).text().trim() || 'Link';
const normalizedLink = normalizeAndFilterLink(url, href);
if (normalizedLink) {
try {
const baseUrlObject = new URL(url);
const linkUrlObject = new URL(normalizedLink);
// Check for same-domain or relevant subdomain
if ((linkUrlObject.hostname === baseUrlObject.hostname || linkUrlObject.hostname.endsWith(`.${baseUrlObject.hostname}`)) && rawDiscoveredLinks.size < MAX_LINKS_TO_RETURN) {
rawDiscoveredLinks.set(linkText, normalizedLink);
}
} catch (e) { /* ignore malformed links */ }
}
});
// --- Content Processing (Structural) ---
let structuredContent = "";
$("h1, h2, h3").each((i, el) => {
structuredContent += `[H${el.tagName.charAt(1)}]: ${$(el).text().trim()} | `;
});
$("ul, ol").each((i, el) => {
const listItems = $(el).find('li').map((j, li) => $(li).text().trim()).get().join('; ');
if (listItems.length > 0) {
structuredContent += `[LIST]: ${listItems} | `;
}
});
$("table").each((i, el) => {
const headers = $(el).find('thead th').map((j, th) => $(th).text().trim()).get().join(' / ');
const firstRow = $(el).find('tbody tr').eq(0).find('td').map((j, td) => $(td).text().trim()).get().join(' / ');
if (headers.length > 0) {
structuredContent += `[TABLE Headers]: ${headers} [First Row]: ${firstRow} | `;
}
});
// --- LINK COMPRESSION FOR CHUNK CONTENT ---
// Replaces full links with just the text or nothing.
$("a").each((i, el) => {
const linkText = $(el).text().trim();
if (linkText) {
$(el).replaceWith(`[Link: ${linkText}]`); // Use a shorter tag format
} else {
$(el).remove();
}
});
let cleanedText = $("main, article").text().trim() || $("body").text().trim();
// Aggressive Whitespace and Punctuation Cleanup
let fullText = (structuredContent + cleanedText)
.replace(/\s+/g, " ")
.replace(/[.,;:]\s*[.,;:]+/g, (match) => match[0] + " ")
.trim();
// --- Query-based Prioritization (Lossy Semantic Compression) ---
if (filterQuery) {
const paragraphs = fullText.split(/\.|\?|\!/g);
let bestParagraphs = "";
for (const p of paragraphs) {
if (p.toLowerCase().includes(filterQuery.toLowerCase())) {
bestParagraphs += p.trim() + ". ";
if (bestParagraphs.length > 1000) break;
}
}
if (bestParagraphs.length > 0) {
fullText = `[QUERY MATCH: ${filterQuery}] ${bestParagraphs.trim()} | ${fullText}`;
}
}
if (fullText.length > MAX_TEXT_LENGTH) {
fullText = fullText.substring(0, MAX_TEXT_LENGTH);
}
return { fullText, title, rawDiscoveredLinks };
}
// --- 1. TOOL DEFINITION FUNCTION ---
async function toolsProvider(ctl: ToolsProviderController): Promise<Tool[]> {
const tools: Tool[] = [];
// 1. SEARCH WEB TOOL 🌎
const searchWebTool = tool({
name: "searchWeb",
description: "Performs a global search, crawling the top 5 results (depth 1). Optionally use deepCrawl=true to also crawl the first internal link on each page (depth 2), providing more comprehensive coverage.",
parameters: {
query: z.string().describe("The complete search phrase or terms."),
deepCrawl: z.boolean().optional().describe("Set to true to perform a secondary crawl on the first internal link found on each of the top 5 results (Depth 2).")
},
implementation: async ({ query, deepCrawl }: { query: string, deepCrawl?: boolean }) => {
const finalQuery = query;
try {
// Using DuckDuckGo for search results
const ddgOrganicUrl = new URL("https://duckduckgo.com/html/");
ddgOrganicUrl.searchParams.set("q", finalQuery);
const response = await fetch(ddgOrganicUrl.toString());
if (!response.ok) return "Search Failed: Network.";
let html = await response.text();
html = html.replace(/\s+/g, " ");
const $ = cheerio.load(html);
const urlsToCrawl: string[] = [];
// Extract search result URLs
$("#links .result").each((i, el) => {
const titleElement = $(el).find(".result__title .result__a");
const url = titleElement.attr("href");
if (url) {
let cleanUrl = new URL(url, "https://duckduckgo.com").searchParams.get('uddg') || url;
urlsToCrawl.push(cleanUrl);
}
});
const primaryUrls = urlsToCrawl.filter(url => {
return normalizeAndFilterLink("https://duckduckgo.com/", url) !== null;
}).slice(0, SEARCH_CRAWL_LIMIT);
if (primaryUrls.length === 0) return `No relevant results found after filtering.`;
// Depth 1 Crawl (Primary Results)
const crawlPromises = primaryUrls.map(url =>
fetchAndSummarizeUrl(url, AUTO_CRAWL_CONTENT_LIMIT, deepCrawl)
);
const primaryResults = await Promise.all(crawlPromises);
let allResults = [...primaryResults];
let deepCrawlSuccesses = 0;
// Depth 2 Crawl (Secondary Links)
if (deepCrawl) {
const deepCrawlPromises = primaryResults
.filter(res => res.deepLink)
.map(res => {
deepCrawlSuccesses++;
return fetchAndSummarizeUrl(res.deepLink!, AUTO_CRAWL_CONTENT_LIMIT, false)
.then(deepRes => ({
summary: `[SECONDARY CRAWL from ${res.deepLink}] ${deepRes.summary}`,
deepLink: null
}));
});
const secondaryResults = await Promise.all(deepCrawlPromises);
allResults.push(...secondaryResults);
}
// Format the final output
const formattedResults = allResults.map((result, index) => {
return `R${index + 1} | ${result.summary}`;
}).join('\n');
let header = `Search and Auto-Crawl Success (Top ${primaryUrls.length} results, Links explicit and compressed)`;
if (deepCrawl) {
header += ` - DEEP CRAWL (Depth 2) completed for ${deepCrawlSuccesses} pages`;
}
return `${header}:\n${formattedResults}`;
} catch (error: any) {
return "Search Failed: Critical Error during auto-crawl.";
}
}
});
tools.push(searchWebTool);
// 2. CRAWL WEB PAGE TOOL (Manual Deep Dive) 📜
const crawlWebPageTool = tool({
name: "crawlWebPage",
description: "Fetches and extracts the main text content from a single URL. Returns the first compressed chunk, a list of suggested internal links (indexed for compression), and a Session ID if more content exists. Supports optional filterQuery to prioritize specific content.",
parameters: {
url: z.string().url().describe("The absolute URL of the webpage to crawl (e.g., https://example.com)."),
filterQuery: z.string().optional().describe("An optional term (e.g., 'system requirements') to prioritize relevant paragraph content in the first chunk.")
},
implementation: async ({ url, filterQuery }: { url: string, filterQuery?: string }) => {
try {
// Use the new helper function to fetch and process content
const { fullText, title, rawDiscoveredLinks } = await fetchAndExtractFullContent(url, filterQuery);
// --- Final Output and Cache Setup ---
const sessionId = randomUUID();
// Prepare link index for compressed output, while storing full URLs
const suggestedLinksMap: Record<string, string> = {};
const suggestedLinksOutput: string[] = [];
Array.from(rawDiscoveredLinks.entries()).forEach(([text, linkUrl], index) => {
const id = `L${index + 1}`;
const domain = new URL(linkUrl).hostname;
suggestedLinksMap[id] = linkUrl;
suggestedLinksOutput.push(`${id}: ${text} (${domain})`);
});
// Cache the full content and links
pageCache.set(sessionId, { fullText, currentOffset: 0, suggestedLinks: suggestedLinksMap });
const firstChunk = fullText.substring(0, MAX_CHUNK_LENGTH);
const remainingLength = fullText.length - firstChunk.length;
pageCache.get(sessionId)!.currentOffset = firstChunk.length;
const linksOutput = suggestedLinksOutput.length > 0
? ` | Suggested Links (Indexed): [${suggestedLinksOutput.join(' | ')}]`
: " | No relevant links suggested.";
let outputMessage = `Crawl Success: ${title} | First Chunk (HIGHLY COMPRESSED, ${firstChunk.length} chars): ${firstChunk}`;
if (remainingLength > 0) {
outputMessage += ` | MORE CONTENT EXISTS (Remaining: ${remainingLength} chars). Use scrollPage with ID ${sessionId}`;
}
// Instruction for the new follow tool
if (suggestedLinksOutput.length > 0) {
outputMessage += ` | ACTION: Use followSuggestedLink(sessionId='${sessionId}', linkId='L#') to follow a link.`;
}
// Delete cache if no scrolling or links are possible
if (remainingLength === 0 && suggestedLinksOutput.length === 0) {
pageCache.delete(sessionId);
}
return outputMessage + linksOutput;
} catch (error: any) {
return `Crawl Failed: Unexpected error: ${error.message}`;
}
}
});
tools.push(crawlWebPageTool);
// 3. FOLLOW SUGGESTED LINK 🧭 (NEW TOOL for Deeper Search)
const followSuggestedLinkTool = tool({
name: "followSuggestedLink",
description: "Resolves a compressed link ID (e.g., 'L1') from a previous crawl session into its full URL and then crawls that new URL. REQUIRES both the Session ID and the Link ID.",
parameters: {
sessionId: z.string().describe("The unique Session ID returned by 'crawlWebPage' that contains the link index."),
linkId: z.string().describe("The compressed link index (e.g., 'L1', 'L2') from the suggested links list.")
},
implementation: async ({ sessionId, linkId }: { sessionId: string, linkId: string }) => {
const session = pageCache.get(sessionId);
if (!session) {
return `Follow Link Failed: Invalid or expired Session ID. Start a new crawl.`;
}
const fullUrl = session.suggestedLinks[linkId];
if (!fullUrl) {
return `Follow Link Failed: Link ID '${linkId}' not found in Session ID '${sessionId}'.`;
}
// Clean up old session before crawling the new link to manage memory
pageCache.delete(sessionId);
// Perform the crawl on the resolved URL
try {
const { fullText, title, rawDiscoveredLinks } = await fetchAndExtractFullContent(fullUrl);
// --- Final Output and Cache Setup for the NEW page ---
const newSessionId = randomUUID();
// Prepare link index for compressed output, while storing full URLs
const suggestedLinksMap: Record<string, string> = {};
const suggestedLinksOutput: string[] = [];
Array.from(rawDiscoveredLinks.entries()).forEach(([text, linkUrl], index) => {
const id = `L${index + 1}`;
const domain = new URL(linkUrl).hostname;
suggestedLinksMap[id] = linkUrl;
suggestedLinksOutput.push(`${id}: ${text} (${domain})`);
});
pageCache.set(newSessionId, { fullText, currentOffset: 0, suggestedLinks: suggestedLinksMap });
const firstChunk = fullText.substring(0, MAX_CHUNK_LENGTH);
const remainingLength = fullText.length - firstChunk.length;
pageCache.get(newSessionId)!.currentOffset = firstChunk.length;
const linksOutput = suggestedLinksOutput.length > 0
? ` | Suggested Links (Indexed): [${suggestedLinksOutput.join(' | ')}]`
: " | No relevant links suggested.";
let outputMessage = `Follow Success: Crawled ${title} | First Chunk (HIGHLY COMPRESSED, ${firstChunk.length} chars): ${firstChunk}`;
if (remainingLength > 0) {
outputMessage += ` | MORE CONTENT EXISTS (Remaining: ${remainingLength} chars). Use scrollPage with ID ${newSessionId}`;
}
if (suggestedLinksOutput.length > 0) {
outputMessage += ` | ACTION: Use followSuggestedLink(sessionId='${newSessionId}', linkId='L#') to follow a link.`;
}
if (remainingLength === 0 && suggestedLinksOutput.length === 0) {
pageCache.delete(newSessionId);
}
return outputMessage + linksOutput;
} catch (error: any) {
return `Follow Link Crawl Failed for ${fullUrl}: ${error.message}`;
}
}
});
tools.push(followSuggestedLinkTool);
// 4. SCROLL PAGE TOOL (Pagination) 🖱️
const scrollPageTool = tool({
name: "scrollPage",
description: "Scrolls and returns the next chunk of content (which is compressed and explicit) from a previously crawled page session. Takes a Session ID from 'crawlWebPage' or 'followSuggestedLink'.",
parameters: {
sessionId: z.string().describe("The unique Session ID returned by 'crawlWebPage' or 'followSuggestedLink'."),
},
implementation: async ({ sessionId }: { sessionId: string }) => {
const session = pageCache.get(sessionId);
if (!session) {
return `Scroll Failed: Invalid or expired Session ID. Start a new crawl.`;
}
const { fullText, currentOffset, suggestedLinks } = session;
const remainingText = fullText.substring(currentOffset);
if (remainingText.length === 0) {
pageCache.delete(sessionId);
return `Scroll End: End of content reached. Session ID ${sessionId} deleted.`;
}
const nextChunk = remainingText.substring(0, MAX_CHUNK_LENGTH);
const newOffset = currentOffset + nextChunk.length;
const newRemaining = fullText.length - newOffset;
session.currentOffset = newOffset;
let response = `Scroll Success (Chunk ${Math.ceil(newOffset / MAX_CHUNK_LENGTH)}, HIGHLY COMPRESSED): ${nextChunk}`;
if (newRemaining > 0) {
response += ` | MORE CONTENT EXISTS (Remaining: ${newRemaining} chars). Use scrollPage again with ID ${sessionId}`;
} else {
let linksOutput = "";
if (Object.keys(suggestedLinks).length > 0) {
// Re-include links when session is ending for final context reference
const suggestedLinksOutput: string[] = Object.entries(suggestedLinks).map(([id, url]) => {
const text = Object.entries(suggestedLinks).find(([, storedUrl]) => storedUrl === url)?.[0] || 'Link'; // Find the original text
const domain = new URL(url).hostname;
return `${id}: ${text} (${domain})`;
});
linksOutput = ` | FINAL SUGGESTED LINKS: [${suggestedLinksOutput.join(' | ')}]`;
}
pageCache.delete(sessionId);
response += ` | Scroll End: No more content. Session ID ${sessionId} deleted. ${linksOutput}`;
}
return response;
}
});
tools.push(scrollPageTool);
return tools;
}
// --- 2. PLUGIN ENTRY POINT ---
export async function main(context: PluginContext) {
context.withToolsProvider(toolsProvider);
return Promise.resolve();
}