Project Files
src
index.ts
.gitignore
LICENSE
manifest.json
package-lock.json
package.json
README.md
tsconfig.json
src / index.ts
import { tool, Tool, ToolsProviderController, type PluginContext } from "@lmstudio/sdk";
import { z } from "zod";
import * as cheerio from "cheerio";
import { randomUUID } from "crypto";

// --- GLOBAL CONFIGURATION ---
const MAX_TEXT_LENGTH = 20000; // Max length for stored content
const MAX_CHUNK_LENGTH = 15000; // The size of each "scroll" segment passed to the model
const SEARCH_CRAWL_LIMIT = 5; // Max number of search results to AUTOMATICALLY CRAWL
const AUTO_CRAWL_CONTENT_LIMIT = 1000; // The amount of content to extract from an auto-crawled page
const MAX_LINKS_TO_RETURN = 10; // Limit the number of suggested links to return for manual following

// --- STATE MANAGEMENT: Cache for content that can be scrolled and suggested links ---
// Stores the full, compressed text, the current scroll offset, and the map of indexed links (L1, L2, etc.).
const pageCache = new Map<string, { fullText: string, currentOffset: number, suggestedLinks: Record<string, string> }>();

// --- COMMON EXCLUSIONS FOR DEEPER SEARCH (Filtering Low-Value URLs) ---
const COMMON_URL_EXCLUSIONS = [
    // 1. Media/Files
    /\.pdf$/, /\.jpg$/, /\.png$/, /\.gif$/, /\.svg$/,
    /\.zip$/, /\.rar$/, /\.tar$/, /\.gz$/, 
    /\.mp3$/, /\.mp4$/, /\.mov$/, /\.avi$/, 
    
    // 2. Forms/Auth/Social
    /\/login\/?$/, /\/register\/?$/, /\/forgot-password\/?$/,
    /:\/\/(www\.)?twitter\.com/, /:\/\/(www\.)?facebook\.com/, /:\/\/(www\.)?instagram\.com/,
    
    // 3. Query Parameter Traps
    /\?.*(sort|filter|order|limit|sessionid|phpsessid|cart|checkout|wishlist|compare|src)=/i,
];

// --- Helper Functions ---

/**
 * Normalizes a URL, makes it absolute, and filters it against common exclusions.
 */
function normalizeAndFilterLink(baseUrl: string, href: string | undefined): string | null {
    if (!href) return null;

    try {
        const url = new URL(href, baseUrl);
        
        const normalizedUrl = url.href.replace(/#.*$/, ''); 
        
        if (!normalizedUrl.startsWith('http')) return null;

        if (COMMON_URL_EXCLUSIONS.some(pattern => pattern.test(normalizedUrl.toLowerCase()))) {
            return null;
        }

        return normalizedUrl;
    } catch (e) {
        return null;
    }
}


/**
 * Fetches, strips boilerplate, extracts structured data, and compresses the content of a single URL.
 * Used exclusively by searchWeb for quick, summarized results.
 */
async function fetchAndSummarizeUrl(url: string, contentLimit: number, discoverLink: boolean = false): Promise<{ summary: string, deepLink: string | null }> {
    let deepLink: string | null = null;
    try {
        const httpResponse = await fetch(url);
        if (!httpResponse.ok) return { summary: `[Crawl Failed: Status ${httpResponse.status}]`, deepLink };

        const html = await httpResponse.text();
        const $ = cheerio.load(html);

        // --- STRUCTURAL COMPRESSION (Stricter Stripping) ---
        $("script, style, noscript, footer, nav, aside, header, form, .ad, .sidebar, .comments, #cookie-banner, [role='navigation'], .wp-block-group, blockquote, figcaption").remove();
        
        const title = $("title").text().trim() || "No title found";

        // --- LINK DISCOVERY for Deep Crawl (used by searchWeb's deepCrawl=true) ---
        if (discoverLink) {
             const baseUrlObject = new URL(url);
             $("a").each((i, el) => {
                const href = $(el).attr("href");
                const normalizedLink = normalizeAndFilterLink(url, href);
                
                if (normalizedLink) {
                    try {
                        const linkUrlObject = new URL(normalizedLink);
                        if (linkUrlObject.hostname === baseUrlObject.hostname || linkUrlObject.hostname.endsWith(`.${baseUrlObject.hostname}`)) {
                            deepLink = normalizedLink;
                            return false; 
                        }
                    } catch (e) { /* ignore */ }
                }
            });
        }
        
        // --- CONTENT COMPRESSION (Explicit Links - Highly Compressed for Search) ---
        // Replaces HTML <a> tags with explicit, plain text links
        $("a").each((i, el) => {
            const linkText = $(el).text().trim();
            const linkUrl = $(el).attr("href");
            
            if (linkText && linkUrl) {
                // Use a short form of the URL for maximum compression in search summary
                const shortUrl = linkUrl.length > 30 ? linkUrl.substring(0, 30) + "..." : linkUrl;
                $(el).replaceWith(`${linkText} (${shortUrl})`);
            } else if (linkUrl) {
                $(el).replaceWith(`Link: (${linkUrl.substring(0, 30)}...)`);
            } else {
                $(el).remove();
            }
        });

        // --- STRUCTURAL PRIORITIZATION ---
        let structuredContent = "";

        // Headers
        $("h1, h2, h3").each((i, el) => {
            structuredContent += `[H${el.tagName.charAt(1)}]: ${$(el).text().trim()} | `;
        });

        // Lists (Condensed)
        $("ul, ol").each((i, el) => {
            const listItems = $(el).find('li').map((j, li) => $(li).text().trim()).get().join('; ');
            if (listItems.length > 0) {
                structuredContent += `[LIST]: ${listItems} | `;
            }
        });

        // Tables (Summarized)
        $("table").each((i, el) => {
            const headers = $(el).find('thead th').map((j, th) => $(th).text().trim()).get().join(' / ');
            const firstRow = $(el).find('tbody tr').eq(0).find('td').map((j, td) => $(td).text().trim()).get().join(' / ');
            if (headers.length > 0) {
                structuredContent += `[TABLE Headers]: ${headers} [First Row]: ${firstRow} | `;
            }
        });

        let cleanedText = $("main, article").text().trim() || $("body").text().trim();

        // --- CHARACTER COMPRESSION (Whitespace Minimization) ---
        let fullCleanedText = (structuredContent + cleanedText)
            .replace(/\s+/g, " ") // Reduce all whitespace to a single space
            .replace(/[.,;:]\s*[.,;:]+/g, (match) => match[0] + " ") // Clean up double punctuation
            .trim();
        
        let contentSummary = fullCleanedText.substring(0, contentLimit);
        if (fullCleanedText.length > contentLimit) {
             contentSummary += " [Content Truncated]";
        }

        const summary = `T: ${title} | U: ${url} | C: ${contentSummary}`;

        return { summary, deepLink };

    } catch (error: any) {
        return { summary: `[Crawl Failed: Unexpected Error]`, deepLink };
    }
}


/**
 * Fetches, strips boilerplate, extracts structured data, and compresses the content of a single URL, 
 * returning the full text for cache storage (used by the crawlWebPage and followSuggestedLink tools).
 */
async function fetchAndExtractFullContent(url: string, filterQuery?: string): Promise<{ fullText: string, title: string, rawDiscoveredLinks: Map<string, string> }> {
    const httpResponse = await fetch(url); 
    if (!httpResponse.ok) throw new Error(`Status ${httpResponse.status}`);
    
    const html = await httpResponse.text();
    const $ = cheerio.load(html);
    
    // Apply Structural Compression filters
    $("script, style, noscript, footer, nav, aside, header, form, .ad, .sidebar, .comments, #cookie-banner, [role='navigation'], .wp-block-group, blockquote, figcaption").remove(); 
    
    const title = $("title").text().trim() || "No title";
    
    // --- Link Collection for Further Crawling ---
    const rawDiscoveredLinks = new Map<string, string>(); // Link Text -> Full URL
    $("a").each((i, el) => {
        const href = $(el).attr("href");
        const linkText = $(el).text().trim() || 'Link';
        const normalizedLink = normalizeAndFilterLink(url, href); 
        
        if (normalizedLink) {
            try {
                const baseUrlObject = new URL(url);
                const linkUrlObject = new URL(normalizedLink);
                
                // Check for same-domain or relevant subdomain
                if ((linkUrlObject.hostname === baseUrlObject.hostname || linkUrlObject.hostname.endsWith(`.${baseUrlObject.hostname}`)) && rawDiscoveredLinks.size < MAX_LINKS_TO_RETURN) {
                    rawDiscoveredLinks.set(linkText, normalizedLink);
                }
            } catch (e) { /* ignore malformed links */ }
        }
    });
    
    // --- Content Processing (Structural) ---
    let structuredContent = "";

    $("h1, h2, h3").each((i, el) => {
        structuredContent += `[H${el.tagName.charAt(1)}]: ${$(el).text().trim()} | `;
    });
    $("ul, ol").each((i, el) => {
        const listItems = $(el).find('li').map((j, li) => $(li).text().trim()).get().join('; ');
        if (listItems.length > 0) {
            structuredContent += `[LIST]: ${listItems} | `;
        }
    });
    $("table").each((i, el) => {
        const headers = $(el).find('thead th').map((j, th) => $(th).text().trim()).get().join(' / ');
        const firstRow = $(el).find('tbody tr').eq(0).find('td').map((j, td) => $(td).text().trim()).get().join(' / ');
        if (headers.length > 0) {
            structuredContent += `[TABLE Headers]: ${headers} [First Row]: ${firstRow} | `;
        }
    });

    // --- LINK COMPRESSION FOR CHUNK CONTENT ---
    // Replaces full links with just the text or nothing.
    $("a").each((i, el) => {
        const linkText = $(el).text().trim();
        if (linkText) {
            $(el).replaceWith(`[Link: ${linkText}]`); // Use a shorter tag format
        } else {
            $(el).remove();
        }
    });

    let cleanedText = $("main, article").text().trim() || $("body").text().trim();
    
    // Aggressive Whitespace and Punctuation Cleanup
    let fullText = (structuredContent + cleanedText)
        .replace(/\s+/g, " ")
        .replace(/[.,;:]\s*[.,;:]+/g, (match) => match[0] + " ") 
        .trim();


    // --- Query-based Prioritization (Lossy Semantic Compression) ---
    if (filterQuery) {
        const paragraphs = fullText.split(/\.|\?|\!/g);
        let bestParagraphs = "";

        for (const p of paragraphs) {
            if (p.toLowerCase().includes(filterQuery.toLowerCase())) {
                bestParagraphs += p.trim() + ". ";
                if (bestParagraphs.length > 1000) break;
            }
        }
        
        if (bestParagraphs.length > 0) {
            fullText = `[QUERY MATCH: ${filterQuery}] ${bestParagraphs.trim()} | ${fullText}`;
        }
    }
    
    if (fullText.length > MAX_TEXT_LENGTH) {
        fullText = fullText.substring(0, MAX_TEXT_LENGTH);
    }
    
    return { fullText, title, rawDiscoveredLinks };
}


// --- 1. TOOL DEFINITION FUNCTION ---
async function toolsProvider(ctl: ToolsProviderController): Promise<Tool[]> {
    const tools: Tool[] = [];
    
    // 1. SEARCH WEB TOOL 🌎
    const searchWebTool = tool({
        name: "searchWeb",
        description: "Performs a global search, crawling the top 5 results (depth 1). Optionally use deepCrawl=true to also crawl the first internal link on each page (depth 2), providing more comprehensive coverage.",
        parameters: {
            query: z.string().describe("The complete search phrase or terms."),
            deepCrawl: z.boolean().optional().describe("Set to true to perform a secondary crawl on the first internal link found on each of the top 5 results (Depth 2).")
        },
        
        implementation: async ({ query, deepCrawl }: { query: string, deepCrawl?: boolean }) => {
            
            const finalQuery = query;

            try {
                // Using DuckDuckGo for search results
                const ddgOrganicUrl = new URL("https://duckduckgo.com/html/");
                ddgOrganicUrl.searchParams.set("q", finalQuery); 
                const response = await fetch(ddgOrganicUrl.toString()); 
                if (!response.ok) return "Search Failed: Network.";

                let html = await response.text();
                html = html.replace(/\s+/g, " ");
                const $ = cheerio.load(html);

                const urlsToCrawl: string[] = [];
                
                // Extract search result URLs
                $("#links .result").each((i, el) => {
                    const titleElement = $(el).find(".result__title .result__a");
                    const url = titleElement.attr("href");
                    
                    if (url) {
                        let cleanUrl = new URL(url, "https://duckduckgo.com").searchParams.get('uddg') || url;
                        urlsToCrawl.push(cleanUrl);
                    }
                });
                
                const primaryUrls = urlsToCrawl.filter(url => {
                    return normalizeAndFilterLink("https://duckduckgo.com/", url) !== null; 
                }).slice(0, SEARCH_CRAWL_LIMIT);
                
                if (primaryUrls.length === 0) return `No relevant results found after filtering.`; 
                
                // Depth 1 Crawl (Primary Results)
                const crawlPromises = primaryUrls.map(url => 
                    fetchAndSummarizeUrl(url, AUTO_CRAWL_CONTENT_LIMIT, deepCrawl)
                );
                
                const primaryResults = await Promise.all(crawlPromises);

                let allResults = [...primaryResults];
                let deepCrawlSuccesses = 0;

                // Depth 2 Crawl (Secondary Links)
                if (deepCrawl) {
                    const deepCrawlPromises = primaryResults
                        .filter(res => res.deepLink)
                        .map(res => {
                            deepCrawlSuccesses++;
                            return fetchAndSummarizeUrl(res.deepLink!, AUTO_CRAWL_CONTENT_LIMIT, false)
                                .then(deepRes => ({
                                    summary: `[SECONDARY CRAWL from ${res.deepLink}] ${deepRes.summary}`, 
                                    deepLink: null 
                                }));
                        });
                    
                    const secondaryResults = await Promise.all(deepCrawlPromises);
                    allResults.push(...secondaryResults);
                }

                // Format the final output
                const formattedResults = allResults.map((result, index) => {
                    return `R${index + 1} | ${result.summary}`;
                }).join('\n');
                
                let header = `Search and Auto-Crawl Success (Top ${primaryUrls.length} results, Links explicit and compressed)`;
                if (deepCrawl) {
                    header += ` - DEEP CRAWL (Depth 2) completed for ${deepCrawlSuccesses} pages`;
                }

                return `${header}:\n${formattedResults}`;

            } catch (error: any) {
                return "Search Failed: Critical Error during auto-crawl.";
            }
        }
    });
    tools.push(searchWebTool);

    // 2. CRAWL WEB PAGE TOOL (Manual Deep Dive) 📜
    const crawlWebPageTool = tool({
        name: "crawlWebPage",
        description: "Fetches and extracts the main text content from a single URL. Returns the first compressed chunk, a list of suggested internal links (indexed for compression), and a Session ID if more content exists. Supports optional filterQuery to prioritize specific content.",
        parameters: {
            url: z.string().url().describe("The absolute URL of the webpage to crawl (e.g., https://example.com)."),
            filterQuery: z.string().optional().describe("An optional term (e.g., 'system requirements') to prioritize relevant paragraph content in the first chunk.")
        },
        implementation: async ({ url, filterQuery }: { url: string, filterQuery?: string }) => {
            try {
                // Use the new helper function to fetch and process content
                const { fullText, title, rawDiscoveredLinks } = await fetchAndExtractFullContent(url, filterQuery);

                // --- Final Output and Cache Setup ---
                const sessionId = randomUUID();
                
                // Prepare link index for compressed output, while storing full URLs
                const suggestedLinksMap: Record<string, string> = {};
                const suggestedLinksOutput: string[] = [];
                Array.from(rawDiscoveredLinks.entries()).forEach(([text, linkUrl], index) => {
                    const id = `L${index + 1}`;
                    const domain = new URL(linkUrl).hostname;
                    suggestedLinksMap[id] = linkUrl;
                    suggestedLinksOutput.push(`${id}: ${text} (${domain})`);
                });
                
                // Cache the full content and links
                pageCache.set(sessionId, { fullText, currentOffset: 0, suggestedLinks: suggestedLinksMap });

                const firstChunk = fullText.substring(0, MAX_CHUNK_LENGTH);
                const remainingLength = fullText.length - firstChunk.length;
                pageCache.get(sessionId)!.currentOffset = firstChunk.length;

                const linksOutput = suggestedLinksOutput.length > 0 
                    ? ` | Suggested Links (Indexed): [${suggestedLinksOutput.join(' | ')}]`
                    : " | No relevant links suggested.";


                let outputMessage = `Crawl Success: ${title} | First Chunk (HIGHLY COMPRESSED, ${firstChunk.length} chars): ${firstChunk}`;

                if (remainingLength > 0) {
                    outputMessage += ` | MORE CONTENT EXISTS (Remaining: ${remainingLength} chars). Use scrollPage with ID ${sessionId}`;
                } 
                
                // Instruction for the new follow tool
                if (suggestedLinksOutput.length > 0) {
                     outputMessage += ` | ACTION: Use followSuggestedLink(sessionId='${sessionId}', linkId='L#') to follow a link.`;
                }

                // Delete cache if no scrolling or links are possible
                if (remainingLength === 0 && suggestedLinksOutput.length === 0) {
                    pageCache.delete(sessionId); 
                }
                
                return outputMessage + linksOutput;

            } catch (error: any) {
                return `Crawl Failed: Unexpected error: ${error.message}`; 
            }
        }
    });
    tools.push(crawlWebPageTool);
    
    // 3. FOLLOW SUGGESTED LINK 🧭 (NEW TOOL for Deeper Search)
    const followSuggestedLinkTool = tool({
        name: "followSuggestedLink",
        description: "Resolves a compressed link ID (e.g., 'L1') from a previous crawl session into its full URL and then crawls that new URL. REQUIRES both the Session ID and the Link ID.",
        parameters: {
            sessionId: z.string().describe("The unique Session ID returned by 'crawlWebPage' that contains the link index."),
            linkId: z.string().describe("The compressed link index (e.g., 'L1', 'L2') from the suggested links list.")
        },
        implementation: async ({ sessionId, linkId }: { sessionId: string, linkId: string }) => {
            const session = pageCache.get(sessionId);

            if (!session) {
                return `Follow Link Failed: Invalid or expired Session ID. Start a new crawl.`;
            }

            const fullUrl = session.suggestedLinks[linkId];
            
            if (!fullUrl) {
                 return `Follow Link Failed: Link ID '${linkId}' not found in Session ID '${sessionId}'.`;
            }
            
            // Clean up old session before crawling the new link to manage memory
            pageCache.delete(sessionId);
            
            // Perform the crawl on the resolved URL
            try {
                const { fullText, title, rawDiscoveredLinks } = await fetchAndExtractFullContent(fullUrl);

                // --- Final Output and Cache Setup for the NEW page ---
                const newSessionId = randomUUID();
                
                // Prepare link index for compressed output, while storing full URLs
                const suggestedLinksMap: Record<string, string> = {};
                const suggestedLinksOutput: string[] = [];
                Array.from(rawDiscoveredLinks.entries()).forEach(([text, linkUrl], index) => {
                    const id = `L${index + 1}`;
                    const domain = new URL(linkUrl).hostname;
                    suggestedLinksMap[id] = linkUrl;
                    suggestedLinksOutput.push(`${id}: ${text} (${domain})`);
                });
                
                pageCache.set(newSessionId, { fullText, currentOffset: 0, suggestedLinks: suggestedLinksMap });

                const firstChunk = fullText.substring(0, MAX_CHUNK_LENGTH);
                const remainingLength = fullText.length - firstChunk.length;
                pageCache.get(newSessionId)!.currentOffset = firstChunk.length;

                const linksOutput = suggestedLinksOutput.length > 0 
                    ? ` | Suggested Links (Indexed): [${suggestedLinksOutput.join(' | ')}]`
                    : " | No relevant links suggested.";


                let outputMessage = `Follow Success: Crawled ${title} | First Chunk (HIGHLY COMPRESSED, ${firstChunk.length} chars): ${firstChunk}`;

                if (remainingLength > 0) {
                    outputMessage += ` | MORE CONTENT EXISTS (Remaining: ${remainingLength} chars). Use scrollPage with ID ${newSessionId}`;
                }
                
                if (suggestedLinksOutput.length > 0) {
                     outputMessage += ` | ACTION: Use followSuggestedLink(sessionId='${newSessionId}', linkId='L#') to follow a link.`;
                }

                if (remainingLength === 0 && suggestedLinksOutput.length === 0) {
                    pageCache.delete(newSessionId);
                }
                
                return outputMessage + linksOutput;

            } catch (error: any) {
                return `Follow Link Crawl Failed for ${fullUrl}: ${error.message}`; 
            }
        }
    });
    tools.push(followSuggestedLinkTool);


    // 4. SCROLL PAGE TOOL (Pagination) 🖱️
    const scrollPageTool = tool({
        name: "scrollPage",
        description: "Scrolls and returns the next chunk of content (which is compressed and explicit) from a previously crawled page session. Takes a Session ID from 'crawlWebPage' or 'followSuggestedLink'.",
        parameters: {
            sessionId: z.string().describe("The unique Session ID returned by 'crawlWebPage' or 'followSuggestedLink'."),
        },
        implementation: async ({ sessionId }: { sessionId: string }) => {
            const session = pageCache.get(sessionId);

            if (!session) {
                return `Scroll Failed: Invalid or expired Session ID. Start a new crawl.`;
            }

            const { fullText, currentOffset, suggestedLinks } = session;
            const remainingText = fullText.substring(currentOffset);

            if (remainingText.length === 0) {
                pageCache.delete(sessionId);
                return `Scroll End: End of content reached. Session ID ${sessionId} deleted.`;
            }

            const nextChunk = remainingText.substring(0, MAX_CHUNK_LENGTH);
            const newOffset = currentOffset + nextChunk.length;
            const newRemaining = fullText.length - newOffset;

            session.currentOffset = newOffset;

            let response = `Scroll Success (Chunk ${Math.ceil(newOffset / MAX_CHUNK_LENGTH)}, HIGHLY COMPRESSED): ${nextChunk}`;

            if (newRemaining > 0) {
                response += ` | MORE CONTENT EXISTS (Remaining: ${newRemaining} chars). Use scrollPage again with ID ${sessionId}`;
            } else {
                
                let linksOutput = "";
                if (Object.keys(suggestedLinks).length > 0) {
                    // Re-include links when session is ending for final context reference
                    const suggestedLinksOutput: string[] = Object.entries(suggestedLinks).map(([id, url]) => {
                        const text = Object.entries(suggestedLinks).find(([, storedUrl]) => storedUrl === url)?.[0] || 'Link'; // Find the original text
                        const domain = new URL(url).hostname;
                        return `${id}: ${text} (${domain})`;
                    });
                    linksOutput = ` | FINAL SUGGESTED LINKS: [${suggestedLinksOutput.join(' | ')}]`;
                }

                pageCache.delete(sessionId);
                response += ` | Scroll End: No more content. Session ID ${sessionId} deleted. ${linksOutput}`;
            }

            return response;
        }
    });
    tools.push(scrollPageTool);


    return tools;
}

// --- 2. PLUGIN ENTRY POINT ---
export async function main(context: PluginContext) {
    context.withToolsProvider(toolsProvider);
    return Promise.resolve();
}
crawler