Project Files
src
config.ts
index.ts
toolsProvider.ts
.gitattributes
.gitignore
bun.lock
eslint.config.mjs
LICENSE
manifest.json
package.json
README.md
tsconfig.json
tsconfig.tsbuildinfo
src / toolsProvider.ts
import { tool, Tool, ToolCallContext, ToolsProviderController } from "@lmstudio/sdk";
import { z } from "zod";
import * as cheerio from "cheerio";
import { fetchTranscript } from "youtube-transcript-plus";
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import TurndownService from "turndown";
import { configSchematics } from "./config";
import { dedent } from 'ts-dedent';

type SearchResult = { title: string; url: string; snippet: string };

const td = new TurndownService({
	headingStyle: 'atx',
	codeBlockStyle: 'fenced',
	bulletListMarker: '-',
	linkStyle: 'inlined',
});
td.remove(['img', 'figure']);

function extractContent(html: string, url: string): { title: string, content: string } {
	const dom = new JSDOM(html, { url });
	const article = new Readability(dom.window.document).parse();
	if (!article) return { title: 'Untitled', content: '' };

	// Strip links from article content, keeping only text
	const tmp = new JSDOM(article.content || undefined);
	tmp.window.document.querySelectorAll('a').forEach(el => {
		el.replaceWith(...el.childNodes);
	});

	return {
		title: article.title ?? 'Untitled',
		content: td.turndown(tmp.window.document.body.innerHTML).replace(/\n{3,}/g, '\n\n').trim(),
	};
}

let gotScrapingInstance: typeof import('got-scraping').gotScraping | null = null;

async function fetchPage(url: string, signal: AbortSignal): Promise<string> {
	gotScrapingInstance ??= (await import('got-scraping')).gotScraping;
	const response = await gotScrapingInstance({
		url,
		signal,
		timeout: { request: 30000 },
	});
	return response.body as string;
}

export async function toolsProvider(ctl: ToolsProviderController): Promise<Tool[]> {
	const tools: Tool[] = [];

	const makeRateLimiter = (interval: number) => {
		let lastRequestTimestamp = 0;
		return async () => {
			const now = Date.now();
			const waitMs = interval - (now - lastRequestTimestamp);
			if (waitMs > 0) await new Promise(resolve => setTimeout(resolve, waitMs));
			lastRequestTimestamp = Date.now();
		};
	};

	const waitIfNeededSearch = makeRateLimiter(5000);
	const waitIfNeededJina = makeRateLimiter(1000);

	let searchCount = 0;
	let visitCount = 0;

	// In-memory search result cache (session-scoped, LRU)
	const searchCache = new Map<string, { results: SearchResult[], timestamp: number }>();
	const CACHE_TTL = 5 * 60 * 1000;
	const MAX_CACHE_SIZE = 100;

	const evictCacheIfNeeded = () => {
		const now = Date.now();
		for (const [key, entry] of searchCache) {
			if (now - entry.timestamp > CACHE_TTL) searchCache.delete(key);
		}
		while (searchCache.size >= MAX_CACHE_SIZE) {
			const oldest = searchCache.keys().next().value;
			if (oldest) searchCache.delete(oldest);
			else break;
		}
	};

	const getCachedResults = (query: string, pageSize: number): { results: SearchResult[] } | null => {
		const cached = searchCache.get(query);
		if (!cached) return null;
		if (Date.now() - cached.timestamp > CACHE_TTL) {
			searchCache.delete(query);
			return null;
		}
		// Move to end for LRU
		searchCache.delete(query);
		searchCache.set(query, cached);
		return { results: cached.results.slice(0, pageSize) };
	};

	const setCachedResults = (query: string, results: SearchResult[]) => {
		evictCacheIfNeeded();
		searchCache.set(query, { results, timestamp: Date.now() });
	};

	// Search: DuckDuckGo only
	const searchDuckDuckGo = async (query: string, pageSize: number, signal: AbortSignal): Promise<SearchResult[]> => {
		const url = new URL("https://html.duckduckgo.com/html/");
		url.searchParams.append("q", query);

		const html = await fetchPage(url.toString(), signal);
		const $ = cheerio.load(html);
		const results: SearchResult[] = [];

		$('.result__body').each((_, el) => {
			console.log(`{results.length}`);
			if (pageSize > 0 && results.length >= pageSize) return false;
			const $resultItem = $(el).closest('.result');
			const classes = $resultItem.attr('class') || '';
			if (/\bresult--ad\b|\bresult--sponsored\b|\bad\b/i.test(classes)) return;
			if (/^\s*Ad\b|^\s*Sponsored\b/i.test($resultItem.text() || '')) return;
			const $anchor = $(el).find('a.result__a');
			if (!$anchor.length) return;
			let href = $anchor.attr('href') || '';

			// Skip ad tracking redirects
			if (/duckduckgo\.com\/(aclick|y\.js)/i.test(href)) return;

			// Extract real URL from DDG redirect
			const uddgMatch = href.match(/[?&]uddg=([^&]+)/);
			if (uddgMatch) href = decodeURIComponent(uddgMatch[1]);

			const title = ($anchor.text() || '').replace(/\s+/g, ' ').trim();
			// if (!title) return;
			const snippet = $(el).find('.result__snippet').text().replace(/\s+/g, ' ').trim();
			if (!results.some(r => r.url === href)) results.push({ title, url: href, snippet });
		});

		return results;
	};

	const webSearchTool = tool({
		name: "web_search",
		description: dedent(`
			web_search - a tool to search the internet.
			
			When requested to search a web site or internet, do not answer right away from internal knowledge; you must follow the following steps in order:
			1. **Initial Query;*: Formulate a search query based on the user input.
			2. **Search**: Call the \`web_search\` tool, and rank result snippets by relevance to the user's input from 1 to 5.
			3. **Resolution Logic**:
    			- **Sufficient Results**: If there are more then 3 pages with **rating 5**, call \`extract_web_page_contents\` tool for each relevant result, and answer based on extracted data.
    			- **Query Refinement**: If the query has not been refined, identify specific terms, keywords, or better search angles, and repeat the steps with refined query starting from **Search**.
    			- **Request User Clarification**: Refine only once, then STOP SEARCH.
		`),
		parameters: {
			query: z.string().describe(dedent(`
				The search query - when constructing search query, use the following syntax:
				- **Semantic Search**: \`~"cats and dogs"\` (Use tilde + quotes for similar concepts)
				- **Exact Phrase**: \`"cats and dogs"\` (Use quotes for specific phrases)
				- **Exclude Keyword**: \`cats -dogs\` (Use minus to remove a keyword)
				- **Focus on Keyword**: \`cats +dogs\` (Use plus to focus on a specific keyword)
				- **Filetype Search**: \`cats filetype:pdf\` (Supported: pdf, doc(x), xls(x), ppt(x), html)
				- **Specific Site**: \`site:example.com cats\` (e.g., \`site:arxiv.org\` or \`site:stackexchange.com\`)
				- **Exclude Site**: \`cats -site:example.com\` (Search pages about cats, excluding example.com)
				- **Page Title**: \`intitle:dogs\` (Search pages where the title includes specific phrase)
				- **Page URL**: \`inurl:cats\` (Search pages where the URL includes specific phrase)
			`)),
		},
		implementation: async ({ query }, { status, warn, signal }) => {
			try {
				let pageSize = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("pageSize"), 0) ?? 5;

				// Check cache first (no rate limit for cache hits)
				const cached = getCachedResults(query, pageSize);
				if (cached) {
					searchCount++;
					status(`Returning cached results for "${query}".`);
					return { results: cached.results, count: cached.results.length, cached: true };
				}

				let results: SearchResult[] = [];
				const maxRetries = ctl.getPluginConfig(configSchematics).get("webSearchEngineCallRetries");
				for (let attempt = 1; results.length > 0 || attempt <= maxRetries; attempt++) {
					try {
						let attemptsMsg = attempt > 1 ? ` {attempt}/{maxRetries}` : "";
						status(`Search${attemptsMsg} for: "${query}"`);
						await waitIfNeededSearch();
						results = await searchDuckDuckGo(query, pageSize, signal);
					} catch (err: unknown) {
						if (attempt >= maxRetries) {
							warn(`Web search engine error: ${err instanceof Error ? err.message : err}`);
							return `No results. Stop.`; 
						}
					}
				}
				if (results.length == 0) {
					return `No results.`; 
				}

				setCachedResults(query, results);
				let reminder: string | undefined;
				status(`Found ${results.length} results.`);
				return { results, count: results.length, ...(reminder && { reminder }) };
			} catch (error: unknown) {
				if (error instanceof DOMException && error.name === "AbortError") {
					return "Search was cancelled.";
				}
				const msg = error instanceof Error ? error.message : 'Unknown error';
				console.error(error);
				warn(`Search failed: ${msg}`);
				return `Error: ${msg}`;
			}
		},
	});

	const extractWebPageContent = async ({ url }: {readonly url: string}, { status, warn, signal }: ToolCallContext) => {
			const originalUrl = url;

			// De-AMP - AMP pages are always worse than the original
			url = url.replace(/\/amp\/?$/, '');
			url = url.replace(/[?&]amp=1/, '');
			const ampMatch = url.match(/google\.com\/amp\/s\/(.+)/);
			if (ampMatch) url = 'https://' + ampMatch[1];

			// URL transformations for better content extraction
			url = url.replace(/arxiv\.org\/abs\//, 'arxiv.org/pdf/');
			const isMedium = /(?:www\.)?medium\.com/.test(url);
			url = url.replace(/(?:www\.)?medium\.com/, 'scribe.rip');
			url = url.replace(/(?:www\.)?reddit\.com/, 'old.reddit.com');

			const shortUrl = url.length > 50 ? url.slice(0, 47) + '...' : url;
			status(`Fetching content from: ${shortUrl}`);

			try {
				let contentLimit = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("contentLimit"), -1) ?? 8000;
				const isPdf = /pdf/i.test(url);

				// Handle YouTube URLs - fetch transcript instead
				const ytMatch = url.match(/(?:youtube\.com\/watch\?.*v=|youtu\.be\/)([\w-]+)/);
				if (ytMatch) {
					status(`Fetching YouTube transcript for: ${ytMatch[1]}`);
					try {
						const transcript = await fetchTranscript(url, {"videoDetails": true});
						const text = transcript.segments.map(t => t.text).join(' ').trim();
						const content = smartTruncate(text, contentLimit);
						status(`Retrieved YouTube transcript (${content.length} chars)`);
						return {
							url,
							title: transcript.videoDetails.title,
                            description: transcript.videoDetails.description,
                            keywords: transcript.videoDetails.keywords,
							content,
						};
					} catch (ytErr: unknown) {
						const msg = ytErr instanceof Error ? ytErr.message : 'unknown';
						warn(`YouTube transcript unavailable: ${msg}`);
						status('Falling back to Jina for YouTube page (content may be limited)');
					}
				}

				// PDFs always use Jina
				if (isPdf) {
					await waitIfNeededJina();
					const jinaUrl = `https://r.jina.ai/${url}`;
					const jinaResponse = await fetch(jinaUrl, {
						method: "GET",
						signal: AbortSignal.any([signal, AbortSignal.timeout(30000)]),
					});
					if (jinaResponse.ok) {
						const raw = await jinaResponse.text();
						const content = smartTruncate(raw, contentLimit);
						status(`Retrieved PDF (${content.length} chars)`);
						return { url, title: 'PDF Document', content };
					}
					return `Error: Could not fetch PDF from ${url}`;
				}

				// Helper: fetch via Jina
				const tryJina = async (): Promise<{ title: string, content: string } | null> => {
					await waitIfNeededJina();
					const jinaUrl = `https://r.jina.ai/${url}`;
					const jinaResponse = await fetch(jinaUrl, {
						method: "GET",
						signal: AbortSignal.any([signal, AbortSignal.timeout(30000)]),
					});
					if (!jinaResponse.ok && jinaResponse.status !== 451 && jinaResponse.status !== 403) return null;
					const raw = await jinaResponse.text();
					const titleMatch = raw.match(/^Title:\s*(.+)$/m);
					const title = titleMatch ? titleMatch[1].trim() : 'Untitled';
					const content = smartTruncate(cleanMarkdown(raw), contentLimit);
					const jinaWarning = raw.includes('This page maybe not yet fully loaded') || raw.includes('Unavailable For Legal Reasons');
					const jinaBlocked = jinaResponse.status === 451 || jinaResponse.status === 403;
					if (jinaWarning || jinaBlocked || content.length < 2000) return null;
					return { title, content };
				};

				// Helper: fetch via Readability + Turndown
				const tryDirectFetch = async (): Promise<{ title: string, content: string } | null> => {
					try {
						const html = await fetchPage(url, signal);
						const { title, content: extracted } = extractContent(html, url);
						const content = smartTruncate(extracted, contentLimit);
						if (content.length < 2000) return null;
						return { title, content };
					} catch {
						return null;
					}
				};

				// Try direct fetch first, fallback to Jina
				let result: { title: string, content: string } | null = null;
				status('Trying direct fetch...');
				result = await tryDirectFetch();
				if (!result) {
					status('Direct fetch failed, trying Jina...');
					result = await tryJina();
				}

				// Medium fallback: if scribe.rip failed, try original URL via Jina
				if (!result && isMedium && originalUrl) {
					status('Scribe.rip failed, trying original Medium URL via Jina...');
					try {
						await waitIfNeededJina();
						const fallbackResponse = await fetch(`https://r.jina.ai/${originalUrl}`, {
							method: "GET",
							signal: AbortSignal.any([signal, AbortSignal.timeout(30000)]),
						});
						if (fallbackResponse.ok) {
							const raw = await fallbackResponse.text();
							const titleMatch = raw.match(/^Title:\s*(.+)$/m);
							const title = titleMatch ? titleMatch[1].trim() : 'Untitled';
							const content = smartTruncate(cleanMarkdown(raw), contentLimit);
							if (content.length >= 500) {
								result = { title, content };
							}
						}
					} catch { /* fall through */ }
				}

				if (!result) {
					return `Error: Could not extract content from ${url}`;
				}

				const { title, content } = result;
				status(`Retrieved "${title}" (${content.length} chars)`);
				return { url, title, content };
			} catch (error: unknown) {
				if (error instanceof DOMException && error.name === "AbortError") {
					return "Website visit was cancelled.";
				}
				const msg = error instanceof Error ? error.message : 'Unknown error';
				console.error(error);
				warn(`Failed to load website: ${msg}`);
				return `Error: ${msg}`;
			}
	};

	tools.push(webSearchTool);
	tools.push(tool({
		name: "extract_web_page_contents",
		description: `It extracts contens from web pages and documents.`,
		parameters: {
			url: z.string().url().describe("The URL of the web page to extract from"),
		},
		implementation: extractWebPageContent,
	}));
	tools.push(tool({
		name: "extract_video_transcript",
		description: `It extracts video transcript by URLs like "https://www.youtube.com/watch?...").`,
		parameters: {
			url: z.string().url().describe("The URL of the video to extract a transcript"),
		},
		implementation: extractWebPageContent,
	}));
	return tools;
}

const undefinedIfAuto = (value: unknown, autoValue: number): number | undefined =>
	typeof value === 'number' && value === autoValue ? undefined : typeof value === 'number' ? value : undefined;

function cleanMarkdown(md: string): string {
	let text = md;

	// Remove Jina metadata header lines
	text = text.replace(/^(URL Source|Title|Published|Description|Markdown Content):\s*.*\n?/gm, '');

	// Remove Jina footer noise
	text = text.replace(/^(?:Let me know|Scraped|Final URL|Total|To visit).*$/gm, '');
	text = text.replace(/^-{3,}$/gm, '');

	// Convert markdown images ![alt](url) to [Image: alt] (preserve alt text as context)
	text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, (_match, alt) => alt ? `[Image: ${alt}]` : '');

	// Convert markdown links [text](url) to just text (preserve readable text)
	text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');

	// Remove reference-style links [text][ref]
	text = text.replace(/\[([^\]]+)\]\[[^\]]*\]/g, '$1');

	// Remove bare URLs on their own line (nav/footer links)
	text = text.replace(/^https?:\/\/\S+$/gm, '');

	// Remove HTML tags
	text = text.replace(/<[^>]+>/g, '');

	// Remove consecutive short single-word lines (nav items) but keep structural content
	text = text.replace(/^(?:\s*\w{1,20}\s*\n){4,}/gm, (match) => {
		const lines = match.split('\n').filter(l => l.trim());
		return lines.length > 6 ? '' : match;
	});

	// Collapse excessive blank lines
	text = text.replace(/\n{3,}/g, '\n\n');

	return text.trim();
}

function smartTruncate(text: string, limit: number): string {
	if (text.length <= limit) return text;
	const truncated = text.slice(0, limit);

	// Try paragraph boundary first
	const lastPara = truncated.lastIndexOf('\n\n');
	if (lastPara > limit * 0.7) return truncated.slice(0, lastPara).trimEnd();

	// Fall back to sentence boundary
	const lastPeriod = truncated.lastIndexOf('. ');
	const lastExclaim = truncated.lastIndexOf('! ');
	const lastQuestion = truncated.lastIndexOf('? ');
	const lastSentence = Math.max(lastPeriod, lastExclaim, lastQuestion);
	if (lastSentence > limit * 0.7) return truncated.slice(0, lastSentence + 1).trimEnd();

	return truncated;
}
web-search