src / toolsProvider.ts
import { tool, Tool, ToolCallContext, ToolsProviderController } from "@lmstudio/sdk";
import { z } from "zod";
import * as cheerio from "cheerio";
import { fetchTranscript } from "youtube-transcript-plus";
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import TurndownService from "turndown";
import { configSchematics } from "./config";
import { dedent } from 'ts-dedent';
type SearchResult = { title: string; url: string; snippet: string };
const td = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
bulletListMarker: '-',
linkStyle: 'inlined',
});
td.remove(['img', 'figure']);
function extractContent(html: string, url: string): { title: string, content: string } {
const dom = new JSDOM(html, { url });
const article = new Readability(dom.window.document).parse();
if (!article) return { title: 'Untitled', content: '' };
// Strip links from article content, keeping only text
const tmp = new JSDOM(article.content || undefined);
tmp.window.document.querySelectorAll('a').forEach(el => {
el.replaceWith(...el.childNodes);
});
return {
title: article.title ?? 'Untitled',
content: td.turndown(tmp.window.document.body.innerHTML).replace(/\n{3,}/g, '\n\n').trim(),
};
}
let gotScrapingInstance: typeof import('got-scraping').gotScraping | null = null;
async function fetchPage(url: string, signal: AbortSignal): Promise<string> {
gotScrapingInstance ??= (await import('got-scraping')).gotScraping;
const response = await gotScrapingInstance({
url,
signal,
timeout: { request: 30000 },
});
return response.body as string;
}
export async function toolsProvider(ctl: ToolsProviderController): Promise<Tool[]> {
const tools: Tool[] = [];
const makeRateLimiter = (interval: number) => {
let lastRequestTimestamp = 0;
return async () => {
const now = Date.now();
const waitMs = interval - (now - lastRequestTimestamp);
if (waitMs > 0) await new Promise(resolve => setTimeout(resolve, waitMs));
lastRequestTimestamp = Date.now();
};
};
const waitIfNeededSearch = makeRateLimiter(5000);
const waitIfNeededJina = makeRateLimiter(1000);
let searchCount = 0;
let visitCount = 0;
// In-memory search result cache (session-scoped, LRU)
const searchCache = new Map<string, { results: SearchResult[], timestamp: number }>();
const CACHE_TTL = 5 * 60 * 1000;
const MAX_CACHE_SIZE = 100;
const evictCacheIfNeeded = () => {
const now = Date.now();
for (const [key, entry] of searchCache) {
if (now - entry.timestamp > CACHE_TTL) searchCache.delete(key);
}
while (searchCache.size >= MAX_CACHE_SIZE) {
const oldest = searchCache.keys().next().value;
if (oldest) searchCache.delete(oldest);
else break;
}
};
const getCachedResults = (query: string, pageSize: number): { results: SearchResult[] } | null => {
const cached = searchCache.get(query);
if (!cached) return null;
if (Date.now() - cached.timestamp > CACHE_TTL) {
searchCache.delete(query);
return null;
}
// Move to end for LRU
searchCache.delete(query);
searchCache.set(query, cached);
return { results: cached.results.slice(0, pageSize) };
};
const setCachedResults = (query: string, results: SearchResult[]) => {
evictCacheIfNeeded();
searchCache.set(query, { results, timestamp: Date.now() });
};
// Search: DuckDuckGo only
const searchDuckDuckGo = async (query: string, pageSize: number, signal: AbortSignal): Promise<SearchResult[]> => {
const url = new URL("https://html.duckduckgo.com/html/");
url.searchParams.append("q", query);
const html = await fetchPage(url.toString(), signal);
const $ = cheerio.load(html);
const results: SearchResult[] = [];
$('.result__body').each((_, el) => {
console.log(`{results.length}`);
if (pageSize > 0 && results.length >= pageSize) return false;
const $resultItem = $(el).closest('.result');
const classes = $resultItem.attr('class') || '';
if (/\bresult--ad\b|\bresult--sponsored\b|\bad\b/i.test(classes)) return;
if (/^\s*Ad\b|^\s*Sponsored\b/i.test($resultItem.text() || '')) return;
const $anchor = $(el).find('a.result__a');
if (!$anchor.length) return;
let href = $anchor.attr('href') || '';
// Skip ad tracking redirects
if (/duckduckgo\.com\/(aclick|y\.js)/i.test(href)) return;
// Extract real URL from DDG redirect
const uddgMatch = href.match(/[?&]uddg=([^&]+)/);
if (uddgMatch) href = decodeURIComponent(uddgMatch[1]);
const title = ($anchor.text() || '').replace(/\s+/g, ' ').trim();
// if (!title) return;
const snippet = $(el).find('.result__snippet').text().replace(/\s+/g, ' ').trim();
if (!results.some(r => r.url === href)) results.push({ title, url: href, snippet });
});
return results;
};
const webSearchTool = tool({
name: "web_search",
description: dedent(`
web_search - a tool to search the internet.
When requested to search a web site or internet, do not answer right away from internal knowledge; you must follow the following steps in order:
1. **Initial Query;*: Formulate a search query based on the user input.
2. **Search**: Call the \`web_search\` tool, and rank result snippets by relevance to the user's input from 1 to 5.
3. **Resolution Logic**:
- **Sufficient Results**: If there are more then 3 pages with **rating 5**, call \`extract_web_page_contents\` tool for each relevant result, and answer based on extracted data.
- **Query Refinement**: If the query has not been refined, identify specific terms, keywords, or better search angles, and repeat the steps with refined query starting from **Search**.
- **Request User Clarification**: Refine only once, then STOP SEARCH.
`),
parameters: {
query: z.string().describe(dedent(`
The search query - when constructing search query, use the following syntax:
- **Semantic Search**: \`~"cats and dogs"\` (Use tilde + quotes for similar concepts)
- **Exact Phrase**: \`"cats and dogs"\` (Use quotes for specific phrases)
- **Exclude Keyword**: \`cats -dogs\` (Use minus to remove a keyword)
- **Focus on Keyword**: \`cats +dogs\` (Use plus to focus on a specific keyword)
- **Filetype Search**: \`cats filetype:pdf\` (Supported: pdf, doc(x), xls(x), ppt(x), html)
- **Specific Site**: \`site:example.com cats\` (e.g., \`site:arxiv.org\` or \`site:stackexchange.com\`)
- **Exclude Site**: \`cats -site:example.com\` (Search pages about cats, excluding example.com)
- **Page Title**: \`intitle:dogs\` (Search pages where the title includes specific phrase)
- **Page URL**: \`inurl:cats\` (Search pages where the URL includes specific phrase)
`)),
},
implementation: async ({ query }, { status, warn, signal }) => {
try {
let pageSize = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("pageSize"), 0) ?? 5;
// Check cache first (no rate limit for cache hits)
const cached = getCachedResults(query, pageSize);
if (cached) {
searchCount++;
status(`Returning cached results for "${query}".`);
return { results: cached.results, count: cached.results.length, cached: true };
}
let results: SearchResult[] = [];
const maxRetries = ctl.getPluginConfig(configSchematics).get("webSearchEngineCallRetries");
for (let attempt = 1; results.length > 0 || attempt <= maxRetries; attempt++) {
try {
let attemptsMsg = attempt > 1 ? ` {attempt}/{maxRetries}` : "";
status(`Search${attemptsMsg} for: "${query}"`);
await waitIfNeededSearch();
results = await searchDuckDuckGo(query, pageSize, signal);
} catch (err: unknown) {
if (attempt >= maxRetries) {
warn(`Web search engine error: ${err instanceof Error ? err.message : err}`);
return `No results. Stop.`;
}
}
}
if (results.length == 0) {
return `No results.`;
}
setCachedResults(query, results);
let reminder: string | undefined;
status(`Found ${results.length} results.`);
return { results, count: results.length, ...(reminder && { reminder }) };
} catch (error: unknown) {
if (error instanceof DOMException && error.name === "AbortError") {
return "Search was cancelled.";
}
const msg = error instanceof Error ? error.message : 'Unknown error';
console.error(error);
warn(`Search failed: ${msg}`);
return `Error: ${msg}`;
}
},
});
const extractWebPageContent = async ({ url }: {readonly url: string}, { status, warn, signal }: ToolCallContext) => {
const originalUrl = url;
// De-AMP - AMP pages are always worse than the original
url = url.replace(/\/amp\/?$/, '');
url = url.replace(/[?&]amp=1/, '');
const ampMatch = url.match(/google\.com\/amp\/s\/(.+)/);
if (ampMatch) url = 'https://' + ampMatch[1];
// URL transformations for better content extraction
url = url.replace(/arxiv\.org\/abs\//, 'arxiv.org/pdf/');
const isMedium = /(?:www\.)?medium\.com/.test(url);
url = url.replace(/(?:www\.)?medium\.com/, 'scribe.rip');
url = url.replace(/(?:www\.)?reddit\.com/, 'old.reddit.com');
const shortUrl = url.length > 50 ? url.slice(0, 47) + '...' : url;
status(`Fetching content from: ${shortUrl}`);
try {
let contentLimit = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("contentLimit"), -1) ?? 8000;
const isPdf = /pdf/i.test(url);
// Handle YouTube URLs - fetch transcript instead
const ytMatch = url.match(/(?:youtube\.com\/watch\?.*v=|youtu\.be\/)([\w-]+)/);
if (ytMatch) {
status(`Fetching YouTube transcript for: ${ytMatch[1]}`);
try {
const transcript = await fetchTranscript(url, {"videoDetails": true});
const text = transcript.segments.map(t => t.text).join(' ').trim();
const content = smartTruncate(text, contentLimit);
status(`Retrieved YouTube transcript (${content.length} chars)`);
return {
url,
title: transcript.videoDetails.title,
description: transcript.videoDetails.description,
keywords: transcript.videoDetails.keywords,
content,
};
} catch (ytErr: unknown) {
const msg = ytErr instanceof Error ? ytErr.message : 'unknown';
warn(`YouTube transcript unavailable: ${msg}`);
status('Falling back to Jina for YouTube page (content may be limited)');
}
}
// PDFs always use Jina
if (isPdf) {
await waitIfNeededJina();
const jinaUrl = `https://r.jina.ai/${url}`;
const jinaResponse = await fetch(jinaUrl, {
method: "GET",
signal: AbortSignal.any([signal, AbortSignal.timeout(30000)]),
});
if (jinaResponse.ok) {
const raw = await jinaResponse.text();
const content = smartTruncate(raw, contentLimit);
status(`Retrieved PDF (${content.length} chars)`);
return { url, title: 'PDF Document', content };
}
return `Error: Could not fetch PDF from ${url}`;
}
// Helper: fetch via Jina
const tryJina = async (): Promise<{ title: string, content: string } | null> => {
await waitIfNeededJina();
const jinaUrl = `https://r.jina.ai/${url}`;
const jinaResponse = await fetch(jinaUrl, {
method: "GET",
signal: AbortSignal.any([signal, AbortSignal.timeout(30000)]),
});
if (!jinaResponse.ok && jinaResponse.status !== 451 && jinaResponse.status !== 403) return null;
const raw = await jinaResponse.text();
const titleMatch = raw.match(/^Title:\s*(.+)$/m);
const title = titleMatch ? titleMatch[1].trim() : 'Untitled';
const content = smartTruncate(cleanMarkdown(raw), contentLimit);
const jinaWarning = raw.includes('This page maybe not yet fully loaded') || raw.includes('Unavailable For Legal Reasons');
const jinaBlocked = jinaResponse.status === 451 || jinaResponse.status === 403;
if (jinaWarning || jinaBlocked || content.length < 2000) return null;
return { title, content };
};
// Helper: fetch via Readability + Turndown
const tryDirectFetch = async (): Promise<{ title: string, content: string } | null> => {
try {
const html = await fetchPage(url, signal);
const { title, content: extracted } = extractContent(html, url);
const content = smartTruncate(extracted, contentLimit);
if (content.length < 2000) return null;
return { title, content };
} catch {
return null;
}
};
// Try direct fetch first, fallback to Jina
let result: { title: string, content: string } | null = null;
status('Trying direct fetch...');
result = await tryDirectFetch();
if (!result) {
status('Direct fetch failed, trying Jina...');
result = await tryJina();
}
// Medium fallback: if scribe.rip failed, try original URL via Jina
if (!result && isMedium && originalUrl) {
status('Scribe.rip failed, trying original Medium URL via Jina...');
try {
await waitIfNeededJina();
const fallbackResponse = await fetch(`https://r.jina.ai/${originalUrl}`, {
method: "GET",
signal: AbortSignal.any([signal, AbortSignal.timeout(30000)]),
});
if (fallbackResponse.ok) {
const raw = await fallbackResponse.text();
const titleMatch = raw.match(/^Title:\s*(.+)$/m);
const title = titleMatch ? titleMatch[1].trim() : 'Untitled';
const content = smartTruncate(cleanMarkdown(raw), contentLimit);
if (content.length >= 500) {
result = { title, content };
}
}
} catch { /* fall through */ }
}
if (!result) {
return `Error: Could not extract content from ${url}`;
}
const { title, content } = result;
status(`Retrieved "${title}" (${content.length} chars)`);
return { url, title, content };
} catch (error: unknown) {
if (error instanceof DOMException && error.name === "AbortError") {
return "Website visit was cancelled.";
}
const msg = error instanceof Error ? error.message : 'Unknown error';
console.error(error);
warn(`Failed to load website: ${msg}`);
return `Error: ${msg}`;
}
};
tools.push(webSearchTool);
tools.push(tool({
name: "extract_web_page_contents",
description: `It extracts contens from web pages and documents.`,
parameters: {
url: z.string().url().describe("The URL of the web page to extract from"),
},
implementation: extractWebPageContent,
}));
tools.push(tool({
name: "extract_video_transcript",
description: `It extracts video transcript by URLs like "https://www.youtube.com/watch?...").`,
parameters: {
url: z.string().url().describe("The URL of the video to extract a transcript"),
},
implementation: extractWebPageContent,
}));
return tools;
}
const undefinedIfAuto = (value: unknown, autoValue: number): number | undefined =>
typeof value === 'number' && value === autoValue ? undefined : typeof value === 'number' ? value : undefined;
function cleanMarkdown(md: string): string {
let text = md;
// Remove Jina metadata header lines
text = text.replace(/^(URL Source|Title|Published|Description|Markdown Content):\s*.*\n?/gm, '');
// Remove Jina footer noise
text = text.replace(/^(?:Let me know|Scraped|Final URL|Total|To visit).*$/gm, '');
text = text.replace(/^-{3,}$/gm, '');
// Convert markdown images  to [Image: alt] (preserve alt text as context)
text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, (_match, alt) => alt ? `[Image: ${alt}]` : '');
// Convert markdown links [text](url) to just text (preserve readable text)
text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
// Remove reference-style links [text][ref]
text = text.replace(/\[([^\]]+)\]\[[^\]]*\]/g, '$1');
// Remove bare URLs on their own line (nav/footer links)
text = text.replace(/^https?:\/\/\S+$/gm, '');
// Remove HTML tags
text = text.replace(/<[^>]+>/g, '');
// Remove consecutive short single-word lines (nav items) but keep structural content
text = text.replace(/^(?:\s*\w{1,20}\s*\n){4,}/gm, (match) => {
const lines = match.split('\n').filter(l => l.trim());
return lines.length > 6 ? '' : match;
});
// Collapse excessive blank lines
text = text.replace(/\n{3,}/g, '\n\n');
return text.trim();
}
function smartTruncate(text: string, limit: number): string {
if (text.length <= limit) return text;
const truncated = text.slice(0, limit);
// Try paragraph boundary first
const lastPara = truncated.lastIndexOf('\n\n');
if (lastPara > limit * 0.7) return truncated.slice(0, lastPara).trimEnd();
// Fall back to sentence boundary
const lastPeriod = truncated.lastIndexOf('. ');
const lastExclaim = truncated.lastIndexOf('! ');
const lastQuestion = truncated.lastIndexOf('? ');
const lastSentence = Math.max(lastPeriod, lastExclaim, lastQuestion);
if (lastSentence > limit * 0.7) return truncated.slice(0, lastSentence + 1).trimEnd();
return truncated;
}