Forked from vadimfedenko/visit-website-reworked
src / toolsProvider.ts
import { tool, Tool, ToolsProviderController } from "@lmstudio/sdk";
import { writeFile, access } from "fs/promises";
import { LRUCache } from "lru-cache";
import { parse, HTMLElement as NHTMLElement } from "node-html-parser";
import { join } from "path";
import { z } from "zod";
import { configSchematics } from "./config";
export async function toolsProvider(ctl:ToolsProviderController):Promise<Tool[]> {
const tools: Tool[] = [];
const COMPACT_INLINE_LIMIT = 2;
const COMPACT_GALLERY_LIMIT = 6;
const pageCache = new LRUCache<string, FetchHTMLResult>({
max: 20,
ttl: 3 * 60 * 1000,
});
type FetchHTMLResult = {
html: string;
root: NHTMLElement;
source: "direct" | "jina";
finalUrl: string;
statusCode?: number;
serverHeader?: string;
};
const fetchWithRetries = async (targetUrl:string, signal:AbortSignal, headers:Record<string, string>, attempts = 2) => {
let lastError: unknown = null;
for(let i = 0; i < attempts; i++) {
const timeoutController = new AbortController();
const timeoutId = setTimeout(() => timeoutController.abort(), 12_000);
const combined = AbortSignal.any
? AbortSignal.any([signal, timeoutController.signal])
: timeoutController.signal;
try {
const result = await fetch(targetUrl, {
method: "GET",
signal: combined,
headers,
redirect: "follow",
});
clearTimeout(timeoutId);
return result;
} catch(error) {
clearTimeout(timeoutId);
lastError = error;
if(signal.aborted) throw error;
await new Promise(resolve => setTimeout(resolve, 350 * (i + 1)));
}
}
throw lastError;
}
const looksBlocked = (statusCode:number, _serverHeader:string, html:string) =>
statusCode === 401
|| statusCode === 403
|| statusCode === 429
|| statusCode === 503
|| /access denied|captcha|cf-chl|checking your browser|attention required|challenge-platform/i.test(html);
const fetchViaJina = async (url:string, signal:AbortSignal, warn:(msg:string) => void):Promise<FetchHTMLResult | null> => {
const jinaUrl = `https://r.jina.ai/http://${url.replace(/^https?:\/\//, "")}`;
try {
const response = await fetchWithRetries(jinaUrl, signal, {
"User-Agent": "Mozilla/5.0",
"Accept": "text/plain,text/html;q=0.9,*/*;q=0.8",
});
if(!response.ok) {
warn(`Jina fallback failed: ${response.status} ${response.statusText}`);
return null;
}
const text = await response.text();
if(!text.trim()) {
warn("Jina fallback returned empty content.");
return null;
}
const root = parse(text);
return {
html: text,
root,
source: "jina",
finalUrl: response.url || jinaUrl,
statusCode: response.status,
serverHeader: response.headers.get("server") || "",
};
} catch(error: any) {
if (error instanceof DOMException && error.name === "AbortError") throw error;
warn(`Jina fallback request failed: ${error?.message || String(error)}`);
return null;
}
}
const fetchHTML = async (url:string, signal:AbortSignal, warn:(msg:string) => void):Promise<FetchHTMLResult> => {
const cached = pageCache.get(url);
if(cached) return cached;
const jinaEnabled = ctl.getPluginConfig(configSchematics).get("jinaFallback") ?? true;
const headers = spoofHeaders(url);
let response: Response;
try {
response = await fetchWithRetries(url, signal, headers);
} catch(error: any) {
if (error instanceof DOMException && error.name === "AbortError") throw error;
if(jinaEnabled) {
const fallback = await fetchViaJina(url, signal, warn);
if(fallback) return fallback;
}
throw new Error(`Failed to fetch website: network error (${error?.message || String(error)})`);
}
const html = await response.text();
const statusCode = response.status;
const serverHeader = response.headers.get("server") || "";
if (!response.ok || looksBlocked(statusCode, serverHeader, html)) {
if(jinaEnabled) {
const fallback = await fetchViaJina(url, signal, warn);
if(fallback) return fallback;
}
if(!jinaEnabled) {
throw new Error(`Failed to fetch website: ${statusCode} ${response.statusText} (Jina fallback is disabled in plugin settings)`);
}
warn(`Failed to fetch website: ${statusCode} ${response.statusText}`);
throw new Error(`Failed to fetch website: ${statusCode} ${response.statusText} (server: ${serverHeader || "unknown"})`);
}
const root = parse(html);
const result: FetchHTMLResult = {
html,
root,
source: "direct",
finalUrl: response.url || url,
statusCode,
serverHeader,
};
pageCache.set(url, result);
return result;
}
const extractLinks = (
root: NHTMLElement,
baseUrl: string,
maxLinks: number,
searchTerms?: string[],
): Array<[string, string]> => {
const anchors = root.querySelectorAll("a[href]");
type Entry = { index: number; label: string; link: string; score: number };
const entries: Entry[] = [];
anchors.forEach((a, index) => {
const rawHref = a.getAttribute("href") ?? "";
let link: string;
try {
link = rawHref.startsWith("/") ? new URL(rawHref, baseUrl).href : rawHref;
} catch {
return;
}
if(!link.startsWith("http")) return;
const label = a.text.replace(/\s+/g, " ").trim();
const ratio = 1 / Math.min(1, /\d/g.exec(link)?.length || 1);
let score = ratio * (100 - (label.length + link.length + (20 * index / anchors.length)))
+ (1 - ratio) * label.split(/\s+/).length;
if(searchTerms?.length) {
score += searchTerms.reduce((acc, term) =>
acc + (label.toLowerCase().includes(term.toLowerCase()) ? 1000 : 0), 0);
}
entries.push({ index, label, link, score });
});
return entries
.filter((x, i, arr) => !arr.find((y, j) => j < i && y.link === x.link))
.sort((a, b) => b.score - a.score)
.slice(0, maxLinks)
.map(({ label, link }) => [label, link]);
};
const extractImages = (
root: NHTMLElement,
baseUrl: string,
maxImages: number,
searchTerms?: string[],
): Array<[string, string]> => {
type Entry = { index: number; alt: string; src: string; score: number };
const entries: Entry[] = [];
root.querySelectorAll("img").forEach((img, index) => {
const alt = img.getAttribute("alt") ?? "";
const src = img.getAttribute("src")
|| img.getAttribute("data-src")
|| img.getAttribute("data-original")
|| img.getAttribute("srcset")?.split(",")[0]?.trim()?.split(/\s+/)[0]
|| "";
if(!src) return;
let resolved: string;
try {
resolved = src.startsWith("/") ? new URL(src, baseUrl).href : src;
} catch {
return;
}
if(!resolved.startsWith("http")) return;
let score = alt.length;
if(searchTerms?.length) {
score += searchTerms.reduce((acc, term) =>
acc + (alt.toLowerCase().includes(term.toLowerCase()) ? 1000 : 0), 0);
}
entries.push({ index, alt, src: resolved, score });
});
const mdMatches = [...(root.text ?? "").matchAll(/!\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)/g)];
mdMatches.forEach((match, index) => {
entries.push({
index: entries.length + index,
alt: match[1] ?? "",
src: match[2],
score: (match[1] ?? "").length,
});
});
return entries
.filter((x, i, arr) => !arr.find((y, j) => j < i && y.src === x.src))
.sort((a, b) => b.score - a.score)
.slice(0, maxImages)
.sort((a, b) => a.index - b.index)
.map(({ alt, src }) => [alt, src]);
}
const downloadImagesTool = tool({
name: "Download Images",
description: "Use with remote HTTP(S) URLs or websiteURL.",
parameters: {
imageURLs: z.array(z.string()).optional().describe("List of image references to view. Supports HTTP(S) URLs, local file paths, and markdown image links."),
websiteURL: z.string().url().optional().describe("The URL of the website, whose images to view."),
maxImages: z.number().int().min(1).max(200).optional().describe("Maximum number of images to view when websiteURL is provided."),
},
implementation: async ({ imageURLs, websiteURL, maxImages }, { status, warn, signal }) => {
try {
maxImages = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("maxImages"), -1)
?? maxImages
?? 10;
const imageURLsToDownload = (imageURLs || [])
.map(normalizeImageReference)
.filter((x): x is string => !!x);
if (imageURLsToDownload.length === 0 && !websiteURL) {
return [];
}
if(websiteURL) {
status("Fetching image URLs from website...");
const { root } = await fetchHTML(websiteURL, signal, warn);
const images = extractImages(root, websiteURL, maxImages).map(x => x[1]);
imageURLsToDownload.push(...images);
}
status("Downloading images...");
const workingDirectory = ctl.getWorkingDirectory();
const timestamp = Date.now();
const BATCH_SIZE = 3;
const downloadedImageEntries: Array<{ fullPath: string; thumbPath: string }> = [];
for(let i = 0; i < imageURLsToDownload.length; i += BATCH_SIZE) {
const batch = imageURLsToDownload.slice(i, i + BATCH_SIZE);
const batchResults = await Promise.all(
batch.map((url, batchIndex) =>
downloadSingleImage(url, i + batchIndex + 1, timestamp, workingDirectory, signal, warn)
)
);
downloadedImageEntries.push(
...batchResults.filter((x): x is { fullPath: string; thumbPath: string } => x !== null)
);
if(i + BATCH_SIZE < imageURLsToDownload.length) {
await new Promise(resolve => setTimeout(resolve, 300 + Math.floor(Math.random() * 200)));
}
}
if (downloadedImageEntries.length === 0) {
// Avoid returning remote URLs as if they were downloaded.
return {
count: 0,
images: [],
compactGalleryMarkdown: "",
hint: "No images were downloaded successfully. The listed source URLs may be blocked or unavailable.",
};
}
status(`Downloaded ${downloadedImageEntries.length} images successfully.`);
return buildImagePresentationPayload(downloadedImageEntries, COMPACT_INLINE_LIMIT, COMPACT_GALLERY_LIMIT);
} catch (error: any) {
if (error instanceof DOMException && error.name === "AbortError") {
return "Image download aborted by user.";
}
console.error(error);
warn(`Error during image download: ${error.message}`);
return `Error: ${error.message}`;
}
}
});
const visitWebsiteTool = tool({
name: "Visit Website",
description: "Visit a website and return its title, headings, links, images, and text content. Images are automatically downloaded and viewable.",
parameters: {
url: z.string().url().describe("The URL of the website to visit"),
findInPage: z.array(z.string()).optional().describe("Highly recommended! Optional search terms to prioritize which links, images, and content to return."),
maxLinks: z.number().int().min(0).max(200).optional().describe("Maximum number of links to extract from the page."),
maxImages: z.number().int().min(0).max(200).optional().describe("Maximum number of images to extract from the page."),
contentLimit: z.number().int().min(0).max(10_000).optional().describe("Maximum text content length to extract from the page."),
},
implementation: async ({ url, maxLinks, maxImages, contentLimit, findInPage: searchTerms }, context) => {
const { status, warn, signal } = context;
status("Visiting website...");
try {
maxLinks = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("maxLinks"), -1)
?? maxLinks
?? 40;
maxImages = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("maxImages"), -1)
?? maxImages
?? 10;
contentLimit = undefinedIfAuto(ctl.getPluginConfig(configSchematics).get("contentLimit"), -1)
?? contentLimit
?? 2000;
const { root, source, finalUrl, statusCode, serverHeader } = await fetchHTML(url, signal, warn);
status("Website visited successfully.");
const title = root.querySelector("title")?.text?.trim() ?? "";
const headings = {
h1: root.querySelectorAll("h1").slice(0, 5).map(node => node.text.replace(/\s+/g, " ").trim()).filter(Boolean),
h2: root.querySelectorAll("h2").slice(0, 5).map(node => node.text.replace(/\s+/g, " ").trim()).filter(Boolean),
h3: root.querySelectorAll("h3").slice(0, 5).map(node => node.text.replace(/\s+/g, " ").trim()).filter(Boolean),
};
const links = maxLinks && extractLinks(root, url, maxLinks, searchTerms);
const imagesToFetch = maxImages ? extractImages(root, url, maxImages, searchTerms) : [];
const imagesPayload = maxImages &&
await downloadImagesTool.implementation({ imageURLs: imagesToFetch.map(x => x[1]) }, context) as any;
const imageMarkdowns: string[] = Array.isArray(imagesPayload)
? imagesPayload
: imagesPayload?.images || [];
const images = maxImages &&
imageMarkdowns.map((markdown, index) => [imagesToFetch[index]?.[0] || "", markdown] as [string, string]);
const textRoot = parse(root.toString());
for(const tag of textRoot.querySelectorAll("script,style,nav,footer,header,aside")) {
tag.remove();
}
const main = textRoot.querySelector("article") ?? textRoot.querySelector("main") ?? textRoot;
const allContent = contentLimit
? main.text.replace(/\s{3,}/g, "\n\n").replace(/\t/g, " ").trim()
: "";
let content = "";
if(searchTerms?.length && contentLimit < allContent.length) {
const padding = `.{0,${contentLimit / (searchTerms.length * 2)}}`;
const matches = searchTerms
.map(term => new RegExp(padding + term + padding, 'gi').exec(allContent))
.filter(match => !!match)
.sort((a, b) => a.index - b.index); // Sort by index in the content
let nextMinIndex = 0;
for(const match of matches) {
// Ensure we don't return duplicates by merging overlapping matches
content += match.index >= nextMinIndex
// The Match does not overlap with the previous one
? match[0]
// The match overlaps so we just extend the content to include it
: match[0].slice(nextMinIndex - match.index);
nextMinIndex = match.index + match[0].length;
}
}
else content = allContent.slice(0, contentLimit) // Limit text length
return {
url, title, headings,
fetch: { source, finalUrl, statusCode, server: serverHeader || undefined },
...(links ? { links } : {}),
...(images ? { images } : {}),
...(imagesPayload?.compactGalleryMarkdown ? { compactGalleryMarkdown: imagesPayload.compactGalleryMarkdown } : {}),
...(imagesPayload?.hint ? { hint: imagesPayload.hint } : {}),
...(content ? { content } : {}),
};
} catch (error: any) {
if (error instanceof DOMException && error.name === "AbortError") {
return "Website visit aborted by user.";
}
console.error(error);
warn(`Error during website visit: ${error.message}`);
return `Error: ${error.message}`;
}
},
});
tools.push(visitWebsiteTool);
tools.push(downloadImagesTool);
return tools;
}
const undefinedIfAuto = (value: unknown, autoValue: unknown) =>
value === autoValue ? undefined : value as undefined;
const spoofedUserAgents = [
// Chrome 124 - Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
// Chrome 124 - macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
// Chrome 124 - Linux
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
// Chrome 124 - Android
"Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.82 Mobile Safari/537.36",
// Edge 124 - Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
// Firefox 125 - Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
// Firefox 125 - macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.4; rv:125.0) Gecko/20100101 Firefox/125.0",
// Firefox 125 - Linux
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
// Safari 17 - macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
// Safari 17 - iPhone
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1",
]
function spoofHeaders(url:string) {
const domain = new URL(url).hostname;
return {
'User-Agent': spoofedUserAgents[Math.floor(Math.random() * spoofedUserAgents.length)],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://' + domain + '/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Cache-Control': 'max-age=0',
};
}
function buildImagePresentationPayload(
localImages: Array<{ fullPath: string; thumbPath: string }>,
inlineLimit = 2,
galleryLimit = 6,
) {
const thumbImages = localImages.map((x, i) => ``);
const hint = "These images are already downloaded and renderable (stored locally). Use compactGalleryMarkdown when you want a compact table gallery; otherwise, please, embed images where they fit with . The user will not see images unless you embed them. Only standard Markdown (no HTML <img>). Do NOT call Download Images as these are already downloaded local files";
return {
count: localImages.length,
images: thumbImages,
compactGalleryMarkdown: buildCompactGalleryMarkdown(thumbImages),
hint,
thumbnailsUsed: true,
};
}
async function downloadSingleImage(
url: string,
index: number,
timestamp: number,
workingDirectory: string,
signal: AbortSignal,
warn: (msg: string) => void,
): Promise<{ fullPath: string; thumbPath: string } | null> {
if(isLocalImagePath(url)) {
const safe = await resolveLocalImagePathForCurrentWorkingDirectory(url, workingDirectory);
return safe ? { fullPath: safe, thumbPath: safe } : null;
}
try {
const headers = spoofHeaders(url);
const imageResponse = await fetch(url, { method: "GET", signal, headers });
if(!imageResponse.ok) {
warn(`Failed to fetch image ${index}: ${imageResponse.statusText}`);
return null;
}
const contentType = imageResponse.headers.get("content-type") ?? "";
if(!contentType.startsWith("image/")) {
warn(`Image ${index} has unexpected content-type "${contentType}", skipping.`);
return null;
}
const bytes = await imageResponse.bytes();
if(bytes.length === 0) {
warn(`Image ${index} is empty: ${url}`);
return null;
}
const ext = /image\/([\w]+)/.exec(contentType)?.[1]
|| /\.([\w]+)(?:\?.*)?$/.exec(url)?.[1]
|| "jpg";
const fileName = `${timestamp}-${index}.${ext}`;
const filePath = join(workingDirectory, fileName);
await writeFile(filePath, bytes, "binary");
const thumbFileName = `${timestamp}-${index}-thumb.webp`;
const thumbFilePath = join(workingDirectory, thumbFileName);
const thumbCreated = await createThumbnailWebp(bytes, thumbFilePath);
return { fullPath: fileName, thumbPath: thumbCreated ? thumbFileName : fileName };
} catch(error: any) {
if(error instanceof DOMException && error.name === "AbortError") return null;
warn(`Error fetching image ${index}: ${error.message}`);
return null;
}
}
function normalizeImageReference(input: string): string | null {
const trimmed = input.trim();
if (!trimmed) return null;
const markdownMatch = trimmed.match(/!\[[^\]]*\]\(([^)]+)\)/);
const extracted = markdownMatch?.[1]?.trim() || trimmed;
if (/^https?:\/\//i.test(extracted)) return extracted;
if (extracted.startsWith("/")) return extracted;
if (/^[a-zA-Z]:[\\/]/.test(extracted)) return extracted;
return null;
}
function buildCompactGalleryMarkdown(imageMarkdowns: string[]): string {
if (imageMarkdowns.length === 0) return "";
const rows: string[] = ["| 1 | 2 |", "| --- | --- |"];
for (let i = 0; i < imageMarkdowns.length; i += 2) {
const left = imageMarkdowns[i] || "";
const right = imageMarkdowns[i + 1] || "";
rows.push(`| ${left} | ${right} |`);
}
return rows.join("\n");
}
const THUMB_MAX_WIDTH = 360;
const THUMB_WEBP_QUALITY = 60;
function isLocalImagePath(value: string): boolean {
return value.startsWith("/") || value.startsWith("./") || value.startsWith("../") || /^[a-zA-Z]:[\\/]/.test(value);
}
async function createThumbnailWebp(
inputBytes: Uint8Array,
outputPath: string,
): Promise<boolean> {
try {
const sharpModule = await import("sharp");
const sharp = (sharpModule as any).default || sharpModule;
const outputBytes = await sharp(Buffer.from(inputBytes))
.resize({ width: THUMB_MAX_WIDTH, fit: "inside", withoutEnlargement: true })
.webp({ quality: THUMB_WEBP_QUALITY })
.toBuffer();
if (!outputBytes?.length) {
return false;
}
await writeFile(outputPath, outputBytes, "binary");
return true;
} catch {
return false;
}
}
async function resolveLocalImagePathForCurrentWorkingDirectory(
inputPath: string,
workingDirectory: string,
): Promise<string | null> {
if (!inputPath) return null;
const normalized = inputPath.replace(/\\/g, "/");
const filename = normalized.split("/").pop() || "";
const candidates = [inputPath, normalized, filename].filter(Boolean);
for (const candidate of candidates) {
const absoluteCandidate = candidate === filename
? join(workingDirectory, filename)
: toAbsolutePath(candidate, workingDirectory);
try {
await access(absoluteCandidate);
return candidate === filename ? filename : normalized;
} catch {
// Try next candidate
}
}
return null;
}
function toAbsolutePath(pathLike: string, workingDirectory: string): string {
if (/^[a-zA-Z]:[\\/]/.test(pathLike)) return pathLike;
if (pathLike.startsWith("/")) {
if (/^\/Users\//.test(pathLike)) {
return pathLike.replace(/^\//, "").replace(/\//g, "\\");
}
return join(workingDirectory, pathLike);
}
return join(workingDirectory, pathLike);
}