src / toolsProvider.ts
/**
* lms-plugin-deep-search β toolsProvider.ts
*
* Exposes ONE tool: deep_search
*
* Workflow:
* 1. Search Phase β query DuckDuckGo HTML endpoint; fall back to Google if DNS fails.
* 2. Scrape Phase β visit each of the top N URLs in parallel using axios + cheerio.
* 3. Report Phase β combine cleaned extracts into a single token-safe string (β€4000 chars).
*
* Fixes shipped:
* β’ dns.setServers(['8.8.8.8', '8.8.4.4']) β avoids ENOTFOUND on restrictive hosts
* β’ axios replaces fetch() β honours the custom DNS resolver reliably
* β’ Cheerio strips scripts/styles/nav/footer β clean text only
* β’ User-Agent rotation (19 real UA strings)
* β’ DuckDuckGo β Google fallback search
* β’ Hard cap: total output β€ 4000 chars
*/
import dns from "dns";
dns.setServers(["8.8.8.8", "8.8.4.4"]);
import axios, { AxiosRequestConfig } from "axios";
import * as cheerio from "cheerio";
import { tool, Tool, ToolsProviderController } from "@lmstudio/sdk";
import { z } from "zod";
import { configSchematics } from "./config";
// βββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const MAX_TOTAL_CHARS = 32000;
const REQUEST_TIMEOUT_MS = 12_000;
const SEARCH_DELAY_MS = 1_500; // polite delay between search & scrape batch
const SPOOFED_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.4; rv:124.0) Gecko/20100101 Firefox/124.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
"Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.82 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.230 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.62",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36",
"Mozilla/5.0 (Linux; Android 10; SM-M515F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 9; POT-LX1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36",
"Opera/9.80 (Android 7.0; Opera Mini/36.2.2254/119.132; U; id) Presto/2.12.423 Version/12.16",
];
// βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
function randomUA(): string {
return SPOOFED_USER_AGENTS[Math.floor(Math.random() * SPOOFED_USER_AGENTS.length)];
}
function buildHeaders(referer: string): Record<string, string> {
return {
"User-Agent": randomUA(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Referer": referer,
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Cache-Control": "max-age=0",
};
}
function axiosCfg(referer: string): AxiosRequestConfig {
return {
timeout: REQUEST_TIMEOUT_MS,
headers: buildHeaders(referer),
// Follow redirects, cap at 5
maxRedirects: 5,
// Return raw text; axios will decompress gzip automatically
responseType: "text",
// Allow any 2xx status
validateStatus: (s) => s >= 200 && s < 300,
};
}
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
// βββ Phase 1: Search βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
interface SearchResult {
title: string;
url: string;
}
/**
* Try DuckDuckGo HTML endpoint first.
* If it throws a network/DNS error, fall back to Google (Custom Search via HTML scrape).
*/
async function searchWeb(
query: string,
numResults: number,
safeSearch: "strict" | "moderate" | "off",
warn: (msg: string) => void,
): Promise<SearchResult[]> {
try {
return await searchDuckDuckGo(query, numResults, safeSearch);
} catch (err: any) {
warn(`DuckDuckGo failed (${err.message}), falling back to Googleβ¦`);
try {
return await searchGoogle(query, numResults);
} catch (err2: any) {
throw new Error(`Both search engines failed. DDG: ${err.message} | Google: ${err2.message}`);
}
}
}
async function searchDuckDuckGo(
query: string,
numResults: number,
safeSearch: "strict" | "moderate" | "off",
): Promise<SearchResult[]> {
const url = new URL("https://duckduckgo.com/html/");
url.searchParams.set("q", query);
if (safeSearch === "strict") url.searchParams.set("p", "-1");
if (safeSearch === "off") url.searchParams.set("p", "1");
const resp = await axios.get<string>(url.toString(), axiosCfg("https://duckduckgo.com/"));
const html: string = resp.data;
// DuckDuckGo HTML result links have the pattern:
// href="...uddg=<encoded-url>&..." or plain href="https://..."
const results: SearchResult[] = [];
// Extract via cheerio for robustness
const $ = cheerio.load(html);
$("a.result__a").each((_i, el) => {
if (results.length >= numResults) return false; // break
const rawHref = $(el).attr("href") || "";
const titleText = $(el).text().trim();
// DuckDuckGo HTML wraps real URLs in "uddg=" query param
let finalUrl = rawHref;
try {
const parsed = new URL(rawHref, "https://duckduckgo.com");
const uddg = parsed.searchParams.get("uddg");
if (uddg) finalUrl = decodeURIComponent(uddg);
} catch {/* ignore */}
if (finalUrl.startsWith("http") && !results.some(r => r.url === finalUrl)) {
results.push({ title: titleText, url: finalUrl });
}
});
// Fallback regex if cheerio found nothing (layout change)
if (results.length === 0) {
const regex = /href="[^"]*uddg=(https?[^&"]+)/g;
let m: RegExpExecArray | null;
while (results.length < numResults && (m = regex.exec(html))) {
const finalUrl = decodeURIComponent(m[1]);
if (!results.some(r => r.url === finalUrl)) {
results.push({ title: "", url: finalUrl });
}
}
}
if (results.length === 0) throw new Error("No results returned by DuckDuckGo");
return results;
}
async function searchGoogle(query: string, numResults: number): Promise<SearchResult[]> {
const url = new URL("https://www.google.com/search");
url.searchParams.set("q", query);
url.searchParams.set("num", String(numResults + 2)); // ask a few extra
url.searchParams.set("hl", "en");
const resp = await axios.get<string>(url.toString(), {
...axiosCfg("https://www.google.com/"),
headers: {
// Google is stricter β use a plain desktop Chrome UA
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
},
});
const $ = cheerio.load(resp.data as string);
const results: SearchResult[] = [];
// Google result links: <a> inside <div class="g"> or similar
$("a").each((_i, el) => {
if (results.length >= numResults) return false;
const href = $(el).attr("href") || "";
// Google wraps real URLs: /url?q=<url>&...
let finalUrl = href;
if (href.startsWith("/url?")) {
try {
const p = new URL("https://www.google.com" + href);
finalUrl = p.searchParams.get("q") || href;
} catch {/* ignore */}
}
if (
finalUrl.startsWith("http") &&
!finalUrl.includes("google.com") &&
!results.some(r => r.url === finalUrl)
) {
const titleText = $(el).text().trim();
results.push({ title: titleText, url: finalUrl });
}
});
if (results.length === 0) throw new Error("No results returned by Google");
return results;
}
// βββ Phase 2: Scrape βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
interface ScrapeResult {
url: string;
title: string;
text: string;
error?: string;
}
async function scrapePage(url: string, maxChars: number): Promise<ScrapeResult> {
try {
const resp = await axios.get<string>(url, axiosCfg(new URL(url).origin + "/"));
// ββ MIME Type Check βββββββββββββββββββββββββββββββββββββββββββββββββββ
// Only process text/html, text/plain, or application/json
const contentType = (resp.headers["content-type"] || "").toLowerCase();
const allowedMimeTypes = ["text/html", "text/plain", "application/json"];
const isAllowedType = allowedMimeTypes.some(type => contentType.includes(type));
if (!isAllowedType) {
return {
url,
title: url,
text: "",
error: `Unsupported format: ${contentType || "unknown"}. Only text/html, text/plain, or application/json are supported.`
};
}
const html: string = resp.data;
const $ = cheerio.load(html);
// Strip non-content elements
$(
"script, style, noscript, iframe, nav, footer, header, " +
"aside, .sidebar, .ad, .ads, .advertisement, .cookie-banner, " +
"[class*='cookie'], [id*='cookie'], [class*='popup'], [id*='popup'], " +
"[class*='newsletter'], [class*='subscribe'], [role='banner'], [role='navigation']"
).remove();
const title = $("title").first().text().trim() || new URL(url).hostname;
// Prefer <article> or <main>, else fall back to <body>
const container = $("article").first().length
? $("article").first()
: $("main").first().length
? $("main").first()
: $("body");
let rawText = container
.text()
.replace(/\s{3,}/g, "\n\n") // collapse long whitespace runs
.replace(/\t/g, " ")
.trim();
// ββ Encoding Fix & Sanitization βββββββββββββββββββββββββββββββββββββββ
// Remove non-printable characters (keeps ASCII 0x20-0x7E and Unicode 0xA0+)
rawText = rawText.replace(/[^\x20-\x7E\u00A0-\uFFFF]/g, "");
// Collapse multiple consecutive newlines/spaces into single ones
rawText = rawText.replace(/\n\n+/g, "\n\n");
rawText = rawText.replace(/ +/g, " ");
// ββ Safety Check: Discard if mostly garbage ββββββββββββββββββββββββββββ
// Count non-dictionary characters (anything not alphanumeric, whitespace, or common punctuation)
const nonDictionaryChars = (rawText.match(/[^a-zA-Z0-9\s.,;:!?\-'"\(\)\n]/g) || []).length;
const garbageRatio = rawText.length > 0 ? nonDictionaryChars / rawText.length : 0;
if (garbageRatio > 0.3) {
return {
url,
title,
text: "",
error: `Content is mostly garbage (${Math.round(garbageRatio * 100)}% non-dictionary characters). Discarded to protect context window.`
};
}
const text = rawText.slice(0, maxChars);
return { url, title, text };
} catch (err: any) {
return { url, title: url, text: "", error: err.message };
}
}
// βββ Phase 3: Report βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
function buildReport(query: string, scrapes: ScrapeResult[]): string {
const header = `## Deep Search Report\n**Query:** ${query}\n\n`;
const sections: string[] = [];
for (const s of scrapes) {
if (s.error) {
sections.push(`### β ${s.url}\n_Failed to scrape: ${s.error}_\n`);
} else {
sections.push(`### ${s.title}\n**URL:** ${s.url}\n\n${s.text}\n`);
}
}
let report = header + sections.join("\n---\n\n");
// Hard cap: ensure total response β€ MAX_TOTAL_CHARS
if (report.length > MAX_TOTAL_CHARS) {
report = report.slice(0, MAX_TOTAL_CHARS - 80) +
`\n\n_[Report truncated to ${MAX_TOTAL_CHARS} characters to protect context window]_`;
}
return report;
}
// βββ Tool Registration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
export async function toolsProvider(ctl: ToolsProviderController): Promise<Tool[]> {
let lastSearchTimestamp = 0;
const deepSearchTool = tool({
name: "deep_search",
description:
"Perform a full web research cycle in one call: " +
"(1) search DuckDuckGo for the query (falls back to Google on network errors), " +
"(2) automatically visit and scrape the top results, " +
"(3) return a single cleaned, token-safe report with the combined content. " +
"Use this whenever you need up-to-date information from the internet.",
parameters: {
query: z.string().describe("The research question or search query."),
numResults: z.coerce // ThΓͺm chα»― .coerce vΓ o ΔΓ’y
.number().int().min(1).max(5).optional()
.describe("Number of websites to visit (1β5). Default: 3."),
safeSearch: z
.enum(["strict", "moderate", "off"]).optional()
.describe("Safe search level. Default: moderate."),
},
implementation: async ({ query, numResults, safeSearch }, { status, warn, signal }) => {
// ββ Config overrides ββββββββββββββββββββββββββββββββββββββββββββββ
const cfg = ctl.getPluginConfig(configSchematics);
numResults = numResults ?? cfg.get("numResults") ?? 3;
safeSearch = safeSearch
?? (cfg.get("safeSearch") === "auto" ? "moderate" : cfg.get("safeSearch") as "strict" | "moderate" | "off")
?? "moderate";
const maxCharsPerSite: number = (cfg.get("maxCharsPerSite") as number | undefined) ?? 8000;
// ββ Rate-limit guard (don't hammer search engines) ββββββββββββββββ
const now = Date.now();
const gap = now - lastSearchTimestamp;
if (gap < SEARCH_DELAY_MS) await sleep(SEARCH_DELAY_MS - gap);
lastSearchTimestamp = Date.now();
// Allow abort propagation
if (signal.aborted) return "Search aborted.";
try {
// ββ Phase 1: Search βββββββββββββββββββββββββββββββββββββββββββ
status(`π Searching the web for: "${query}"β¦`);
const searchResults = await searchWeb(query, numResults, safeSearch, warn);
status(`β
Found ${searchResults.length} URL(s). Scraping contentβ¦`);
if (signal.aborted) return "Search aborted.";
// ββ Phase 2: Scrape (parallel) ββββββββββββββββββββββββββββββββ
const scrapePromises = searchResults.map((r) =>
scrapePage(r.url, maxCharsPerSite)
);
const scrapes = await Promise.all(scrapePromises);
const successCount = scrapes.filter(s => !s.error).length;
status(`π Scraped ${successCount}/${scrapes.length} page(s) successfully.`);
if (signal.aborted) return "Search aborted.";
// ββ Phase 3: Build report βββββββββββββββββββββββββββββββββββββ
const report = buildReport(query, scrapes);
status(`β Deep search complete. Report: ${report.length} chars.`);
return report;
} catch (err: any) {
if (err instanceof DOMException && err.name === "AbortError") {
return "Deep search aborted by user.";
}
console.error("[deep_search] fatal error:", err);
warn(`Deep search failed: ${err.message}`);
return `Error: ${err.message}`;
}
},
});
return [deepSearchTool];
}