search.js
"use strict";
/**
* Custom web search — no API key, no rate limits.
*
* Priority:
* 1. SearXNG (self-hosted meta-search) — best quality, aggregates many engines
* 2. DuckDuckGo HTML scraper — no API, no account
* 3. Bing HTML scraper — fallback
*
* Pass searxngUrl (e.g. "http://localhost:8080") to enable SearXNG.
* Falls back automatically when any tier fails or returns 0 results.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.webSearch = webSearch;
const UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
function decodeHtmlEntities(s) {
return s
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ /g, " ");
}
function stripTags(html) {
return decodeHtmlEntities(html.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim());
}
async function fetchText(url, timeoutMs, headers) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
try {
const res = await fetch(url, {
signal: ctrl.signal,
headers: {
"User-Agent": UA,
"Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
...headers,
},
});
if (!res.ok)
throw new Error(`HTTP ${res.status}`);
return await res.text();
}
finally {
clearTimeout(timer);
}
}
// ---------------------------------------------------------------------------
// Tier 1 — SearXNG (self-hosted, JSON API)
// Run: docker run -p 8080:8080 searxng/searxng
// ---------------------------------------------------------------------------
async function searchSearXNG(baseUrl, query, max, timeoutMs, timeRange) {
const params = { q: query, format: "json" };
if (timeRange)
params["time_range"] = timeRange;
const url = `${baseUrl.replace(/\/$/, "")}/search?${new URLSearchParams(params)}`;
const text = await fetchText(url, timeoutMs, { Accept: "application/json" });
const data = JSON.parse(text);
return (data.results ?? [])
.filter((r) => r.url?.startsWith("http"))
.slice(0, max)
.map((r) => ({ title: r.title ?? "", url: r.url ?? "", snippet: r.content ?? "" }));
}
// ---------------------------------------------------------------------------
// Tier 2 — DuckDuckGo HTML
// Result format:
// <a class="result__a" href="/l/?uddg=ENCODED_URL&...">Title</a>
// <div class="result__snippet">Snippet</div>
// ---------------------------------------------------------------------------
// DDG df= param: d=past day, w=past week, m=past month, y=past year
const DDG_TIME = { day: "d", week: "w", month: "m", year: "y" };
async function searchDDG(query, max, timeoutMs, timeRange) {
const dfParam = timeRange ? `&df=${DDG_TIME[timeRange]}` : "";
const html = await fetchText(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}&kl=us-en${dfParam}`, timeoutMs);
const links = [];
const linkRe = /class="result__a"[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/g;
let m;
while ((m = linkRe.exec(html)) !== null && links.length < max) {
const uddg = m[1].match(/[?]uddg=([^&"]+)/);
let url = m[1];
if (uddg) {
try {
url = decodeURIComponent(uddg[1]);
}
catch {
continue;
}
}
if (!url.startsWith("http"))
continue;
const title = stripTags(m[2]);
if (title)
links.push({ url, title });
}
if (links.length === 0)
return [];
const snippets = [];
const snipRe = /class="result__snippet"[^>]*>([\s\S]*?)<\/(?:div|a)>/g;
let sm;
while ((sm = snipRe.exec(html)) !== null)
snippets.push(stripTags(sm[1]));
return links.map((l, i) => ({ ...l, snippet: snippets[i] ?? "" }));
}
// ---------------------------------------------------------------------------
// Tier 3 — Bing HTML
// ---------------------------------------------------------------------------
// Bing HTML search has no reliable URL-based freshness parameter.
// (tbs=qdr:X is Google's parameter and was silently ignored by Bing.)
// Time filtering is handled by SearXNG and DDG tiers above; Bing is the
// last-resort fallback and returns unfiltered results.
async function searchBing(query, max, timeoutMs, _timeRange) {
const html = await fetchText(`https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${max}&setlang=en&cc=us`, timeoutMs);
const results = [];
const liRe = /<li class="b_algo">([\s\S]*?)<\/li>/g;
let m;
while ((m = liRe.exec(html)) !== null && results.length < max) {
const block = m[1];
const linkM = block.match(/<h2[^>]*>\s*<a[^>]+href="(https?:\/\/[^"]+)"[^>]*>([\s\S]*?)<\/a>/);
if (!linkM)
continue;
const title = stripTags(linkM[2]);
if (!title)
continue;
const snipM = block.match(/<div class="b_caption"[^>]*>[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/) ??
block.match(/<p[^>]*>([\s\S]*?)<\/p>/);
results.push({ title, url: linkM[1], snippet: snipM ? stripTags(snipM[1]) : "" });
}
return results;
}
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/**
* Search the web with automatic fallback chain:
* SearXNG (if baseUrl provided) → DuckDuckGo HTML → Bing HTML
*
* @param query Search query string
* @param max Max results to return (default 8)
* @param timeoutMs Per-request timeout in ms (default 10000)
* @param searxngUrl Optional SearXNG base URL, e.g. "http://localhost:8080"
* @param timeRange Optional recency filter: "day" | "week" | "month" | "year"
* Applied natively on each engine — not just a query hint.
*/
async function webSearch(query, max = 8, timeoutMs = 10_000, searxngUrl, timeRange) {
if (searxngUrl) {
try {
const r = await searchSearXNG(searxngUrl, query, max, timeoutMs, timeRange);
if (r.length > 0)
return r;
}
catch { /* SearXNG unavailable — try DDG */ }
}
try {
const r = await searchDDG(query, max, timeoutMs, timeRange);
if (r.length > 0)
return r;
}
catch { /* DDG failed — try Bing */ }
try {
return await searchBing(query, max, timeoutMs, timeRange);
}
catch {
return [];
}
}