Project Files
src / search / providers / SearXNGProvider.ts
import * as cheerio from "cheerio";
import { normalizeUrl } from "../../crawler/normalizeUrl";
import { SEARCH_RESULT_LIMIT, SEARXNG_BASE_URL, SEARXNG_FALLBACK_URLS, REQUEST_TIMEOUT_MS } from "../../utils/constants";
import type { SearchProvider } from "./SearchProvider";
interface SearXNGResult {
url: string;
title: string;
}
// Full browser-like headers that public SearXNG instances expect.
// These must stay together — partial sets trigger 403 on most instances.
const BROWSER_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
};
async function fetchWithTimeout(url: string, headers: Record<string, string>): Promise<Response> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
try {
return await fetch(url, { signal: controller.signal, redirect: "follow", headers });
} finally {
clearTimeout(timer);
}
}
/** Try ?format=json — some instances allow it, returns null if blocked. */
async function tryJsonSearch(instance: string, query: string, maxResults: number = SEARCH_RESULT_LIMIT): Promise<string[] | null> {
const url = new URL(`${instance}/search`);
url.searchParams.set("q", query);
url.searchParams.set("format", "json");
const res = await fetchWithTimeout(url.toString(), {
...BROWSER_HEADERS,
"Accept": "application/json,text/plain,*/*",
});
if (!res.ok) return null; // 403/429 etc — caller will try HTML
const text = await res.text();
try {
const data = JSON.parse(text) as { results?: SearXNGResult[] };
return extractUrlsFromJson(data, instance, maxResults);
} catch {
return null;
}
}
/** Scrape the HTML results page — works on every public instance. */
async function tryHtmlSearch(instance: string, query: string, maxResults: number = SEARCH_RESULT_LIMIT): Promise<string[]> {
const url = new URL(`${instance}/search`);
url.searchParams.set("q", query);
const res = await fetchWithTimeout(url.toString(), {
...BROWSER_HEADERS,
"Referer": instance + "/",
});
if (!res.ok) {
throw new Error(`HTTP ${res.status} ${res.statusText}`.trim());
}
const html = await res.text();
return extractUrlsFromHtml(html, instance, maxResults);
}
function extractUrlsFromJson(data: { results?: SearXNGResult[] }, instance: string, maxResults: number = SEARCH_RESULT_LIMIT): string[] {
if (!Array.isArray(data?.results)) return [];
const urls: string[] = [];
for (const r of data.results) {
if (!r?.url) continue;
try {
const n = normalizeUrl(instance, r.url);
if (n && !urls.includes(n)) urls.push(n);
} catch { /* skip */ }
if (urls.length >= maxResults) break;
}
return urls;
}
function extractUrlsFromHtml(html: string, instance: string, maxResults: number = SEARCH_RESULT_LIMIT): string[] {
const $ = cheerio.load(html);
const urls: string[] = [];
// SearXNG HTML result links — covers all known themes
const selectors = [
"article.result h3 a",
".result-title a",
"h3.result_header a",
".result_header a",
"h3 a[href]",
];
for (const selector of selectors) {
$(selector).each((_, el) => {
const href = $(el).attr("href");
if (!href || !href.startsWith("http")) return;
try {
const n = normalizeUrl(instance, href);
if (n && !urls.includes(n)) urls.push(n);
} catch { /* skip */ }
});
if (urls.length >= maxResults) break;
}
return urls.slice(0, maxResults);
}
export class SearXNGProvider implements SearchProvider {
private baseUrl: string;
constructor(baseUrl: string = SEARXNG_BASE_URL) {
this.baseUrl = baseUrl.replace(/\/$/, "");
}
async search(query: string, maxResults: number = SEARCH_RESULT_LIMIT): Promise<string[]> {
const allInstances = [
this.baseUrl,
...SEARXNG_FALLBACK_URLS.filter(u => u !== this.baseUrl),
];
let lastError: Error = new Error("No instances configured");
for (const instance of allInstances) {
try {
// 1. Try JSON API first (faster, structured)
const jsonUrls = await tryJsonSearch(instance, query, maxResults);
if (jsonUrls && jsonUrls.length > 0) {
console.log(`[SearXNG] JSON hit on ${instance} — ${jsonUrls.length} results`);
return jsonUrls;
}
// 2. Fall back to HTML scraping
const htmlUrls = await tryHtmlSearch(instance, query, maxResults);
if (htmlUrls.length > 0) {
console.log(`[SearXNG] HTML hit on ${instance} — ${htmlUrls.length} results`);
return htmlUrls;
}
console.warn(`[SearXNG] ${instance} returned 0 results — trying next`);
} catch (err) {
lastError = err instanceof Error ? err : new Error(String(err));
console.warn(`[SearXNG] ${instance} failed: ${lastError.message} — trying next`);
}
}
throw new Error(
`[SearXNG] All instances failed for "${query}". ` +
`Last error: ${lastError.message}. ` +
`You can set SEARXNG_URL to a working instance from https://searx.space`
);
}
}