Project Files
src / net / search-engines.ts
/**
* @file net/search-engines.ts
* Multi-engine search — Brave Search, Google Scholar, SearXNG, Mojeek.
* ALL are HTML-scraped, zero API keys required.
* Combined with DDG to 2-4× the candidate pool.
*/
import { SearchHit } from "../types";
import { buildBrowserHeaders, sleep } from "./http";
import { DdgRateLimiter } from "./ddg";
export async function searchBrave(
query: string,
maxResults: number,
signal: AbortSignal,
limiter?: DdgRateLimiter,
): Promise<ReadonlyArray<SearchHit>> {
if (limiter) await limiter.acquire();
if (signal.aborted) throw new DOMException("Aborted", "AbortError");
try {
const url = `https://search.brave.com/search?q=${encodeURIComponent(query)}&source=web`;
const res = await fetch(url, {
signal,
headers: {
...buildBrowserHeaders(url),
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
});
if (!res.ok) return [];
const html = await res.text();
return parseBraveResults(html, maxResults);
} catch {
return [];
}
}
function parseBraveResults(html: string, maxResults: number): SearchHit[] {
const hits: SearchHit[] = [];
const seen = new Set<string>();
const snippetBlockRe =
/<div[^>]+class="[^"]*snippet[^"]*"[^>]*>([\s\S]*?)(?=<div[^>]+class="[^"]*snippet|<footer)/gi;
const linkRe = /href="(https?:\/\/[^"]+)"[^>]*>([\s\S]*?)<\/a>/i;
const descRe =
/<div[^>]+class="[^"]*snippet-description[^"]*"[^>]*>([\s\S]*?)<\/div>/i;
let blockMatch: RegExpExecArray | null;
while (
hits.length < maxResults &&
(blockMatch = snippetBlockRe.exec(html)) !== null
) {
const block = blockMatch[1];
const lm = linkRe.exec(block);
if (!lm) continue;
const rawUrl = lm[1].trim();
if (!rawUrl.startsWith("http") || seen.has(rawUrl)) continue;
if (/brave\.com|search\.brave/i.test(rawUrl)) continue;
seen.add(rawUrl);
const title = stripTags(lm[2]).trim();
const descMatch = descRe.exec(block);
const snippet = descMatch ? stripTags(descMatch[1]).trim() : title;
hits.push({ url: rawUrl, title, snippet });
}
if (hits.length === 0) {
const fallbackRe =
/<a[^>]+href="(https?:\/\/[^"]+)"[^>]*class="[^"]*heading[^"]*"[^>]*>([\s\S]*?)<\/a>/gi;
let m: RegExpExecArray | null;
while (hits.length < maxResults && (m = fallbackRe.exec(html)) !== null) {
const rawUrl = m[1].trim();
if (seen.has(rawUrl) || /brave\.com/i.test(rawUrl)) continue;
seen.add(rawUrl);
hits.push({ url: rawUrl, title: stripTags(m[2]).trim(), snippet: "" });
}
}
return hits;
}
export async function searchGoogleScholar(
query: string,
maxResults: number,
signal: AbortSignal,
limiter?: DdgRateLimiter,
): Promise<ReadonlyArray<SearchHit>> {
if (limiter) await limiter.acquire();
if (signal.aborted) throw new DOMException("Aborted", "AbortError");
try {
const url = `https://scholar.google.com/scholar?q=${encodeURIComponent(query)}&hl=en&num=${Math.min(maxResults, 10)}`;
const res = await fetch(url, {
signal,
headers: buildBrowserHeaders(url),
});
if (!res.ok) return [];
const html = await res.text();
return parseScholarResults(html, maxResults);
} catch {
return [];
}
}
function parseScholarResults(html: string, maxResults: number): SearchHit[] {
const hits: SearchHit[] = [];
const seen = new Set<string>();
const blockRe =
/<div[^>]+class="[^"]*gs_r[^"]*"[^>]*>([\s\S]*?)(?=<div[^>]+class="[^"]*gs_r|$)/gi;
const titleLinkRe =
/<h3[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i;
const snippetRe = /<div[^>]+class="gs_rs"[^>]*>([\s\S]*?)<\/div>/i;
let blockMatch: RegExpExecArray | null;
while (
hits.length < maxResults &&
(blockMatch = blockRe.exec(html)) !== null
) {
const block = blockMatch[1];
const tm = titleLinkRe.exec(block);
if (!tm) continue;
let rawUrl = tm[1].trim();
const urlParam = /[?&]url=(https?[^&]+)/.exec(rawUrl);
if (urlParam) rawUrl = decodeURIComponent(urlParam[1]);
if (!rawUrl.startsWith("http") || seen.has(rawUrl)) continue;
if (/scholar\.google|google\.com\/scholar/i.test(rawUrl)) continue;
seen.add(rawUrl);
const title = stripTags(tm[2]).trim();
const sm = snippetRe.exec(block);
const snippet = sm ? stripTags(sm[1]).trim() : title;
hits.push({ url: rawUrl, title, snippet });
}
return hits;
}
const SEARXNG_INSTANCES: ReadonlyArray<string> = [
"https://search.sapti.me",
"https://searx.tiekoetter.com",
"https://search.bus-hit.me",
"https://priv.au",
];
export async function searchSearXNG(
query: string,
maxResults: number,
signal: AbortSignal,
limiter?: DdgRateLimiter,
): Promise<ReadonlyArray<SearchHit>> {
if (limiter) await limiter.acquire();
if (signal.aborted) throw new DOMException("Aborted", "AbortError");
for (const instance of SEARXNG_INSTANCES) {
if (signal.aborted) break;
try {
const url = `${instance}/search?q=${encodeURIComponent(query)}&format=json&categories=general&language=en`;
const res = await fetch(url, {
signal,
headers: { "User-Agent": "Mozilla/5.0", Accept: "application/json" },
});
if (!res.ok) continue;
const data = (await res.json()) as {
results?: Array<{ url?: string; title?: string; content?: string }>;
};
if (!data.results || !Array.isArray(data.results)) continue;
const hits: SearchHit[] = [];
const seen = new Set<string>();
for (const r of data.results) {
if (hits.length >= maxResults) break;
if (!r.url || !r.url.startsWith("http") || seen.has(r.url)) continue;
seen.add(r.url);
hits.push({
url: r.url,
title: r.title ?? "",
snippet: r.content ?? r.title ?? "",
});
}
if (hits.length > 0) return hits;
} catch {
continue;
}
}
return [];
}
export async function searchMojeek(
query: string,
maxResults: number,
signal: AbortSignal,
limiter?: DdgRateLimiter,
): Promise<ReadonlyArray<SearchHit>> {
if (limiter) await limiter.acquire();
if (signal.aborted) throw new DOMException("Aborted", "AbortError");
try {
const url = `https://www.mojeek.com/search?q=${encodeURIComponent(query)}&fmt=html`;
const res = await fetch(url, {
signal,
headers: buildBrowserHeaders(url),
});
if (!res.ok) return [];
const html = await res.text();
return parseMojeekResults(html, maxResults);
} catch {
return [];
}
}
function parseMojeekResults(html: string, maxResults: number): SearchHit[] {
const hits: SearchHit[] = [];
const seen = new Set<string>();
const linkRe = /<a[^>]+class="ob"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
const snippetRe = /<p[^>]+class="s"[^>]*>([\s\S]*?)<\/p>/gi;
const links: Array<{ url: string; title: string }> = [];
const snippets: string[] = [];
let m: RegExpExecArray | null;
while ((m = linkRe.exec(html)) !== null) {
const rawUrl = m[1].trim();
if (rawUrl.startsWith("http") && !/mojeek\.com/i.test(rawUrl)) {
links.push({ url: rawUrl, title: stripTags(m[2]).trim() });
}
}
while ((m = snippetRe.exec(html)) !== null) {
snippets.push(stripTags(m[1]).trim());
}
for (let i = 0; i < links.length && hits.length < maxResults; i++) {
const { url, title } = links[i];
if (seen.has(url)) continue;
seen.add(url);
hits.push({ url, title, snippet: snippets[i] ?? title });
}
return hits;
}
export type SearchEngine = "ddg" | "brave" | "scholar" | "searxng" | "mojeek";
/**
* Run a query across multiple engines in parallel and merge results.
* Deduplicates by URL. Each engine gets its own limiter from the pool.
*/
export async function multiEngineSearch(
query: string,
maxResultsPerEngine: number,
engines: ReadonlyArray<SearchEngine>,
signal: AbortSignal,
getLimiter: () => DdgRateLimiter,
): Promise<ReadonlyArray<SearchHit>> {
const engineFns: Record<
SearchEngine,
(
q: string,
max: number,
s: AbortSignal,
l: DdgRateLimiter,
) => Promise<ReadonlyArray<SearchHit>>
> = {
ddg: async () => [],
brave: searchBrave,
scholar: searchGoogleScholar,
searxng: searchSearXNG,
mojeek: searchMojeek,
};
const promises = engines
.filter((e) => e !== "ddg")
.map((engine) => {
const fn = engineFns[engine];
const limiter = getLimiter();
return fn(query, maxResultsPerEngine, signal, limiter).catch(
() => [] as SearchHit[],
);
});
const results = await Promise.all(promises);
const seen = new Set<string>();
const merged: SearchHit[] = [];
for (const engineHits of results) {
for (const hit of engineHits) {
if (!seen.has(hit.url)) {
seen.add(hit.url);
merged.push(hit);
}
}
}
return merged;
}
function stripTags(html: string): string {
return html
.replace(/<[^>]+>/g, "")
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ /g, " ")
.replace(/\s+/g, " ")
.trim();
}