Project Files
src / net / ddg.ts
/**
* @file net/ddg.ts
* DuckDuckGo search scraper with:
* - Adaptive throttle: shared error counter across all lanes — when DDG
* starts failing, ALL lanes back off exponentially and add jitter
* - Global cooldown: after N consecutive failures, pause everything
* - Per-lane rate limiting with staggered offsets
* - Pagination support
* - Query length enforcement (DDG chokes on 100+ char queries)
*/
import { DDG_RATE_LIMIT_MS } from "../constants";
import { SearchHit } from "../types";
import { buildDDGHeaders, sleep } from "./http";
/** Max query length sent to DDG — longer queries get trimmed. */
const MAX_QUERY_LENGTH = 80;
/** Trim a query to fit DDG's sweet spot without cutting mid-word. */
function trimQuery(query: string): string {
if (query.length <= MAX_QUERY_LENGTH) return query;
const cut = query.slice(0, MAX_QUERY_LENGTH);
const lastSpace = cut.lastIndexOf(" ");
return lastSpace > 20 ? cut.slice(0, lastSpace) : cut;
}
class AdaptiveThrottle {
private consecutiveErrors = 0;
private cooldownUntil = 0;
/** Call after a successful DDG response. */
reportSuccess(): void {
this.consecutiveErrors = Math.max(0, this.consecutiveErrors - 1);
}
/** Call after a failed DDG request. Returns extra delay (ms) to add. */
reportError(): number {
this.consecutiveErrors++;
if (this.consecutiveErrors >= 5) {
const cooldownMs = Math.min(30_000, this.consecutiveErrors * 3_000);
this.cooldownUntil = Date.now() + cooldownMs;
return cooldownMs;
}
return Math.min(15_000, 1000 * Math.pow(2, this.consecutiveErrors - 1));
}
/** Extra delay all lanes should wait right now (0 if no pressure). */
currentPenalty(): number {
const cooldownRemaining = this.cooldownUntil - Date.now();
if (cooldownRemaining > 0) return cooldownRemaining;
if (this.consecutiveErrors >= 2) {
return Math.min(8_000, this.consecutiveErrors * 800);
}
return 0;
}
get errorCount(): number {
return this.consecutiveErrors;
}
}
/** Singleton — shared across all lanes and workers in a run. */
const globalThrottle = new AdaptiveThrottle();
export class DdgRateLimiter {
private readonly baseDelayMs: number;
private lastRequestAt: number = 0;
private queue: Array<() => void> = [];
private processing = false;
private readonly jitterMs: number;
constructor(baseDelayMs: number = DDG_RATE_LIMIT_MS) {
this.baseDelayMs = baseDelayMs;
this.jitterMs = Math.floor(Math.random() * 600);
}
acquire(): Promise<void> {
return new Promise<void>((resolve) => {
this.queue.push(resolve);
if (!this.processing) this.drain();
});
}
private async drain(): Promise<void> {
this.processing = true;
while (this.queue.length > 0) {
const penalty = globalThrottle.currentPenalty();
const effectiveDelay = this.baseDelayMs + this.jitterMs + penalty;
const now = Date.now();
const wait = Math.max(0, effectiveDelay - (now - this.lastRequestAt));
if (wait > 0) await sleep(wait);
this.lastRequestAt = Date.now();
const resolve = this.queue.shift();
resolve?.();
}
this.processing = false;
}
}
/** Default shared limiter (single lane). */
export const sharedDdgLimiter = new DdgRateLimiter();
/**
* Pool of N independent rate limiters with staggered start offsets.
* Each lane has its own jitter, and all lanes respect the global
* adaptive throttle.
*/
export class DdgLimiterPool {
private readonly limiters: DdgRateLimiter[];
private nextIdx = 0;
constructor(laneCount: number, msPerLane: number) {
this.limiters = [];
for (let i = 0; i < laneCount; i++) {
const stagger = Math.floor(msPerLane * 0.2 * i);
this.limiters.push(new DdgRateLimiter(msPerLane + stagger));
}
}
next(): DdgRateLimiter {
const limiter = this.limiters[this.nextIdx % this.limiters.length];
this.nextIdx++;
return limiter;
}
get laneCount(): number {
return this.limiters.length;
}
}
/** Reset adaptive throttle — call at the start of a new research session. */
export function resetThrottle(): void {
(globalThrottle as any).consecutiveErrors = 0;
(globalThrottle as any).cooldownUntil = 0;
}
export async function searchDDG(
query: string,
maxResults: number,
safeSearch: "strict" | "moderate" | "off",
signal: AbortSignal,
limiter: DdgRateLimiter = sharedDdgLimiter,
page: number = 1,
): Promise<ReadonlyArray<SearchHit>> {
const trimmed = trimQuery(query);
await limiter.acquire();
if (signal.aborted) throw new DOMException("Aborted", "AbortError");
const offset = (page - 1) * maxResults;
try {
const hits = await tryLiteEndpoint(
trimmed,
maxResults,
safeSearch,
signal,
offset,
);
if (hits.length > 0) {
globalThrottle.reportSuccess();
return hits;
}
} catch { }
if (page <= 1) {
try {
const hits = await tryHtmlEndpoint(
trimmed,
maxResults,
safeSearch,
signal,
);
if (hits.length > 0) {
globalThrottle.reportSuccess();
return hits;
}
} catch { }
}
const penalty = globalThrottle.reportError();
if (penalty > 0 && !signal.aborted) {
await sleep(Math.min(penalty, 5_000));
}
return [];
}
/**
* Fetch multiple pages of results for a single query.
*/
export async function searchDDGPaginated(
query: string,
maxResultsPerPage: number,
pages: number,
safeSearch: "strict" | "moderate" | "off",
signal: AbortSignal,
limiter: DdgRateLimiter = sharedDdgLimiter,
): Promise<ReadonlyArray<SearchHit>> {
const allHits: SearchHit[] = [];
const seen = new Set<string>();
for (let p = 1; p <= pages; p++) {
if (signal.aborted) break;
if (p > 1 && globalThrottle.errorCount >= 3) break;
const hits = await searchDDG(
query,
maxResultsPerPage,
safeSearch,
signal,
limiter,
p,
);
for (const h of hits) {
if (!seen.has(h.url)) {
seen.add(h.url);
allHits.push(h);
}
}
if (hits.length < maxResultsPerPage * 0.5) break;
}
return allHits;
}
async function tryLiteEndpoint(
query: string,
maxResults: number,
safeSearch: "strict" | "moderate" | "off",
signal: AbortSignal,
offset: number = 0,
): Promise<ReadonlyArray<SearchHit>> {
const url = new URL("https://lite.duckduckgo.com/lite/");
url.searchParams.set("q", query);
if (safeSearch === "strict") url.searchParams.set("p", "-1");
if (safeSearch === "off") url.searchParams.set("p", "1");
let body = `q=${encodeURIComponent(query)}`;
if (offset > 0) {
body += `&s=${offset}&dc=${Math.floor(offset / maxResults) + 1}`;
}
const res = await fetch(url.toString(), {
method: "POST",
signal,
headers: {
...buildDDGHeaders(),
"Content-Type": "application/x-www-form-urlencoded",
},
body,
});
if (!res.ok) throw new Error(`DDG lite HTTP ${res.status}`);
const html = await res.text();
return parseLiteResults(html, maxResults);
}
async function tryHtmlEndpoint(
query: string,
maxResults: number,
safeSearch: "strict" | "moderate" | "off",
signal: AbortSignal,
): Promise<ReadonlyArray<SearchHit>> {
const url = new URL("https://duckduckgo.com/html/");
url.searchParams.set("q", query);
if (safeSearch === "strict") url.searchParams.set("p", "-1");
if (safeSearch === "off") url.searchParams.set("p", "1");
const res = await fetch(url.toString(), {
method: "GET",
signal,
headers: buildDDGHeaders(),
});
if (!res.ok) throw new Error(`DDG html HTTP ${res.status}`);
const html = await res.text();
return parseHtmlResults(html, maxResults);
}
function parseLiteResults(
html: string,
maxResults: number,
): ReadonlyArray<SearchHit> {
const hits: SearchHit[] = [];
const seen = new Set<string>();
const linkRe =
/<a[^>]+class="result-link"[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>/gi;
const snippetRe = /<td[^>]+class="result-snippet"[^>]*>([\s\S]*?)<\/td>/gi;
const links: Array<{ url: string; title: string }> = [];
const snippets: string[] = [];
let m: RegExpExecArray | null;
while ((m = linkRe.exec(html)) !== null) {
const rawUrl = decodeURIComponent(m[1]).trim();
const title = stripTags(m[2]).trim();
if (rawUrl.startsWith("http") && !DDG_INTERNAL.test(rawUrl)) {
links.push({ url: rawUrl, title });
}
}
while ((m = snippetRe.exec(html)) !== null) {
snippets.push(stripTags(m[1]).replace(/\s+/g, " ").trim());
}
for (let i = 0; i < links.length && hits.length < maxResults; i++) {
const { url, title } = links[i];
if (seen.has(url)) continue;
seen.add(url);
hits.push({ url, title, snippet: snippets[i] ?? title });
}
return hits;
}
function parseHtmlResults(
html: string,
maxResults: number,
): ReadonlyArray<SearchHit> {
const hits: SearchHit[] = [];
const seen = new Set<string>();
const resultBlockRe =
/<div[^>]+class="[^"]*\bresult\b[^"]*"[^>]*>([\s\S]*?)(?=<div[^>]+class="[^"]*\bresult\b|$)/gi;
const linkRe = /class="result__a"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/i;
const snippetRe =
/class="result__snippet"[^>]*>([\s\S]*?)<\/(?:a|span|td|div)>/i;
let blockMatch: RegExpExecArray | null;
while (
hits.length < maxResults &&
(blockMatch = resultBlockRe.exec(html)) !== null
) {
const block = blockMatch[1];
const linkMatch = linkRe.exec(block);
if (!linkMatch) continue;
let rawUrl = linkMatch[1];
const uddgMatch = /[?&]uddg=([^&]+)/.exec(rawUrl);
if (uddgMatch) rawUrl = decodeURIComponent(uddgMatch[1]);
else rawUrl = decodeURIComponent(rawUrl);
if (!rawUrl.startsWith("http")) continue;
if (DDG_INTERNAL.test(rawUrl)) continue;
if (seen.has(rawUrl)) continue;
seen.add(rawUrl);
const title = stripTags(linkMatch[2]).replace(/\s+/g, " ").trim();
const snippetMatch = snippetRe.exec(block);
const snippet = snippetMatch
? stripTags(snippetMatch[1]).replace(/\s+/g, " ").trim()
: title;
hits.push({ url: rawUrl, title, snippet });
}
if (hits.length === 0) {
return parseLegacy(html, maxResults);
}
return hits;
}
function parseLegacy(
html: string,
maxResults: number,
): ReadonlyArray<SearchHit> {
const hits: SearchHit[] = [];
const seen = new Set<string>();
const re = /\shref="[^"]*(https?[^?&"]+)[^>]*>([^<]*)/gm;
let m: RegExpExecArray | null;
re.lastIndex = 0;
while (hits.length < maxResults && (m = re.exec(html)) !== null) {
const rawUrl = decodeURIComponent(m[1]);
const title = m[2].replace(/\s+/g, " ").trim();
if (DDG_INTERNAL.test(rawUrl)) continue;
if (seen.has(rawUrl)) continue;
seen.add(rawUrl);
hits.push({ url: rawUrl, title, snippet: title });
}
return hits;
}
const DDG_INTERNAL = /duckduckgo\.com|bing\.com/;
function stripTags(html: string): string {
return html
.replace(/<[^>]+>/g, "")
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ /g, " ");
}