Project Files
src / search / duckduckgo.ts
import * as cheerio from "cheerio";
import { fetchHtml } from "../crawler/fetchHtml";
import { normalizeUrl } from "../crawler/normalizeUrl";
import { SEARCH_RESULT_LIMIT } from "../utils/constants";
// --- CACHING MECHANISM START (With TTL) ---
/**
* Cache entry structure: { value: Promise<string[]>, expiresAt: number }
*/
interface CachedResult {
value: Promise<string[]>;
expiresAt: number; // Timestamp in milliseconds
}
/**
* In-memory cache store for search results.
* Key: The query string (e.g., "best ai models").
* Value: CachedResult object.
*/
const searchCache = new Map<string, CachedResult>();
const CACHE_TTL_MS = 30 * 60 * 1000; // Cache TTL of 30 minutes
function getCachedResults(query: string): Promise<string[] | null> {
const entry = searchCache.get(query);
if (!entry) {
return null;
}
// Check if the cache has expired
if (Date.now() >= entry.expiresAt) {
console.log(`[CACHE MISS] Cache for query "${query}" expired.`);
searchCache.delete(query); // Evict old entry
return null;
}
console.log(`[CACHE HIT] Returning cached results for query: ${query}`);
// Return the stored promise
return entry.value;
}
function setCache(query: string, promise: Promise<string[]>): void {
const newEntry: CachedResult = {
value: promise,
expiresAt: Date.now() + CACHE_TTL_MS // Set expiration time
};
searchCache.set(query, newEntry);
}
// --- CACHING MECHANISM END ---
function decodeDuckGoRedirect(
href: string
): string {
try {
const parsed = new URL(
href,
"https://duckduckgo.com"
);
const uddg =
parsed.searchParams.get(
"uddg"
);
if (uddg) {
return decodeURIComponent(
uddg
);
}
return href;
} catch {
// Fallback if URL parsing fails for some reason
return href;
}
}
/**
* Searches DuckDuckGo and returns the top N unique, normalized URLs.
*/
export async function searchDuckDuckGo(
query: string
): Promise<string[]> {
let cachedPromise = getCachedResults(query);
if (cachedPromise) {
return cachedPromise; // Use cached promise if available and not expired
}
console.log(`[NETWORK CALL] Searching DuckDuckGo for query: ${query}`);
const searchUrl = new URL(
"https://html.duckduckgo.com/html/"
);
searchUrl.searchParams.set(
"q",
query
);
// 1. Fetch HTML and Parse (Original Logic)
try {
const html = await fetchHtml(
searchUrl.toString()
);
const $ = cheerio.load(html);
let urls: string[] = [];
// Improved selector grouping for better reliability
const selectors = [
".result__title a[href]", // Prioritize href attribute on title link
".result a[data-testid='result-title-a']",
".result a[href]"
];
for (const selector of selectors) {
$(selector).each((_, el) => {
// Check for the 'href' attribute, as it's most reliable
const href = $(el).attr("href");
if (!href || !href.startsWith("http")) {
return; // Skip non-URL links or null attributes
}
const decoded =
decodeDuckGoRedirect(
href
);
// Normalization is critical to ensure unique URLs across different link formats
const normalized =
normalizeUrl(
"https://duckduckgo.com",
decoded
);
if (
normalized &&
!urls.includes(
normalized
)
) {
urls.push(normalized);
}
});
// Stop iterating selectors as soon as we found enough results
if (urls.length > 0 && urls.length >= SEARCH_RESULT_LIMIT) {
break;
}
}
const finalUrls = urls.slice(
0,
SEARCH_RESULT_LIMIT
);
// 2. Create the promise and set cache
const newPromise: Promise<string[]> = Promise.resolve(finalUrls);
setCache(query, newPromise);
return finalUrls;
} catch (error) {
console.error("Error during DuckDuckGo search:", error);
// Re-throw a descriptive error that the calling function can handle gracefully
throw new Error(`Search failed: ${
error instanceof Error ? error.message : "Unknown network error"
}`);
}
}