Forked from npacker/web-tools
Project Files
src / enrichment / enrich-search-results.ts
/**
* Enrich web search results with metascraper metadata. Each result's URL is fetched through the
* shared website cache and rate limiter, scraped, and merged onto the base record. Non-HTML pages
* yield no metadata, and per-result failures demote to an unenriched record since enrichment is
* best-effort.
*/
import { isAbortError } from "../errors"
import { createRetryNotifier } from "../http"
import { fetchPage } from "../page"
import { normalizeText } from "../text"
import type { TTLCache } from "../cache"
import type { WebSearchResult } from "../duckduckgo"
import type { EnrichmentMetascraper } from "./metascraper"
import type { RetryOptions } from "../http"
import type { FetchedPage } from "../page"
import type { PerHostRateLimiter } from "../timing"
import type { Impit } from "impit"
/**
* Per-call options shared across the enrichment fan-out.
*/
export interface EnrichSearchResultsOptions {
/** Signal used to abort the entire enrichment fan-out. */
signal: AbortSignal
/** Retry policy applied to each per-result page fetch. */
retry: RetryOptions
/** Status-line callback used to surface enrichment progress and retry notifications. */
status: (message: string) => void
/** Hard upper bound on the HTML payload fetched for each result, in bytes. */
maxBytes: number
}
/**
* Enrich every result concurrently. The per-host limiter serialises calls to the same host while
* distinct hosts run in parallel, and cache hits skip the limiter entirely.
*
* @param results - Base search results to enrich.
* @param metascraper - Shared metascraper instance used to extract metadata from each fetched page.
* @param impit - Shared HTTP client used for outbound requests.
* @param pageCache - Cache holding recent fetched pages keyed by URL; reused across tools.
* @param hostLimiter - Per-host limiter enforcing the minimum gap between requests to the same host.
* @param options - Cancellation, retry, status, and byte-cap controls for the fan-out.
* @returns The input list with metadata merged onto each record.
* @throws When `options.signal` aborts mid-fan-out; re-thrown so the caller can surface a uniform abort message.
*/
export async function enrichSearchResults(
results: WebSearchResult[],
metascraper: EnrichmentMetascraper,
impit: Impit,
pageCache: TTLCache<FetchedPage>,
hostLimiter: PerHostRateLimiter,
options: EnrichSearchResultsOptions
): Promise<WebSearchResult[]> {
const total = results.length
let completed = 0
const tasks = results.map(async result => {
const enriched = await enrichSearchResult(result, metascraper, impit, pageCache, hostLimiter, options)
completed += 1
options.status(`Enriching metadata... ${completed}/${total}`)
return enriched
})
return Promise.all(tasks)
}
/**
* Resolve metadata for a single result. Non-HTML pages produce no metadata, abort errors
* propagate so the caller can fail fast, and every other error demotes to an unenriched record.
* The extracted `description` is dropped when it normalizes to the result's existing snippet.
*
* @param result - Base result being enriched.
* @param metascraper - Shared metascraper instance used to extract metadata.
* @param impit - Shared HTTP client used for outbound requests.
* @param pageCache - Cache holding recent fetched pages keyed by URL.
* @param hostLimiter - Per-host limiter enforcing the minimum gap between requests to the same host.
* @param options - Per-call options governing cancellation, retry, and status reporting.
* @returns The result with any extracted metadata merged in.
* @throws When `options.signal` aborts during the underlying page fetch.
*/
async function enrichSearchResult(
result: WebSearchResult,
metascraper: EnrichmentMetascraper,
impit: Impit,
pageCache: TTLCache<FetchedPage>,
hostLimiter: PerHostRateLimiter,
options: EnrichSearchResultsOptions
): Promise<WebSearchResult> {
try {
const page = await fetchPage(impit, pageCache, result.url, {
signal: options.signal,
retry: options.retry,
onFailedAttempt: createRetryNotifier(options.status, "result enrichment"),
maxBytes: options.maxBytes,
limiter: hostLimiter,
})
if (page.kind !== "html") return result
const metadata = await metascraper({ html: page.html, url: result.url })
const enriched: WebSearchResult = { ...result, ...metadata }
if (
enriched.description !== undefined &&
enriched.snippet !== undefined &&
normalizeText(enriched.description) === enriched.snippet
) {
delete enriched.description
}
return enriched
} catch (error) {
if (isAbortError(error)) {
throw error
}
return result
}
}