Project Files
src / parsers / search-results-parser.ts
/**
* HTML parsing for DuckDuckGo web-search result pages.
*/
import { JSDOM } from "jsdom"
import { normalizeText } from "../text"
/**
* CSS selector matching individual result blocks on the DuckDuckGo HTML endpoint.
*
* @const {string}
* @default
*/
const SEARCH_RESULT_SELECTOR = ".result"
/**
* CSS selector matching the result link within a result block.
*
* @const {string}
* @default
*/
const SEARCH_RESULT_LINK_SELECTOR = ".result__a"
/**
* CSS selector matching the snippet element within a result block.
*
* @const {string}
* @default
*/
const SEARCH_RESULT_SNIPPET_SELECTOR = ".result__snippet"
/**
* A single parsed web search result.
*/
interface SearchResult {
/** Human-readable title of the result link. */
label: string
/** Destination URL of the result link. */
url: string
/** Preview text extracted from the search result page. */
snippet: string
}
/**
* Parse web search results from DuckDuckGo HTML.
*
* @param html Raw HTML payload returned by the DuckDuckGo HTML endpoint.
* @param maxResults Upper bound on the number of results to return.
* @returns Deduplicated list of parsed search results, capped at `maxResults`.
*/
export function parseSearchResults(html: string, maxResults: number): SearchResult[] {
const results: SearchResult[] = []
const dom = new JSDOM(html)
const resultBlocks = dom.window.document.querySelectorAll(SEARCH_RESULT_SELECTOR)
const seenUrls = new Set<string>()
for (const block of resultBlocks) {
if (results.length >= maxResults) {
break
}
const link = block.querySelector(SEARCH_RESULT_LINK_SELECTOR)
if (link === null) {
continue
}
const url = link.getAttribute("href")
if (url === null) {
continue
}
if (seenUrls.has(url)) {
continue
}
const label = normalizeText(link.textContent)
if (label === "") {
continue
}
const snippetElement = block.querySelector(SEARCH_RESULT_SNIPPET_SELECTOR)
const snippet = normalizeText(snippetElement?.textContent)
seenUrls.add(url)
results.push({ label, url, snippet })
}
return results
}