Forked from npacker/web-tools
Project Files
src / parsers / page-images.ts
/**
* Extract images from a parsed website document.
*/
import { isHttpUrl, resolveUrl } from "../http"
import { normalizeText } from "../text"
import { hasSupportedImageExtension } from "./image-extensions"
import type { JSDOM } from "jsdom"
/**
* A single image reference extracted from the page.
*/
export interface PageImage {
/** Alternative text from the `<img>` `alt` attribute, or an empty string when absent. */
alt: string
/** Advisory text from the `<img>` `title` attribute, or an empty string when absent. */
title: string
/** Absolute URL of the image source. */
src: string
}
/**
* Extract up to `maxImages` images from the document in document order, deduped by src.
*
* @param dom - Parsed website DOM.
* @param baseUrl - Absolute URL used to resolve relative image sources.
* @param maxImages - Upper bound on the number of images to return.
* @returns Image descriptors in original document order, deduped by src.
*/
export function extractPageImages(dom: JSDOM, baseUrl: string, maxImages: number): PageImage[] {
if (maxImages === 0) return []
const images = dom.window.document.querySelectorAll("img[src]")
const results: PageImage[] = []
const seen = new Set<string>()
for (const image of images) {
const rawSource = image.getAttribute("src")
if (rawSource === null || rawSource === "") {
continue
}
const resolved = resolveUrl(rawSource, baseUrl)
if (resolved === undefined || !isHttpUrl(resolved) || !hasSupportedImageExtension(resolved.href)) {
continue
}
if (seen.has(resolved.href)) {
continue
}
seen.add(resolved.href)
results.push({
alt: normalizeText(image.getAttribute("alt")),
title: normalizeText(image.getAttribute("title")),
src: resolved.href,
})
if (results.length >= maxImages) {
break
}
}
return results
}