Forked from npacker/web-tools
Project Files
src / parsers / page / page-images.ts
/**
* Extract images from a parsed website document.
*/
import { normalizeText } from "../../text"
import { hasSupportedImageExtension } from "../image-extensions"
import type { JSDOM } from "jsdom"
/**
* A single image reference extracted from the page.
*/
export interface PageImage {
/** Alternative text from the `<img>` `alt` attribute, or an empty string when absent. */
alt: string
/** Advisory text from the `<img>` `title` attribute, or an empty string when absent. */
title: string
/** Absolute URL of the image source. */
src: string
}
/**
* Extract up to `maxImages` images from the document in document order, deduped by src.
*
* @param dom Parsed website DOM.
* @param baseUrl Absolute URL used to resolve relative image sources.
* @param maxImages Upper bound on the number of images to return.
* @returns Image descriptors in original document order, deduped by src.
*/
export function extractPageImages(dom: JSDOM, baseUrl: string, maxImages: number): PageImage[] {
if (maxImages === 0) {
return []
}
const images = dom.window.document.querySelectorAll("img[src]")
const results: PageImage[] = []
const seen = new Set<string>()
for (const image of images) {
const rawSource = image.getAttribute("src")
if (rawSource === null || rawSource === "") {
continue
}
const resolved = resolveUrl(rawSource, baseUrl)
if (resolved === undefined || !resolved.startsWith("http") || !hasSupportedImageExtension(resolved)) {
continue
}
if (seen.has(resolved)) {
continue
}
seen.add(resolved)
results.push({
alt: normalizeText(image.getAttribute("alt")),
title: normalizeText(image.getAttribute("title")),
src: resolved,
})
if (results.length >= maxImages) {
break
}
}
return results
}
/**
* Resolve a possibly-relative URL against a base URL, returning `undefined` when either is invalid.
*
* @param rawUrl URL to resolve; may be absolute or relative.
* @param baseUrl Absolute URL used as the resolution base.
* @returns The absolute href, or `undefined` when resolution fails.
*/
function resolveUrl(rawUrl: string, baseUrl: string): string | undefined {
try {
return new URL(rawUrl, baseUrl).href
} catch {
return undefined
}
}