Project Files

.claude

settings.local.json

src

bing

build-urls.ts

index.ts

parse-results.ts

search-images.ts

cache

image-search-results-payload.ts

index.ts

search-cache-key.ts

ttl-cache.ts

web-search-results-payload.ts

config

auto-sentinel.ts

config-defaults.ts

config-schematics.ts

resolve-config.ts

duckduckgo

build-urls.ts

index.ts

parse-results.ts

search-web.ts

web-search-result.ts

enrichment

enrich-search-results.ts

index.ts

metascraper-helpers.d.ts

metascraper.ts

errors

abort-error.ts

error-message.ts

index.ts

no-results-error.ts

tool-error.ts

unsupported-content-type-error.ts

index.ts

lmstudio-home.ts

markdown-path.ts

url-filename.ts

http

challenge.ts

decode.ts

fetch-error.ts

fetch-retry.ts

fetch.ts

impit-client.ts

impit-error.ts

index.ts

parse-content-type.ts

redirects.ts

response-body.ts

ssrf.ts

url-schema.ts

url.ts

images

download-image.ts

download-images.ts

index.ts

page

fetch-page.ts

fetched-page.ts

index.ts

page-kind.ts

parsers

image-extensions.ts

index.ts

page-images.ts

page-text.ts

parse-html.ts

pdf-text.ts

renderers

image-results.ts

index.ts

page-result.ts

retrieval

chunking.ts

excerpt.ts

index.ts

relevance.ts

selection.ts

index.ts

safe-search.ts

search-page-parameter.ts

text

escape-markdown.ts

html-to-markdown.ts

html-to-text.ts

index.ts

normalize-blank-lines.ts

normalize-text.ts

timing

index.ts

per-host-rate-limiter.ts

rate-limiter.ts

tools

fetch-images-tool.ts

image-search-tool.ts

visit-website-tool.ts

web-search-tool.ts

index.ts

tools-provider.ts

.gitignore

.prettierrc.json

CLAUDE.md

eslint.config.mjs

knip.json

LICENSE

manifest.json

package-lock.json

package.json

QWEN.md

README.md

thumbnail.png

tsconfig.json

tsdoc.json

src / parsers / page-images.ts

/**
 * Extract images from a parsed website document.
 */

import { isHttpUrl, resolveUrl } from "../http"
import { normalizeText } from "../text"

import { hasSupportedImageExtension } from "./image-extensions"

import type { JSDOM } from "jsdom"

/**
 * A single image reference extracted from the page.
 */
export interface PageImage {
  /** Alternative text from the `<img>` `alt` attribute, or an empty string when absent. */
  alt: string
  /** Advisory text from the `<img>` `title` attribute, or an empty string when absent. */
  title: string
  /** Absolute URL of the image source. */
  src: string
}

/**
 * Extract up to `maxImages` images from the document in document order, deduped by src.
 *
 * @param dom - Parsed website DOM.
 * @param baseUrl - Absolute URL used to resolve relative image sources.
 * @param maxImages - Upper bound on the number of images to return.
 * @returns Image descriptors in original document order, deduped by src.
 */
export function extractPageImages(dom: JSDOM, baseUrl: string, maxImages: number): PageImage[] {
  if (maxImages === 0) return []

  const images = dom.window.document.querySelectorAll("img[src]")
  const results: PageImage[] = []
  const seen = new Set<string>()

  for (const image of images) {
    const rawSource = image.getAttribute("src")

    if (rawSource === null || rawSource === "") {
      continue
    }

    const resolved = resolveUrl(rawSource, baseUrl)

    if (resolved === undefined || !isHttpUrl(resolved) || !hasSupportedImageExtension(resolved.href)) {
      continue
    }

    if (seen.has(resolved.href)) {
      continue
    }

    seen.add(resolved.href)
    results.push({
      alt: normalizeText(image.getAttribute("alt")),
      title: normalizeText(image.getAttribute("title")),
      src: resolved.href,
    })

    if (results.length >= maxImages) {
      break
    }
  }

  return results
}

web-tools