Project Files

.claude

settings.local.json

src

cache

cached-search-results.ts

index.ts

search-cache-key.ts

ttl-cache.ts

config

auto-sentinel.ts

config-schematics.ts

resolve-config.ts

duckduckgo

build-urls.ts

fetch-vqd-token.ts

index.ts

safe-search.ts

search-images.ts

search-web.ts

vqd-token-error.ts

errors

index.ts

no-results-error.ts

tool-error.ts

unsupported-content-type-error.ts

index.ts

lmstudio-home.ts

markdown-path.ts

url-filename.ts

http

decode-bytes.ts

fetch-error.ts

fetch-ok.ts

follow-redirects.ts

impit-client.ts

index.ts

parse-content-type.ts

read-limited-body.ts

retry.ts

url-guard.ts

url-schema.ts

images

download-image.ts

download-images.ts

index.ts

parsers

page

page-images.ts

page-text.ts

pdf-text.ts

image-results-parser.ts

index.ts

search-results-parser.ts

vqd-parser.ts

text

escape-markdown.ts

html-to-markdown.ts

html-to-text.ts

index.ts

normalize-blank-lines.ts

normalize-text.ts

timing

index.ts

rate-limiter.ts

sleep.ts

tools

image-search-tool.ts

view-images-tool.ts

visit-website-tool.ts

web-search-tool.ts

website

fetch-website.ts

fetched-page.ts

index.ts

page-kind.ts

render-visit-result.ts

index.ts

tools-provider.ts

.gitignore

.prettierrc.json

CLAUDE.md

eslint.config.mjs

knip.json

LICENSE

manifest.json

package-lock.json

package.json

QWEN.md

README.md

thumbnail.png

tsconfig.json

src / parsers / page / page-images.ts

/**
 * Extract images from a parsed website document.
 */

import { normalizeText } from "../../text"
import { URL_EXTENSION_PATTERN, isSupportedImageExtension } from "../image-results-parser"

import type { JSDOM } from "jsdom"

/**
 * A single image reference extracted from the page.
 */
export interface PageImage {
  /** Alternative text from the `<img>` `alt` attribute, or an empty string when absent. */
  alt: string
  /** Advisory text from the `<img>` `title` attribute, or an empty string when absent. */
  title: string
  /** Absolute URL of the image source. */
  src: string
}

/**
 * Extract up to `maxImages` images from the document in document order, deduped by src.
 *
 * @param dom Parsed website DOM.
 * @param baseUrl Absolute URL used to resolve relative image sources.
 * @param maxImages Upper bound on the number of images to return.
 * @returns Image descriptors in original document order, deduped by src.
 */
export function extractPageImages(dom: JSDOM, baseUrl: string, maxImages: number): PageImage[] {
  if (maxImages === 0) {
    return []
  }

  const images = dom.window.document.querySelectorAll("img[src]")
  const results: PageImage[] = []
  const seen = new Set<string>()

  for (const image of images) {
    const rawSource = image.getAttribute("src")

    if (rawSource === null || rawSource === "") {
      continue
    }

    const resolved = resolveUrl(rawSource, baseUrl)

    if (resolved === undefined || !resolved.startsWith("http") || !urlHasImageExtension(resolved)) {
      continue
    }

    if (seen.has(resolved)) {
      continue
    }

    seen.add(resolved)
    results.push({
      alt: normalizeText(image.getAttribute("alt")),
      title: normalizeText(image.getAttribute("title")),
      src: resolved,
    })

    if (results.length >= maxImages) {
      break
    }
  }

  return results
}

/**
 * Resolve a possibly-relative URL against a base URL, returning `undefined` when either is invalid.
 *
 * @param rawUrl URL to resolve; may be absolute or relative.
 * @param baseUrl Absolute URL used as the resolution base.
 * @returns The absolute href, or `undefined` when resolution fails.
 */
function resolveUrl(rawUrl: string, baseUrl: string): string | undefined {
  try {
    return new URL(rawUrl, baseUrl).href
  } catch {
    return undefined
  }
}

/**
 * Report whether a URL's path ends in a supported image extension, tolerating query strings.
 *
 * @param url URL to inspect.
 * @returns `true` when the trailing extension is recognised by the image parser.
 */
function urlHasImageExtension(url: string): boolean {
  const match = URL_EXTENSION_PATTERN.exec(url)

  if (match === null) {
    return false
  }

  return isSupportedImageExtension(match[1].toLowerCase())
}

web-tools