Project Files

.claude

settings.local.json

src

bing

build-urls.ts

index.ts

parse-results.ts

search-images.ts

cache

image-search-results-payload.ts

index.ts

search-cache-key.ts

ttl-cache.ts

web-search-result.ts

web-search-results-payload.ts

config

auto-sentinel.ts

config-schematics.ts

resolve-config.ts

duckduckgo

build-urls.ts

index.ts

parse-results.ts

search-web.ts

enrichment

enrich-search-results.ts

index.ts

metascraper-helpers.d.ts

metascraper.ts

errors

abort-error.ts

error-message.ts

index.ts

no-results-error.ts

tool-error.ts

unsupported-content-type-error.ts

index.ts

lmstudio-home.ts

markdown-path.ts

url-filename.ts

http

decode-bytes.ts

fetch-error.ts

fetch-retry.ts

fetch.ts

impit-client.ts

impit-error.ts

index.ts

parse-content-type.ts

redirects.ts

response-body.ts

ssrf.ts

url-schema.ts

images

download-image.ts

download-images.ts

index.ts

page

fetch-page.ts

fetched-page.ts

index.ts

page-kind.ts

render-visit-result.ts

parsers

image-extensions.ts

index.ts

page-images.ts

page-text.ts

pdf-text.ts

index.ts

safe-search.ts

search-page-parameter.ts

text

escape-markdown.ts

html-to-markdown.ts

html-to-text.ts

index.ts

normalize-blank-lines.ts

normalize-text.ts

timing

index.ts

per-host-rate-limiter.ts

rate-limiter.ts

tools

fetch-images-tool.ts

image-search-tool.ts

visit-website-tool.ts

web-search-tool.ts

index.ts

tools-provider.ts

.gitignore

.prettierrc.json

CLAUDE.md

eslint.config.mjs

knip.json

LICENSE

manifest.json

package-lock.json

package.json

QWEN.md

README.md

thumbnail.png

tsconfig.json

tsdoc.json

src / page / fetch-page.ts

/**
 * Fetch arbitrary pages, classify the response into one of the supported content kinds,
 * and return a discriminated `FetchedPage`. Cached on disk by URL with TTL.
 */

import { decodeBytes, fetchOrThrow, readLimitedBytes } from "../http"
import { extractPdfContent } from "../parsers"

import { classifyPage, type PageKind } from "./page-kind"

import type { TTLCache } from "../cache"
import type { RequestOptions } from "../http"
import type { FetchedPage } from "./fetched-page"
import type { Impit } from "impit"

/**
 * Options controlling an outbound page fetch.
 */
interface FetchPageOptions extends RequestOptions {
  /** Hard upper bound on the response payload, in bytes. */
  maxBytes: number
}

/**
 * Fetch the page at `url`, returning a cached payload when one is available. The response
 * is buffered as bytes, classified by declared or sniffed MIME type, and decoded into the
 * kind-specific variant of `FetchedPage`. The `beforeFetch` hook, when supplied, is awaited
 * only on cache misses so warm reads do not pay rate-limit waits.
 *
 * @param impit - Shared HTTP client used for the request.
 * @param cache - Cache holding recent fetched pages keyed by URL.
 * @param url - Target URL to fetch.
 * @param options - Options controlling the outbound request.
 * @returns The structured, classified page payload.
 * @throws When the response carries a non-2xx status or exceeds the size cap.
 * @throws When the response's content type is outside the whitelist.
 */
export async function fetchPage(
  impit: Impit,
  cache: TTLCache<FetchedPage>,
  url: string,
  options: FetchPageOptions
): Promise<FetchedPage> {
  const cached = await cache.get(url)

  if (cached !== undefined) {
    return cached
  }

  const response = await fetchOrThrow(impit, url, options)
  const bytes = await readLimitedBytes(response, options.maxBytes, url)
  const contentTypeHeader = response.headers.get("content-type")
  const { kind, mimeType } = await classifyPage(bytes, contentTypeHeader, url)
  const page = await materializeFetchedPage(kind, mimeType, bytes, contentTypeHeader)

  await cache.set(url, page)

  return page
}

/**
 * Dispatch on the resolved kind to produce the corresponding `FetchedPage` variant:
 * HTML is decoded into a raw string, PDF is parsed for both text and metadata title, and
 * text/JSON fall through a shared charset-aware decode path.
 *
 * @param kind - Resolved page kind.
 * @param mimeType - Effective MIME type that informed the kind.
 * @param bytes - Raw response body.
 * @param contentTypeHeader - Raw `content-type` header value, or `null` when absent.
 * @returns The kind-specific fetched-page record.
 */
async function materializeFetchedPage(
  kind: PageKind,
  mimeType: string,
  bytes: Buffer,
  contentTypeHeader: string | null
): Promise<FetchedPage> {
  if (kind === "html") {
    return { kind, html: decodeBytes(bytes, contentTypeHeader), mimeType }
  }

  if (kind === "pdf") {
    const { text, title } = await extractPdfContent(bytes)

    return { kind, text, title, mimeType }
  }

  return { kind, text: decodeBytes(bytes, contentTypeHeader), mimeType, title: "" }
}

web-tools