Project Files

.claude

settings.local.json

src

cache

cached-search-results.ts

index.ts

search-cache-key.ts

ttl-cache.ts

config

auto-sentinel.ts

config-schematics.ts

resolve-config.ts

duckduckgo

build-urls.ts

fetch-vqd-token.ts

index.ts

safe-search.ts

search-images.ts

search-web.ts

vqd-token-error.ts

errors

index.ts

no-results-error.ts

tool-error.ts

unsupported-content-type-error.ts

index.ts

lmstudio-home.ts

markdown-path.ts

url-filename.ts

http

decode-bytes.ts

fetch-error.ts

fetch-ok.ts

follow-redirects.ts

impit-client.ts

index.ts

parse-content-type.ts

read-limited-body.ts

retry.ts

url-guard.ts

url-schema.ts

images

download-image.ts

download-images.ts

index.ts

parsers

page

page-images.ts

page-text.ts

pdf-text.ts

image-results-parser.ts

index.ts

search-results-parser.ts

vqd-parser.ts

text

escape-markdown.ts

html-to-markdown.ts

html-to-text.ts

index.ts

normalize-blank-lines.ts

normalize-text.ts

timing

index.ts

rate-limiter.ts

sleep.ts

tools

image-search-tool.ts

view-images-tool.ts

visit-website-tool.ts

web-search-tool.ts

website

fetch-website.ts

fetched-page.ts

index.ts

page-kind.ts

render-visit-result.ts

index.ts

tools-provider.ts

.gitignore

.prettierrc.json

CLAUDE.md

eslint.config.mjs

knip.json

LICENSE

manifest.json

package-lock.json

package.json

QWEN.md

README.md

thumbnail.png

tsconfig.json

src / website / page-kind.ts

/**
 * Classification of a fetched response into one of the content shapes Visit Website can
 * render into readable text. The declared `Content-Type` header is trusted when it names
 * a recognised MIME type; a generic label (`application/octet-stream` and friends) falls
 * back to magic-number sniffing via `file-type`. A missing header defaults to HTML rather
 * than paying for a sniff, because the sniff library only recognises binary formats.
 */

import { fileTypeFromBuffer } from "file-type"

import { UnsupportedContentTypeError } from "../errors/unsupported-content-type-error"
import { parseContentTypeSafe } from "../http"

/**
 * Supported content shapes for Visit Website. Each kind drives a distinct decoding and
 * extraction pipeline.
 */
export type PageKind = "html" | "pdf" | "text" | "json"

/**
 * Paired result of classifying a response: the resolved kind and the MIME type the
 * classification is based on.
 */
export interface PageClassification {
  /** Resolved page kind. */
  kind: PageKind
  /** MIME type that informed the kind (declared or sniffed). */
  mimeType: string
}

/**
 * Default MIME type applied when the response omits a `Content-Type` header entirely.
 *
 * @const {string}
 * @default
 */
const DEFAULT_MIME = "text/html"

/**
 * Generic MIME labels treated as uninformative — servers use these for file downloads or
 * when the type is genuinely unknown, so magic-number sniffing is more reliable.
 *
 * @const {ReadonlySet<string>}
 */
const GENERIC_MIME_TYPES: ReadonlySet<string> = new Set(["application/octet-stream", "binary/octet-stream"])

/**
 * Number of bytes passed to `file-type` for magic-number sniffing. 4 KiB is comfortably more
 * than every format's longest signature while remaining small enough that sniffing does not
 * meaningfully compete with the byte budget used for downstream extraction.
 *
 * @const {number}
 * @default
 */
const SNIFF_BYTES = 4096

/**
 * Classify a response body into a supported kind and effective MIME type, trusting the
 * declared header unless it is generic. A missing header short-circuits to HTML without
 * sniffing, because `file-type` only recognises binary formats and text-based payloads
 * (HTML, plain text, JSON) produce no signature to sniff.
 *
 * @param bytes Raw response body buffered in memory.
 * @param contentTypeHeader Raw `content-type` header value, or `null` when absent.
 * @param url Target URL carried on the resulting error for diagnostics.
 * @returns The classified kind and the MIME type the classification is based on.
 * @throws {UnsupportedContentTypeError} When the effective MIME type maps to no supported kind.
 */
export async function classifyPage(
  bytes: Buffer,
  contentTypeHeader: string | null,
  url: string
): Promise<PageClassification> {
  const declaredMime = parseContentTypeSafe(contentTypeHeader)?.type.toLowerCase()

  if (declaredMime === undefined) {
    return { kind: "html", mimeType: DEFAULT_MIME }
  }

  if (!GENERIC_MIME_TYPES.has(declaredMime)) {
    return resolveKind(declaredMime, url)
  }

  const sniffed = await fileTypeFromBuffer(bytes.subarray(0, SNIFF_BYTES))

  if (sniffed !== undefined) {
    return resolveKind(sniffed.mime, url)
  }

  return { kind: "html", mimeType: declaredMime }
}

/**
 * Map a MIME type to a supported page kind, throwing when the type is outside the whitelist.
 *
 * @param mimeType Normalised MIME type to resolve.
 * @param url Target URL carried on the resulting error for diagnostics.
 * @returns The kind-and-MIME pair.
 * @throws {UnsupportedContentTypeError} When the MIME type maps to no supported kind.
 */
function resolveKind(mimeType: string, url: string): PageClassification {
  const kind = mimeToKind(mimeType)

  if (kind === undefined) {
    throw new UnsupportedContentTypeError(mimeType, url)
  }

  return { kind, mimeType }
}

/**
 * Map a MIME type to a supported page kind, covering the common spellings for each shape.
 *
 * @param mimeType Lower-cased MIME type.
 * @returns The matching kind, or `undefined` when the type is not supported.
 */
function mimeToKind(mimeType: string): PageKind | undefined {
  if (mimeType === "text/html" || mimeType === "application/xhtml+xml") {
    return "html"
  }

  if (mimeType === "application/pdf") {
    return "pdf"
  }

  if (mimeType === "application/json" || mimeType === "text/json" || mimeType.endsWith("+json")) {
    return "json"
  }

  if (mimeType.startsWith("text/")) {
    return "text"
  }

  return undefined
}

web-tools