Project Files

.claude

settings.local.json

src

cache

cached-search-results.ts

index.ts

search-cache-key.ts

ttl-cache.ts

config

auto-sentinel.ts

config-schematics.ts

resolve-config.ts

duckduckgo

build-urls.ts

fetch-vqd-token.ts

index.ts

safe-search.ts

search-images.ts

search-web.ts

vqd-token-error.ts

errors

index.ts

no-results-error.ts

tool-error.ts

unsupported-content-type-error.ts

index.ts

lmstudio-home.ts

markdown-path.ts

url-filename.ts

http

decode-bytes.ts

fetch-error.ts

fetch-ok.ts

follow-redirects.ts

impit-client.ts

index.ts

parse-content-type.ts

read-limited-body.ts

retry.ts

url-guard.ts

url-schema.ts

images

download-image.ts

download-images.ts

index.ts

parsers

page

page-images.ts

page-text.ts

pdf-text.ts

image-results-parser.ts

index.ts

search-results-parser.ts

vqd-parser.ts

text

escape-markdown.ts

html-to-markdown.ts

html-to-text.ts

index.ts

normalize-blank-lines.ts

normalize-text.ts

timing

index.ts

rate-limiter.ts

sleep.ts

tools

image-search-tool.ts

view-images-tool.ts

visit-website-tool.ts

web-search-tool.ts

website

fetch-website.ts

fetched-page.ts

index.ts

page-kind.ts

render-visit-result.ts

index.ts

tools-provider.ts

.gitignore

.prettierrc.json

CLAUDE.md

eslint.config.mjs

knip.json

LICENSE

manifest.json

package-lock.json

package.json

QWEN.md

README.md

thumbnail.png

tsconfig.json

src / website / fetched-page.ts

/**
 * Shape of a fetched page after classification and per-kind decoding/extraction. Cached
 * and consumed by the Visit Website tool (which narrows on `kind` to select between the
 * HTML-readability pipeline and the pre-extracted-text pipeline) and by the View Images
 * tool (which accepts only `kind === "html"`).
 *
 * Modelled as a discriminated union so the HTML variant carries no stray `title` field:
 * HTML titles are derived from the document at render time, not captured at fetch time.
 */

import type { PageKind } from "./page-kind"

/**
 * HTML page variant: carries the raw HTML string for downstream Readability + heading
 * extraction, with no pre-captured title since headings are parsed from the document.
 */
export interface HtmlFetchedPage {
  /** Discriminant identifying an HTML (or XHTML) payload. */
  kind: "html"
  /** Raw HTML body, still to be parsed by Mozilla Readability. */
  html: string
  /** Effective MIME type used to classify the payload. */
  mimeType: string
}

/**
 * Non-HTML variant: carries text that has already been extracted into its final form
 * (PDF body, raw text, or raw JSON) along with any metadata title captured at fetch time.
 */
export interface NonHtmlFetchedPage {
  /** Discriminant identifying a non-HTML page kind. */
  kind: Exclude<PageKind, "html">
  /** Pre-extracted text payload ready for the text excerpt pipeline. */
  text: string
  /** Effective MIME type used to classify the payload. */
  mimeType: string
  /** Document-level title when the source format exposes one (for example PDF metadata). */
  title: string
}

/**
 * Discriminated union of the two fetched-page shapes. Consumers narrow via `page.kind`.
 */
export type FetchedPage = HtmlFetchedPage | NonHtmlFetchedPage

web-tools