Project Files

.claude

settings.local.json

src

cache

cached-search-results.ts

index.ts

search-cache-key.ts

ttl-cache.ts

config

auto-sentinel.ts

config-schematics.ts

resolve-config.ts

duckduckgo

build-urls.ts

fetch-vqd-token.ts

index.ts

safe-search.ts

search-images.ts

search-web.ts

vqd-token-error.ts

errors

index.ts

no-results-error.ts

tool-error.ts

unsupported-content-type-error.ts

index.ts

lmstudio-home.ts

markdown-path.ts

url-filename.ts

http

decode-bytes.ts

fetch-error.ts

fetch-ok.ts

follow-redirects.ts

impit-client.ts

index.ts

parse-content-type.ts

read-limited-body.ts

retry.ts

url-guard.ts

url-schema.ts

images

download-image.ts

download-images.ts

index.ts

parsers

page

page-images.ts

page-text.ts

pdf-text.ts

image-results-parser.ts

index.ts

search-results-parser.ts

vqd-parser.ts

text

escape-markdown.ts

html-to-markdown.ts

html-to-text.ts

index.ts

normalize-blank-lines.ts

normalize-text.ts

timing

index.ts

rate-limiter.ts

sleep.ts

tools

image-search-tool.ts

view-images-tool.ts

visit-website-tool.ts

web-search-tool.ts

website

fetch-website.ts

fetched-page.ts

index.ts

page-kind.ts

render-visit-result.ts

index.ts

tools-provider.ts

.gitignore

.prettierrc.json

CLAUDE.md

eslint.config.mjs

knip.json

LICENSE

manifest.json

package-lock.json

package.json

QWEN.md

README.md

thumbnail.png

tsconfig.json

src / text / html-to-text.ts

/**
 * HTML to plain-text conversion that preserves block-level line breaks but drops
 * all markdown syntax. Used by the Visit Website tool when the caller opts out of markdown.
 */

import { convert } from "html-to-text"

import { normalizeBlankLines } from "./normalize-blank-lines"

import type { HtmlToTextOptions } from "html-to-text"

/**
 * Converter options tuned for token efficiency: word wrapping is disabled so paragraphs are not
 * fragmented across lines, anchors render only their inner text (the URL is dropped), images and
 * `<noscript>`/`<template>` subtrees are excluded entirely, headings and table headers retain
 * their original case instead of being uppercased, and list items use a two-character prefix.
 *
 * @const {HtmlToTextOptions}
 */
const CONVERT_OPTIONS: HtmlToTextOptions = {
  wordwrap: false,
  selectors: [
    { selector: "a", options: { ignoreHref: true } },
    { selector: "img", format: "skip" },
    { selector: "noscript", format: "skip" },
    { selector: "template", format: "skip" },
    { selector: "h1", options: { uppercase: false } },
    { selector: "h2", options: { uppercase: false } },
    { selector: "h3", options: { uppercase: false } },
    { selector: "h4", options: { uppercase: false } },
    { selector: "h5", options: { uppercase: false } },
    { selector: "h6", options: { uppercase: false } },
    { selector: "ul", options: { itemPrefix: "- " } },
    { selector: "table", options: { uppercaseHeaderCells: false } },
  ],
}

/**
 * Convert an HTML fragment to plain text via the `html-to-text` package, then collapse runs of
 * blank lines and trim trailing whitespace.
 *
 * @param html HTML fragment to convert.
 * @returns The plain-text representation, with runs of blank lines collapsed and trailing whitespace trimmed.
 */
export function htmlToText(html: string): string {
  return normalizeBlankLines(convert(html, CONVERT_OPTIONS))
}

web-tools