Project Files
src / page / render-visit-result.ts
/**
* Per-kind rendering that builds the page-visit result by narrowing on the fetched page's
* kind. HTML runs a single jsdom parse that yields both headings and the format-aware excerpt;
* every non-HTML kind (PDF, plain text, JSON) feeds its pre-extracted text straight through the
* text excerpt pipeline.
*/
import { buildTextExcerpt, extractHtmlPage } from "../parsers"
import type { FetchedPage } from "./fetched-page"
import type { ContentFormat } from "../config/resolve-config"
/**
* Structured result of visiting a page. Optional fields are omitted when empty to keep the
* payload compact.
*/
export interface VisitWebsiteResult {
/** URL that was visited, echoed back. */
url: string
/** Classified page kind. */
kind: FetchedPage["kind"]
/** Effective MIME type reported by the server or sniffed from the payload. */
mimeType: string
/** Page title when available (HTML `<title>` or PDF metadata `Title`). */
title?: string
/** First `<h1>` of an HTML page, omitted for non-HTML kinds. */
h1?: string
/** First `<h2>` of an HTML page, omitted for non-HTML kinds. */
h2?: string
/** Excerpt of the page content, truncated to the configured character budget. */
content?: string
/** Character count of the full extracted content before truncation or windowing. */
contentLength?: number
}
/**
* Inputs shared by both excerpt paths (HTML and pre-extracted text).
*/
export interface ExcerptInputs {
/** Character budget for the returned excerpt. */
contentLimit: number
/** Optional search terms biasing content selection. */
findInPage: string[] | undefined
/** Output format applied to HTML content; the pre-extracted-text path ignores it. */
contentFormat: ContentFormat
}
/**
* Assemble the per-kind response payload, narrowing on the fetched page's kind to select
* between the HTML jsdom+Readability pipeline and the pre-extracted-text pipeline.
*
* @param url - URL that was visited.
* @param page - Fetched and classified page payload.
* @param inputs - Shared excerpt inputs.
* @returns The user-facing result with content and (for HTML) headings populated.
*/
export function renderVisitResult(url: string, page: FetchedPage, inputs: ExcerptInputs): VisitWebsiteResult {
if (page.kind === "html") {
const { headings, excerpt } = extractHtmlPage(
page.html,
url,
inputs.contentLimit,
inputs.findInPage,
inputs.contentFormat
)
return assembleResult(url, page.kind, page.mimeType, {
title: headings.title,
h1: headings.h1,
h2: headings.h2,
content: excerpt.content,
contentLength: excerpt.totalLength,
})
}
const excerpt = buildTextExcerpt(page.text, inputs.contentLimit, inputs.findInPage)
return assembleResult(url, page.kind, page.mimeType, {
title: page.title,
content: excerpt.content,
contentLength: excerpt.totalLength,
})
}
/**
* Optional fields collected for a Visit Website response before empty values are stripped.
*/
interface ResultCandidates {
/** Candidate title; dropped when empty. */
title?: string
/** Candidate first-level heading; dropped when empty. */
h1?: string
/** Candidate second-level heading; dropped when empty. */
h2?: string
/** Candidate content excerpt; dropped when empty. */
content?: string
/** Candidate content length; dropped when zero. */
contentLength?: number
}
/**
* Assemble a `VisitWebsiteResult` from required identity fields and a set of optional candidates,
* dropping empty strings and zero counts so the response stays compact.
*
* @param url - URL that was visited.
* @param kind - Classified page kind.
* @param mimeType - Effective MIME type.
* @param candidates - Optional fields whose empty values should be elided.
* @returns The user-facing result with empty/zero fields removed.
*/
function assembleResult(
url: string,
kind: FetchedPage["kind"],
mimeType: string,
candidates: ResultCandidates
): VisitWebsiteResult {
const result: VisitWebsiteResult = { url, kind, mimeType }
for (const key of ["title", "h1", "h2", "content"] as const) {
const value = candidates[key]
if (value !== undefined && value.length > 0) {
result[key] = value
}
}
if (candidates.contentLength !== undefined && candidates.contentLength > 0) {
result.contentLength = candidates.contentLength
}
return result
}