Project Files

.claude

settings.local.json

src

cache

cached-search-results.ts

index.ts

search-cache-key.ts

ttl-cache.ts

config

auto-sentinel.ts

config-schematics.ts

resolve-config.ts

duckduckgo

build-urls.ts

fetch-vqd-token.ts

index.ts

safe-search.ts

search-images.ts

search-web.ts

vqd-token-error.ts

errors

index.ts

no-results-error.ts

tool-error.ts

unsupported-content-type-error.ts

index.ts

lmstudio-home.ts

markdown-path.ts

url-filename.ts

http

decode-bytes.ts

fetch-error.ts

fetch-ok.ts

follow-redirects.ts

impit-client.ts

index.ts

parse-content-type.ts

read-limited-body.ts

retry.ts

url-guard.ts

url-schema.ts

images

download-image.ts

download-images.ts

index.ts

parsers

page

page-images.ts

page-text.ts

pdf-text.ts

image-results-parser.ts

index.ts

search-results-parser.ts

vqd-parser.ts

text

escape-markdown.ts

html-to-markdown.ts

html-to-text.ts

index.ts

normalize-blank-lines.ts

normalize-text.ts

timing

index.ts

rate-limiter.ts

sleep.ts

tools

image-search-tool.ts

view-images-tool.ts

visit-website-tool.ts

web-search-tool.ts

website

fetch-website.ts

fetched-page.ts

index.ts

page-kind.ts

render-visit-result.ts

index.ts

tools-provider.ts

.gitignore

.prettierrc.json

CLAUDE.md

eslint.config.mjs

knip.json

LICENSE

manifest.json

package-lock.json

package.json

QWEN.md

README.md

thumbnail.png

tsconfig.json

src / parsers / page / pdf-text.ts

/**
 * Text extraction for PDF payloads via `@opendocsg/pdf2md`. The library wraps `unpdf` (and
 * through it `pdfjs-dist`) with a Markdown converter that does font-size-based heading
 * detection and paragraph grouping, producing structured output that aligns with the Visit
 * Website tool's HTML → turndown pipeline and survives the downstream excerpt windowing.
 *
 * The document `Title` from the PDF metadata dictionary is pulled via `unpdf.getMeta` in a
 * parallel pass — pdf2md's `metadataParsed` callback could supply the same data, but
 * wiring a closure-captured mutable reference through the ESLint ruleset adds more noise
 * than the second cheap metadata pass it would save.
 */

import pdf2md from "@opendocsg/pdf2md"
import { getMeta } from "unpdf"

import { normalizeBlankLines } from "../../text/normalize-blank-lines"

/**
 * Result of extracting text and metadata from a PDF buffer.
 */
export interface PdfContent {
  /** Concatenated Markdown content extracted from every page, in page order. */
  text: string
  /** Document-metadata title when present and non-empty, otherwise an empty string. */
  title: string
}

/**
 * Extract readable Markdown and the document title from a PDF payload.
 *
 * @param bytes Raw PDF bytes.
 * @returns The extracted Markdown body and metadata title.
 * @throws {Error} When `pdf2md` or `unpdf` cannot parse the payload.
 */
export async function extractPdfContent(bytes: Buffer): Promise<PdfContent> {
  const data = new Uint8Array(bytes)
  const [markdown, meta] = await Promise.all([pdf2md(data), getMeta(data)])
  const info = meta.info as Record<string, unknown>
  const rawTitle = info.Title

  return {
    text: normalizeBlankLines(markdown),
    title: typeof rawTitle === "string" ? rawTitle.trim() : "",
  }
}

web-tools