Project Files
src / docxRead.ts
import * as fs from "node:fs/promises";
import mammoth from "mammoth";
import JSZip from "jszip";
// Mammoth ships convertToMarkdown at runtime but doesn't declare it in its
// .d.ts. We narrow the binding here so the rest of the file is fully typed.
interface MammothMessage {
type: "warning" | "error" | string;
message: string;
}
interface MammothResult {
value: string;
messages: MammothMessage[];
}
const convertToMarkdown = (mammoth as unknown as {
convertToMarkdown: (
input: { buffer: Buffer },
options?: Record<string, unknown>,
) => Promise<MammothResult>;
}).convertToMarkdown;
export interface DocxMetadata {
title?: string;
author?: string;
modified?: string;
}
export interface ReadDocxResult {
markdown: string;
warnings: string[];
source_chars: number;
metadata: DocxMetadata | null;
}
export interface ReadDocxOptions {
preserveStyles: boolean;
includeMetadata: boolean;
}
// Mammoth's default style map covers headings/bold/italic/lists/quotes/links/
// tables. We extend it slightly to map common Word style aliases — French
// "Citation" and the implicit "Quote" — to blockquotes.
const EXTRA_STYLE_MAP = [
"p[style-name='Citation'] => blockquote:fresh",
"p[style-name='Quote'] => blockquote:fresh",
"p[style-name='Intense Quote'] => blockquote:fresh",
];
export async function readDocx(
absPath: string,
opts: ReadDocxOptions,
): Promise<ReadDocxResult> {
const buffer = await fs.readFile(absPath);
const result = await convertToMarkdown(
{ buffer },
{
styleMap: opts.preserveStyles ? EXTRA_STYLE_MAP : [],
// i1 — drop images silently. i3 will extract them to disk.
convertImage: (mammoth.images as unknown as {
imgElement: (fn: (i: unknown) => Promise<{ src: string }>) => unknown;
}).imgElement(() => Promise.resolve({ src: "" })),
},
);
const cleaned = postProcess(result.value);
const warnings = result.messages
.filter((m) => m.type === "warning" || m.type === "error")
.map((m) => m.message);
let metadata: DocxMetadata | null = null;
if (opts.includeMetadata) {
try {
metadata = await readCoreProperties(buffer);
} catch (e) {
warnings.push(
`metadata extraction failed: ${e instanceof Error ? e.message : String(e)}`,
);
}
}
let markdown = cleaned;
if (metadata && hasAnyMetadata(metadata)) {
markdown = renderFrontmatter(metadata) + "\n" + markdown;
}
return {
markdown,
warnings,
source_chars: markdown.length,
metadata,
};
}
function postProcess(md: string): string {
return md
// Mammoth sometimes emits stray <a id="..."></a> bookmark anchors. Drop them.
.replace(/<a id="[^"]*"><\/a>/g, "")
// Mammoth emits bold runs as `__x__`, which is valid CommonMark but many
// LLMs misread it as HTML <u>. Normalise to `**x**` so the round-trip
// looks unambiguous to downstream consumers.
.replace(/__/g, "**")
// Mammoth aggressively backslash-escapes any character that could be
// markdown-significant — even when the context wouldn't trigger any
// markdown structure. Strip those escapes for the "safe" punctuation
// set: characters whose unescape never creates an inline structure
// (no `*`, `_`, `` ` ``, `[`, `]`, `\` — these stay protected because
// dropping their escape would alter the document's meaning).
.replace(/\\([.\-+!#():;,?<>=])/g, "$1")
// Collapse runs of 3+ blank lines to a single blank line.
.replace(/\n{3,}/g, "\n\n")
// Trim trailing whitespace on each line.
.replace(/[ \t]+\n/g, "\n")
.trim() + "\n";
}
async function readCoreProperties(buffer: Buffer): Promise<DocxMetadata> {
const zip = await JSZip.loadAsync(buffer);
const core = zip.file("docProps/core.xml");
if (!core) return {};
const xml = await core.async("string");
return {
title: extractTag(xml, "dc:title"),
author: extractTag(xml, "dc:creator"),
modified: extractTag(xml, "dcterms:modified"),
};
}
// Tiny tag extractor — core.xml is small and well-formed; a full XML parser
// is overkill for three optional fields.
function extractTag(xml: string, tag: string): string | undefined {
const escaped = tag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const re = new RegExp(`<${escaped}[^>]*>([\\s\\S]*?)</${escaped}>`);
const m = xml.match(re);
if (!m) return undefined;
const v = m[1].trim();
return v.length ? v : undefined;
}
function hasAnyMetadata(m: DocxMetadata): boolean {
return Boolean(m.title || m.author || m.modified);
}
function renderFrontmatter(m: DocxMetadata): string {
const lines = ["---"];
if (m.title) lines.push(`title: ${yamlScalar(m.title)}`);
if (m.author) lines.push(`author: ${yamlScalar(m.author)}`);
if (m.modified) lines.push(`modified: ${yamlScalar(m.modified)}`);
lines.push("---");
return lines.join("\n");
}
function yamlScalar(s: string): string {
if (/[:#\-?&*!|>'"%@`]|^\s|\s$/.test(s)) {
return `"${s.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
}
return s;
}