Project Files
src / utils / document.ts
import JSZip from "jszip";
import * as fs from "fs";
import { XMLParser } from "fast-xml-parser";
// ─── Types ────────────────────────────────────────────────────────────────────
export interface DocRun {
text: string;
bold?: boolean;
italic?: boolean;
underline?: boolean;
color?: string;
}
export interface DocParagraph {
index: number;
text: string;
style: string;
runs: DocRun[];
isListItem?: boolean;
listLevel?: number;
}
export interface DocTable {
index: number;
rows: string[][];
}
export interface DocComment {
id: string;
author: string;
date: string;
text: string;
}
export interface DocImage {
id: string;
name: string;
type: string;
sizeKB: number;
}
export interface ContentBlock {
type: "heading" | "paragraph" | "list" | "table" | "pagebreak";
level?: number;
text?: string;
bold?: boolean;
italic?: boolean;
align?: "left" | "center" | "right" | "justify";
items?: string[];
ordered?: boolean;
headers?: string[];
rows?: string[][];
}
// ─── XML helpers ──────────────────────────────────────────────────────────────
export function escapeXml(s: string): string {
return s
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, """)
.replace(/'/g, "'");
}
/**
* Remove ALL <w:sectPr> from XML — handles self-closing and element forms.
* MUST be called on extracted body content before injection into another document.
*/
export function stripSectPr(xml: string): string {
xml = xml.replace(/<w:sectPr[^>]*\/>/g, ""); // self-closing
xml = xml.replace(/<w:sectPr[\s\S]*?<\/w:sectPr>/g, ""); // element form
return xml;
}
/**
* Extract content between <w:body> and </w:body>.
* Uses greedy inner match to capture the full body including nested elements.
*/
export function extractBodyContent(xml: string): string {
const match = xml.match(/<w:body>([\s\S]*)<\/w:body>/);
return match ? match[1] : "";
}
export interface ParaSpan { start: number; end: number; }
/**
* Tokenize top-level <w:p> elements in body content.
* Skips paragraphs nested inside <w:tbl> or <w:sdt> to avoid corrupting table structure.
*/
export function findTopLevelParaSpans(bodyContent: string): ParaSpan[] {
const spans: ParaSpan[] = [];
let i = 0;
let depth = 0;
while (i < bodyContent.length) {
if (bodyContent[i] !== "<") { i++; continue; }
const slice = bodyContent.slice(i);
// Depth-increasing opens (skip self-closing)
if (/^<w:tbl[\s>]/.test(slice) && !/^<w:tbl[^>]*\/>/.test(slice)) { depth++; i++; continue; }
if (/^<w:sdt[\s>]/.test(slice) && !/^<w:sdt[^>]*\/>/.test(slice)) { depth++; i++; continue; }
// Depth-decreasing closes
if (slice.startsWith("</w:tbl>")) { if (depth > 0) depth--; i += 8; continue; }
if (slice.startsWith("</w:sdt>")) { if (depth > 0) depth--; i += 8; continue; }
// Top-level <w:p> only
if (depth === 0 && /^<w:p[\s>]/.test(slice)) {
const closeIdx = bodyContent.indexOf("</w:p>", i + 3);
if (closeIdx === -1) { i++; continue; }
spans.push({ start: i, end: closeIdx + 6 });
i = closeIdx + 6;
continue;
}
i++;
}
return spans;
}
/**
* Convert ContentBlock[] → raw OOXML XML.
* No external document round-trips. Safe to inject into any document's <w:body>.
*/
export function blocksToXml(blocks: ContentBlock[]): string {
const parts: string[] = [];
for (const block of blocks) {
switch (block.type) {
case "heading": {
const lvl = block.level ?? 1;
const rPr = block.bold !== false ? "<w:rPr><w:b/></w:rPr>" : "";
parts.push(
`<w:p><w:pPr><w:pStyle w:val="Heading${lvl}"/></w:pPr>` +
`<w:r>${rPr}<w:t xml:space="preserve">${escapeXml(block.text ?? "")}</w:t></w:r></w:p>`
);
break;
}
case "paragraph": {
const alignVal = block.align === "justify" ? "both" : (block.align ?? "");
const pPr = alignVal ? `<w:pPr><w:jc w:val="${alignVal}"/></w:pPr>` : "";
const rPrParts: string[] = [];
if (block.bold) rPrParts.push("<w:b/>");
if (block.italic) rPrParts.push("<w:i/>");
const rPr = rPrParts.length ? `<w:rPr>${rPrParts.join("")}</w:rPr>` : "";
parts.push(
`<w:p>${pPr}` +
`<w:r>${rPr}<w:t xml:space="preserve">${escapeXml(block.text ?? "")}</w:t></w:r></w:p>`
);
break;
}
case "list": {
for (const item of block.items ?? []) {
const prefix = block.ordered ? "" : "\u2022 ";
parts.push(
`<w:p><w:pPr><w:pStyle w:val="ListParagraph"/>` +
`<w:ind w:left="720" w:hanging="360"/></w:pPr>` +
`<w:r><w:t xml:space="preserve">${escapeXml(prefix + item)}</w:t></w:r></w:p>`
);
}
break;
}
case "table": {
const headers = block.headers ?? [];
const dataRows = block.rows ?? [];
const colCount = headers.length || 1;
const colWidth = Math.floor(9360 / colCount);
const widths = Array.from({ length: colCount }, () => colWidth);
const borders = ["top","left","bottom","right","insideH","insideV"]
.map(s => `<w:${s} w:val="single" w:sz="4" w:space="0" w:color="CCCCCC"/>`)
.join("");
const tblProps =
`<w:tblPr><w:tblW w:w="9360" w:type="dxa"/>` +
`<w:tblBorders>${borders}</w:tblBorders></w:tblPr>`;
const tblGrid = widths.map(w => `<w:gridCol w:w="${w}"/>`).join("");
const makeRow = (cells: string[], isHeader: boolean) => {
const tcs = cells.map((text, ci) => {
const w = widths[ci] ?? colWidth;
const shd = isHeader ? `<w:shd w:val="clear" w:color="auto" w:fill="4472C4"/>` : "";
const rPr = isHeader ? `<w:rPr><w:b/><w:color w:val="FFFFFF"/></w:rPr>` : "";
return (
`<w:tc><w:tcPr><w:tcW w:w="${w}" w:type="dxa"/>${shd}` +
`<w:tcMar><w:top w:w="80" w:type="dxa"/><w:bottom w:w="80" w:type="dxa"/>` +
`<w:left w:w="120" w:type="dxa"/><w:right w:w="120" w:type="dxa"/></w:tcMar>` +
`</w:tcPr>` +
`<w:p><w:r>${rPr}<w:t xml:space="preserve">${escapeXml(text)}</w:t></w:r></w:p>` +
`</w:tc>`
);
}).join("");
return `<w:tr>${tcs}</w:tr>`;
};
const allRows = [makeRow(headers, true), ...dataRows.map(r => makeRow(r, false))].join("");
parts.push(`<w:tbl>${tblProps}<w:tblGrid>${tblGrid}</w:tblGrid>${allRows}</w:tbl>`);
break;
}
case "pagebreak": {
parts.push(`<w:p><w:r><w:br w:type="page"/></w:r></w:p>`);
break;
}
}
}
return parts.join("");
}
// ─── Internal XML helpers ─────────────────────────────────────────────────────
function tText(t: unknown): string {
if (typeof t === "string") return t;
if (typeof t === "number") return String(t);
if (t && typeof t === "object" && "#text" in t) return String((t as Record<string, unknown>)["#text"]);
return "";
}
function isPropOn(val: unknown): boolean {
if (val === undefined || val === null) return false;
if (typeof val === "boolean") return val;
if (typeof val === "string") return val !== "0";
if (typeof val === "object") {
const v = (val as Record<string, unknown>)["@_w:val"];
return v !== "0" && v !== "false" && v !== "none";
}
return true;
}
function runsFromPara(para: unknown): DocRun[] {
const p = para as Record<string, unknown>;
const runs: DocRun[] = [];
const rawRuns = p["w:r"];
if (!rawRuns) return runs;
const runArr = Array.isArray(rawRuns) ? rawRuns : [rawRuns];
for (const run of runArr) {
const r = run as Record<string, unknown>;
const tRaw = r["w:t"];
if (tRaw === undefined) continue;
const texts = Array.isArray(tRaw) ? tRaw : [tRaw];
const text = texts.map(tText).join("");
if (!text) continue;
const rPr = (r["w:rPr"] ?? {}) as Record<string, unknown>;
const colorRaw = rPr["w:color"];
const color = typeof colorRaw === "object"
? (colorRaw as Record<string, string>)["@_w:val"]
: undefined;
runs.push({
text,
bold: isPropOn(rPr["w:b"]) ? true : undefined,
italic: isPropOn(rPr["w:i"]) ? true : undefined,
underline: isPropOn(rPr["w:u"]) ? true : undefined,
color: color && color !== "auto" ? color : undefined,
});
}
return runs;
}
// ─── DocxDocument class ───────────────────────────────────────────────────────
export class DocxDocument {
private zip: JSZip;
private docXml: string;
constructor(zip: JSZip, docXml: string) {
this.zip = zip;
this.docXml = docXml;
}
static async load(filePath: string): Promise<DocxDocument> {
const buf = fs.readFileSync(filePath);
const zip = await JSZip.loadAsync(buf);
const docFile = zip.file("word/document.xml");
if (!docFile) throw new Error("Invalid .docx: missing word/document.xml");
const docXml = await docFile.async("string");
return new DocxDocument(zip, docXml);
}
async save(filePath: string): Promise<void> {
this.zip.file("word/document.xml", this.docXml);
const buf = await this.zip.generateAsync({
type: "nodebuffer",
compression: "DEFLATE",
compressionOptions: { level: 6 },
});
fs.writeFileSync(filePath, buf);
}
getXml(): string { return this.docXml; }
setXml(xml: string): void { this.docXml = xml; }
getZip(): JSZip { return this.zip; }
getParagraphs(): DocParagraph[] {
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
isArray: (name) => ["w:p","w:r","w:t","w:tr","w:tc","w:tbl"].includes(name),
parseTagValue: false,
trimValues: false,
});
const parsed = parser.parse(this.docXml) as Record<string, unknown>;
const body = (parsed["w:document"] as Record<string, unknown>)?.["w:body"] as Record<string, unknown>;
if (!body) return [];
const rawParas = body["w:p"];
if (!rawParas) return [];
const paraArr = Array.isArray(rawParas) ? rawParas : [rawParas];
return paraArr.map((para, i) => {
const p = para as Record<string, unknown>;
const pPr = (p["w:pPr"] ?? {}) as Record<string, unknown>;
const styleRaw = pPr["w:pStyle"];
const style = typeof styleRaw === "object"
? ((styleRaw as Record<string, string>)["@_w:val"] ?? "Normal")
: "Normal";
const numPr = pPr["w:numPr"] as Record<string, unknown> | undefined;
const ilvlRaw = numPr?.["w:ilvl"];
const listLevel = typeof ilvlRaw === "object"
? parseInt((ilvlRaw as Record<string, string>)["@_w:val"] ?? "0")
: 0;
const runs = runsFromPara(para);
const text = runs.map(r => r.text).join("");
return { index: i, text, style, runs, isListItem: !!numPr, listLevel: numPr ? listLevel : undefined };
});
}
getTables(): DocTable[] {
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
isArray: (name) => ["w:tbl","w:tr","w:tc","w:p","w:r","w:t"].includes(name),
parseTagValue: false,
});
const parsed = parser.parse(this.docXml) as Record<string, unknown>;
const body = (parsed["w:document"] as Record<string, unknown>)?.["w:body"] as Record<string, unknown>;
const rawTbls = body?.["w:tbl"];
if (!rawTbls) return [];
const tblArr = Array.isArray(rawTbls) ? rawTbls : [rawTbls];
return tblArr.map((tbl, ti) => {
const rawRows = (tbl as Record<string, unknown>)["w:tr"];
if (!rawRows) return { index: ti, rows: [] };
const rowArr = Array.isArray(rawRows) ? rawRows : [rawRows];
const rows = rowArr.map(row => {
const rawCells = (row as Record<string, unknown>)["w:tc"];
if (!rawCells) return [] as string[];
const cellArr = Array.isArray(rawCells) ? rawCells : [rawCells];
return cellArr.map(cell => {
const rawParas = (cell as Record<string, unknown>)["w:p"];
if (!rawParas) return "";
const pArr = Array.isArray(rawParas) ? rawParas : [rawParas];
return pArr.flatMap(p => runsFromPara(p).map(r => r.text)).join(" ").trim();
});
});
return { index: ti, rows };
});
}
/** Find/replace inside <w:t> elements only — never modifies XML structure */
findReplace(
find: string,
replace: string,
opts: { regex?: boolean; caseSensitive?: boolean } = {}
): number {
const flags = opts.caseSensitive ? "g" : "gi";
const escaped = opts.regex ? find : find.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const pattern = new RegExp(escaped, flags);
let count = 0;
this.docXml = this.docXml.replace(
/(<w:t(?:[^>]*)>)([\s\S]*?)(<\/w:t>)/g,
(_, open, text, close) => {
const replaced = text.replace(pattern, () => { count++; return replace; });
return `${open}${replaced}${close}`;
}
);
return count;
}
async getMetadata(): Promise<Record<string, unknown>> {
const result: Record<string, unknown> = {};
const coreFile = this.zip.file("docProps/core.xml");
if (coreFile) {
const xml = await coreFile.async("string");
for (const [key, tag] of [
["title","dc:title"],["creator","dc:creator"],["description","dc:description"],
["revision","cp:revision"],["created","dcterms:created"],["modified","dcterms:modified"],
["lastModifiedBy","cp:lastModifiedBy"],["keywords","cp:keywords"],
] as [string, string][]) {
const m = xml.match(new RegExp(`<${tag}[^>]*>([^<]*)<\/${tag}>`));
if (m) result[key] = m[1];
}
}
const appFile = this.zip.file("docProps/app.xml");
if (appFile) {
const xml = await appFile.async("string");
for (const [key, tag] of [
["pages","Pages"],["words","Words"],["characters","Characters"],
["paragraphs","Paragraphs"],["application","Application"],
] as [string, string][]) {
const m = xml.match(new RegExp(`<${tag}[^>]*>([^<]*)<\/${tag}>`));
if (m) result[key] = m[1];
}
}
return result;
}
async getComments(): Promise<DocComment[]> {
const file = this.zip.file("word/comments.xml");
if (!file) return [];
const xml = await file.async("string");
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
isArray: (name) => ["w:comment","w:p","w:r","w:t"].includes(name),
parseTagValue: false,
});
const parsed = parser.parse(xml) as Record<string, unknown>;
const root = parsed["w:comments"] as Record<string, unknown>;
const raw = root?.["w:comment"];
if (!raw) return [];
const arr = Array.isArray(raw) ? raw : [raw];
return arr.map(c => {
const comment = c as Record<string, unknown>;
const rawParas = comment["w:p"];
const paraArr = rawParas ? (Array.isArray(rawParas) ? rawParas : [rawParas]) : [];
const text = paraArr.flatMap(p => runsFromPara(p).map(r => r.text)).join("\n");
return {
id: String(comment["@_w:id"] ?? ""),
author: String(comment["@_w:author"] ?? ""),
date: String(comment["@_w:date"] ?? ""),
text,
};
});
}
async getImages(): Promise<DocImage[]> {
const relsFile = this.zip.file("word/_rels/document.xml.rels");
if (!relsFile) return [];
const relsXml = await relsFile.async("string");
const images: DocImage[] = [];
const re = /<Relationship[^>]+Id="([^"]+)"[^>]+Type="[^"]*image[^"]*"[^>]+Target="([^"]+)"/g;
let m: RegExpExecArray | null;
while ((m = re.exec(relsXml)) !== null) {
const [, id, target] = m;
const mediaPath = target.startsWith("/word/")
? target.slice(1)
: `word/${target.replace("../", "")}`;
const file = this.zip.file(mediaPath);
if (!file) continue;
const data = await file.async("uint8array");
const ext = target.split(".").pop()?.toLowerCase() ?? "unknown";
images.push({
id,
name: target.split("/").pop() ?? target,
type: ext,
sizeKB: Math.round((data.length / 1024) * 10) / 10,
});
}
return images;
}
async readVba(): Promise<Record<string, string>> {
const vbaFile = this.zip.file("word/vbaProject.bin");
if (!vbaFile) throw new Error("No VBA project found. Is this a .docm file?");
const data = await vbaFile.async("uint8array");
const text = Buffer.from(data).toString("latin1");
const modules: Record<string, string> = {};
const moduleNames = [...text.matchAll(/Attribute VB_Name = "([^"]+)"/g)];
if (moduleNames.length === 0) {
modules["_raw_detected"] = `VBA binary present (${data.length} bytes). Compile the .docm to extract source.`;
} else {
for (const [, name] of moduleNames) {
modules[name] = `[Module: ${name}] — binary VBA detected. Compile the .docm to extract source.`;
}
}
return modules;
}
}