Project Files
dist / utils / document.js
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.DocxDocument = void 0;
exports.escapeXml = escapeXml;
exports.stripSectPr = stripSectPr;
exports.extractBodyContent = extractBodyContent;
exports.findTopLevelParaSpans = findTopLevelParaSpans;
exports.blocksToXml = blocksToXml;
const jszip_1 = __importDefault(require("jszip"));
const fs = __importStar(require("fs"));
const fast_xml_parser_1 = require("fast-xml-parser");
// ─── XML helpers ──────────────────────────────────────────────────────────────
function escapeXml(s) {
return s
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, """)
.replace(/'/g, "'");
}
/**
* Remove ALL <w:sectPr> from XML — handles self-closing and element forms.
* MUST be called on extracted body content before injection into another document.
*/
function stripSectPr(xml) {
xml = xml.replace(/<w:sectPr[^>]*\/>/g, ""); // self-closing
xml = xml.replace(/<w:sectPr[\s\S]*?<\/w:sectPr>/g, ""); // element form
return xml;
}
/**
* Extract content between <w:body> and </w:body>.
* Uses greedy inner match to capture the full body including nested elements.
*/
function extractBodyContent(xml) {
const match = xml.match(/<w:body>([\s\S]*)<\/w:body>/);
return match ? match[1] : "";
}
/**
* Tokenize top-level <w:p> elements in body content.
* Skips paragraphs nested inside <w:tbl> or <w:sdt> to avoid corrupting table structure.
*/
function findTopLevelParaSpans(bodyContent) {
const spans = [];
let i = 0;
let depth = 0;
while (i < bodyContent.length) {
if (bodyContent[i] !== "<") {
i++;
continue;
}
const slice = bodyContent.slice(i);
// Depth-increasing opens (skip self-closing)
if (/^<w:tbl[\s>]/.test(slice) && !/^<w:tbl[^>]*\/>/.test(slice)) {
depth++;
i++;
continue;
}
if (/^<w:sdt[\s>]/.test(slice) && !/^<w:sdt[^>]*\/>/.test(slice)) {
depth++;
i++;
continue;
}
// Depth-decreasing closes
if (slice.startsWith("</w:tbl>")) {
if (depth > 0)
depth--;
i += 8;
continue;
}
if (slice.startsWith("</w:sdt>")) {
if (depth > 0)
depth--;
i += 8;
continue;
}
// Top-level <w:p> only
if (depth === 0 && /^<w:p[\s>]/.test(slice)) {
const closeIdx = bodyContent.indexOf("</w:p>", i + 3);
if (closeIdx === -1) {
i++;
continue;
}
spans.push({ start: i, end: closeIdx + 6 });
i = closeIdx + 6;
continue;
}
i++;
}
return spans;
}
/**
* Convert ContentBlock[] → raw OOXML XML.
* No external document round-trips. Safe to inject into any document's <w:body>.
*/
function blocksToXml(blocks) {
const parts = [];
for (const block of blocks) {
switch (block.type) {
case "heading": {
const lvl = block.level ?? 1;
const rPr = block.bold !== false ? "<w:rPr><w:b/></w:rPr>" : "";
parts.push(`<w:p><w:pPr><w:pStyle w:val="Heading${lvl}"/></w:pPr>` +
`<w:r>${rPr}<w:t xml:space="preserve">${escapeXml(block.text ?? "")}</w:t></w:r></w:p>`);
break;
}
case "paragraph": {
const alignVal = block.align === "justify" ? "both" : (block.align ?? "");
const pPr = alignVal ? `<w:pPr><w:jc w:val="${alignVal}"/></w:pPr>` : "";
const rPrParts = [];
if (block.bold)
rPrParts.push("<w:b/>");
if (block.italic)
rPrParts.push("<w:i/>");
const rPr = rPrParts.length ? `<w:rPr>${rPrParts.join("")}</w:rPr>` : "";
parts.push(`<w:p>${pPr}` +
`<w:r>${rPr}<w:t xml:space="preserve">${escapeXml(block.text ?? "")}</w:t></w:r></w:p>`);
break;
}
case "list": {
for (const item of block.items ?? []) {
const prefix = block.ordered ? "" : "\u2022 ";
parts.push(`<w:p><w:pPr><w:pStyle w:val="ListParagraph"/>` +
`<w:ind w:left="720" w:hanging="360"/></w:pPr>` +
`<w:r><w:t xml:space="preserve">${escapeXml(prefix + item)}</w:t></w:r></w:p>`);
}
break;
}
case "table": {
const headers = block.headers ?? [];
const dataRows = block.rows ?? [];
const colCount = headers.length || 1;
const colWidth = Math.floor(9360 / colCount);
const widths = Array.from({ length: colCount }, () => colWidth);
const borders = ["top", "left", "bottom", "right", "insideH", "insideV"]
.map(s => `<w:${s} w:val="single" w:sz="4" w:space="0" w:color="CCCCCC"/>`)
.join("");
const tblProps = `<w:tblPr><w:tblW w:w="9360" w:type="dxa"/>` +
`<w:tblBorders>${borders}</w:tblBorders></w:tblPr>`;
const tblGrid = widths.map(w => `<w:gridCol w:w="${w}"/>`).join("");
const makeRow = (cells, isHeader) => {
const tcs = cells.map((text, ci) => {
const w = widths[ci] ?? colWidth;
const shd = isHeader ? `<w:shd w:val="clear" w:color="auto" w:fill="4472C4"/>` : "";
const rPr = isHeader ? `<w:rPr><w:b/><w:color w:val="FFFFFF"/></w:rPr>` : "";
return (`<w:tc><w:tcPr><w:tcW w:w="${w}" w:type="dxa"/>${shd}` +
`<w:tcMar><w:top w:w="80" w:type="dxa"/><w:bottom w:w="80" w:type="dxa"/>` +
`<w:left w:w="120" w:type="dxa"/><w:right w:w="120" w:type="dxa"/></w:tcMar>` +
`</w:tcPr>` +
`<w:p><w:r>${rPr}<w:t xml:space="preserve">${escapeXml(text)}</w:t></w:r></w:p>` +
`</w:tc>`);
}).join("");
return `<w:tr>${tcs}</w:tr>`;
};
const allRows = [makeRow(headers, true), ...dataRows.map(r => makeRow(r, false))].join("");
parts.push(`<w:tbl>${tblProps}<w:tblGrid>${tblGrid}</w:tblGrid>${allRows}</w:tbl>`);
break;
}
case "pagebreak": {
parts.push(`<w:p><w:r><w:br w:type="page"/></w:r></w:p>`);
break;
}
}
}
return parts.join("");
}
// ─── Internal XML helpers ─────────────────────────────────────────────────────
function tText(t) {
if (typeof t === "string")
return t;
if (typeof t === "number")
return String(t);
if (t && typeof t === "object" && "#text" in t)
return String(t["#text"]);
return "";
}
function isPropOn(val) {
if (val === undefined || val === null)
return false;
if (typeof val === "boolean")
return val;
if (typeof val === "string")
return val !== "0";
if (typeof val === "object") {
const v = val["@_w:val"];
return v !== "0" && v !== "false" && v !== "none";
}
return true;
}
function runsFromPara(para) {
const p = para;
const runs = [];
const rawRuns = p["w:r"];
if (!rawRuns)
return runs;
const runArr = Array.isArray(rawRuns) ? rawRuns : [rawRuns];
for (const run of runArr) {
const r = run;
const tRaw = r["w:t"];
if (tRaw === undefined)
continue;
const texts = Array.isArray(tRaw) ? tRaw : [tRaw];
const text = texts.map(tText).join("");
if (!text)
continue;
const rPr = (r["w:rPr"] ?? {});
const colorRaw = rPr["w:color"];
const color = typeof colorRaw === "object"
? colorRaw["@_w:val"]
: undefined;
runs.push({
text,
bold: isPropOn(rPr["w:b"]) ? true : undefined,
italic: isPropOn(rPr["w:i"]) ? true : undefined,
underline: isPropOn(rPr["w:u"]) ? true : undefined,
color: color && color !== "auto" ? color : undefined,
});
}
return runs;
}
// ─── DocxDocument class ───────────────────────────────────────────────────────
class DocxDocument {
constructor(zip, docXml) {
this.zip = zip;
this.docXml = docXml;
}
static async load(filePath) {
const buf = fs.readFileSync(filePath);
const zip = await jszip_1.default.loadAsync(buf);
const docFile = zip.file("word/document.xml");
if (!docFile)
throw new Error("Invalid .docx: missing word/document.xml");
const docXml = await docFile.async("string");
return new DocxDocument(zip, docXml);
}
async save(filePath) {
this.zip.file("word/document.xml", this.docXml);
const buf = await this.zip.generateAsync({
type: "nodebuffer",
compression: "DEFLATE",
compressionOptions: { level: 6 },
});
fs.writeFileSync(filePath, buf);
}
getXml() { return this.docXml; }
setXml(xml) { this.docXml = xml; }
getZip() { return this.zip; }
getParagraphs() {
const parser = new fast_xml_parser_1.XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
isArray: (name) => ["w:p", "w:r", "w:t", "w:tr", "w:tc", "w:tbl"].includes(name),
parseTagValue: false,
trimValues: false,
});
const parsed = parser.parse(this.docXml);
const body = parsed["w:document"]?.["w:body"];
if (!body)
return [];
const rawParas = body["w:p"];
if (!rawParas)
return [];
const paraArr = Array.isArray(rawParas) ? rawParas : [rawParas];
return paraArr.map((para, i) => {
const p = para;
const pPr = (p["w:pPr"] ?? {});
const styleRaw = pPr["w:pStyle"];
const style = typeof styleRaw === "object"
? (styleRaw["@_w:val"] ?? "Normal")
: "Normal";
const numPr = pPr["w:numPr"];
const ilvlRaw = numPr?.["w:ilvl"];
const listLevel = typeof ilvlRaw === "object"
? parseInt(ilvlRaw["@_w:val"] ?? "0")
: 0;
const runs = runsFromPara(para);
const text = runs.map(r => r.text).join("");
return { index: i, text, style, runs, isListItem: !!numPr, listLevel: numPr ? listLevel : undefined };
});
}
getTables() {
const parser = new fast_xml_parser_1.XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
isArray: (name) => ["w:tbl", "w:tr", "w:tc", "w:p", "w:r", "w:t"].includes(name),
parseTagValue: false,
});
const parsed = parser.parse(this.docXml);
const body = parsed["w:document"]?.["w:body"];
const rawTbls = body?.["w:tbl"];
if (!rawTbls)
return [];
const tblArr = Array.isArray(rawTbls) ? rawTbls : [rawTbls];
return tblArr.map((tbl, ti) => {
const rawRows = tbl["w:tr"];
if (!rawRows)
return { index: ti, rows: [] };
const rowArr = Array.isArray(rawRows) ? rawRows : [rawRows];
const rows = rowArr.map(row => {
const rawCells = row["w:tc"];
if (!rawCells)
return [];
const cellArr = Array.isArray(rawCells) ? rawCells : [rawCells];
return cellArr.map(cell => {
const rawParas = cell["w:p"];
if (!rawParas)
return "";
const pArr = Array.isArray(rawParas) ? rawParas : [rawParas];
return pArr.flatMap(p => runsFromPara(p).map(r => r.text)).join(" ").trim();
});
});
return { index: ti, rows };
});
}
/** Find/replace inside <w:t> elements only — never modifies XML structure */
findReplace(find, replace, opts = {}) {
const flags = opts.caseSensitive ? "g" : "gi";
const escaped = opts.regex ? find : find.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const pattern = new RegExp(escaped, flags);
let count = 0;
this.docXml = this.docXml.replace(/(<w:t(?:[^>]*)>)([\s\S]*?)(<\/w:t>)/g, (_, open, text, close) => {
const replaced = text.replace(pattern, () => { count++; return replace; });
return `${open}${replaced}${close}`;
});
return count;
}
async getMetadata() {
const result = {};
const coreFile = this.zip.file("docProps/core.xml");
if (coreFile) {
const xml = await coreFile.async("string");
for (const [key, tag] of [
["title", "dc:title"], ["creator", "dc:creator"], ["description", "dc:description"],
["revision", "cp:revision"], ["created", "dcterms:created"], ["modified", "dcterms:modified"],
["lastModifiedBy", "cp:lastModifiedBy"], ["keywords", "cp:keywords"],
]) {
const m = xml.match(new RegExp(`<${tag}[^>]*>([^<]*)<\/${tag}>`));
if (m)
result[key] = m[1];
}
}
const appFile = this.zip.file("docProps/app.xml");
if (appFile) {
const xml = await appFile.async("string");
for (const [key, tag] of [
["pages", "Pages"], ["words", "Words"], ["characters", "Characters"],
["paragraphs", "Paragraphs"], ["application", "Application"],
]) {
const m = xml.match(new RegExp(`<${tag}[^>]*>([^<]*)<\/${tag}>`));
if (m)
result[key] = m[1];
}
}
return result;
}
async getComments() {
const file = this.zip.file("word/comments.xml");
if (!file)
return [];
const xml = await file.async("string");
const parser = new fast_xml_parser_1.XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
isArray: (name) => ["w:comment", "w:p", "w:r", "w:t"].includes(name),
parseTagValue: false,
});
const parsed = parser.parse(xml);
const root = parsed["w:comments"];
const raw = root?.["w:comment"];
if (!raw)
return [];
const arr = Array.isArray(raw) ? raw : [raw];
return arr.map(c => {
const comment = c;
const rawParas = comment["w:p"];
const paraArr = rawParas ? (Array.isArray(rawParas) ? rawParas : [rawParas]) : [];
const text = paraArr.flatMap(p => runsFromPara(p).map(r => r.text)).join("\n");
return {
id: String(comment["@_w:id"] ?? ""),
author: String(comment["@_w:author"] ?? ""),
date: String(comment["@_w:date"] ?? ""),
text,
};
});
}
async getImages() {
const relsFile = this.zip.file("word/_rels/document.xml.rels");
if (!relsFile)
return [];
const relsXml = await relsFile.async("string");
const images = [];
const re = /<Relationship[^>]+Id="([^"]+)"[^>]+Type="[^"]*image[^"]*"[^>]+Target="([^"]+)"/g;
let m;
while ((m = re.exec(relsXml)) !== null) {
const [, id, target] = m;
const mediaPath = target.startsWith("/word/")
? target.slice(1)
: `word/${target.replace("../", "")}`;
const file = this.zip.file(mediaPath);
if (!file)
continue;
const data = await file.async("uint8array");
const ext = target.split(".").pop()?.toLowerCase() ?? "unknown";
images.push({
id,
name: target.split("/").pop() ?? target,
type: ext,
sizeKB: Math.round((data.length / 1024) * 10) / 10,
});
}
return images;
}
async readVba() {
const vbaFile = this.zip.file("word/vbaProject.bin");
if (!vbaFile)
throw new Error("No VBA project found. Is this a .docm file?");
const data = await vbaFile.async("uint8array");
const text = Buffer.from(data).toString("latin1");
const modules = {};
const moduleNames = [...text.matchAll(/Attribute VB_Name = "([^"]+)"/g)];
if (moduleNames.length === 0) {
modules["_raw_detected"] = `VBA binary present (${data.length} bytes). Compile the .docm to extract source.`;
}
else {
for (const [, name] of moduleNames) {
modules[name] = `[Module: ${name}] — binary VBA detected. Compile the .docm to extract source.`;
}
}
return modules;
}
}
exports.DocxDocument = DocxDocument;
//# sourceMappingURL=document.js.map