Project Files
src / docxReplace.ts
import * as fs from "node:fs/promises";
import JSZip from "jszip";
export interface Replacement {
find: string;
replace: string;
}
export interface ReplaceOptions {
caseSensitive: boolean;
}
export interface ReplaceResult {
bytes: number;
replacements: Array<{ find: string; replace: string; count: number }>;
warnings: string[];
}
// Files inside the .docx zip where user-authored text typically lives.
// We explicitly skip word/styles.xml, numbering.xml, settings.xml, etc.,
// because those describe the *form* — replacing text there could rename a
// custom style and break Word's style resolution.
const TEXT_BEARING_RE =
/^word\/(document|header\d*|footer\d*|footnotes|endnotes|comments)\.xml$/;
const WT_RE = /(<w:t(?:\s[^>]*)?>)([\s\S]*?)(<\/w:t>)/g;
const WP_RE = /<w:p(?:\s[^>]*)?>[\s\S]*?<\/w:p>/g;
export async function replaceInDocx(
inputPath: string,
outputPath: string,
replacements: Replacement[],
opts: ReplaceOptions,
): Promise<ReplaceResult> {
const buffer = await fs.readFile(inputPath);
const zip = await JSZip.loadAsync(buffer);
const counts = new Map<string, number>();
for (const r of replacements) counts.set(r.find, 0);
const warnings: string[] = [];
const fileNames = Object.keys(zip.files).filter((n) => TEXT_BEARING_RE.test(n));
for (const name of fileNames) {
const file = zip.file(name);
if (!file) continue;
const xml = await file.async("string");
const { xml: nextXml, applied } = applyToXml(xml, replacements, opts);
for (const [find, n] of applied) counts.set(find, (counts.get(find) ?? 0) + n);
if (nextXml !== xml) {
zip.file(name, nextXml);
}
}
// Cross-run fallback search: for each find string with count 0, scan
// paragraphs and report if it shows up across run boundaries.
for (const [find, count] of counts) {
if (count > 0) continue;
const splitHits = await findCrossRunHits(zip, find, opts, fileNames);
if (splitHits > 0) {
warnings.push(
`"${find}" appears ${splitHits} time(s) split across multiple Word runs — not replaced. ` +
`This usually happens when Word inserts spell-check or formatting marks mid-string.`,
);
}
}
const out = await zip.generateAsync({
type: "nodebuffer",
compression: "DEFLATE",
compressionOptions: { level: 9 },
});
await fs.writeFile(outputPath, out);
return {
bytes: out.length,
replacements: replacements.map((r) => ({
find: r.find,
replace: r.replace,
count: counts.get(r.find) ?? 0,
})),
warnings,
};
}
function applyToXml(
xml: string,
replacements: Replacement[],
opts: ReplaceOptions,
): { xml: string; applied: Map<string, number> } {
const applied = new Map<string, number>();
for (const r of replacements) applied.set(r.find, 0);
const next = xml.replace(WT_RE, (_match, openTag: string, inner: string, closeTag: string) => {
const decoded = decodeXml(inner);
let working = decoded;
for (const r of replacements) {
const before = working;
working = replaceAll(working, r.find, r.replace, opts.caseSensitive);
if (working !== before) {
const n = countOccurrences(before, r.find, opts.caseSensitive);
applied.set(r.find, (applied.get(r.find) ?? 0) + n);
}
}
if (working === decoded) return openTag + inner + closeTag;
return openTag + encodeXml(working) + closeTag;
});
return { xml: next, applied };
}
async function findCrossRunHits(
zip: JSZip,
find: string,
opts: ReplaceOptions,
fileNames: string[],
): Promise<number> {
let total = 0;
for (const name of fileNames) {
const f = zip.file(name);
if (!f) continue;
const xml = await f.async("string");
const paragraphs = xml.match(WP_RE) ?? [];
for (const p of paragraphs) {
const wt = [...p.matchAll(WT_RE)].map((m) => decodeXml(m[2])).join("");
const indivHasIt = [...p.matchAll(WT_RE)].some((m) =>
containsText(decodeXml(m[2]), find, opts.caseSensitive),
);
if (!indivHasIt) {
total += countOccurrences(wt, find, opts.caseSensitive);
}
}
}
return total;
}
function containsText(haystack: string, needle: string, caseSensitive: boolean): boolean {
if (caseSensitive) return haystack.includes(needle);
return haystack.toLowerCase().includes(needle.toLowerCase());
}
function countOccurrences(haystack: string, needle: string, caseSensitive: boolean): number {
if (!needle) return 0;
const h = caseSensitive ? haystack : haystack.toLowerCase();
const n = caseSensitive ? needle : needle.toLowerCase();
let count = 0;
let pos = 0;
while ((pos = h.indexOf(n, pos)) !== -1) {
count++;
pos += n.length;
}
return count;
}
function replaceAll(haystack: string, needle: string, replacement: string, caseSensitive: boolean): string {
if (!needle) return haystack;
if (caseSensitive) return haystack.split(needle).join(replacement);
// Case-insensitive: walk and rebuild preserving original casing of context.
const out: string[] = [];
const lower = haystack.toLowerCase();
const lowerNeedle = needle.toLowerCase();
let i = 0;
while (i < haystack.length) {
const found = lower.indexOf(lowerNeedle, i);
if (found === -1) {
out.push(haystack.slice(i));
break;
}
out.push(haystack.slice(i, found), replacement);
i = found + needle.length;
}
return out.join("");
}
function decodeXml(s: string): string {
return s
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
.replace(/&/g, "&");
}
function encodeXml(s: string): string {
return s
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">");
}