src / anonymizer.ts
import { detectAll, dedupeSpans, type Span, type SpanType } from "./detectors";
export type AnonymizeArgs = {
names?: string[];
customTerms?: string[];
configCustomTerms?: string[];
};
export type AnonymizeResult = {
anonymized: string;
mapping: Record<string, string>;
counts: Record<string, number>;
};
function escapeRegex(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function findLiteral(text: string, needle: string, type: SpanType): Span[] {
const trimmed = needle.trim();
if (!trimmed) return [];
const escaped = escapeRegex(trimmed);
let re: RegExp;
try {
re = new RegExp(`(?<![\\p{L}\\p{N}])${escaped}(?![\\p{L}\\p{N}])`, "gu");
} catch {
re = new RegExp(escaped, "g");
}
const out: Span[] = [];
for (const m of text.matchAll(re)) {
out.push({ start: m.index!, end: m.index! + m[0].length, type, value: m[0] });
}
return out;
}
export function anonymize(text: string, args: AnonymizeArgs): AnonymizeResult {
const spans: Span[] = [...detectAll(text)];
for (const name of args.names ?? []) {
spans.push(...findLiteral(text, name, "NOM"));
}
const customs = [...(args.customTerms ?? []), ...(args.configCustomTerms ?? [])];
for (const term of customs) {
spans.push(...findLiteral(text, term, "CUSTOM"));
}
const resolved = dedupeSpans(spans);
const pseudoFor = new Map<string, string>();
const counts: Record<string, number> = {};
function getPseudo(type: string, value: string): string {
const key = `${type}|${value}`;
const cached = pseudoFor.get(key);
if (cached) return cached;
counts[type] = (counts[type] ?? 0) + 1;
const p = `[${type}_${counts[type]}]`;
pseudoFor.set(key, p);
return p;
}
// Assign pseudonyms in reading order so [TYPE_1] is the first occurrence in the text.
for (const s of resolved) getPseudo(s.type, s.value);
// Apply replacements right-to-left to keep earlier indices valid.
const reversed = [...resolved].sort((a, b) => b.start - a.start);
let result = text;
const mapping: Record<string, string> = {};
for (const s of reversed) {
const p = pseudoFor.get(`${s.type}|${s.value}`)!;
mapping[p] = s.value;
result = result.slice(0, s.start) + p + result.slice(s.end);
}
return { anonymized: result, mapping, counts };
}