src / redactor.ts
import type { PiiEntity, PiiType, RedactionResult } from "./types";
// ---------------------------------------------------------------------------
// Placeholder helpers
// ---------------------------------------------------------------------------
/**
* Build the placeholder string for a given identifier.
* Format: [[[TYPE_N]]]
*/
export function buildPlaceholder(identifier: string): string {
return `[[[${identifier}]]]`;
}
/**
* Build a deterministic identifier key for a (type, counter) pair.
* e.g. ("PERSON", 1) → "PERSON_1"
*/
function buildIdentifier(type: PiiType, n: number): string {
return `${type}_${n}`;
}
// ---------------------------------------------------------------------------
// Span-based entity (internal, before identifier assignment)
// ---------------------------------------------------------------------------
interface RawEntity {
original: string;
type: PiiType;
}
// ---------------------------------------------------------------------------
// Redactor
// ---------------------------------------------------------------------------
/**
* Given the original text and a list of raw detected entities (ordered by
* first appearance), assign unique identifiers, resolve coreferences, and
* substitute all occurrences in the text.
*
* Rules:
* - Entities are numbered per-type starting at 1.
* - Coreference: if two raw entities have the same `original` value (case-
* insensitive), they share the same identifier.
* - Substitution is performed longest-match-first to avoid partial overlaps
* (e.g. "John Smith" before "John").
* - Placeholders are triple-bracket wrapped so downstream models are unlikely
* to alter them.
*/
export function redact(text: string, rawEntities: RawEntity[]): RedactionResult {
// Step 1: Deduplicate and assign identifiers.
// We preserve first-appearance order and merge identical originals.
const counters = new Map<PiiType, number>();
// Map from normalised original text → assigned identifier
const canonicalMap = new Map<string, string>();
// Final ordered entity list (no duplicates)
const entities: PiiEntity[] = [];
for (const raw of rawEntities) {
const key = raw.original.toLowerCase().trim();
if (canonicalMap.has(key)) {
// Already assigned — coreference resolved
continue;
}
const n = (counters.get(raw.type) ?? 0) + 1;
counters.set(raw.type, n);
const identifier = buildIdentifier(raw.type, n);
canonicalMap.set(key, identifier);
entities.push({ identifier, type: raw.type, original: raw.original });
}
// Step 2: Build replacement pairs sorted by length descending (longest first)
// to avoid replacing substrings of longer matches.
const replacements: Array<{ original: string; placeholder: string }> = entities
.map(e => ({
original: e.original,
placeholder: buildPlaceholder(e.identifier),
}))
.sort((a, b) => b.original.length - a.original.length);
// Step 3: Perform substitutions.
// We use a single-pass approach: scan the text left-to-right, greedily
// matching the longest entity at each position.
let redacted = text;
for (const { original, placeholder } of replacements) {
// Replace all occurrences (case-sensitive — the LLM returns exact spans).
// We escape special regex characters in the original text.
const escaped = original.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
redacted = redacted.replace(new RegExp(escaped, "g"), placeholder);
}
return { redacted_text: redacted, entities };
}
/**
* Validate that no placeholder has been altered in the redacted text.
* Returns true if all placeholders are intact.
*/
export function validatePlaceholders(result: RedactionResult): boolean {
for (const entity of result.entities) {
const placeholder = buildPlaceholder(entity.identifier);
if (!result.redacted_text.includes(placeholder)) {
return false;
}
}
return true;
}