Project Files
src / anonymizer.test.ts
import { test } from "node:test";
import { strict as assert } from "node:assert";
import { anonymize } from "./anonymizer.ts";
test("anonymize: returns the expected output shape", () => {
const r = anonymize("Hello alice@example.com", {});
assert.ok(typeof r.anonymized === "string");
assert.ok(typeof r.mapping === "object");
assert.ok(typeof r.counts === "object");
assert.equal(r.counts.EMAIL, 1);
assert.equal(r.anonymized, "Hello [EMAIL_1]");
assert.equal(r.mapping["[EMAIL_1]"], "alice@example.com");
});
test("anonymize: stable pseudonyms within a call", () => {
const r = anonymize("Mail alice@example.com puis re-mail alice@example.com.", {});
assert.equal(r.counts.EMAIL, 1);
assert.equal(
r.anonymized,
"Mail [EMAIL_1] puis re-mail [EMAIL_1].",
);
});
test("anonymize: coreference across phone formats", () => {
// +33, +33(0), and 0-prefix should all share a pseudonym after normalization.
const text = "Appelle au 06 12 34 56 78, +33 6 12 34 56 78 ou +33 (0)6 12 34 56 78.";
const r = anonymize(text, {});
assert.equal(r.counts.TEL, 1, "all three phone forms collapse to one pseudo");
assert.equal(r.mapping["[TEL_1]"], "06 12 34 56 78");
assert.equal(r.anonymized, "Appelle au [TEL_1], [TEL_1] ou [TEL_1].");
});
test("anonymize: coreference across IBAN case and spacing", () => {
const iban1 = "FR1420041010050500013M02606";
const iban2 = "fr14 2004 1010 0505 0001 3m02 606";
const text = `Vire sur ${iban1} ou sur ${iban2}.`;
const r = anonymize(text, {});
assert.equal(r.counts.IBAN, 1);
assert.equal(r.mapping["[IBAN_1]"], iban1);
});
test("anonymize: email coreference is case-insensitive", () => {
const r = anonymize("Mail Alice@Example.COM puis alice@example.com.", {});
assert.equal(r.counts.EMAIL, 1);
});
test("anonymize: names with word boundary", () => {
// "Jean" must NOT match inside "Jeanne" or "majeurement".
const text = "Jean a parlé à Jeanne, c'est majeur. Puis Jean repart.";
const r = anonymize(text, { names: ["Jean"] });
assert.equal(r.counts.NOM, 1);
assert.match(r.anonymized, /Jeanne/);
assert.match(r.anonymized, /majeur/);
const occurrences = r.anonymized.split("[NOM_1]").length - 1;
assert.equal(occurrences, 2, "should redact both standalone 'Jean'");
});
test("anonymize: longer name wins over shorter overlap", () => {
const text = "Jean a appelé Jean Dupont au sujet de Jean.";
const r = anonymize(text, { names: ["Jean", "Jean Dupont"] });
// Two distinct pseudos: standalone "Jean" and the compound "Jean Dupont".
assert.equal(r.counts.NOM, 2);
// "Jean Dupont" gets its own pseudo (not split into two NOMs).
const dupontPseudo = Object.entries(r.mapping).find(
([, v]) => v === "Jean Dupont",
)?.[0];
assert.ok(dupontPseudo, "Jean Dupont mapped");
});
test("anonymize: custom_terms and configCustomTerms both apply", () => {
const text = "ACME Corp paie sur projet Hydra-7 demain.";
const r = anonymize(text, {
customTerms: ["ACME Corp"],
configCustomTerms: ["Hydra-7"],
});
assert.equal(r.counts.CUSTOM, 2);
assert.match(r.anonymized, /\[CUSTOM_\d\] paie sur projet \[CUSTOM_\d\]/);
});
test("anonymize: detector overrides custom term on identical span", () => {
// If the user adds a phone number as a custom_term AND it's a valid French
// phone, the detector should win (more specific type) since dedupe is stable
// by insertion order and detectors run before custom terms.
const text = "Tel: 06 12 34 56 78";
const r = anonymize(text, { customTerms: ["06 12 34 56 78"] });
assert.equal(r.counts.TEL, 1);
assert.equal(r.counts.CUSTOM, undefined);
});
test("anonymize: pseudo numbering is in reading order", () => {
const text = "Bob bob@ex.com Carol carol@ex.com Alice alice@ex.com";
const r = anonymize(text, { names: ["Alice", "Bob", "Carol"] });
// EMAIL_1 should be bob@ex.com (first email occurrence), not alice@.
assert.equal(r.mapping["[EMAIL_1]"], "bob@ex.com");
assert.equal(r.mapping["[EMAIL_2]"], "carol@ex.com");
assert.equal(r.mapping["[EMAIL_3]"], "alice@ex.com");
assert.equal(r.mapping["[NOM_1]"], "Bob");
assert.equal(r.mapping["[NOM_2]"], "Carol");
assert.equal(r.mapping["[NOM_3]"], "Alice");
});
test("anonymize: NIR Corsica end-to-end", () => {
const text = "Sécu: 2 89 04 2A 342 163 90, employeur ACME.";
const r = anonymize(text, { customTerms: ["ACME"] });
assert.equal(r.counts.NIR, 1);
assert.match(r.anonymized, /Sécu: \[NIR_1\], employeur \[CUSTOM_1\]\./);
});
test("anonymize: empty input produces empty mapping", () => {
const r = anonymize("Texte sans rien de sensible.", { names: ["Inexistant"] });
assert.equal(r.anonymized, "Texte sans rien de sensible.");
assert.deepEqual(r.mapping, {});
assert.deepEqual(r.counts, {});
});
test("anonymize: name matching is case-insensitive (NOM)", () => {
// Headers and signatures often use all-caps; the model still passes
// canonical case.
const text = "JEAN DUPONT, directeur. Jean Dupont a signé. jean dupont aussi.";
const r = anonymize(text, { names: ["Jean Dupont"] });
assert.equal(r.counts.NOM, 1, "all three variants share one pseudonym");
// 3 occurrences replaced.
const occ = r.anonymized.split("[NOM_1]").length - 1;
assert.equal(occ, 3);
// Mapping keeps the first textual form seen.
assert.equal(r.mapping["[NOM_1]"], "JEAN DUPONT");
});
test("anonymize: custom terms remain case-sensitive (CUSTOM)", () => {
// CUSTOM is deliberate: caller passes the exact form they want gone.
const text = "ACME Corp vs acme corp vs Acme Corp";
const r = anonymize(text, { customTerms: ["ACME Corp"] });
assert.equal(r.counts.CUSTOM, 1);
assert.match(r.anonymized, /^\[CUSTOM_1\] vs acme corp vs Acme Corp$/);
});
test("anonymize: case-insensitive names preserve word boundary", () => {
// "jean" inside "jeanne" must not match even with case folding.
const text = "JEANNE était là . Jean repartit.";
const r = anonymize(text, { names: ["jean"] });
assert.equal(r.counts.NOM, 1);
assert.match(r.anonymized, /JEANNE était là \. \[NOM_1\] repartit\./);
});
test("anonymize: pre-existing pseudonyms in input are preserved and counters offset", () => {
// Re-processing an already-partly-redacted document: existing [NOM_1]
// stays as-is, new redactions start at [NOM_2].
const text = "Bonjour [NOM_1], Bob a téléphoné.";
const r = anonymize(text, { names: ["Bob"] });
assert.equal(r.counts.NOM, 1, "only Bob is a new redaction");
// The literal [NOM_1] survives verbatim.
assert.match(r.anonymized, /Bonjour \[NOM_1\], \[NOM_2\] a téléphoné\./);
assert.equal(r.mapping["[NOM_2]"], "Bob");
assert.equal(r.mapping["[NOM_1]"], undefined, "existing literal not added to mapping");
});
test("anonymize: pre-existing pseudonyms of multiple types each offset independently", () => {
const text = "[EMAIL_3] et [TEL_1] sont lĂ . Mail: x@y.fr et phone: 0612345678.";
const r = anonymize(text, {});
assert.equal(r.counts.EMAIL, 1);
assert.equal(r.counts.TEL, 1);
// New EMAIL starts at index 4 (past existing 3), new TEL at 2 (past 1).
assert.equal(r.mapping["[EMAIL_4]"], "x@y.fr");
assert.equal(r.mapping["[TEL_2]"], "0612345678");
});
test("anonymize: international flag enables non-FR phones, off by default", () => {
const text = "FR +33 6 12 34 56 78 ; DE +49 30 12345678 ; US +1 555 123 4567";
// Default: only the +33 number is redacted.
const off = anonymize(text, {});
assert.equal(off.counts.TEL, 1);
// With the flag: three distinct phones redacted.
const on = anonymize(text, { international: true });
assert.equal(on.counts.TEL, 3);
});
test("anonymize: international does not double-count the +33 form", () => {
// When international is on, +33 numbers are caught by both regexes; dedup
// must keep one pseudonym only.
const text = "Number: +33 6 12 34 56 78 only once.";
const r = anonymize(text, { international: true });
assert.equal(r.counts.TEL, 1);
});
test("anonymize: handles accented names", () => {
const text = "Hélène et François vont à Paris.";
const r = anonymize(text, { names: ["Hélène", "François"] });
assert.equal(r.counts.NOM, 2);
assert.match(r.anonymized, /\[NOM_1\] et \[NOM_2\] vont Ă Paris\./);
});
test("anonymize: modelSpans injected — new ADDRESS / DATE / IDDOC types", () => {
// Simulate output from piiModel.ts without loading the actual model.
const text = "NĂ© le 12/03/1990 au 4 rue de Paris. Passeport: 12AB34567.";
const modelSpans = [
{ start: 6, end: 16, type: "DATE" as const, value: "12/03/1990" },
{ start: 20, end: 34, type: "ADDRESS" as const, value: "4 rue de Paris" },
{ start: 47, end: 56, type: "IDDOC" as const, value: "12AB34567" },
];
const r = anonymize(text, { modelSpans });
assert.equal(r.counts.DATE, 1);
assert.equal(r.counts.ADDRESS, 1);
assert.equal(r.counts.IDDOC, 1);
assert.match(r.anonymized, /\[DATE_1\]/);
assert.match(r.anonymized, /\[ADDRESS_1\]/);
assert.match(r.anonymized, /\[IDDOC_1\]/);
});
test("anonymize: passport numbers are redacted without the ML model", () => {
const text = "Passeport n° 19FH84235, dossier ouvert.";
const r = anonymize(text, {});
assert.equal(r.counts.IDDOC, 1);
assert.equal(r.anonymized, "Passeport n° [IDDOC_1], dossier ouvert.");
});
test("anonymize: labelled addresses are redacted without the ML model", () => {
const text = "Adresse : 27 rue de la République, 69002 Lyon. Contact : a@b.fr";
const r = anonymize(text, {});
assert.equal(r.counts.ADDRESS, 1);
assert.equal(r.anonymized, "Adresse : [ADDRESS_1]. Contact : [EMAIL_1]");
});
test("anonymize: regex span wins over overlapping model span (same length)", () => {
// Regex detects "alice@ex.com" as EMAIL; model also flags the same span as
// EMAIL or similar. Insertion order puts regex first → it wins on ties.
const text = "Contact: alice@ex.com";
const emailRegexSpan = "alice@ex.com";
const modelSpans = [
{
start: text.indexOf(emailRegexSpan),
end: text.indexOf(emailRegexSpan) + emailRegexSpan.length,
// Model trying to override with a different type — must lose.
type: "NOM" as const,
value: emailRegexSpan,
},
];
const r = anonymize(text, { modelSpans });
assert.equal(r.counts.EMAIL, 1);
assert.equal(r.counts.NOM, undefined, "model NOM must not override regex EMAIL");
});
test("anonymize: model ADDRESS coreference is case + whitespace insensitive", () => {
const text = "Adresse: 4 Rue De Paris. Aussi: 4 rue de paris.";
const modelSpans = [
{ start: 9, end: 23, type: "ADDRESS" as const, value: "4 Rue De Paris" },
{ start: 32, end: 46, type: "ADDRESS" as const, value: "4 rue de paris" },
];
const r = anonymize(text, { modelSpans });
assert.equal(r.counts.ADDRESS, 1, "both writings collapse to one pseudonym");
});
test("anonymize: pre-existing [ADDRESS_N] / [DATE_N] literals offset counters", () => {
// Verifies the extended PSEUDO_LIKE regex picks up the new types too.
const text = "Voir [ADDRESS_3] et [DATE_2].";
const modelSpans = [
{ start: text.length, end: text.length, type: "ADDRESS" as const, value: "x" },
];
// No actual new spans — just check PSEUDO_LIKE scans the new types so a
// future redaction would start at 4 / 3 respectively. We probe via a
// contrived span at end-of-text.
const r = anonymize(text + "12 rue X", {
modelSpans: [
{ start: text.length, end: text.length + 8, type: "ADDRESS" as const, value: "12 rue X" },
],
});
assert.equal(r.mapping["[ADDRESS_4]"], "12 rue X");
});