import { test } from "node:test";
import { strict as assert } from "node:assert";
import { anonymize } from "./anonymizer.ts";

test("anonymize: returns the expected output shape", () => {
  const r = anonymize("Hello alice@example.com", {});
  assert.ok(typeof r.anonymized === "string");
  assert.ok(typeof r.mapping === "object");
  assert.ok(typeof r.counts === "object");
  assert.equal(r.counts.EMAIL, 1);
  assert.equal(r.anonymized, "Hello [EMAIL_1]");
  assert.equal(r.mapping["[EMAIL_1]"], "alice@example.com");
});

test("anonymize: stable pseudonyms within a call", () => {
  const r = anonymize("Mail alice@example.com puis re-mail alice@example.com.", {});
  assert.equal(r.counts.EMAIL, 1);
  assert.equal(
    r.anonymized,
    "Mail [EMAIL_1] puis re-mail [EMAIL_1].",
  );
});

test("anonymize: coreference across phone formats", () => {
  // +33, +33(0), and 0-prefix should all share a pseudonym after normalization.
  const text = "Appelle au 06 12 34 56 78, +33 6 12 34 56 78 ou +33 (0)6 12 34 56 78.";
  const r = anonymize(text, {});
  assert.equal(r.counts.TEL, 1, "all three phone forms collapse to one pseudo");
  assert.equal(r.mapping["[TEL_1]"], "06 12 34 56 78");
  assert.equal(r.anonymized, "Appelle au [TEL_1], [TEL_1] ou [TEL_1].");
});

test("anonymize: coreference across IBAN case and spacing", () => {
  const iban1 = "FR1420041010050500013M02606";
  const iban2 = "fr14 2004 1010 0505 0001 3m02 606";
  const text = `Vire sur ${iban1} ou sur ${iban2}.`;
  const r = anonymize(text, {});
  assert.equal(r.counts.IBAN, 1);
  assert.equal(r.mapping["[IBAN_1]"], iban1);
});

test("anonymize: email coreference is case-insensitive", () => {
  const r = anonymize("Mail Alice@Example.COM puis alice@example.com.", {});
  assert.equal(r.counts.EMAIL, 1);
});

test("anonymize: names with word boundary", () => {
  // "Jean" must NOT match inside "Jeanne" or "majeurement".
  const text = "Jean a parlé à Jeanne, c'est majeur. Puis Jean repart.";
  const r = anonymize(text, { names: ["Jean"] });
  assert.equal(r.counts.NOM, 1);
  assert.match(r.anonymized, /Jeanne/);
  assert.match(r.anonymized, /majeur/);
  const occurrences = r.anonymized.split("[NOM_1]").length - 1;
  assert.equal(occurrences, 2, "should redact both standalone 'Jean'");
});

test("anonymize: longer name wins over shorter overlap", () => {
  const text = "Jean a appelé Jean Dupont au sujet de Jean.";
  const r = anonymize(text, { names: ["Jean", "Jean Dupont"] });
  // Two distinct pseudos: standalone "Jean" and the compound "Jean Dupont".
  assert.equal(r.counts.NOM, 2);
  // "Jean Dupont" gets its own pseudo (not split into two NOMs).
  const dupontPseudo = Object.entries(r.mapping).find(
    ([, v]) => v === "Jean Dupont",
  )?.[0];
  assert.ok(dupontPseudo, "Jean Dupont mapped");
});

test("anonymize: custom_terms and configCustomTerms both apply", () => {
  const text = "ACME Corp paie sur projet Hydra-7 demain.";
  const r = anonymize(text, {
    customTerms: ["ACME Corp"],
    configCustomTerms: ["Hydra-7"],
  });
  assert.equal(r.counts.CUSTOM, 2);
  assert.match(r.anonymized, /\[CUSTOM_\d\] paie sur projet \[CUSTOM_\d\]/);
});

test("anonymize: detector overrides custom term on identical span", () => {
  // If the user adds a phone number as a custom_term AND it's a valid French
  // phone, the detector should win (more specific type) since dedupe is stable
  // by insertion order and detectors run before custom terms.
  const text = "Tel: 06 12 34 56 78";
  const r = anonymize(text, { customTerms: ["06 12 34 56 78"] });
  assert.equal(r.counts.TEL, 1);
  assert.equal(r.counts.CUSTOM, undefined);
});

test("anonymize: pseudo numbering is in reading order", () => {
  const text = "Bob bob@ex.com Carol carol@ex.com Alice alice@ex.com";
  const r = anonymize(text, { names: ["Alice", "Bob", "Carol"] });
  // EMAIL_1 should be bob@ex.com (first email occurrence), not alice@.
  assert.equal(r.mapping["[EMAIL_1]"], "bob@ex.com");
  assert.equal(r.mapping["[EMAIL_2]"], "carol@ex.com");
  assert.equal(r.mapping["[EMAIL_3]"], "alice@ex.com");
  assert.equal(r.mapping["[NOM_1]"], "Bob");
  assert.equal(r.mapping["[NOM_2]"], "Carol");
  assert.equal(r.mapping["[NOM_3]"], "Alice");
});

test("anonymize: NIR Corsica end-to-end", () => {
  const text = "Sécu: 2 89 04 2A 342 163 90, employeur ACME.";
  const r = anonymize(text, { customTerms: ["ACME"] });
  assert.equal(r.counts.NIR, 1);
  assert.match(r.anonymized, /Sécu: \[NIR_1\], employeur \[CUSTOM_1\]\./);
});

test("anonymize: empty input produces empty mapping", () => {
  const r = anonymize("Texte sans rien de sensible.", { names: ["Inexistant"] });
  assert.equal(r.anonymized, "Texte sans rien de sensible.");
  assert.deepEqual(r.mapping, {});
  assert.deepEqual(r.counts, {});
});

test("anonymize: name matching is case-insensitive (NOM)", () => {
  // Headers and signatures often use all-caps; the model still passes
  // canonical case.
  const text = "JEAN DUPONT, directeur. Jean Dupont a signé. jean dupont aussi.";
  const r = anonymize(text, { names: ["Jean Dupont"] });
  assert.equal(r.counts.NOM, 1, "all three variants share one pseudonym");
  // 3 occurrences replaced.
  const occ = r.anonymized.split("[NOM_1]").length - 1;
  assert.equal(occ, 3);
  // Mapping keeps the first textual form seen.
  assert.equal(r.mapping["[NOM_1]"], "JEAN DUPONT");
});

test("anonymize: custom terms remain case-sensitive (CUSTOM)", () => {
  // CUSTOM is deliberate: caller passes the exact form they want gone.
  const text = "ACME Corp vs acme corp vs Acme Corp";
  const r = anonymize(text, { customTerms: ["ACME Corp"] });
  assert.equal(r.counts.CUSTOM, 1);
  assert.match(r.anonymized, /^\[CUSTOM_1\] vs acme corp vs Acme Corp$/);
});

test("anonymize: case-insensitive names preserve word boundary", () => {
  // "jean" inside "jeanne" must not match even with case folding.
  const text = "JEANNE était là. Jean repartit.";
  const r = anonymize(text, { names: ["jean"] });
  assert.equal(r.counts.NOM, 1);
  assert.match(r.anonymized, /JEANNE était là\. \[NOM_1\] repartit\./);
});

test("anonymize: pre-existing pseudonyms in input are preserved and counters offset", () => {
  // Re-processing an already-partly-redacted document: existing [NOM_1]
  // stays as-is, new redactions start at [NOM_2].
  const text = "Bonjour [NOM_1], Bob a téléphoné.";
  const r = anonymize(text, { names: ["Bob"] });
  assert.equal(r.counts.NOM, 1, "only Bob is a new redaction");
  // The literal [NOM_1] survives verbatim.
  assert.match(r.anonymized, /Bonjour \[NOM_1\], \[NOM_2\] a téléphoné\./);
  assert.equal(r.mapping["[NOM_2]"], "Bob");
  assert.equal(r.mapping["[NOM_1]"], undefined, "existing literal not added to mapping");
});

test("anonymize: pre-existing pseudonyms of multiple types each offset independently", () => {
  const text = "[EMAIL_3] et [TEL_1] sont là. Mail: x@y.fr et phone: 0612345678.";
  const r = anonymize(text, {});
  assert.equal(r.counts.EMAIL, 1);
  assert.equal(r.counts.TEL, 1);
  // New EMAIL starts at index 4 (past existing 3), new TEL at 2 (past 1).
  assert.equal(r.mapping["[EMAIL_4]"], "x@y.fr");
  assert.equal(r.mapping["[TEL_2]"], "0612345678");
});

test("anonymize: international flag enables non-FR phones, off by default", () => {
  const text = "FR +33 6 12 34 56 78 ; DE +49 30 12345678 ; US +1 555 123 4567";
  // Default: only the +33 number is redacted.
  const off = anonymize(text, {});
  assert.equal(off.counts.TEL, 1);
  // With the flag: three distinct phones redacted.
  const on = anonymize(text, { international: true });
  assert.equal(on.counts.TEL, 3);
});

test("anonymize: international does not double-count the +33 form", () => {
  // When international is on, +33 numbers are caught by both regexes; dedup
  // must keep one pseudonym only.
  const text = "Number: +33 6 12 34 56 78 only once.";
  const r = anonymize(text, { international: true });
  assert.equal(r.counts.TEL, 1);
});

test("anonymize: handles accented names", () => {
  const text = "Hélène et François vont à Paris.";
  const r = anonymize(text, { names: ["Hélène", "François"] });
  assert.equal(r.counts.NOM, 2);
  assert.match(r.anonymized, /\[NOM_1\] et \[NOM_2\] vont à Paris\./);
});

test("anonymize: modelSpans injected — new ADDRESS / DATE / IDDOC types", () => {
  // Simulate output from piiModel.ts without loading the actual model.
  const text = "Né le 12/03/1990 au 4 rue de Paris. Passeport: 12AB34567.";
  const modelSpans = [
    { start: 6, end: 16, type: "DATE" as const, value: "12/03/1990" },
    { start: 20, end: 34, type: "ADDRESS" as const, value: "4 rue de Paris" },
    { start: 47, end: 56, type: "IDDOC" as const, value: "12AB34567" },
  ];
  const r = anonymize(text, { modelSpans });
  assert.equal(r.counts.DATE, 1);
  assert.equal(r.counts.ADDRESS, 1);
  assert.equal(r.counts.IDDOC, 1);
  assert.match(r.anonymized, /\[DATE_1\]/);
  assert.match(r.anonymized, /\[ADDRESS_1\]/);
  assert.match(r.anonymized, /\[IDDOC_1\]/);
});

test("anonymize: passport numbers are redacted without the ML model", () => {
  const text = "Passeport n° 19FH84235, dossier ouvert.";
  const r = anonymize(text, {});
  assert.equal(r.counts.IDDOC, 1);
  assert.equal(r.anonymized, "Passeport n° [IDDOC_1], dossier ouvert.");
});

test("anonymize: labelled addresses are redacted without the ML model", () => {
  const text = "Adresse : 27 rue de la République, 69002 Lyon. Contact : a@b.fr";
  const r = anonymize(text, {});
  assert.equal(r.counts.ADDRESS, 1);
  assert.equal(r.anonymized, "Adresse : [ADDRESS_1]. Contact : [EMAIL_1]");
});

test("anonymize: regex span wins over overlapping model span (same length)", () => {
  // Regex detects "alice@ex.com" as EMAIL; model also flags the same span as
  // EMAIL or similar. Insertion order puts regex first → it wins on ties.
  const text = "Contact: alice@ex.com";
  const emailRegexSpan = "alice@ex.com";
  const modelSpans = [
    {
      start: text.indexOf(emailRegexSpan),
      end: text.indexOf(emailRegexSpan) + emailRegexSpan.length,
      // Model trying to override with a different type — must lose.
      type: "NOM" as const,
      value: emailRegexSpan,
    },
  ];
  const r = anonymize(text, { modelSpans });
  assert.equal(r.counts.EMAIL, 1);
  assert.equal(r.counts.NOM, undefined, "model NOM must not override regex EMAIL");
});

test("anonymize: model ADDRESS coreference is case + whitespace insensitive", () => {
  const text = "Adresse: 4 Rue De Paris. Aussi: 4 rue de paris.";
  const modelSpans = [
    { start: 9, end: 23, type: "ADDRESS" as const, value: "4 Rue De Paris" },
    { start: 32, end: 46, type: "ADDRESS" as const, value: "4 rue de paris" },
  ];
  const r = anonymize(text, { modelSpans });
  assert.equal(r.counts.ADDRESS, 1, "both writings collapse to one pseudonym");
});

test("anonymize: pre-existing [ADDRESS_N] / [DATE_N] literals offset counters", () => {
  // Verifies the extended PSEUDO_LIKE regex picks up the new types too.
  const text = "Voir [ADDRESS_3] et [DATE_2].";
  const modelSpans = [
    { start: text.length, end: text.length, type: "ADDRESS" as const, value: "x" },
  ];
  // No actual new spans — just check PSEUDO_LIKE scans the new types so a
  // future redaction would start at 4 / 3 respectively. We probe via a
  // contrived span at end-of-text.
  const r = anonymize(text + "12 rue X", {
    modelSpans: [
      { start: text.length, end: text.length + 8, type: "ADDRESS" as const, value: "12 rue X" },
    ],
  });
  assert.equal(r.mapping["[ADDRESS_4]"], "12 rue X");
});