Project Files
eval / run-ai4privacy.ts
// Local-only evaluation against the AI4Privacy pii-masking-400k FR subset.
// Reports precision/recall per detector type.
//
// IMPORTANT β licence : the AI4Privacy dataset is shared under a proprietary
// licence (academic / non-commercial, no redistribution, watermarked). This
// script reads files in ./datasets/ai4privacy-400k/ which is gitignored. It
// emits ONLY aggregate metrics. Do NOT commit individual samples, extracts,
// or per-row breakdowns derived from the data.
//
// Run: npm run eval:ai4privacy
import { createReadStream, existsSync } from "node:fs";
import { createInterface } from "node:readline";
import { detectAll, type Span, type SpanType } from "../src/detectors.ts";
type GoldSpan = { start: number; end: number; label: string; value: string };
type Sample = { source_text: string; locale: string; privacy_mask: GoldSpan[] };
// AI4Privacy label β our detector type. Unmapped labels are out-of-scope.
const LABEL_MAP: Record<string, SpanType> = {
EMAIL: "EMAIL",
TELEPHONENUM: "TEL",
CREDITCARDNUMBER: "CB",
SOCIALNUM: "NIR", // valid only on locale=="FR" (CH SOCIALNUM is AVS, not NIR)
};
const EVAL_TYPES: SpanType[] = ["EMAIL", "TEL", "CB", "NIR"];
const FILES = [
"datasets/ai4privacy-400k/data/train/fr.jsonl",
"datasets/ai4privacy-400k/data/validation/fr.jsonl",
];
// Predictions and gold are counted independently so two overlapping
// predictions on the same gold can't inflate the TP count.
type Counts = {
predTotal: number;
predMatched: number; // predictions overlapping any gold of same type
goldTotal: number;
goldMatched: number; // golds overlapped by any prediction of same type
};
type LocaleStats = {
samples: number;
counts: Record<SpanType, Counts>;
outOfScope: Record<string, number>;
};
function newCounts(): Record<SpanType, Counts> {
const c = {} as Record<SpanType, Counts>;
for (const t of EVAL_TYPES) c[t] = { predTotal: 0, predMatched: 0, goldTotal: 0, goldMatched: 0 };
return c;
}
function newLocaleStats(): LocaleStats {
return { samples: 0, counts: newCounts(), outOfScope: {} };
}
function overlap(a: { start: number; end: number }, b: { start: number; end: number }): boolean {
return a.start < b.end && b.start < a.end;
}
const INTERNATIONAL = process.env.ANONYMIZE_INTL === "1";
function evaluateSample(sample: Sample, counts: Record<SpanType, Counts>, outOfScope: Record<string, number>) {
const predicted: Span[] = detectAll(sample.source_text, { international: INTERNATIONAL });
const goldByType = new Map<SpanType, GoldSpan[]>();
for (const t of EVAL_TYPES) goldByType.set(t, []);
for (const g of sample.privacy_mask) {
const mapped = LABEL_MAP[g.label];
if (mapped) {
goldByType.get(mapped)!.push(g);
} else {
outOfScope[g.label] = (outOfScope[g.label] ?? 0) + 1;
}
}
for (const p of predicted) {
if (!EVAL_TYPES.includes(p.type)) continue;
counts[p.type].predTotal += 1;
if (goldByType.get(p.type)!.some((g) => overlap(p, g))) counts[p.type].predMatched += 1;
}
for (const t of EVAL_TYPES) {
for (const g of goldByType.get(t)!) {
counts[t].goldTotal += 1;
if (predicted.some((p) => p.type === t && overlap(p, g))) counts[t].goldMatched += 1;
}
}
}
async function processFile(path: string, byLocale: Map<string, LocaleStats>) {
if (!existsSync(path)) {
console.error(`Missing file: ${path}`);
console.error("Run the AI4Privacy download first (see README). Skipping.");
return;
}
let total = 0;
let kept = 0;
const rl = createInterface({ input: createReadStream(path, { encoding: "utf-8" }), crlfDelay: Infinity });
for await (const line of rl) {
if (!line) continue;
total += 1;
const sample = JSON.parse(line) as Sample;
if (!byLocale.has(sample.locale)) continue;
const stats = byLocale.get(sample.locale)!;
stats.samples += 1;
kept += 1;
evaluateSample(sample, stats.counts, stats.outOfScope);
}
console.error(` ${path}: ${kept}/${total} samples evaluated`);
}
function pct(n: number, d: number): string {
if (d === 0) return " n/a";
return ((n / d) * 100).toFixed(2).padStart(5, " ") + "%";
}
function f1(p: number, r: number): string {
if (p + r === 0) return " n/a";
return (((2 * p * r) / (p + r)) * 100).toFixed(2).padStart(5, " ") + "%";
}
function printMetricsTable(label: string, stats: LocaleStats) {
console.log("");
console.log(`${label} β ${stats.samples} samples`);
console.log("type | gold | pred | pred_match | gold_match | precision | recall | F1");
console.log("--------+--------+-------+-------------+-------------+------------+----------+--------");
for (const t of EVAL_TYPES) {
const c = stats.counts[t];
const precision = c.predTotal === 0 ? 0 : c.predMatched / c.predTotal;
const recall = c.goldTotal === 0 ? 0 : c.goldMatched / c.goldTotal;
console.log(
`${t.padEnd(7)} | ${String(c.goldTotal).padStart(6)} | ${String(c.predTotal).padStart(5)} | ` +
`${String(c.predMatched).padStart(11)} | ${String(c.goldMatched).padStart(11)} | ` +
`${pct(c.predMatched, c.predTotal)} | ${pct(c.goldMatched, c.goldTotal)} | ${f1(precision, recall)}`,
);
}
}
function printSpecificityCH(stats: LocaleStats) {
// For CH text, French-specific detectors should match nothing (Swiss AVS
// β NIR FR). With international phones OFF, TEL should also be silent.
// With international ON, TEL is allowed to fire on +41 numbers, so the
// metric is reported but is not "specificity" in the strict sense.
const types: SpanType[] = INTERNATIONAL ? ["NIR"] : ["NIR", "TEL"];
const label = INTERNATIONAL
? "CH specificity check β NIR FR detector should stay silent on Swiss text:"
: "CH specificity check β French-specific detectors should be silent:";
console.log("");
console.log(label);
console.log("type | predictions | per 1k samples");
console.log("--------+-------------+----------------");
for (const t of types) {
const pred = stats.counts[t].predTotal;
const per1k = stats.samples === 0 ? 0 : (pred / stats.samples) * 1000;
console.log(`${t.padEnd(7)} | ${String(pred).padStart(11)} | ${per1k.toFixed(2).padStart(14)}`);
}
}
async function main() {
const byLocale = new Map<string, LocaleStats>();
byLocale.set("FR", newLocaleStats());
byLocale.set("CH", newLocaleStats());
console.error("Streaming AI4Privacy FR + CH samplesβ¦");
for (const f of FILES) {
await processFile(f, byLocale);
}
console.log("");
console.log("AI4Privacy pii-masking-400k β metrics by locale");
console.log(
`(span-overlap matching; out-of-scope labels excluded from P/R; ` +
`international phones: ${INTERNATIONAL ? "ON" : "OFF"})`,
);
printMetricsTable("FR locale (France) β primary target", byLocale.get("FR")!);
printMetricsTable("CH locale (Suisse romande) β control", byLocale.get("CH")!);
printSpecificityCH(byLocale.get("CH")!);
console.log("");
console.log("FR out-of-scope gold labels (entities we don't claim to detect):");
const sorted = Object.entries(byLocale.get("FR")!.outOfScope).sort((a, b) => b[1] - a[1]);
for (const [label, n] of sorted.slice(0, 15)) {
console.log(` ${label.padEnd(20)} ${String(n).padStart(6)}`);
}
const totalOoS = sorted.reduce((s, [, n]) => s + n, 0);
console.log(` ${"(total)".padEnd(20)} ${String(totalOoS).padStart(6)}`);
}
main().catch((e) => {
console.error(e);
process.exit(1);
});