Project Files
scripts / pii-service.ts
// Standalone PII-NER service for the @zexigh/anonymize plugin.
//
// WHY a separate process? The LM Studio app process is signed with strict
// macOS Library Validation — it refuses to load non-Apple native binaries
// like onnxruntime-node and sharp ("different Team IDs"). A standalone
// `node scripts/pii-service.js` runs WITHOUT that restriction and can use
// the native onnxruntime-node binding directly.
//
// USAGE:
// cd anonymize
// npx tsc -p tsconfig.json
// node scripts/pii-service.js
//
// The plugin's `detectWithModel()` issues HTTP POSTs to this service. If
// the service isn't running, the plugin falls back to regex-only redaction.
import * as http from "node:http";
import * as fs from "node:fs";
import * as os from "node:os";
import * as nodePath from "node:path";
import { spawn } from "node:child_process";
import type { AddressInfo } from "node:net";
// ─── Path resolution ──────────────────────────────────────────────────────
// Walk up from __dirname to find LM Studio's data root (parent of
// `extensions/`). Falls back to ~/.lmstudio then tmpdir if not found
// (e.g. service ran outside the plugin install).
function findLmStudioDataRoot(): string | null {
let dir = __dirname;
for (let i = 0; i < 10; i++) {
if (nodePath.basename(dir) === "extensions") return nodePath.dirname(dir);
const parent = nodePath.dirname(dir);
if (parent === dir) return null;
dir = parent;
}
return null;
}
function resolveLogDir(): string {
const candidates: string[] = [];
const lmRoot = findLmStudioDataRoot();
if (lmRoot) candidates.push(lmRoot);
candidates.push(nodePath.join(os.homedir(), ".lmstudio"));
candidates.push(os.tmpdir());
candidates.push(os.homedir());
for (const dir of candidates) {
try {
fs.mkdirSync(dir, { recursive: true });
fs.accessSync(dir, fs.constants.W_OK);
return dir;
} catch { /* try next */ }
}
return os.tmpdir();
}
const RESOLVED_LOG_DIR = resolveLogDir();
// ─── Self-daemonize ───────────────────────────────────────────────────────
// When this script is spawned by the plugin, we play the role of a launcher:
// we re-spawn ourselves with --daemon-child detached + into a new session,
// then exit. The actual daemon is then orphan-adopted by init/launchd and
// invisible to LM Studio's process tree. This survives any tree-kill that
// LM Studio performs when reloading the plugin.
//
// The daemon-child instance bypasses this block and runs the HTTP server.
const IS_DAEMON_CHILD = process.argv.includes("--daemon-child");
if (!IS_DAEMON_CHILD) {
const logDir = RESOLVED_LOG_DIR;
const serviceLog = nodePath.join(logDir, "anonymize-pii-service.log");
fs.appendFileSync(
serviceLog,
`\n--- launcher@${process.pid} re-spawning daemon at ${new Date().toISOString()} ---\n`,
);
// Open the log file and hand its FD to the daemon as stdout+stderr, so
// any console.log inside the daemon ends up in the file.
const fd = fs.openSync(serviceLog, "a");
const child = spawn(
process.execPath,
[...process.argv.slice(1), "--daemon-child"],
{
detached: true,
stdio: ["ignore", fd, fd],
},
);
child.unref();
// Don't fs.closeSync(fd): the kernel keeps it alive for the daemon.
fs.appendFileSync(
serviceLog,
`--- launcher@${process.pid} → daemon@${child.pid} spawned; launcher exiting ---\n`,
);
process.exit(0);
}
// ─── DAEMON MODE — running from here on ───────────────────────────────────
const HOST = process.env.PII_SERVICE_HOST ?? "127.0.0.1";
const PORT = Number(process.env.PII_SERVICE_PORT ?? 7878);
const MODEL_ID = "onnx-community/multilang-pii-ner-ONNX";
const DEFAULT_THRESHOLD = 0.5;
const BIO_RE = /^([BIE])-(.+)$/;
// Write a PID file so the plugin (or anyone) can locate or signal the
// running daemon without going through `pgrep`.
const PID_FILE = nodePath.join(RESOLVED_LOG_DIR, "anonymize-pii-service.pid");
try {
fs.writeFileSync(PID_FILE, String(process.pid));
console.log(`[pii-service] daemon started, pid=${process.pid}, log dir=${RESOLVED_LOG_DIR}, pid file=${PID_FILE}`);
} catch (e) {
console.error(`[pii-service] cannot write PID file ${PID_FILE}: ${e}`);
}
// Belt-and-braces: ignore polite kill signals too. SIGKILL remains fatal.
process.on("SIGTERM", () => {
console.log("[pii-service] received SIGTERM — ignoring (running as daemon)");
});
process.on("SIGHUP", () => {
console.log("[pii-service] received SIGHUP — ignoring");
});
process.on("SIGINT", () => {
console.log("[pii-service] received SIGINT — exiting cleanly");
try { fs.unlinkSync(PID_FILE); } catch {}
process.exit(0);
});
process.on("exit", () => {
try { fs.unlinkSync(PID_FILE); } catch {}
});
type SpanType = "NOM" | "ADDRESS" | "DATE" | "IDDOC";
type Span = {
start: number;
end: number;
type: SpanType;
value: string;
};
type DetectOptions = {
detectNames?: boolean;
detectAddresses?: boolean;
detectDates?: boolean;
detectIdDocs?: boolean;
threshold?: number;
};
const LABEL_MAP: Record<string, { type: SpanType; option: keyof DetectOptions }> = {
GIVENNAME: { type: "NOM", option: "detectNames" },
SURNAME: { type: "NOM", option: "detectNames" },
STREET: { type: "ADDRESS", option: "detectAddresses" },
CITY: { type: "ADDRESS", option: "detectAddresses" },
BUILDINGNUM: { type: "ADDRESS", option: "detectAddresses" },
ZIPCODE: { type: "ADDRESS", option: "detectAddresses" },
DATE: { type: "DATE", option: "detectDates" },
TIME: { type: "DATE", option: "detectDates" },
PASSPORTNUM: { type: "IDDOC", option: "detectIdDocs" },
DRIVERLICENSENUM: { type: "IDDOC", option: "detectIdDocs" },
IDCARDNUM: { type: "IDDOC", option: "detectIdDocs" },
};
type LoadedModel = {
tokenizer: any;
model: any;
id2label: Record<string, string>;
};
let loadPromise: Promise<LoadedModel> | null = null;
let modelReady = false;
// Latest human-readable status, surfaced via GET /health so the plugin can
// forward it into the LM Studio tool-status pill in real time.
let lastStatus = "Starting service…";
function setStatus(msg: string): void {
lastStatus = msg;
console.log(`[pii-service] ${msg}`);
}
async function loadModel(): Promise<LoadedModel> {
if (loadPromise) return loadPromise;
loadPromise = (async () => {
setStatus(`Loading ${MODEL_ID}…`);
const t0 = Date.now();
const tx = await import("@huggingface/transformers");
// Wire transformers.js progress events to our setStatus. transformers.js
// calls progress_callback many times — we throttle in the formatter.
const lastPctByFile: Record<string, number> = {};
const progressCb = (raw: any) => {
if (raw.status === "progress" && raw.file) {
const pct = typeof raw.progress === "number" ? Math.floor(raw.progress / 10) * 10 : 0;
if (lastPctByFile[raw.file] === pct) return;
lastPctByFile[raw.file] = pct;
const mb = typeof raw.loaded === "number" ? (raw.loaded / 1_048_576).toFixed(0) : "?";
const total = typeof raw.total === "number" ? (raw.total / 1_048_576).toFixed(0) : "?";
setStatus(`Downloading ${raw.file}: ${pct}% (${mb}/${total} MB)`);
} else if (raw.status === "download" && raw.file) {
setStatus(`Downloading ${raw.file}…`);
} else if (raw.status === "done" && raw.file) {
setStatus(`Loaded ${raw.file}`);
} else if (raw.status === "ready") {
setStatus("Model ready");
}
};
const tokenizer = await tx.AutoTokenizer.from_pretrained(MODEL_ID, {
progress_callback: progressCb,
} as any);
const model = await tx.AutoModelForTokenClassification.from_pretrained(MODEL_ID, {
progress_callback: progressCb,
} as any);
const id2label = (model.config as { id2label?: Record<string, string> }).id2label ?? {};
setStatus(`Model ready (${Date.now() - t0} ms, ${Object.keys(id2label).length} labels).`);
modelReady = true;
return { tokenizer, model, id2label };
})();
return loadPromise;
}
// Run the model on a SINGLE chunk of text (no newlines) and produce raw BIO
// spans with offsets relative to that chunk. The cumulative-decode offset
// recovery is reliable for newline-free chunks; multi-paragraph input MUST
// be split BEFORE calling this — see detect().
async function detectChunk(
chunkText: string,
options: DetectOptions,
tokenizer: any,
model: any,
id2label: Record<string, string>,
threshold: number,
): Promise<Span[]> {
const encoded = await tokenizer(chunkText, { return_tensors: "pt" });
const idsTensor: any = encoded.input_ids;
const ids: number[] = (idsTensor.tolist
? idsTensor.tolist()[0]
: Array.from(idsTensor.data as ArrayLike<number>)
).map((n: bigint | number) => Number(n));
const offsets: number[][] = [];
let prevLen = 0;
for (let i = 0; i < ids.length; i++) {
const decoded: string = tokenizer.decode(ids.slice(0, i + 1), {
skip_special_tokens: true,
});
offsets.push([prevLen, decoded.length]);
prevLen = decoded.length;
}
const output = await model(encoded);
const logits = output.logits;
const dims = logits.dims as number[];
const T = dims[1];
const L = dims[2];
const data = logits.data as Float32Array;
type Tok = { label: string; score: number; start: number; end: number };
const tokenLabels: Tok[] = [];
for (let t = 0; t < T; t++) {
let max = -Infinity;
let argmax = 0;
for (let l = 0; l < L; l++) {
const v = data[t * L + l];
if (v > max) { max = v; argmax = l; }
}
let denom = 0;
for (let l = 0; l < L; l++) denom += Math.exp(data[t * L + l] - max);
const score = 1 / denom;
const label = id2label[String(argmax)] ?? "O";
const [s, e] = offsets[t] ?? [0, 0];
tokenLabels.push({ label, score, start: s, end: e });
}
const spans: Span[] = [];
let cur: { type: SpanType; start: number; end: number; minScore: number } | null = null;
const flush = () => {
if (!cur) return;
if (cur.minScore >= threshold) {
spans.push({
start: cur.start,
end: cur.end,
type: cur.type,
value: chunkText.slice(cur.start, cur.end),
});
}
cur = null;
};
for (const tok of tokenLabels) {
if (tok.start === 0 && tok.end === 0) { flush(); continue; }
const m = tok.label.match(BIO_RE);
if (!m) { flush(); continue; }
const tag = m[1];
const entityName = m[2];
const mapped = LABEL_MAP[entityName];
if (!mapped || !options[mapped.option]) { flush(); continue; }
if (tag === "B" || !cur || cur.type !== mapped.type) {
flush();
cur = {
type: mapped.type,
start: tok.start,
end: tok.end,
minScore: tok.score,
};
} else {
cur.end = tok.end;
cur.minScore = Math.min(cur.minScore, tok.score);
}
}
flush();
return spans;
}
async function detect(text: string, options: DetectOptions): Promise<Span[]> {
const anyEnabled =
options.detectNames || options.detectAddresses ||
options.detectDates || options.detectIdDocs;
if (!anyEnabled) return [];
const { tokenizer, model, id2label } = await loadModel();
const threshold = options.threshold ?? DEFAULT_THRESHOLD;
// Split into paragraphs on any run of newlines. Detect each separately
// so the tokenizer's whitespace normalization (which collapses `\n\n`
// and breaks our cumulative-decode offset recovery) doesn't shift
// span positions relative to the original text.
const paragraphs: { start: number; text: string }[] = [];
const splitRe = /\n+/g;
let pos = 0;
let m: RegExpExecArray | null;
while ((m = splitRe.exec(text)) !== null) {
const paraText = text.slice(pos, m.index);
if (paraText.trim().length > 0) {
paragraphs.push({ start: pos, text: paraText });
}
pos = m.index + m[0].length;
}
if (pos < text.length && text.slice(pos).trim().length > 0) {
paragraphs.push({ start: pos, text: text.slice(pos) });
}
// Detect on each paragraph and rebase span offsets to the original text.
const rawSpans: Span[] = [];
for (const para of paragraphs) {
const chunkSpans = await detectChunk(
para.text, options, tokenizer, model, id2label, threshold,
);
for (const s of chunkSpans) {
rawSpans.push({
start: s.start + para.start,
end: s.end + para.start,
type: s.type,
value: text.slice(s.start + para.start, s.end + para.start),
});
}
}
// ─── Post-processing on the combined span list (now with absolute offsets) ──
// Trim whitespace at span edges.
for (const s of rawSpans) {
while (s.start < s.end && /\s/.test(text[s.start])) s.start++;
while (s.end > s.start && /\s/.test(text[s.end - 1])) s.end--;
s.value = text.slice(s.start, s.end);
}
// Merge adjacent same-type spans separated only by whitespace.
const merged: Span[] = [];
for (const s of rawSpans) {
if (s.start >= s.end) continue;
const last = merged[merged.length - 1];
if (last && last.type === s.type && /^\s*$/.test(text.slice(last.end, s.start))) {
last.end = s.end;
last.value = text.slice(last.start, last.end);
} else {
merged.push({ ...s });
}
}
// Extend DATE spans through numeric continuations (e.g. "14" → "14/03/1985").
const DATE_RIGHT_FULL_RE = /^([\/\-.]\d{1,2}[\/\-.]\d{2,4})/;
const DATE_RIGHT_YEAR_RE = /^([\/\-.]\d{2,4})/;
const DATE_LEFT_FULL_RE = /(\d{1,2}[\/\-.]\d{1,2}[\/\-.])$/;
for (let i = 0; i < merged.length; i++) {
const s = merged[i];
if (s.type !== "DATE") continue;
const nextStart = i + 1 < merged.length ? merged[i + 1].start : text.length;
const prevEnd = i > 0 ? merged[i - 1].end : 0;
const rightSlice = text.slice(s.end, Math.min(nextStart, s.end + 12));
let rightExt = rightSlice.match(DATE_RIGHT_FULL_RE);
if (!rightExt) rightExt = rightSlice.match(DATE_RIGHT_YEAR_RE);
if (rightExt) {
s.end += rightExt[1].length;
s.value = text.slice(s.start, s.end);
}
const leftSlice = text.slice(Math.max(prevEnd, s.start - 12), s.start);
const leftExt = leftSlice.match(DATE_LEFT_FULL_RE);
if (leftExt) {
s.start -= leftExt[1].length;
s.value = text.slice(s.start, s.end);
}
}
// Strip leading/trailing sentence punctuation wrapped in spans.
const PUNCT_RE = /[.,;:!?'"`()]/;
for (const s of merged) {
while (s.end > s.start && PUNCT_RE.test(text[s.end - 1])) s.end--;
while (s.start < s.end && PUNCT_RE.test(text[s.start])) s.start++;
s.value = text.slice(s.start, s.end);
}
// Final filter: drop spans below the minimum sensible length for their
// type. Catches single-character noise like the "N" of "NIR" being tagged
// as IDDOC ahead of the NIR digits the regex captures separately.
// Real-world minimums: passport ≥7, driver-licence ≥6, ID card ≥4, dates
// ≥2 chars (year alone), names ≥2 chars (Bo, Li, Mo…).
const MIN_LEN_BY_TYPE: Record<SpanType, number> = {
NOM: 2,
ADDRESS: 2,
DATE: 2,
IDDOC: 4,
};
return merged.filter((s) => (s.end - s.start) >= MIN_LEN_BY_TYPE[s.type]);
}
function readBody(req: http.IncomingMessage, maxBytes: number): Promise<string> {
return new Promise((resolve, reject) => {
let body = "";
req.on("data", (chunk: Buffer) => {
body += chunk;
if (body.length > maxBytes) {
req.destroy();
reject(new Error("body too large"));
}
});
req.on("end", () => resolve(body));
req.on("error", reject);
});
}
const server = http.createServer(async (req, res) => {
try {
if (req.method === "GET" && req.url === "/health") {
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({
ok: true,
model_ready: modelReady,
model_loading: loadPromise !== null && !modelReady,
model_id: MODEL_ID,
last_status: lastStatus,
}));
return;
}
if (req.method === "POST" && req.url === "/detect") {
const body = await readBody(req, 1024 * 1024);
const parsed = JSON.parse(body) as { text?: unknown; options?: unknown };
if (typeof parsed.text !== "string") {
res.writeHead(400, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: "text must be a string" }));
return;
}
const opts = (parsed.options as DetectOptions | undefined) ?? {};
setStatus(`Detecting PII on ${parsed.text.length} chars…`);
const t0 = Date.now();
const spans = await detect(parsed.text, opts);
const ms = Date.now() - t0;
setStatus(`Detected ${spans.length} span(s) in ${ms} ms — idle`);
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({ spans, ms }));
return;
}
res.writeHead(404);
res.end();
} catch (e) {
console.error("[pii-service] error:", e);
res.writeHead(500, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: e instanceof Error ? e.message : String(e) }));
}
});
server.listen(PORT, HOST, () => {
const addr = server.address() as AddressInfo;
console.log(`[pii-service] Listening on http://${addr.address}:${addr.port}`);
console.log(`[pii-service] Endpoints: GET /health, POST /detect`);
console.log(`[pii-service] Pre-warming model…`);
// Pre-load so first /detect doesn't wait ~20s.
loadModel().catch((e) => console.error("[pii-service] preload failed:", e));
});