"use strict";
// Standalone PII-NER service for the @zexigh/anonymize plugin.
//
// WHY a separate process? The LM Studio app process is signed with strict
// macOS Library Validation — it refuses to load non-Apple native binaries
// like onnxruntime-node and sharp ("different Team IDs"). A standalone
// `node scripts/pii-service.js` runs WITHOUT that restriction and can use
// the native onnxruntime-node binding directly.
//
// USAGE:
//   cd anonymize
//   npx tsc -p tsconfig.json
//   node scripts/pii-service.js
//
// The plugin's `detectWithModel()` issues HTTP POSTs to this service. If
// the service isn't running, the plugin falls back to regex-only redaction.
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    var desc = Object.getOwnPropertyDescriptor(m, k);
    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
      desc = { enumerable: true, get: function() { return m[k]; } };
    }
    Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
    Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
    o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
    var ownKeys = function(o) {
        ownKeys = Object.getOwnPropertyNames || function (o) {
            var ar = [];
            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
            return ar;
        };
        return ownKeys(o);
    };
    return function (mod) {
        if (mod && mod.__esModule) return mod;
        var result = {};
        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
        __setModuleDefault(result, mod);
        return result;
    };
})();
Object.defineProperty(exports, "__esModule", { value: true });
const http = __importStar(require("node:http"));
const fs = __importStar(require("node:fs"));
const os = __importStar(require("node:os"));
const nodePath = __importStar(require("node:path"));
const node_child_process_1 = require("node:child_process");
// ─── Path resolution ──────────────────────────────────────────────────────
// Walk up from __dirname to find LM Studio's data root (parent of
// `extensions/`). Falls back to ~/.lmstudio then tmpdir if not found
// (e.g. service ran outside the plugin install).
function findLmStudioDataRoot() {
    let dir = __dirname;
    for (let i = 0; i < 10; i++) {
        if (nodePath.basename(dir) === "extensions")
            return nodePath.dirname(dir);
        const parent = nodePath.dirname(dir);
        if (parent === dir)
            return null;
        dir = parent;
    }
    return null;
}
function resolveLogDir() {
    const candidates = [];
    const lmRoot = findLmStudioDataRoot();
    if (lmRoot)
        candidates.push(lmRoot);
    candidates.push(nodePath.join(os.homedir(), ".lmstudio"));
    candidates.push(os.tmpdir());
    candidates.push(os.homedir());
    for (const dir of candidates) {
        try {
            fs.mkdirSync(dir, { recursive: true });
            fs.accessSync(dir, fs.constants.W_OK);
            return dir;
        }
        catch { /* try next */ }
    }
    return os.tmpdir();
}
const RESOLVED_LOG_DIR = resolveLogDir();
// ─── Self-daemonize ───────────────────────────────────────────────────────
// When this script is spawned by the plugin, we play the role of a launcher:
// we re-spawn ourselves with --daemon-child detached + into a new session,
// then exit. The actual daemon is then orphan-adopted by init/launchd and
// invisible to LM Studio's process tree. This survives any tree-kill that
// LM Studio performs when reloading the plugin.
//
// The daemon-child instance bypasses this block and runs the HTTP server.
const IS_DAEMON_CHILD = process.argv.includes("--daemon-child");
if (!IS_DAEMON_CHILD) {
    const logDir = RESOLVED_LOG_DIR;
    const serviceLog = nodePath.join(logDir, "anonymize-pii-service.log");
    fs.appendFileSync(serviceLog, `\n--- launcher@${process.pid} re-spawning daemon at ${new Date().toISOString()} ---\n`);
    // Open the log file and hand its FD to the daemon as stdout+stderr, so
    // any console.log inside the daemon ends up in the file.
    const fd = fs.openSync(serviceLog, "a");
    const child = (0, node_child_process_1.spawn)(process.execPath, [...process.argv.slice(1), "--daemon-child"], {
        detached: true,
        stdio: ["ignore", fd, fd],
    });
    child.unref();
    // Don't fs.closeSync(fd): the kernel keeps it alive for the daemon.
    fs.appendFileSync(serviceLog, `--- launcher@${process.pid} → daemon@${child.pid} spawned; launcher exiting ---\n`);
    process.exit(0);
}
// ─── DAEMON MODE — running from here on ───────────────────────────────────
const HOST = process.env.PII_SERVICE_HOST ?? "127.0.0.1";
const PORT = Number(process.env.PII_SERVICE_PORT ?? 7878);
const MODEL_ID = "onnx-community/multilang-pii-ner-ONNX";
const DEFAULT_THRESHOLD = 0.5;
const BIO_RE = /^([BIE])-(.+)$/;
// Write a PID file so the plugin (or anyone) can locate or signal the
// running daemon without going through `pgrep`.
const PID_FILE = nodePath.join(RESOLVED_LOG_DIR, "anonymize-pii-service.pid");
try {
    fs.writeFileSync(PID_FILE, String(process.pid));
    console.log(`[pii-service] daemon started, pid=${process.pid}, log dir=${RESOLVED_LOG_DIR}, pid file=${PID_FILE}`);
}
catch (e) {
    console.error(`[pii-service] cannot write PID file ${PID_FILE}: ${e}`);
}
// Belt-and-braces: ignore polite kill signals too. SIGKILL remains fatal.
process.on("SIGTERM", () => {
    console.log("[pii-service] received SIGTERM — ignoring (running as daemon)");
});
process.on("SIGHUP", () => {
    console.log("[pii-service] received SIGHUP — ignoring");
});
process.on("SIGINT", () => {
    console.log("[pii-service] received SIGINT — exiting cleanly");
    try {
        fs.unlinkSync(PID_FILE);
    }
    catch { }
    process.exit(0);
});
process.on("exit", () => {
    try {
        fs.unlinkSync(PID_FILE);
    }
    catch { }
});
const LABEL_MAP = {
    GIVENNAME: { type: "NOM", option: "detectNames" },
    SURNAME: { type: "NOM", option: "detectNames" },
    STREET: { type: "ADDRESS", option: "detectAddresses" },
    CITY: { type: "ADDRESS", option: "detectAddresses" },
    BUILDINGNUM: { type: "ADDRESS", option: "detectAddresses" },
    ZIPCODE: { type: "ADDRESS", option: "detectAddresses" },
    DATE: { type: "DATE", option: "detectDates" },
    TIME: { type: "DATE", option: "detectDates" },
    PASSPORTNUM: { type: "IDDOC", option: "detectIdDocs" },
    DRIVERLICENSENUM: { type: "IDDOC", option: "detectIdDocs" },
    IDCARDNUM: { type: "IDDOC", option: "detectIdDocs" },
};
let loadPromise = null;
let modelReady = false;
// Latest human-readable status, surfaced via GET /health so the plugin can
// forward it into the LM Studio tool-status pill in real time.
let lastStatus = "Starting service…";
function setStatus(msg) {
    lastStatus = msg;
    console.log(`[pii-service] ${msg}`);
}
async function loadModel() {
    if (loadPromise)
        return loadPromise;
    loadPromise = (async () => {
        setStatus(`Loading ${MODEL_ID}…`);
        const t0 = Date.now();
        const tx = await Promise.resolve().then(() => __importStar(require("@huggingface/transformers")));
        // Wire transformers.js progress events to our setStatus. transformers.js
        // calls progress_callback many times — we throttle in the formatter.
        const lastPctByFile = {};
        const progressCb = (raw) => {
            if (raw.status === "progress" && raw.file) {
                const pct = typeof raw.progress === "number" ? Math.floor(raw.progress / 10) * 10 : 0;
                if (lastPctByFile[raw.file] === pct)
                    return;
                lastPctByFile[raw.file] = pct;
                const mb = typeof raw.loaded === "number" ? (raw.loaded / 1_048_576).toFixed(0) : "?";
                const total = typeof raw.total === "number" ? (raw.total / 1_048_576).toFixed(0) : "?";
                setStatus(`Downloading ${raw.file}: ${pct}% (${mb}/${total} MB)`);
            }
            else if (raw.status === "download" && raw.file) {
                setStatus(`Downloading ${raw.file}…`);
            }
            else if (raw.status === "done" && raw.file) {
                setStatus(`Loaded ${raw.file}`);
            }
            else if (raw.status === "ready") {
                setStatus("Model ready");
            }
        };
        const tokenizer = await tx.AutoTokenizer.from_pretrained(MODEL_ID, {
            progress_callback: progressCb,
        });
        const model = await tx.AutoModelForTokenClassification.from_pretrained(MODEL_ID, {
            progress_callback: progressCb,
        });
        const id2label = model.config.id2label ?? {};
        setStatus(`Model ready (${Date.now() - t0} ms, ${Object.keys(id2label).length} labels).`);
        modelReady = true;
        return { tokenizer, model, id2label };
    })();
    return loadPromise;
}
// Run the model on a SINGLE chunk of text (no newlines) and produce raw BIO
// spans with offsets relative to that chunk. The cumulative-decode offset
// recovery is reliable for newline-free chunks; multi-paragraph input MUST
// be split BEFORE calling this — see detect().
async function detectChunk(chunkText, options, tokenizer, model, id2label, threshold) {
    const encoded = await tokenizer(chunkText, { return_tensors: "pt" });
    const idsTensor = encoded.input_ids;
    const ids = (idsTensor.tolist
        ? idsTensor.tolist()[0]
        : Array.from(idsTensor.data)).map((n) => Number(n));
    const offsets = [];
    let prevLen = 0;
    for (let i = 0; i < ids.length; i++) {
        const decoded = tokenizer.decode(ids.slice(0, i + 1), {
            skip_special_tokens: true,
        });
        offsets.push([prevLen, decoded.length]);
        prevLen = decoded.length;
    }
    const output = await model(encoded);
    const logits = output.logits;
    const dims = logits.dims;
    const T = dims[1];
    const L = dims[2];
    const data = logits.data;
    const tokenLabels = [];
    for (let t = 0; t < T; t++) {
        let max = -Infinity;
        let argmax = 0;
        for (let l = 0; l < L; l++) {
            const v = data[t * L + l];
            if (v > max) {
                max = v;
                argmax = l;
            }
        }
        let denom = 0;
        for (let l = 0; l < L; l++)
            denom += Math.exp(data[t * L + l] - max);
        const score = 1 / denom;
        const label = id2label[String(argmax)] ?? "O";
        const [s, e] = offsets[t] ?? [0, 0];
        tokenLabels.push({ label, score, start: s, end: e });
    }
    const spans = [];
    let cur = null;
    const flush = () => {
        if (!cur)
            return;
        if (cur.minScore >= threshold) {
            spans.push({
                start: cur.start,
                end: cur.end,
                type: cur.type,
                value: chunkText.slice(cur.start, cur.end),
            });
        }
        cur = null;
    };
    for (const tok of tokenLabels) {
        if (tok.start === 0 && tok.end === 0) {
            flush();
            continue;
        }
        const m = tok.label.match(BIO_RE);
        if (!m) {
            flush();
            continue;
        }
        const tag = m[1];
        const entityName = m[2];
        const mapped = LABEL_MAP[entityName];
        if (!mapped || !options[mapped.option]) {
            flush();
            continue;
        }
        if (tag === "B" || !cur || cur.type !== mapped.type) {
            flush();
            cur = {
                type: mapped.type,
                start: tok.start,
                end: tok.end,
                minScore: tok.score,
            };
        }
        else {
            cur.end = tok.end;
            cur.minScore = Math.min(cur.minScore, tok.score);
        }
    }
    flush();
    return spans;
}
async function detect(text, options) {
    const anyEnabled = options.detectNames || options.detectAddresses ||
        options.detectDates || options.detectIdDocs;
    if (!anyEnabled)
        return [];
    const { tokenizer, model, id2label } = await loadModel();
    const threshold = options.threshold ?? DEFAULT_THRESHOLD;
    // Split into paragraphs on any run of newlines. Detect each separately
    // so the tokenizer's whitespace normalization (which collapses `\n\n`
    // and breaks our cumulative-decode offset recovery) doesn't shift
    // span positions relative to the original text.
    const paragraphs = [];
    const splitRe = /\n+/g;
    let pos = 0;
    let m;
    while ((m = splitRe.exec(text)) !== null) {
        const paraText = text.slice(pos, m.index);
        if (paraText.trim().length > 0) {
            paragraphs.push({ start: pos, text: paraText });
        }
        pos = m.index + m[0].length;
    }
    if (pos < text.length && text.slice(pos).trim().length > 0) {
        paragraphs.push({ start: pos, text: text.slice(pos) });
    }
    // Detect on each paragraph and rebase span offsets to the original text.
    const rawSpans = [];
    for (const para of paragraphs) {
        const chunkSpans = await detectChunk(para.text, options, tokenizer, model, id2label, threshold);
        for (const s of chunkSpans) {
            rawSpans.push({
                start: s.start + para.start,
                end: s.end + para.start,
                type: s.type,
                value: text.slice(s.start + para.start, s.end + para.start),
            });
        }
    }
    // ─── Post-processing on the combined span list (now with absolute offsets) ──
    // Trim whitespace at span edges.
    for (const s of rawSpans) {
        while (s.start < s.end && /\s/.test(text[s.start]))
            s.start++;
        while (s.end > s.start && /\s/.test(text[s.end - 1]))
            s.end--;
        s.value = text.slice(s.start, s.end);
    }
    // Merge adjacent same-type spans separated only by whitespace.
    const merged = [];
    for (const s of rawSpans) {
        if (s.start >= s.end)
            continue;
        const last = merged[merged.length - 1];
        if (last && last.type === s.type && /^\s*$/.test(text.slice(last.end, s.start))) {
            last.end = s.end;
            last.value = text.slice(last.start, last.end);
        }
        else {
            merged.push({ ...s });
        }
    }
    // Extend DATE spans through numeric continuations (e.g. "14" → "14/03/1985").
    const DATE_RIGHT_FULL_RE = /^([\/\-.]\d{1,2}[\/\-.]\d{2,4})/;
    const DATE_RIGHT_YEAR_RE = /^([\/\-.]\d{2,4})/;
    const DATE_LEFT_FULL_RE = /(\d{1,2}[\/\-.]\d{1,2}[\/\-.])$/;
    for (let i = 0; i < merged.length; i++) {
        const s = merged[i];
        if (s.type !== "DATE")
            continue;
        const nextStart = i + 1 < merged.length ? merged[i + 1].start : text.length;
        const prevEnd = i > 0 ? merged[i - 1].end : 0;
        const rightSlice = text.slice(s.end, Math.min(nextStart, s.end + 12));
        let rightExt = rightSlice.match(DATE_RIGHT_FULL_RE);
        if (!rightExt)
            rightExt = rightSlice.match(DATE_RIGHT_YEAR_RE);
        if (rightExt) {
            s.end += rightExt[1].length;
            s.value = text.slice(s.start, s.end);
        }
        const leftSlice = text.slice(Math.max(prevEnd, s.start - 12), s.start);
        const leftExt = leftSlice.match(DATE_LEFT_FULL_RE);
        if (leftExt) {
            s.start -= leftExt[1].length;
            s.value = text.slice(s.start, s.end);
        }
    }
    // Strip leading/trailing sentence punctuation wrapped in spans.
    const PUNCT_RE = /[.,;:!?'"`()]/;
    for (const s of merged) {
        while (s.end > s.start && PUNCT_RE.test(text[s.end - 1]))
            s.end--;
        while (s.start < s.end && PUNCT_RE.test(text[s.start]))
            s.start++;
        s.value = text.slice(s.start, s.end);
    }
    // Final filter: drop spans below the minimum sensible length for their
    // type. Catches single-character noise like the "N" of "NIR" being tagged
    // as IDDOC ahead of the NIR digits the regex captures separately.
    // Real-world minimums: passport ≥7, driver-licence ≥6, ID card ≥4, dates
    // ≥2 chars (year alone), names ≥2 chars (Bo, Li, Mo…).
    const MIN_LEN_BY_TYPE = {
        NOM: 2,
        ADDRESS: 2,
        DATE: 2,
        IDDOC: 4,
    };
    return merged.filter((s) => (s.end - s.start) >= MIN_LEN_BY_TYPE[s.type]);
}
function readBody(req, maxBytes) {
    return new Promise((resolve, reject) => {
        let body = "";
        req.on("data", (chunk) => {
            body += chunk;
            if (body.length > maxBytes) {
                req.destroy();
                reject(new Error("body too large"));
            }
        });
        req.on("end", () => resolve(body));
        req.on("error", reject);
    });
}
const server = http.createServer(async (req, res) => {
    try {
        if (req.method === "GET" && req.url === "/health") {
            res.writeHead(200, { "Content-Type": "application/json" });
            res.end(JSON.stringify({
                ok: true,
                model_ready: modelReady,
                model_loading: loadPromise !== null && !modelReady,
                model_id: MODEL_ID,
                last_status: lastStatus,
            }));
            return;
        }
        if (req.method === "POST" && req.url === "/detect") {
            const body = await readBody(req, 1024 * 1024);
            const parsed = JSON.parse(body);
            if (typeof parsed.text !== "string") {
                res.writeHead(400, { "Content-Type": "application/json" });
                res.end(JSON.stringify({ error: "text must be a string" }));
                return;
            }
            const opts = parsed.options ?? {};
            setStatus(`Detecting PII on ${parsed.text.length} chars…`);
            const t0 = Date.now();
            const spans = await detect(parsed.text, opts);
            const ms = Date.now() - t0;
            setStatus(`Detected ${spans.length} span(s) in ${ms} ms — idle`);
            res.writeHead(200, { "Content-Type": "application/json" });
            res.end(JSON.stringify({ spans, ms }));
            return;
        }
        res.writeHead(404);
        res.end();
    }
    catch (e) {
        console.error("[pii-service] error:", e);
        res.writeHead(500, { "Content-Type": "application/json" });
        res.end(JSON.stringify({ error: e instanceof Error ? e.message : String(e) }));
    }
});
server.listen(PORT, HOST, () => {
    const addr = server.address();
    console.log(`[pii-service] Listening on http://${addr.address}:${addr.port}`);
    console.log(`[pii-service] Endpoints: GET /health, POST /detect`);
    console.log(`[pii-service] Pre-warming model…`);
    // Pre-load so first /detect doesn't wait ~20s.
    loadModel().catch((e) => console.error("[pii-service] preload failed:", e));
});