import * as fs from "node:fs/promises";
import mammoth from "mammoth";
import JSZip from "jszip";

// Mammoth ships convertToMarkdown at runtime but doesn't declare it in its
// .d.ts. We narrow the binding here so the rest of the file is fully typed.
interface MammothMessage {
  type: "warning" | "error" | string;
  message: string;
}
interface MammothResult {
  value: string;
  messages: MammothMessage[];
}
const convertToMarkdown = (mammoth as unknown as {
  convertToMarkdown: (
    input: { buffer: Buffer },
    options?: Record<string, unknown>,
  ) => Promise<MammothResult>;
}).convertToMarkdown;

export interface DocxMetadata {
  title?: string;
  author?: string;
  modified?: string;
}

export interface ReadDocxResult {
  markdown: string;
  warnings: string[];
  source_chars: number;
  metadata: DocxMetadata | null;
}

export interface ReadDocxOptions {
  preserveStyles: boolean;
  includeMetadata: boolean;
}

// Mammoth's default style map covers headings/bold/italic/lists/quotes/links/
// tables. We extend it slightly to map common Word style aliases — French
// "Citation" and the implicit "Quote" — to blockquotes.
const EXTRA_STYLE_MAP = [
  "p[style-name='Citation'] => blockquote:fresh",
  "p[style-name='Quote'] => blockquote:fresh",
  "p[style-name='Intense Quote'] => blockquote:fresh",
];

export async function readDocx(
  absPath: string,
  opts: ReadDocxOptions,
): Promise<ReadDocxResult> {
  const buffer = await fs.readFile(absPath);

  const result = await convertToMarkdown(
    { buffer },
    {
      styleMap: opts.preserveStyles ? EXTRA_STYLE_MAP : [],
      // i1 — drop images silently. i3 will extract them to disk.
      convertImage: (mammoth.images as unknown as {
        imgElement: (fn: (i: unknown) => Promise<{ src: string }>) => unknown;
      }).imgElement(() => Promise.resolve({ src: "" })),
    },
  );

  const cleaned = postProcess(result.value);
  const warnings = result.messages
    .filter((m) => m.type === "warning" || m.type === "error")
    .map((m) => m.message);

  let metadata: DocxMetadata | null = null;
  if (opts.includeMetadata) {
    try {
      metadata = await readCoreProperties(buffer);
    } catch (e) {
      warnings.push(
        `metadata extraction failed: ${e instanceof Error ? e.message : String(e)}`,
      );
    }
  }

  let markdown = cleaned;
  if (metadata && hasAnyMetadata(metadata)) {
    markdown = renderFrontmatter(metadata) + "\n" + markdown;
  }

  return {
    markdown,
    warnings,
    source_chars: markdown.length,
    metadata,
  };
}

function postProcess(md: string): string {
  return md
    // Mammoth sometimes emits stray <a id="..."></a> bookmark anchors. Drop them.
    .replace(/<a id="[^"]*"><\/a>/g, "")
    // Mammoth emits bold runs as `__x__`, which is valid CommonMark but many
    // LLMs misread it as HTML <u>. Normalise to `**x**` so the round-trip
    // looks unambiguous to downstream consumers.
    .replace(/__/g, "**")
    // Mammoth aggressively backslash-escapes any character that could be
    // markdown-significant — even when the context wouldn't trigger any
    // markdown structure. Strip those escapes for the "safe" punctuation
    // set: characters whose unescape never creates an inline structure
    // (no `*`, `_`, `` ` ``, `[`, `]`, `\` — these stay protected because
    // dropping their escape would alter the document's meaning).
    .replace(/\\([.\-+!#():;,?<>=])/g, "$1")
    // Collapse runs of 3+ blank lines to a single blank line.
    .replace(/\n{3,}/g, "\n\n")
    // Trim trailing whitespace on each line.
    .replace(/[ \t]+\n/g, "\n")
    .trim() + "\n";
}

async function readCoreProperties(buffer: Buffer): Promise<DocxMetadata> {
  const zip = await JSZip.loadAsync(buffer);
  const core = zip.file("docProps/core.xml");
  if (!core) return {};
  const xml = await core.async("string");
  return {
    title: extractTag(xml, "dc:title"),
    author: extractTag(xml, "dc:creator"),
    modified: extractTag(xml, "dcterms:modified"),
  };
}

// Tiny tag extractor — core.xml is small and well-formed; a full XML parser
// is overkill for three optional fields.
function extractTag(xml: string, tag: string): string | undefined {
  const escaped = tag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
  const re = new RegExp(`<${escaped}[^>]*>([\\s\\S]*?)</${escaped}>`);
  const m = xml.match(re);
  if (!m) return undefined;
  const v = m[1].trim();
  return v.length ? v : undefined;
}

function hasAnyMetadata(m: DocxMetadata): boolean {
  return Boolean(m.title || m.author || m.modified);
}

function renderFrontmatter(m: DocxMetadata): string {
  const lines = ["---"];
  if (m.title) lines.push(`title: ${yamlScalar(m.title)}`);
  if (m.author) lines.push(`author: ${yamlScalar(m.author)}`);
  if (m.modified) lines.push(`modified: ${yamlScalar(m.modified)}`);
  lines.push("---");
  return lines.join("\n");
}

function yamlScalar(s: string): string {
  if (/[:#\-?&*!|>'"%@`]|^\s|\s$/.test(s)) {
    return `"${s.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
  }
  return s;
}
docx

docx