Project Files
src / report / builder.ts
/**
* @file report/builder.ts
* Compiles a structured Markdown research report from swarm results.
* AI synthesis is the primary narrative section; structured extraction
* is retained as a fallback when AI is unavailable.
*/
import { CrawledSource } from "../types";
import {
CompiledReport,
ReportSource,
ContradictionEntry,
} from "../types";
import { DIMENSIONS, detectCoveredDimensions } from "../planning/dimensions";
import { synthesiseReport, detectContradictions } from "../synthesis/ai";
import { StatusFn } from "../types";
import { DepthProfile } from "../constants";
import {
MAX_SENTENCE_CHARS,
IDEAL_SENTENCE_LENGTH,
CONSENSUS_OVERLAP_FRACTION,
MAX_CONSENSUS_PHRASES,
KEY_SENTENCES_PER_SOURCE,
MAX_SOURCES_PER_DIMENSION,
REPORT_SOURCE_PREVIEW_CHARS,
} from "../constants";
const INFO_MARKERS: ReadonlyArray<string> = [
"is a",
"is the",
"refers to",
"known as",
"defined as",
"found that",
"shows that",
"research",
"study",
"according to",
"estimated",
"reported",
"discovered",
"demonstrated",
"evidence",
"however",
"although",
"despite",
"unlike",
"compared to",
"increased",
"decreased",
"improved",
"reduced",
"percent",
"%",
"million",
"billion",
"2024",
"2025",
"2026",
];
function extractKeySentences(
text: string,
keywords: ReadonlyArray<string>,
count: number,
): ReadonlyArray<string> {
const sentences = text
.replace(/\n+/g, " ")
.split(/(?<=[.!?])\s+/)
.map((s) => s.trim())
.filter((s) => s.length > 40 && s.length < MAX_SENTENCE_CHARS);
const lowerKw = keywords.map((k) => k.toLowerCase());
const scored = sentences.map((s) => {
const lower = s.toLowerCase();
let score = 0;
score += INFO_MARKERS.filter((m) => lower.includes(m)).length * 2;
score += lowerKw.filter((kw) => lower.includes(kw)).length * 3;
score -= Math.abs(s.length - IDEAL_SENTENCE_LENGTH) / 50;
return { s, score };
});
return scored
.sort((a, b) => b.score - a.score)
.slice(0, count)
.sort((a, b) => text.indexOf(a.s) - text.indexOf(b.s))
.map((x) => x.s);
}
function detectConsensus(
sources: ReadonlyArray<ReportSource>,
keywords: ReadonlyArray<string>,
): ReadonlyArray<string> {
if (sources.length < 2) return [];
const lowerKw = keywords.map((k) => k.toLowerCase());
const ngramCounts = new Map<string, number>();
const threshold = Math.max(
2,
Math.ceil(sources.length * CONSENSUS_OVERLAP_FRACTION),
);
for (const src of sources) {
const words = src.text
.toLowerCase()
.replace(/[^a-z0-9\s]/g, " ")
.split(/\s+/)
.filter((w) => w.length > 3)
.slice(0, 500);
const local = new Set<string>();
for (let i = 0; i < words.length - 2; i++) {
local.add(`${words[i]} ${words[i + 1]}`);
local.add(`${words[i]} ${words[i + 1]} ${words[i + 2]}`);
}
for (const ng of local) ngramCounts.set(ng, (ngramCounts.get(ng) ?? 0) + 1);
}
return Array.from(ngramCounts.entries())
.filter(
([ng, count]) =>
count >= threshold && lowerKw.some((kw) => ng.includes(kw)),
)
.sort((a, b) => b[1] - a[1])
.slice(0, MAX_CONSENSUS_PHRASES)
.map(([ng, count]) => `"${ng}" (${count} of ${sources.length} sources)`);
}
interface DimGroup {
label: string;
entries: ReadonlyArray<{
index: number;
title: string;
sentences: ReadonlyArray<string>;
}>;
}
function groupByDimension(
sources: ReadonlyArray<ReportSource>,
keywords: ReadonlyArray<string>,
): ReadonlyArray<DimGroup> {
return DIMENSIONS.map((dim) => {
const matching = sources
.filter((s) =>
dim.keywords.some((kw) => s.text.toLowerCase().includes(kw)),
)
.slice(0, MAX_SOURCES_PER_DIMENSION);
const entries = matching
.map((s) => ({
index: s.index,
title: s.title,
sentences: extractKeySentences(
s.text,
[...keywords, ...dim.keywords],
KEY_SENTENCES_PER_SOURCE,
),
}))
.filter((e) => e.sentences.length > 0);
return { label: dim.label, entries };
}).filter((g) => g.entries.length > 0);
}
const TIER_BADGES: Readonly<Record<string, string>> = {
academic: "[academic]",
government: "[gov]",
reference: "[ref]",
news: "[news]",
professional: "[pro]",
general: "[general]",
low: "[low]",
};
const ORIGIN_BADGES: Readonly<Record<string, string>> = {
web: "",
local: "[local]",
};
/** Compiles the final research report from swarm-collected sources. */
export async function buildReport(
topic: string,
crawled: ReadonlyArray<CrawledSource>,
queriesUsed: ReadonlyArray<string>,
topicKws: ReadonlyArray<string>,
totalRounds: number,
usedAI: boolean,
enableAI: boolean,
status: StatusFn,
profile?: DepthProfile,
): Promise<CompiledReport> {
const now = new Date().toUTCString();
const keywords: ReadonlyArray<string> =
topicKws.length > 0 ? topicKws : extractKeywordsFromTopic(topic);
const sources: ReportSource[] = crawled.map((c, i) => ({
index: i + 1,
url: c.finalUrl || c.url,
title: c.title || "Untitled",
description: c.description,
published: c.published,
text: c.text,
wordCount: c.wordCount,
sourceQuery: c.sourceQuery,
workerRole: c.workerRole,
workerLabel: c.workerLabel,
domainScore: c.domainScore,
freshnessScore: c.freshnessScore,
tier: c.tier,
relevanceScore: c.relevanceScore,
origin: c.origin,
}));
const allTexts = sources.map((s) => s.text);
const coveredIds = detectCoveredDimensions(allTexts);
const gapIds = DIMENSIONS.filter((d) => !coveredIds.includes(d.id)).map(
(d) => d.id,
);
const coveredLabels = DIMENSIONS.filter((d) => coveredIds.includes(d.id)).map(
(d) => d.label,
);
const gapLabels = DIMENSIONS.filter((d) => gapIds.includes(d.id)).map(
(d) => d.label,
);
let aiSynthesis: string | null = null;
let contradictions: ReadonlyArray<ContradictionEntry> = [];
if (enableAI && sources.length > 0) {
const defaultProfile: DepthProfile = {
depthRounds: totalRounds,
pageBudgetPerWorker: 8,
pageBudgetPerGapWorker: 6,
defaultContentLimit: 6000,
searchResultsPerQuery: 10,
maxQueriesPerWorker: 5,
maxPagesPerDomain: 4,
maxLinksToEvaluate: 50,
maxLinksToFollow: 6,
maxOutlinksPerPage: 40,
candidatePoolMultiplier: 3,
workerConcurrency: 3,
maxDecompositionWorkers: 8,
maxGapFillQueries: 6,
ddgRateLimitMs: 2000,
minRelevanceScore: 0.13,
synthesisMaxSources: 30,
synthesisSourceChars: 600,
synthesisMaxTokens: 4000,
contradictionMaxSources: 20,
stagnationThreshold: 1,
searchPages: 1,
searchLanes: 2,
workerFanOut: 1,
extraEngines: [],
linkCrawlDepth: 1,
queryMutationThreshold: 2,
};
const p = profile ?? defaultProfile;
const [synthResult, contradResult] = await Promise.all([
synthesiseReport(
topic,
sources,
coveredLabels,
gapLabels,
status,
p,
).catch(() => null),
detectContradictions(topic, sources, status, p).catch(
() => [] as ContradictionEntry[],
),
]);
aiSynthesis = synthResult;
contradictions = contradResult;
}
const header = buildHeader(
topic,
sources,
totalRounds,
usedAI,
coveredIds.length,
now,
!!aiSynthesis,
);
const coverageTable = buildCoverageTable(coveredIds);
const swarmActivity = buildSwarmActivity(sources, queriesUsed);
const queryList = buildQueryList(queriesUsed);
const consensus = buildConsensus(sources, keywords);
const sections: string[] = [header, "---"];
if (aiSynthesis) {
sections.push(`## Research Analysis\n\n${aiSynthesis}`);
sections.push("---");
}
if (contradictions.length > 0) {
sections.push(buildContradictions(contradictions));
sections.push("---");
}
sections.push(coverageTable, "---");
sections.push(swarmActivity, "---");
sections.push(queryList);
if (consensus) {
sections.push("---\n" + consensus);
}
const findings = buildFindings(sources, keywords);
sections.push("---", findings, "---");
sections.push(buildFullSources(sources), "---");
sections.push(buildCitationIndex(sources), "---");
sections.push(
"*Generated by LM Studio Deep Research Plugin · Swarm Architecture · Verify important claims with primary sources.*",
);
return {
markdown: sections.join("\n\n"),
sources,
topicKeywords: keywords,
coveredDims: coveredIds,
gapDims: gapIds,
aiSynthesis: aiSynthesis ?? undefined,
contradictions,
};
}
const STOP_WORDS = new Set([
"the",
"a",
"an",
"is",
"in",
"of",
"and",
"or",
"for",
"to",
"how",
"what",
"why",
"when",
"does",
"with",
"from",
"its",
]);
function extractKeywordsFromTopic(topic: string): ReadonlyArray<string> {
return topic
.toLowerCase()
.replace(/[^a-z0-9\s]/g, "")
.split(/\s+/)
.filter((w) => w.length > 2 && !STOP_WORDS.has(w))
.slice(0, 8);
}
function buildHeader(
topic: string,
sources: ReadonlyArray<ReportSource>,
rounds: number,
usedAI: boolean,
covered: number,
timestamp: string,
hasSynthesis: boolean,
): string {
const workerLabels = [...new Set(sources.map((s) => s.workerLabel))];
const localCount = sources.filter((s) => s.origin === "local").length;
const webCount = sources.length - localCount;
const sourceLine =
localCount > 0
? `> **Total sources:** ${sources.length} (${webCount} web, ${localCount} local)`
: `> **Total sources:** ${sources.length}`;
return [
`# Deep Research Report: ${esc(topic)}`,
``,
`> **Generated:** ${timestamp}`,
`> **Architecture:** Swarm (${workerLabels.length} parallel workers)`,
`> **Workers:** ${workerLabels.join(", ")}`,
`> **Research rounds:** ${rounds}`,
sourceLine,
`> **AI query planning:** ${usedAI ? " Enabled" : " Fallback (dimension-based)"}`,
`> **AI narrative synthesis:** ${hasSynthesis ? " Enabled" : " Structured extraction"}`,
`> **Dimension coverage:** ${covered}/${DIMENSIONS.length}`,
].join("\n");
}
function buildCoverageTable(coveredIds: ReadonlyArray<string>): string {
const covered = new Set(coveredIds);
const rows = DIMENSIONS.map(
(d) => `| ${covered.has(d.id) ? "" : ""} | **${d.label}** |`,
);
return [
`## Research Dimension Coverage`,
``,
`| | Dimension |`,
`|---|-----------|`,
...rows,
].join("\n");
}
function buildSwarmActivity(
sources: ReadonlyArray<ReportSource>,
queries: ReadonlyArray<string>,
): string {
const byLabel = new Map<string, number>();
for (const s of sources)
byLabel.set(s.workerLabel, (byLabel.get(s.workerLabel) ?? 0) + 1);
const roleRows = Array.from(byLabel.entries())
.map(([label, count]) => `| ${label} | ${count} sources |`)
.join("\n");
return [
`## Swarm Activity`,
``,
`| Worker | Sources Collected |`,
`|--------|------------------|`,
roleRows,
``,
`**Total queries executed:** ${queries.length}`,
].join("\n");
}
function buildQueryList(queries: ReadonlyArray<string>): string {
const unique = [...new Set(queries)];
return [
`## Queries Executed`,
``,
unique.map((q, i) => `${i + 1}. \`${esc(q)}\``).join("\n"),
].join("\n");
}
function buildConsensus(
sources: ReadonlyArray<ReportSource>,
keywords: ReadonlyArray<string>,
): string | null {
const points = detectConsensus(sources, keywords);
if (points.length === 0) return null;
return [
`## Cross-Source Consensus`,
``,
`These concepts appeared consistently across multiple independent sources:`,
``,
points.map((p) => `- ${p}`).join("\n"),
].join("\n");
}
function buildContradictions(
entries: ReadonlyArray<ContradictionEntry>,
): string {
const SEVERITY_ICONS: Record<string, string> = {
minor: "[minor]",
moderate: "[moderate]",
major: "[major]",
};
const rows = entries.map((e) => {
const icon = SEVERITY_ICONS[e.severity] ?? "";
return [
`### ${icon} ${esc(e.claim)}`,
``,
`- **\\[${e.sourceA.index}\\] ${esc(e.sourceA.title)}**: ${esc(e.sourceA.stance)}`,
`- **\\[${e.sourceB.index}\\] ${esc(e.sourceB.title)}**: ${esc(e.sourceB.stance)}`,
`- **Severity:** ${e.severity}`,
].join("\n");
});
return [
`## Cross-Source Contradictions`,
``,
`The following disagreements were detected between sources:`,
``,
...rows,
].join("\n\n");
}
function buildFindings(
sources: ReadonlyArray<ReportSource>,
keywords: ReadonlyArray<string>,
): string {
const groups = groupByDimension(sources, keywords);
const sections = groups.map((g) => {
const entries = g.entries
.map((e) => {
const sentences = e.sentences.map((s) => ` > ${esc(s)}`).join("\n");
return `**\\[${e.index}\\] ${esc(e.title)}**\n${sentences}`;
})
.join("\n\n");
return `### ${g.label}\n\n${entries}`;
});
return [`## Findings by Research Dimension`, ``, ...sections].join(
"\n\n---\n\n",
);
}
function buildFullSources(sources: ReadonlyArray<ReportSource>): string {
const entries = sources.map((s) => {
const badge = TIER_BADGES[s.tier] ?? "";
const originBadge = ORIGIN_BADGES[s.origin] ?? "";
const preview = s.text
.slice(0, REPORT_SOURCE_PREVIEW_CHARS)
.replace(/\n+/g, " ")
.trim();
const isTruncated = s.text.length > REPORT_SOURCE_PREVIEW_CHARS;
const meta = [
`${badge}${originBadge ? " " + originBadge : ""} **${s.tier}** · Score: ${s.domainScore}/100`,
`Relevance: ${(s.relevanceScore * 100).toFixed(0)}%`,
s.published ? ` ${s.published}` : null,
`~${s.wordCount.toLocaleString()} words`,
`Worker: ${s.workerLabel}`,
`Query: \`${esc(s.sourceQuery)}\``,
]
.filter(Boolean)
.join(" · ");
return [
`### \\[${s.index}\\] ${esc(s.title)}`,
`<${s.url}>`,
``,
meta,
``,
`> ${esc(s.description)}`,
``,
`<details>`,
`<summary>Extracted content (~${s.wordCount.toLocaleString()} words)</summary>`,
``,
esc(preview) + (isTruncated ? "\n\n*…truncated…*" : ""),
``,
`</details>`,
].join("\n");
});
return [`## Full Source Details`, ``, entries.join("\n\n---\n\n")].join("\n");
}
function buildCitationIndex(sources: ReadonlyArray<ReportSource>): string {
const lines = sources.map((s) => {
const originTag = s.origin === "local" ? " [local]" : "";
return `**\\[${s.index}\\]** [${esc(s.title)}](${s.url})${s.published ? ` *(${s.published})*` : ""} ${TIER_BADGES[s.tier] ?? ""}${originTag}`;
});
return [`## Citation Index`, ``, ...lines].join("\n");
}
function esc(text: string): string {
return text.replace(/\[/g, "\\[").replace(/\]/g, "\\]");
}