Project Files
src / utils / summarizer.ts
/**
* @module Summarizer
* Utility class for performing text summarization and content extraction.
*/
/**
* Performs basic extractive summarization on a block of raw text.
* This function simulates advanced NLP by scoring sentences based on length,
* position (intro/conclusion), and keyword density to provide a concise overview.
* @param rawText The long string of content to summarize.
* @param maxLengthWords The target maximum number of words for the summary. Defaults to 300 words.
* @returns A structured, summarized string or an error message if no text is provided.
*/
export function summarize(rawText: string, maxLengthWords: number = 300): string {
if (!rawText || rawText.trim().length === 0) {
return "No content provided to summarize.";
}
// Basic cleanup and sentence splitting
const cleanedText = rawText.replace(/[\r\n]+/g, ' ').trim();
const sentences: string[] = [];
// Use a robust regex approach for splitting while retaining the delimiter
const sentenceRegex = /[^.!?]+([.!?]\s*)/g;
let lastIndex = 0;
let match: RegExpExecArray | null;
// ✅ FIX: Cleaned up the math and reference errors by using match[0] directly
while ((match = sentenceRegex.exec(cleanedText)) !== null) {
sentences.push(match[0].trim());
lastIndex = sentenceRegex.lastIndex;
}
// Add any remaining text if it didn't end with punctuation
if (cleanedText.substring(lastIndex).trim().length > 0) {
sentences.push(cleanedText.substring(lastIndex).trim());
}
if (sentences.length === 0) {
return cleanedText.substring(0, Math.min(350, cleanedText.length)); // Fallback to simple truncation if regex fails
}
// --- Simple Scoring Logic for Extractive Summary ---
const scoreMap: { [index: number]: number } = {};
sentences.forEach((sentence: string, index: number) => {
const wordCount = sentence.trim().split(/\s+/).length;
let score = Math.min(10, 5 + (wordCount * 0.2)); // Base score based on length
// Boost score for introductory and concluding sentences
if (index < 3 || index >= Math.max(1, sentences.length - 2)) {
score += 4;
}
scoreMap[index] = score;
});
// Sort sentences by their calculated score
const sortedIndices: number[] = Object.keys(scoreMap).map(Number).sort((a, b) => scoreMap[b] - scoreMap[a]);
const summarySentences: string[] = [];
let currentWordCount = 0;
const maxSentencesToTarget = Math.ceil(maxLengthWords / 30); // Estimate needed number of sentences
// Select top sentences until the word limit is reached
for (const index of sortedIndices) {
const sentence = sentences[index];
const wordCount = sentence.trim().split(/\s+/).length;
if (currentWordCount + wordCount <= maxLengthWords && summarySentences.length < maxSentencesToTarget) {
summarySentences.push(sentence);
currentWordCount += wordCount;
} else if (summarySentences.length > 0 && currentWordCount > 0) {
break; // Stop when we hit the limit or run out of meaningful content
}
}
// Join and ensure final formatting
return summarySentences.join(" ").trim();
}
/**
* Extracts the full, raw text content from the given string without applying any
* summarizing or filtering logic. Use this function when maximum context preservation is required.
* @param rawText The raw, extracted text block.
* @returns The original, unsummarized text.
*/
export function extractFullText(rawText: string): string {
if (!rawText || rawText.trim().length === 0) {
return "";
}
// Only performs basic whitespace cleanup to prevent massive gaps caused by formatting
return rawText.replace(/[\r\n]+/g, ' ').trim();
}