import { getSentenceEndingRegex } from '../languages';

const MAX_CHUNK_LENGTH = 700;

// Splits the provided text into chunks without breaking sentences and ensures each chunk is <= maxChunkLength
export const splitTextIntoChunks = (
    text,
    language,
    maxChunkLength = MAX_CHUNK_LENGTH
) => {
    const sentenceEndingRegex = getSentenceEndingRegex(language);
    const sentences = text
        .split(sentenceEndingRegex)
        .map((s) => s.trim())
        .filter((s) => s);

    const chunks = [];
    let currentChunk = '';

    for (const sentence of sentences) {
        // Handle common patterns like numbered lists and colons
        if (/^\d+[.:]/.test(sentence) || /:$/.test(sentence)) {
            currentChunk += (currentChunk ? ' ' : '') + sentence;
            continue;
        }

        if ((currentChunk + ' ' + sentence).length <= maxChunkLength) {
            currentChunk += (currentChunk ? ' ' : '') + sentence;
        } else {
            if (currentChunk) chunks.push(currentChunk.trim());
            currentChunk = sentence;
        }
    }

    if (currentChunk) chunks.push(currentChunk.trim());

    return chunks;
};
