<?php
/**
 * Processador de Overlap para Chunks Semânticos
 * Arquivo: overlap-processor.php
 * 
 * Responsável por adicionar overlap inteligente aos chunks preservando a semântica
 */

require_once __DIR__ . '/../config/config.php';

/**
 * Classe para processar overlap entre chunks semânticos
 */
class OverlapProcessor {
    
    private $log = [];
    
    /**
     * Processa chunks adicionando overlap inteligente baseado no tamanho
     * 
     * @param array $chunks Array de chunks originais
     * @param string $originalText Texto original normalizado para referência
     * @return array Chunks com overlap aplicado
     */
    public function processOverlap($chunks, $originalText) {
        try {
            $this->addLog("Iniciando processamento de overlap...");
            $this->addLog("Total de chunks originais: " . count($chunks));
            
            if (count($chunks) < 2) {
                $this->addLog("Menos de 2 chunks - overlap não necessário");
                return $chunks;
            }
            
            $chunksWithOverlap = [];
            
            for ($i = 0; $i < count($chunks); $i++) {
                $currentChunk = $chunks[$i];
                $chunkSize = str_word_count($currentChunk['content']);
                
                // Calcula porcentagem de overlap baseada no tamanho
                $overlapPercentage = $this->calculateOverlapPercentage($chunkSize);
                
                // Calcula palavras de overlap esperadas
                $expectedOverlapWords = round($chunkSize * ($overlapPercentage / 100));
                $this->addLog("Chunk #{$currentChunk['id']}: {$chunkSize} palavras, overlap: {$overlapPercentage}% (~{$expectedOverlapWords} palavras)");
                
                // Processa overlap com chunk anterior
                $prefixOverlap = '';
                if ($i > 0) {
                    $prefixOverlap = $this->createPrefixOverlap($chunks[$i-1], $overlapPercentage);
                }
                
                // Processa overlap com próximo chunk
                $suffixOverlap = '';
                if ($i < count($chunks) - 1) {
                    $suffixOverlap = $this->createSuffixOverlap($chunks[$i+1], $overlapPercentage);
                }
                
                // Cria novo chunk com overlap
                $newChunk = $this->createChunkWithOverlap($currentChunk, $prefixOverlap, $suffixOverlap);
                $chunksWithOverlap[] = $newChunk;
            }
            
            // Valida resultados
            $this->validateOverlapResults($chunksWithOverlap);
            
            $this->addLog("Processamento de overlap concluído com sucesso");
            $this->addLog("Total de chunks com overlap: " . count($chunksWithOverlap));
            
            return $chunksWithOverlap;
            
        } catch (Exception $e) {
            $this->addLog("ERRO no processamento de overlap: " . $e->getMessage());
            throw $e;
        }
    }
    
    /**
     * Calcula porcentagem de overlap baseada no tamanho do chunk
     * 
     * @param int $chunkSize Tamanho do chunk em palavras
     * @return float Porcentagem de overlap (10-20%)
     */
    private function calculateOverlapPercentage($chunkSize) {
        // Chunks pequenos (< 150 palavras): 10% overlap
        if ($chunkSize < 150) {
            return 10.0;
        }
        
        // Chunks médios (150-400 palavras): 15% overlap
        if ($chunkSize <= 400) {
            return 15.0;
        }
        
        // Chunks grandes (> 400 palavras): 20% overlap
        return 20.0;
    }
    
    /**
     * Cria overlap do final do chunk anterior (prefixo)
     * 
     * @param array $previousChunk Chunk anterior
     * @param float $overlapPercentage Porcentagem de overlap
     * @return string Texto de overlap para prefixo
     */
    private function createPrefixOverlap($previousChunk, $overlapPercentage) {
        $content = $previousChunk['content'];
        $words = explode(' ', $content);
        $totalWords = count($words);
        
        // Calcula quantas palavras pegar baseado na porcentagem real
        $overlapWords = max(1, round($totalWords * ($overlapPercentage / 100)));
        
        // Limites adaptativos baseados no tamanho do chunk
        $minOverlap = max(5, round($totalWords * 0.05)); // Mínimo 5% do chunk
        $maxOverlap = round($totalWords * 0.25); // Máximo 25% do chunk
        
        $overlapWords = max($minOverlap, min($overlapWords, $maxOverlap));
        
        // Pega as últimas palavras, mas tenta preservar frases completas
        $startPosition = max(0, $totalWords - $overlapWords);
        $overlapText = implode(' ', array_slice($words, $startPosition));
        
        // Tenta encontrar o início de uma frase no overlap
        $sentences = preg_split('/[.!?]+/', $overlapText);
        if (count($sentences) > 1) {
            // Remove a primeira frase incompleta se houver
            array_shift($sentences);
            $overlapText = implode('. ', array_filter($sentences));
            if (!empty($overlapText)) {
                $overlapText = trim($overlapText) . '.';
            }
        }
        
        return trim($overlapText);
    }
    
    /**
     * Cria overlap do início do próximo chunk (sufixo)
     * 
     * @param array $nextChunk Próximo chunk
     * @param float $overlapPercentage Porcentagem de overlap
     * @return string Texto de overlap para sufixo
     */
    private function createSuffixOverlap($nextChunk, $overlapPercentage) {
        $content = $nextChunk['content'];
        $words = explode(' ', $content);
        $totalWords = count($words);
        
        // Calcula quantas palavras pegar baseado na porcentagem real
        $overlapWords = max(1, round($totalWords * ($overlapPercentage / 100)));
        
        // Limites adaptativos baseados no tamanho do chunk
        $minOverlap = max(5, round($totalWords * 0.05)); // Mínimo 5% do chunk
        $maxOverlap = round($totalWords * 0.25); // Máximo 25% do chunk
        
        $overlapWords = max($minOverlap, min($overlapWords, $maxOverlap));
        
        // Pega as primeiras palavras, mas tenta preservar frases completas
        $overlapText = implode(' ', array_slice($words, 0, $overlapWords));
        
        // Tenta encontrar o final de uma frase no overlap
        $sentences = preg_split('/[.!?]+/', $overlapText);
        if (count($sentences) > 1) {
            // Remove a última frase incompleta se houver
            array_pop($sentences);
            $overlapText = implode('. ', array_filter($sentences));
            if (!empty($overlapText)) {
                $overlapText = trim($overlapText) . '.';
            }
        }
        
        return trim($overlapText);
    }
    
    /**
     * Cria novo chunk com overlap aplicado
     * 
     * @param array $originalChunk Chunk original
     * @param string $prefixOverlap Overlap do chunk anterior
     * @param string $suffixOverlap Overlap do próximo chunk
     * @return array Novo chunk com overlap
     */
    private function createChunkWithOverlap($originalChunk, $prefixOverlap, $suffixOverlap) {
        $content = $originalChunk['content'];
        
        // Adiciona prefixo se existir
        if (!empty($prefixOverlap)) {
            $content = $prefixOverlap . ' ' . $content;
        }
        
        // Adiciona sufixo se existir
        if (!empty($suffixOverlap)) {
            $content = $content . ' ' . $suffixOverlap;
        }
        
        // Limpa espaços extras e normaliza
        $content = preg_replace('/\s+/', ' ', trim($content));
        
        // Atualiza keywords incluindo termos do overlap
        $newKeywords = $originalChunk['keywords'];
        
        // Extrai palavras-chave do overlap
        $overlapText = $prefixOverlap . ' ' . $suffixOverlap;
        if (!empty($overlapText)) {
            $overlapKeywords = $this->extractKeywordsFromOverlap($overlapText, $newKeywords);
            $newKeywords = array_unique(array_merge($newKeywords, $overlapKeywords));
        }
        
        return [
            'id' => $originalChunk['id'],
            'content' => $content,
            'topic' => $originalChunk['topic'],
            'keywords' => array_values($newKeywords), // Remove indices para JSON limpo
            'original_words' => str_word_count($originalChunk['content']),
            'overlap_words' => str_word_count($content) - str_word_count($originalChunk['content']),
            'total_words' => str_word_count($content),
            'has_prefix_overlap' => !empty($prefixOverlap),
            'has_suffix_overlap' => !empty($suffixOverlap)
        ];
    }
    
    /**
     * Extrai palavras-chave relevantes do texto de overlap
     * 
     * @param string $overlapText Texto do overlap
     * @param array $existingKeywords Keywords já existentes
     * @return array Novas keywords do overlap
     */
    private function extractKeywordsFromOverlap($overlapText, $existingKeywords) {
        // Remove stopwords comuns em português
        $stopwords = ['a', 'o', 'e', 'é', 'de', 'do', 'da', 'em', 'um', 'uma', 'para', 'com', 'não', 'que', 'se', 'na', 'no', 'por', 'mais', 'as', 'os', 'como', 'mas', 'foi', 'ao', 'ele', 'ela', 'ou', 'ser', 'ter', 'eu', 'você', 'isso', 'esse', 'essa'];
        
        // Extrai palavras significativas (3+ letras, não stopwords)
        $words = preg_split('/\W+/', strtolower($overlapText));
        $newKeywords = [];
        
        foreach ($words as $word) {
            if (strlen($word) >= 3 && 
                !in_array($word, $stopwords) && 
                !in_array($word, $existingKeywords) &&
                !is_numeric($word)) {
                $newKeywords[] = $word;
            }
        }
        
        // Retorna máximo 3 novas keywords do overlap
        return array_slice($newKeywords, 0, 3);
    }
    
    /**
     * Valida resultados do processamento de overlap
     * 
     * @param array $chunksWithOverlap Chunks processados
     * @throws Exception Se validação falhar
     */
    private function validateOverlapResults($chunksWithOverlap) {
        foreach ($chunksWithOverlap as $chunk) {
            if (empty($chunk['content'])) {
                throw new Exception("Chunk com ID {$chunk['id']} ficou vazio após overlap");
            }
            
            if (!isset($chunk['total_words']) || $chunk['total_words'] < 1) {
                throw new Exception("Chunk com ID {$chunk['id']} tem contagem de palavras inválida");
            }
            
            // Verifica se overlap não criou chunks muito grandes
            if ($chunk['total_words'] > 1000) {
                $this->addLog("AVISO: Chunk #{$chunk['id']} ficou muito grande ({$chunk['total_words']} palavras)");
            }
        }
        
        $this->addLog("Validação de overlap concluída - todos os chunks são válidos");
    }
    
    /**
     * Retorna estatísticas do processamento de overlap
     * 
     * @param array $chunksWithOverlap Chunks processados
     * @return array Estatísticas detalhadas
     */
    public function getOverlapStats($chunksWithOverlap) {
        $totalOverlapWords = 0;
        $chunksWithPrefix = 0;
        $chunksWithSuffix = 0;
        $avgWordsBeforeOverlap = 0;
        $avgWordsAfterOverlap = 0;
        
        foreach ($chunksWithOverlap as $chunk) {
            $totalOverlapWords += $chunk['overlap_words'];
            $avgWordsBeforeOverlap += $chunk['original_words'];
            $avgWordsAfterOverlap += $chunk['total_words'];
            
            if ($chunk['has_prefix_overlap']) {
                $chunksWithPrefix++;
            }
            
            if ($chunk['has_suffix_overlap']) {
                $chunksWithSuffix++;
            }
        }
        
        $totalChunks = count($chunksWithOverlap);
        $avgWordsBeforeOverlap = $totalChunks > 0 ? round($avgWordsBeforeOverlap / $totalChunks) : 0;
        $avgWordsAfterOverlap = $totalChunks > 0 ? round($avgWordsAfterOverlap / $totalChunks) : 0;
        
        return [
            'total_chunks_processed' => $totalChunks,
            'total_overlap_words_added' => $totalOverlapWords,
            'chunks_with_prefix_overlap' => $chunksWithPrefix,
            'chunks_with_suffix_overlap' => $chunksWithSuffix,
            'avg_words_before_overlap' => $avgWordsBeforeOverlap,
            'avg_words_after_overlap' => $avgWordsAfterOverlap,
            'overlap_efficiency' => $totalOverlapWords > 0 ? round(($totalOverlapWords / ($avgWordsAfterOverlap * $totalChunks)) * 100, 2) : 0
        ];
    }
    
    /**
     * Retorna log do processamento
     * 
     * @return array Log entries
     */
    public function getLog() {
        return $this->log;
    }
    
    /**
     * Adiciona entrada ao log
     * 
     * @param string $message Mensagem do log
     */
    private function addLog($message) {
        $timestamp = date('H:i:s');
        $this->log[] = "[$timestamp] $message";
    }
}
?>
