"""
Advanced Natural Language Processing Engine
Text analysis with sentiment, entities, summarization, and more.
"""

import asyncio
import json
import logging
import re
from collections import Counter, defaultdict
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime

import numpy as np
import pandas as pd

# Try to import advanced NLP libraries
try:
    import spacy
    from spacy import displacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False
    print("spaCy not available. Advanced NLP features will be limited.")

try:
    from transformers import pipeline, AutoTokenizer, AutoModel
    import torch
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("Transformers not available. Deep learning NLP features will be limited.")

try:
    from textblob import TextBlob
    TEXTBLOB_AVAILABLE = True
except ImportError:
    TEXTBLOB_AVAILABLE = False

try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk.chunk import ne_chunk
    from nltk.tag import pos_tag
    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class TextAnalysisResult:
    """Result from text analysis."""
    text: str
    sentiment: Dict[str, float]
    entities: List[Dict[str, Any]]
    keywords: List[Tuple[str, float]]
    summary: Optional[str] = None
    language: Optional[str] = None
    readability_score: Optional[float] = None
    pos_tags: Optional[List[Tuple[str, str]]] = None

@dataclass  
class SentimentResult:
    """Sentiment analysis result."""
    polarity: float  # -1 (negative) to 1 (positive)
    subjectivity: float  # 0 (objective) to 1 (subjective)
    label: str  # 'positive', 'negative', 'neutral'
    confidence: float

class BasicNLPProcessor:
    """Basic NLP processing without external dependencies."""
    
    def __init__(self):
        self.stop_words = {
            'english': {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
                       'of', 'with', 'by', 'this', 'that', 'is', 'are', 'was', 'were', 'be', 
                       'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 
                       'could', 'should', 'may', 'might', 'can', 'shall', 'must', 'i', 'you', 
                       'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
        }
        self.positive_words = {
            'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'awesome',
            'love', 'like', 'enjoy', 'happy', 'pleased', 'satisfied', 'perfect', 'best'
        }
        self.negative_words = {
            'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike', 'angry',
            'sad', 'disappointed', 'frustrated', 'annoyed', 'poor', 'fail', 'problem'
        }
    
    def tokenize(self, text: str) -> List[str]:
        """Basic tokenization."""
        # Remove punctuation and convert to lowercase
        text = re.sub(r'[^\w\s]', '', text.lower())
        return text.split()
    
    def remove_stop_words(self, tokens: List[str], language: str = 'english') -> List[str]:
        """Remove stop words."""
        stop_words = self.stop_words.get(language, set())
        return [token for token in tokens if token not in stop_words]
    
    def basic_sentiment(self, text: str) -> SentimentResult:
        """Basic rule-based sentiment analysis."""
        tokens = self.tokenize(text)
        
        positive_count = sum(1 for token in tokens if token in self.positive_words)
        negative_count = sum(1 for token in tokens if token in self.negative_words)
        
        total_words = len(tokens)
        if total_words == 0:
            return SentimentResult(0.0, 0.0, 'neutral', 0.5)
        
        polarity = (positive_count - negative_count) / total_words
        
        # Determine label
        if polarity > 0.1:
            label = 'positive'
        elif polarity < -0.1:
            label = 'negative'  
        else:
            label = 'neutral'
        
        confidence = min(abs(polarity) * 2 + 0.5, 1.0)
        subjectivity = min((positive_count + negative_count) / total_words * 2, 1.0)
        
        return SentimentResult(polarity, subjectivity, label, confidence)
    
    def extract_keywords(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
        """Extract keywords using TF-IDF-like approach."""
        tokens = self.tokenize(text)
        tokens = self.remove_stop_words(tokens)
        
        # Filter short tokens
        tokens = [token for token in tokens if len(token) > 2]
        
        if not tokens:
            return []
        
        # Calculate frequencies
        word_freq = Counter(tokens)
        max_freq = max(word_freq.values())
        
        # Normalize and score
        keywords = []
        for word, freq in word_freq.items():
            score = freq / max_freq
            keywords.append((word, score))
        
        # Sort by score and return top_k
        keywords.sort(key=lambda x: x[1], reverse=True)
        return keywords[:top_k]

class AdvancedNLPProcessor:
    """Advanced NLP processing using external libraries."""
    
    def __init__(self):
        self.models = {}
        self._initialize_models()
    
    def _initialize_models(self):
        """Initialize NLP models."""
        # Initialize spaCy model
        if SPACY_AVAILABLE:
            try:
                self.models['spacy'] = spacy.load("en_core_web_sm")
            except OSError:
                logger.warning("spaCy English model not found. Some features may be limited.")
        
        # Initialize transformers models
        if TRANSFORMERS_AVAILABLE:
            try:
                self.models['sentiment'] = pipeline("sentiment-analysis")
                self.models['summarization'] = pipeline("summarization")
                self.models['ner'] = pipeline("ner")
                self.models['classification'] = pipeline("zero-shot-classification")
            except Exception as e:
                logger.warning(f"Error initializing transformers models: {str(e)}")
    
    def advanced_sentiment_analysis(self, text: str) -> SentimentResult:
        """Advanced sentiment analysis using transformers."""
        if 'sentiment' in self.models:
            try:
                result = self.models['sentiment'](text)[0]
                
                # Convert to standard format
                if result['label'] == 'POSITIVE':
                    polarity = result['score']
                    label = 'positive'
                else:
                    polarity = -result['score']
                    label = 'negative'
                
                return SentimentResult(
                    polarity=polarity,
                    subjectivity=0.8,  # Approximate
                    label=label,
                    confidence=result['score']
                )
            except Exception as e:
                logger.error(f"Error in advanced sentiment analysis: {str(e)}")
        
        # Fallback to basic sentiment
        basic_processor = BasicNLPProcessor()
        return basic_processor.basic_sentiment(text)
    
    def named_entity_recognition(self, text: str) -> List[Dict[str, Any]]:
        """Extract named entities."""
        entities = []
        
        # Try spaCy first
        if 'spacy' in self.models:
            try:
                doc = self.models['spacy'](text)
                for ent in doc.ents:
                    entities.append({
                        'text': ent.text,
                        'label': ent.label_,
                        'start': ent.start_char,
                        'end': ent.end_char,
                        'confidence': 0.9  # spaCy doesn't provide confidence scores
                    })
                return entities
            except Exception as e:
                logger.error(f"Error in spaCy NER: {str(e)}")
        
        # Try transformers NER
        if 'ner' in self.models:
            try:
                ner_results = self.models['ner'](text)
                for result in ner_results:
                    entities.append({
                        'text': result['word'],
                        'label': result['entity'],
                        'start': result['start'],
                        'end': result['end'],
                        'confidence': result['score']
                    })
                return entities
            except Exception as e:
                logger.error(f"Error in transformers NER: {str(e)}")
        
        # Basic pattern-based NER fallback
        return self._basic_ner(text)
    
    def _basic_ner(self, text: str) -> List[Dict[str, Any]]:
        """Basic pattern-based named entity recognition."""
        entities = []
        
        # Email pattern
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        for match in re.finditer(email_pattern, text):
            entities.append({
                'text': match.group(),
                'label': 'EMAIL',
                'start': match.start(),
                'end': match.end(),
                'confidence': 0.95
            })
        
        # Phone number pattern
        phone_pattern = r'\b\d{3}-?\d{3}-?\d{4}\b'
        for match in re.finditer(phone_pattern, text):
            entities.append({
                'text': match.group(),
                'label': 'PHONE',
                'start': match.start(),
                'end': match.end(),
                'confidence': 0.9
            })
        
        # Date pattern
        date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
        for match in re.finditer(date_pattern, text):
            entities.append({
                'text': match.group(),
                'label': 'DATE',
                'start': match.start(),
                'end': match.end(),
                'confidence': 0.85
            })
        
        return entities
    
    def text_summarization(self, text: str, max_length: int = 130, 
                          min_length: int = 30) -> str:
        """Generate text summary."""
        if 'summarization' in self.models:
            try:
                # Split long text into chunks
                sentences = self._split_into_sentences(text)
                if len(sentences) <= 3:
                    return text  # Too short to summarize
                
                # Limit input length for transformer
                max_input_length = 1024
                if len(text) > max_input_length:
                    text = text[:max_input_length]
                
                summary = self.models['summarization'](
                    text, 
                    max_length=max_length, 
                    min_length=min_length,
                    do_sample=False
                )[0]['summary_text']
                
                return summary
            except Exception as e:
                logger.error(f"Error in text summarization: {str(e)}")
        
        # Fallback to extractive summarization
        return self._extractive_summary(text, num_sentences=3)
    
    def _split_into_sentences(self, text: str) -> List[str]:
        """Split text into sentences."""
        if NLTK_AVAILABLE:
            try:
                return sent_tokenize(text)
            except:
                pass
        
        # Basic sentence splitting
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def _extractive_summary(self, text: str, num_sentences: int = 3) -> str:
        """Simple extractive summarization."""
        sentences = self._split_into_sentences(text)
        
        if len(sentences) <= num_sentences:
            return text
        
        # Score sentences by word frequency
        word_freq = Counter()
        for sentence in sentences:
            words = re.findall(r'\w+', sentence.lower())
            word_freq.update(words)
        
        # Score sentences
        sentence_scores = []
        for i, sentence in enumerate(sentences):
            words = re.findall(r'\w+', sentence.lower())
            score = sum(word_freq[word] for word in words)
            sentence_scores.append((score, i, sentence))
        
        # Select top sentences
        sentence_scores.sort(reverse=True)
        selected = sentence_scores[:num_sentences]
        selected.sort(key=lambda x: x[1])  # Maintain original order
        
        return ' '.join([sentence for _, _, sentence in selected])

class NLPEngine:
    """
    Comprehensive Natural Language Processing Engine.
    Supports sentiment analysis, entity recognition, summarization, and more.
    """
    
    def __init__(self, use_advanced_models: bool = True):
        self.use_advanced_models = use_advanced_models
        self.basic_processor = BasicNLPProcessor()
        
        if use_advanced_models:
            self.advanced_processor = AdvancedNLPProcessor()
        else:
            self.advanced_processor = None
        
        self.analysis_cache = {}
        self.processing_stats = {
            'total_texts_processed': 0,
            'total_processing_time': 0.0,
            'average_processing_time': 0.0
        }
    
    async def analyze_text(self, text: str, include_summary: bool = True,
                          include_entities: bool = True,
                          include_keywords: bool = True) -> TextAnalysisResult:
        """Comprehensive text analysis."""
        start_time = datetime.now()
        
        logger.info(f"Analyzing text of length {len(text)}")
        
        # Check cache
        cache_key = hash(text + str(include_summary) + str(include_entities) + str(include_keywords))
        if cache_key in self.analysis_cache:
            return self.analysis_cache[cache_key]
        
        # Sentiment analysis
        if self.advanced_processor:
            sentiment_result = self.advanced_processor.advanced_sentiment_analysis(text)
        else:
            sentiment_result = self.basic_processor.basic_sentiment(text)
        
        sentiment = {
            'polarity': sentiment_result.polarity,
            'subjectivity': sentiment_result.subjectivity,
            'label': sentiment_result.label,
            'confidence': sentiment_result.confidence
        }
        
        # Named entity recognition
        entities = []
        if include_entities:
            if self.advanced_processor:
                entities = self.advanced_processor.named_entity_recognition(text)
            else:
                entities = self.basic_processor._basic_ner(text) if hasattr(self.basic_processor, '_basic_ner') else []
        
        # Keyword extraction
        keywords = []
        if include_keywords:
            keywords = self.basic_processor.extract_keywords(text)
        
        # Text summarization
        summary = None
        if include_summary and len(text) > 200:
            if self.advanced_processor:
                summary = self.advanced_processor.text_summarization(text)
            else:
                summary = self.advanced_processor._extractive_summary(text) if self.advanced_processor else text[:200] + "..."
        
        # Language detection (basic)
        language = self._detect_language(text)
        
        # Readability score
        readability_score = self._calculate_readability(text)
        
        # POS tagging (basic)
        pos_tags = self._basic_pos_tagging(text)
        
        result = TextAnalysisResult(
            text=text,
            sentiment=sentiment,
            entities=entities,
            keywords=keywords,
            summary=summary,
            language=language,
            readability_score=readability_score,
            pos_tags=pos_tags
        )
        
        # Cache result
        self.analysis_cache[cache_key] = result
        
        # Update stats
        processing_time = (datetime.now() - start_time).total_seconds()
        self.processing_stats['total_texts_processed'] += 1
        self.processing_stats['total_processing_time'] += processing_time
        self.processing_stats['average_processing_time'] = (
            self.processing_stats['total_processing_time'] / 
            self.processing_stats['total_texts_processed']
        )
        
        logger.info(f"Text analysis complete in {processing_time:.2f}s")
        
        return result
    
    async def batch_analyze(self, texts: List[str], max_workers: int = 4) -> List[TextAnalysisResult]:
        """Analyze multiple texts in parallel."""
        logger.info(f"Starting batch analysis of {len(texts)} texts")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            tasks = [
                executor.submit(asyncio.run, self.analyze_text(text))
                for text in texts
            ]
            
            results = []
            for task in tasks:
                try:
                    result = task.result()
                    results.append(result)
                except Exception as e:
                    logger.error(f"Error in batch analysis: {str(e)}")
                    # Create empty result for failed analysis
                    results.append(TextAnalysisResult(
                        text="",
                        sentiment={'polarity': 0, 'subjectivity': 0, 'label': 'neutral', 'confidence': 0},
                        entities=[],
                        keywords=[]
                    ))
        
        logger.info(f"Batch analysis complete. Processed {len(results)} texts")
        return results
    
    def _detect_language(self, text: str) -> str:
        """Basic language detection."""
        # Simple heuristic based on common words
        english_indicators = ['the', 'and', 'is', 'to', 'of', 'a', 'in', 'that', 'it', 'with']
        spanish_indicators = ['el', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no']
        french_indicators = ['le', 'de', 'et', 'à', 'un', 'il', 'être', 'et', 'en', 'avoir']
        
        words = text.lower().split()
        
        english_count = sum(1 for word in words if word in english_indicators)
        spanish_count = sum(1 for word in words if word in spanish_indicators)  
        french_count = sum(1 for word in words if word in french_indicators)
        
        if english_count >= spanish_count and english_count >= french_count:
            return 'english'
        elif spanish_count >= french_count:
            return 'spanish'
        elif french_count > 0:
            return 'french'
        else:
            return 'unknown'
    
    def _calculate_readability(self, text: str) -> float:
        """Calculate basic readability score (Flesch Reading Ease approximation)."""
        sentences = len(re.split(r'[.!?]+', text))
        words = len(text.split())
        syllables = sum(self._count_syllables(word) for word in text.split())
        
        if sentences == 0 or words == 0:
            return 0.0
        
        # Flesch Reading Ease formula approximation
        score = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
        return max(0, min(100, score))
    
    def _count_syllables(self, word: str) -> int:
        """Count syllables in a word (approximation)."""
        word = word.lower()
        vowels = 'aeiouy'
        syllable_count = 0
        previous_was_vowel = False
        
        for char in word:
            is_vowel = char in vowels
            if is_vowel and not previous_was_vowel:
                syllable_count += 1
            previous_was_vowel = is_vowel
        
        # Handle silent 'e'
        if word.endswith('e') and syllable_count > 1:
            syllable_count -= 1
        
        return max(1, syllable_count)
    
    def _basic_pos_tagging(self, text: str) -> List[Tuple[str, str]]:
        """Basic part-of-speech tagging."""
        # Simple rule-based POS tagging
        words = re.findall(r'\w+', text)
        pos_tags = []
        
        for word in words:
            word_lower = word.lower()
            
            # Simple rules
            if word_lower in ['the', 'a', 'an']:
                tag = 'DT'  # Determiner
            elif word_lower in ['is', 'are', 'was', 'were', 'am']:
                tag = 'VB'  # Verb
            elif word_lower in ['and', 'or', 'but']:
                tag = 'CC'  # Coordinating conjunction
            elif word_lower in ['in', 'on', 'at', 'to', 'for', 'of', 'with']:
                tag = 'IN'  # Preposition
            elif word.endswith('ing'):
                tag = 'VBG'  # Gerund
            elif word.endswith('ed'):
                tag = 'VBD'  # Past tense verb
            elif word.endswith('ly'):
                tag = 'RB'  # Adverb
            elif word.endswith('s') and len(word) > 2:
                tag = 'NNS'  # Plural noun
            elif word[0].isupper():
                tag = 'NNP'  # Proper noun
            else:
                tag = 'NN'  # Noun (default)
            
            pos_tags.append((word, tag))
        
        return pos_tags
    
    def compare_texts(self, text1: str, text2: str) -> Dict[str, float]:
        """Compare two texts for similarity."""
        # Tokenize and convert to sets
        tokens1 = set(self.basic_processor.tokenize(text1))
        tokens2 = set(self.basic_processor.tokenize(text2))
        
        # Jaccard similarity
        intersection = len(tokens1.intersection(tokens2))
        union = len(tokens1.union(tokens2))
        jaccard_similarity = intersection / union if union > 0 else 0
        
        # Cosine similarity (simple version)
        all_words = tokens1.union(tokens2)
        vector1 = [1 if word in tokens1 else 0 for word in all_words]
        vector2 = [1 if word in tokens2 else 0 for word in all_words]
        
        dot_product = sum(a * b for a, b in zip(vector1, vector2))
        magnitude1 = sum(a * a for a in vector1) ** 0.5
        magnitude2 = sum(b * b for b in vector2) ** 0.5
        
        cosine_similarity = dot_product / (magnitude1 * magnitude2) if magnitude1 * magnitude2 > 0 else 0
        
        return {
            'jaccard_similarity': jaccard_similarity,
            'cosine_similarity': cosine_similarity,
            'word_overlap_ratio': intersection / max(len(tokens1), len(tokens2)) if max(len(tokens1), len(tokens2)) > 0 else 0
        }
    
    def get_processing_stats(self) -> Dict[str, Any]:
        """Get processing statistics."""
        return self.processing_stats.copy()
    
    def clear_cache(self):
        """Clear analysis cache."""
        self.analysis_cache.clear()
        logger.info("Analysis cache cleared")

# Example usage and testing
async def main():
    """Example usage of the NLP Engine."""
    print("=== NLP Engine Demo ===")
    
    # Initialize NLP engine
    nlp_engine = NLPEngine(use_advanced_models=True)
    
    # Test texts
    texts = [
        "I absolutely love this new product! It's amazing and works perfectly.",
        "This is terrible. I hate it and want my money back. Very disappointed.",
        "The weather today is nice. John Smith called from New York at 555-123-4567.",
        "Artificial intelligence and machine learning are transforming industries worldwide."
    ]
    
    # Test 1: Single text analysis
    print("\n1. Testing single text analysis...")
    result = await nlp_engine.analyze_text(texts[0])
    
    print(f"Text: {result.text}")
    print(f"Sentiment: {result.sentiment}")
    print(f"Entities: {result.entities}")
    print(f"Keywords: {result.keywords[:5]}")
    print(f"Language: {result.language}")
    print(f"Readability: {result.readability_score:.2f}")
    
    # Test 2: Batch analysis
    print("\n2. Testing batch analysis...")
    batch_results = await nlp_engine.batch_analyze(texts)
    
    for i, result in enumerate(batch_results):
        print(f"Text {i+1} sentiment: {result.sentiment['label']} "
              f"(confidence: {result.sentiment['confidence']:.2f})")
    
    # Test 3: Text comparison
    print("\n3. Testing text comparison...")
    similarity = nlp_engine.compare_texts(texts[0], texts[1])
    print(f"Similarity between positive and negative texts: {similarity}")
    
    similarity = nlp_engine.compare_texts(texts[2], texts[3])
    print(f"Similarity between factual texts: {similarity}")
    
    # Test 4: Processing stats
    print("\n4. Processing statistics:")
    stats = nlp_engine.get_processing_stats()
    for key, value in stats.items():
        print(f"{key}: {value}")
    
    # Test 5: Advanced features (if available)
    if TRANSFORMERS_AVAILABLE:
        print("\n5. Advanced features available")
    else:
        print("\n5. Using basic NLP features only")
    
    print("\n=== NLP Engine Demo Complete ===")

if __name__ == "__main__":
    asyncio.run(main()) 