#!/usr/bin/env python3
"""
Machine Learning Engine for TuskLang Python SDK
===============================================
Advanced machine learning capabilities and automated learning systems

This module provides comprehensive machine learning capabilities for the TuskLang Python SDK,
including model training, prediction, automated learning, and intelligent decision making.
"""

import numpy as np
import pandas as pd
import pickle
import json
import time
import threading
from typing import Any, Dict, List, Optional, Tuple, Union, Callable
from dataclasses import dataclass, asdict
from datetime import datetime, timedelta
from enum import Enum
import logging
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')


class ModelType(Enum):
    """Model type enumeration"""
    CLASSIFICATION = "classification"
    REGRESSION = "regression"
    CLUSTERING = "clustering"
    ANOMALY_DETECTION = "anomaly_detection"
    RECOMMENDATION = "recommendation"


class TrainingStatus(Enum):
    """Training status enumeration"""
    PENDING = "pending"
    TRAINING = "training"
    COMPLETED = "completed"
    FAILED = "failed"


@dataclass
class ModelConfig:
    """Model configuration structure"""
    model_id: str
    name: str
    model_type: ModelType
    algorithm: str
    hyperparameters: Dict[str, Any]
    features: List[str]
    target: str
    created_at: datetime
    version: str = "1.0.0"


@dataclass
class TrainingResult:
    """Training result structure"""
    model_id: str
    status: TrainingStatus
    accuracy: Optional[float] = None
    precision: Optional[float] = None
    recall: Optional[float] = None
    f1_score: Optional[float] = None
    mse: Optional[float] = None
    r2_score: Optional[float] = None
    training_time: Optional[float] = None
    error_message: Optional[str] = None


class MachineLearningEngine:
    """Machine learning engine for TuskLang"""
    
    def __init__(self, config: Dict[str, Any] = None):
        self.config = config or {}
        self.logger = logging.getLogger('tusklang.ml')
        
        # Initialize components
        self.models = {}
        self.training_queue = []
        self.training_results = {}
        self.data_cache = {}
        self.feature_encoders = {}
        
        # Initialize ML components
        self.model_factory = ModelFactory()
        self.data_processor = DataProcessor()
        self.auto_ml = AutoML()
        
        # Start background training
        self.training_active = True
        self.training_thread = threading.Thread(target=self._training_loop, daemon=True)
        self.training_thread.start()
    
    def create_model(self, name: str, model_type: ModelType, algorithm: str,
                    features: List[str], target: str, 
                    hyperparameters: Dict[str, Any] = None) -> str:
        """Create a new machine learning model"""
        model_id = f"{name}_{int(time.time())}"
        
        config = ModelConfig(
            model_id=model_id,
            name=name,
            model_type=model_type,
            algorithm=algorithm,
            hyperparameters=hyperparameters or {},
            features=features,
            target=target,
            created_at=datetime.now()
        )
        
        # Create model instance
        model = self.model_factory.create_model(config)
        self.models[model_id] = {
            "config": config,
            "model": model,
            "scaler": None,
            "encoders": {}
        }
        
        self.logger.info(f"Created model: {model_id}")
        return model_id
    
    def train_model(self, model_id: str, data: pd.DataFrame, 
                   test_size: float = 0.2) -> str:
        """Train a machine learning model"""
        if model_id not in self.models:
            raise ValueError(f"Model {model_id} not found")
        
        # Add to training queue
        training_task = {
            "model_id": model_id,
            "data": data,
            "test_size": test_size,
            "timestamp": datetime.now()
        }
        
        self.training_queue.append(training_task)
        self.logger.info(f"Queued model {model_id} for training")
        
        return model_id
    
    def predict(self, model_id: str, data: pd.DataFrame) -> np.ndarray:
        """Make predictions using a trained model"""
        if model_id not in self.models:
            raise ValueError(f"Model {model_id} not found")
        
        model_info = self.models[model_id]
        model = model_info["model"]
        config = model_info["config"]
        
        # Preprocess data
        processed_data = self.data_processor.preprocess_data(
            data, config.features, model_info["scaler"], model_info["encoders"]
        )
        
        # Make prediction
        predictions = model.predict(processed_data)
        
        return predictions
    
    def predict_proba(self, model_id: str, data: pd.DataFrame) -> np.ndarray:
        """Make probability predictions for classification models"""
        if model_id not in self.models:
            raise ValueError(f"Model {model_id} not found")
        
        model_info = self.models[model_id]
        model = model_info["model"]
        config = model_info["config"]
        
        if config.model_type != ModelType.CLASSIFICATION:
            raise ValueError("Probability predictions only available for classification models")
        
        # Preprocess data
        processed_data = self.data_processor.preprocess_data(
            data, config.features, model_info["scaler"], model_info["encoders"]
        )
        
        # Make probability prediction
        if hasattr(model, 'predict_proba'):
            probabilities = model.predict_proba(processed_data)
        else:
            raise ValueError("Model does not support probability predictions")
        
        return probabilities
    
    def evaluate_model(self, model_id: str, test_data: pd.DataFrame) -> Dict[str, float]:
        """Evaluate model performance"""
        if model_id not in self.models:
            raise ValueError(f"Model {model_id} not found")
        
        model_info = self.models[model_id]
        config = model_info["config"]
        
        # Prepare test data
        X_test = test_data[config.features]
        y_test = test_data[config.target]
        
        # Preprocess test data
        X_test_processed = self.data_processor.preprocess_data(
            X_test, config.features, model_info["scaler"], model_info["encoders"]
        )
        
        # Make predictions
        y_pred = model_info["model"].predict(X_test_processed)
        
        # Calculate metrics
        metrics = {}
        if config.model_type == ModelType.CLASSIFICATION:
            metrics["accuracy"] = accuracy_score(y_test, y_pred)
            metrics["precision"] = precision_score(y_test, y_pred, average='weighted')
            metrics["recall"] = recall_score(y_test, y_pred, average='weighted')
            metrics["f1_score"] = f1_score(y_test, y_pred, average='weighted')
        elif config.model_type == ModelType.REGRESSION:
            metrics["mse"] = mean_squared_error(y_test, y_pred)
            metrics["rmse"] = np.sqrt(metrics["mse"])
            metrics["r2_score"] = r2_score(y_test, y_pred)
        
        return metrics
    
    def auto_train(self, data: pd.DataFrame, target: str, 
                   model_type: ModelType = ModelType.CLASSIFICATION,
                   max_models: int = 5) -> str:
        """Automatically train the best model"""
        return self.auto_ml.auto_train(data, target, model_type, max_models)
    
    def save_model(self, model_id: str, filepath: str) -> bool:
        """Save model to file"""
        if model_id not in self.models:
            return False
        
        try:
            model_info = self.models[model_id]
            model_data = {
                "config": asdict(model_info["config"]),
                "model": model_info["model"],
                "scaler": model_info["scaler"],
                "encoders": model_info["encoders"]
            }
            
            joblib.dump(model_data, filepath)
            self.logger.info(f"Saved model {model_id} to {filepath}")
            return True
            
        except Exception as e:
            self.logger.error(f"Failed to save model {model_id}: {e}")
            return False
    
    def load_model(self, filepath: str) -> str:
        """Load model from file"""
        try:
            model_data = joblib.load(filepath)
            
            # Reconstruct model info
            config_dict = model_data["config"]
            config = ModelConfig(
                model_id=config_dict["model_id"],
                name=config_dict["name"],
                model_type=ModelType(config_dict["model_type"]),
                algorithm=config_dict["algorithm"],
                hyperparameters=config_dict["hyperparameters"],
                features=config_dict["features"],
                target=config_dict["target"],
                created_at=datetime.fromisoformat(config_dict["created_at"]),
                version=config_dict.get("version", "1.0.0")
            )
            
            model_id = config.model_id
            self.models[model_id] = {
                "config": config,
                "model": model_data["model"],
                "scaler": model_data["scaler"],
                "encoders": model_data["encoders"]
            }
            
            self.logger.info(f"Loaded model {model_id} from {filepath}")
            return model_id
            
        except Exception as e:
            self.logger.error(f"Failed to load model from {filepath}: {e}")
            raise
    
    def get_model_info(self, model_id: str) -> Optional[Dict[str, Any]]:
        """Get model information"""
        if model_id not in self.models:
            return None
        
        model_info = self.models[model_id]
        config = model_info["config"]
        
        return {
            "model_id": model_id,
            "name": config.name,
            "type": config.model_type.value,
            "algorithm": config.algorithm,
            "features": config.features,
            "target": config.target,
            "created_at": config.created_at.isoformat(),
            "version": config.version,
            "training_status": self.training_results.get(model_id, {}).get("status", "unknown")
        }
    
    def list_models(self) -> List[Dict[str, Any]]:
        """List all models"""
        return [
            self.get_model_info(model_id)
            for model_id in self.models.keys()
        ]
    
    def delete_model(self, model_id: str) -> bool:
        """Delete a model"""
        if model_id in self.models:
            del self.models[model_id]
            if model_id in self.training_results:
                del self.training_results[model_id]
            self.logger.info(f"Deleted model {model_id}")
            return True
        return False
    
    def _training_loop(self):
        """Background training loop"""
        while self.training_active:
            try:
                if self.training_queue:
                    task = self.training_queue.pop(0)
                    self._train_model_task(task)
                else:
                    time.sleep(1)
                    
            except Exception as e:
                self.logger.error(f"Training loop error: {e}")
                time.sleep(5)
    
    def _train_model_task(self, task: Dict[str, Any]):
        """Train a single model task"""
        model_id = task["model_id"]
        data = task["data"]
        test_size = task["test_size"]
        
        try:
            # Update status
            self.training_results[model_id] = TrainingResult(
                model_id=model_id,
                status=TrainingStatus.TRAINING
            )
            
            model_info = self.models[model_id]
            config = model_info["config"]
            
            # Prepare data
            X = data[config.features]
            y = data[config.target]
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=42
            )
            
            # Preprocess training data
            X_train_processed, scaler, encoders = self.data_processor.fit_preprocess(
                X_train, config.features
            )
            
            # Update model info with preprocessors
            model_info["scaler"] = scaler
            model_info["encoders"] = encoders
            
            # Train model
            start_time = time.time()
            model_info["model"].fit(X_train_processed, y_train)
            training_time = time.time() - start_time
            
            # Evaluate model
            X_test_processed = self.data_processor.preprocess_data(
                X_test, config.features, scaler, encoders
            )
            y_pred = model_info["model"].predict(X_test_processed)
            
            # Calculate metrics
            if config.model_type == ModelType.CLASSIFICATION:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                result = TrainingResult(
                    model_id=model_id,
                    status=TrainingStatus.COMPLETED,
                    accuracy=accuracy,
                    precision=precision,
                    recall=recall,
                    f1_score=f1,
                    training_time=training_time
                )
            elif config.model_type == ModelType.REGRESSION:
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                
                result = TrainingResult(
                    model_id=model_id,
                    status=TrainingStatus.COMPLETED,
                    mse=mse,
                    r2_score=r2,
                    training_time=training_time
                )
            else:
                result = TrainingResult(
                    model_id=model_id,
                    status=TrainingStatus.COMPLETED,
                    training_time=training_time
                )
            
            self.training_results[model_id] = result
            self.logger.info(f"Model {model_id} training completed")
            
        except Exception as e:
            self.training_results[model_id] = TrainingResult(
                model_id=model_id,
                status=TrainingStatus.FAILED,
                error_message=str(e)
            )
            self.logger.error(f"Model {model_id} training failed: {e}")


class ModelFactory:
    """Model factory for creating ML models"""
    
    def create_model(self, config: ModelConfig):
        """Create a model based on configuration"""
        algorithm = config.algorithm.lower()
        
        if config.model_type == ModelType.CLASSIFICATION:
            if algorithm == "random_forest":
                return RandomForestClassifier(**config.hyperparameters)
            elif algorithm == "logistic_regression":
                return LogisticRegression(**config.hyperparameters)
            else:
                return RandomForestClassifier()
                
        elif config.model_type == ModelType.REGRESSION:
            if algorithm == "random_forest":
                return RandomForestRegressor(**config.hyperparameters)
            elif algorithm == "linear_regression":
                return LinearRegression(**config.hyperparameters)
            else:
                return RandomForestRegressor()
                
        elif config.model_type == ModelType.CLUSTERING:
            if algorithm == "kmeans":
                return KMeans(**config.hyperparameters)
            else:
                return KMeans()
                
        else:
            raise ValueError(f"Unsupported model type: {config.model_type}")


class DataProcessor:
    """Data preprocessing and feature engineering"""
    
    def __init__(self):
        self.logger = logging.getLogger('tusklang.ml.data_processor')
    
    def fit_preprocess(self, data: pd.DataFrame, features: List[str]) -> Tuple[pd.DataFrame, StandardScaler, Dict]:
        """Fit preprocessors and transform data"""
        # Handle missing values
        data_clean = data.copy()
        for feature in features:
            if data_clean[feature].dtype in ['object', 'category']:
                data_clean[feature] = data_clean[feature].fillna(data_clean[feature].mode()[0])
            else:
                data_clean[feature] = data_clean[feature].fillna(data_clean[feature].mean())
        
        # Encode categorical variables
        encoders = {}
        data_encoded = data_clean.copy()
        
        for feature in features:
            if data_encoded[feature].dtype in ['object', 'category']:
                encoder = LabelEncoder()
                data_encoded[feature] = encoder.fit_transform(data_encoded[feature])
                encoders[feature] = encoder
        
        # Scale numerical features
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data_encoded[features])
        
        return pd.DataFrame(data_scaled, columns=features), scaler, encoders
    
    def preprocess_data(self, data: pd.DataFrame, features: List[str], 
                       scaler: StandardScaler, encoders: Dict) -> pd.DataFrame:
        """Preprocess data using fitted preprocessors"""
        # Handle missing values
        data_clean = data.copy()
        for feature in features:
            if data_clean[feature].dtype in ['object', 'category']:
                data_clean[feature] = data_clean[feature].fillna(data_clean[feature].mode()[0])
            else:
                data_clean[feature] = data_clean[feature].fillna(data_clean[feature].mean())
        
        # Encode categorical variables
        data_encoded = data_clean.copy()
        
        for feature in features:
            if feature in encoders:
                encoder = encoders[feature]
                data_encoded[feature] = encoder.transform(data_encoded[feature])
        
        # Scale numerical features
        if scaler:
            data_scaled = scaler.transform(data_encoded[features])
            return pd.DataFrame(data_scaled, columns=features)
        else:
            return data_encoded[features]


class AutoML:
    """Automated machine learning system"""
    
    def __init__(self):
        self.logger = logging.getLogger('tusklang.ml.automl')
        self.ml_engine = None  # Will be set by parent
    
    def auto_train(self, data: pd.DataFrame, target: str, 
                   model_type: ModelType = ModelType.CLASSIFICATION,
                   max_models: int = 5) -> str:
        """Automatically train the best model"""
        # Define algorithms to try
        if model_type == ModelType.CLASSIFICATION:
            algorithms = [
                ("random_forest", {"n_estimators": 100, "random_state": 42}),
                ("logistic_regression", {"random_state": 42}),
                ("random_forest", {"n_estimators": 200, "max_depth": 10, "random_state": 42})
            ]
        elif model_type == ModelType.REGRESSION:
            algorithms = [
                ("random_forest", {"n_estimators": 100, "random_state": 42}),
                ("linear_regression", {}),
                ("random_forest", {"n_estimators": 200, "max_depth": 10, "random_state": 42})
            ]
        else:
            algorithms = [("kmeans", {"n_clusters": 3, "random_state": 42})]
        
        # Get features
        features = [col for col in data.columns if col != target]
        
        # Try different algorithms
        best_score = -1
        best_model_id = None
        
        for i, (algorithm, hyperparams) in enumerate(algorithms[:max_models]):
            try:
                # Create model
                model_id = f"auto_model_{i}_{int(time.time())}"
                
                config = ModelConfig(
                    model_id=model_id,
                    name=f"Auto {algorithm}",
                    model_type=model_type,
                    algorithm=algorithm,
                    hyperparameters=hyperparams,
                    features=features,
                    target=target,
                    created_at=datetime.now()
                )
                
                # Create and train model
                model = ModelFactory().create_model(config)
                
                # Prepare data
                X = data[features]
                y = data[target]
                
                # Split data
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42
                )
                
                # Preprocess data
                processor = DataProcessor()
                X_train_processed, scaler, encoders = processor.fit_preprocess(X_train, features)
                X_test_processed = processor.preprocess_data(X_test, features, scaler, encoders)
                
                # Train model
                model.fit(X_train_processed, y_train)
                
                # Evaluate model
                y_pred = model.predict(X_test_processed)
                
                if model_type == ModelType.CLASSIFICATION:
                    score = accuracy_score(y_test, y_pred)
                elif model_type == ModelType.REGRESSION:
                    score = r2_score(y_test, y_pred)
                else:
                    score = 0.5  # Placeholder for clustering
                
                # Update best model
                if score > best_score:
                    best_score = score
                    best_model_id = model_id
                    
                    # Store best model
                    if hasattr(self, 'ml_engine') and self.ml_engine:
                        self.ml_engine.models[model_id] = {
                            "config": config,
                            "model": model,
                            "scaler": scaler,
                            "encoders": encoders
                        }
                
            except Exception as e:
                self.logger.error(f"AutoML training failed for {algorithm}: {e}")
                continue
        
        if best_model_id:
            self.logger.info(f"AutoML completed. Best model: {best_model_id} (score: {best_score:.4f})")
            return best_model_id
        else:
            raise ValueError("AutoML failed to train any models")


# Global ML engine instance
ml_engine = MachineLearningEngine()


def create_ml_model(name: str, model_type: str, algorithm: str,
                   features: List[str], target: str, 
                   hyperparameters: Dict[str, Any] = None) -> str:
    """Create a new machine learning model"""
    model_type_enum = ModelType(model_type.lower())
    return ml_engine.create_model(name, model_type_enum, algorithm, features, target, hyperparameters)


def train_ml_model(model_id: str, data: pd.DataFrame, test_size: float = 0.2) -> str:
    """Train a machine learning model"""
    return ml_engine.train_model(model_id, data, test_size)


def predict_ml(model_id: str, data: pd.DataFrame) -> np.ndarray:
    """Make predictions using a trained model"""
    return ml_engine.predict(model_id, data)


def predict_proba_ml(model_id: str, data: pd.DataFrame) -> np.ndarray:
    """Make probability predictions for classification models"""
    return ml_engine.predict_proba(model_id, data)


def evaluate_ml_model(model_id: str, test_data: pd.DataFrame) -> Dict[str, float]:
    """Evaluate model performance"""
    return ml_engine.evaluate_model(model_id, test_data)


def auto_train_ml(data: pd.DataFrame, target: str, 
                  model_type: str = "classification", max_models: int = 5) -> str:
    """Automatically train the best model"""
    model_type_enum = ModelType(model_type.lower())
    return ml_engine.auto_train(data, target, model_type_enum, max_models)


def save_ml_model(model_id: str, filepath: str) -> bool:
    """Save model to file"""
    return ml_engine.save_model(model_id, filepath)


def load_ml_model(filepath: str) -> str:
    """Load model from file"""
    return ml_engine.load_model(filepath)


def get_ml_model_info(model_id: str) -> Optional[Dict[str, Any]]:
    """Get model information"""
    return ml_engine.get_model_info(model_id)


def list_ml_models() -> List[Dict[str, Any]]:
    """List all models"""
    return ml_engine.list_models()


def delete_ml_model(model_id: str) -> bool:
    """Delete a model"""
    return ml_engine.delete_model(model_id)


if __name__ == "__main__":
    print("Machine Learning Engine for TuskLang Python SDK")
    print("=" * 50)
    
    # Create sample data
    print("\n1. Creating Sample Data:")
    np.random.seed(42)
    n_samples = 1000
    
    # Classification data
    X_class = np.random.randn(n_samples, 4)
    y_class = (X_class[:, 0] + X_class[:, 1] > 0).astype(int)
    
    class_data = pd.DataFrame(X_class, columns=['feature1', 'feature2', 'feature3', 'feature4'])
    class_data['target'] = y_class
    
    print(f"  Classification data shape: {class_data.shape}")
    print(f"  Target distribution: {class_data['target'].value_counts().to_dict()}")
    
    # Regression data
    X_reg = np.random.randn(n_samples, 3)
    y_reg = 2 * X_reg[:, 0] + 1.5 * X_reg[:, 1] - 0.5 * X_reg[:, 2] + np.random.normal(0, 0.1, n_samples)
    
    reg_data = pd.DataFrame(X_reg, columns=['feature1', 'feature2', 'feature3'])
    reg_data['target'] = y_reg
    
    print(f"  Regression data shape: {reg_data.shape}")
    
    # Test classification model
    print("\n2. Testing Classification Model:")
    model_id = create_ml_model("test_classifier", "classification", "random_forest",
                              ['feature1', 'feature2', 'feature3', 'feature4'], 'target')
    
    train_ml_model(model_id, class_data)
    
    # Wait for training to complete
    time.sleep(2)
    
    # Make predictions
    test_data = class_data.head(10)
    predictions = predict_ml(model_id, test_data)
    print(f"  Predictions: {predictions}")
    
    # Test regression model
    print("\n3. Testing Regression Model:")
    reg_model_id = create_ml_model("test_regressor", "regression", "random_forest",
                                  ['feature1', 'feature2', 'feature3'], 'target')
    
    train_ml_model(reg_model_id, reg_data)
    
    # Wait for training to complete
    time.sleep(2)
    
    # Make predictions
    reg_test_data = reg_data.head(10)
    reg_predictions = predict_ml(reg_model_id, reg_test_data)
    print(f"  Predictions: {reg_predictions[:5]}...")
    
    # Test AutoML
    print("\n4. Testing AutoML:")
    auto_model_id = auto_train_ml(class_data, 'target', 'classification', max_models=2)
    print(f"  AutoML model ID: {auto_model_id}")
    
    # List models
    print("\n5. Model Information:")
    models = list_ml_models()
    for model in models:
        print(f"  - {model['name']}: {model['type']} ({model['algorithm']})")
    
    print("\nMachine learning engine testing completed!") 