"""
Advanced Machine Learning Engine with AutoML Pipeline
Supports multiple algorithms, feature engineering, model selection, and hyperparameter tuning.
"""

import asyncio
import concurrent.futures
import json
import logging
import pickle
import warnings
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier,
    RandomForestClassifier, VotingClassifier
)
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split
)
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ModelConfig:
    """Configuration for machine learning models."""
    name: str
    model: Any
    param_grid: Dict[str, List]
    is_ensemble: bool = False
    supports_probability: bool = True

@dataclass
class AutoMLResult:
    """Result from AutoML pipeline."""
    best_model: Any
    best_score: float
    best_params: Dict[str, Any]
    cv_scores: List[float]
    feature_importance: Optional[Dict[str, float]] = None
    evaluation_metrics: Optional[Dict[str, Any]] = None
    training_time: Optional[float] = None

class FeatureEngineer:
    """Advanced feature engineering and preprocessing."""
    
    def __init__(self):
        self.transformers = {}
        self.feature_names = []
        self.target_encoder = None
        
    def create_polynomial_features(self, df: pd.DataFrame, degree: int = 2, 
                                   include_bias: bool = False) -> pd.DataFrame:
        """Create polynomial features."""
        from sklearn.preprocessing import PolynomialFeatures
        
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        if len(numeric_columns) == 0:
            return df
            
        poly = PolynomialFeatures(degree=degree, include_bias=include_bias)
        poly_features = poly.fit_transform(df[numeric_columns])
        
        feature_names = poly.get_feature_names_out(numeric_columns)
        poly_df = pd.DataFrame(poly_features, columns=feature_names, index=df.index)
        
        # Combine with non-numeric columns
        non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
        if len(non_numeric_columns) > 0:
            result_df = pd.concat([poly_df, df[non_numeric_columns]], axis=1)
        else:
            result_df = poly_df
            
        return result_df
    
    def create_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create interaction features between numeric columns."""
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        
        if len(numeric_columns) < 2:
            return df
            
        interaction_df = df.copy()
        
        for i, col1 in enumerate(numeric_columns):
            for col2 in numeric_columns[i+1:]:
                interaction_df[f"{col1}_x_{col2}"] = df[col1] * df[col2]
                interaction_df[f"{col1}_div_{col2}"] = df[col1] / (df[col2] + 1e-8)
                
        return interaction_df
    
    def create_statistical_features(self, df: pd.DataFrame, window: int = 5) -> pd.DataFrame:
        """Create statistical features like rolling means, std, etc."""
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        
        if len(numeric_columns) == 0:
            return df
            
        stat_df = df.copy()
        
        for col in numeric_columns:
            stat_df[f"{col}_rolling_mean"] = df[col].rolling(window=window, min_periods=1).mean()
            stat_df[f"{col}_rolling_std"] = df[col].rolling(window=window, min_periods=1).std()
            stat_df[f"{col}_rolling_min"] = df[col].rolling(window=window, min_periods=1).min()
            stat_df[f"{col}_rolling_max"] = df[col].rolling(window=window, min_periods=1).max()
            
        return stat_df
    
    def encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode categorical features using multiple strategies."""
        categorical_columns = df.select_dtypes(include=['object', 'category']).columns
        
        if len(categorical_columns) == 0:
            return df
            
        encoded_df = df.copy()
        
        for col in categorical_columns:
            # Target encoding for high cardinality features
            if df[col].nunique() > 10:
                target_encoded = df.groupby(col).size().to_dict()
                encoded_df[f"{col}_target_encoded"] = df[col].map(target_encoded)
            
            # One-hot encoding for low cardinality features
            if df[col].nunique() <= 10:
                dummies = pd.get_dummies(df[col], prefix=col, dummy_na=True)
                encoded_df = pd.concat([encoded_df, dummies], axis=1)
                
            # Label encoding
            le = LabelEncoder()
            encoded_df[f"{col}_label_encoded"] = le.fit_transform(df[col].astype(str))
            self.transformers[f"{col}_label_encoder"] = le
            
        # Drop original categorical columns
        encoded_df = encoded_df.drop(columns=categorical_columns)
        
        return encoded_df
    
    def engineer_features(self, df: pd.DataFrame, 
                         include_polynomial: bool = True,
                         include_interactions: bool = True,
                         include_statistical: bool = True) -> pd.DataFrame:
        """Comprehensive feature engineering pipeline."""
        logger.info("Starting feature engineering...")
        
        # Start with original dataframe
        engineered_df = df.copy()
        
        # Encode categorical features
        engineered_df = self.encode_categorical_features(engineered_df)
        
        # Create polynomial features
        if include_polynomial:
            engineered_df = self.create_polynomial_features(engineered_df, degree=2)
            
        # Create interaction features
        if include_interactions:
            engineered_df = self.create_interaction_features(engineered_df)
            
        # Create statistical features
        if include_statistical and len(df) > 10:
            engineered_df = self.create_statistical_features(engineered_df, window=min(5, len(df)//2))
            
        # Handle missing values
        engineered_df = engineered_df.fillna(engineered_df.mean())
        
        logger.info(f"Feature engineering complete. Original features: {len(df.columns)}, "
                   f"Engineered features: {len(engineered_df.columns)}")
        
        return engineered_df

class HyperparameterOptimizer:
    """Advanced hyperparameter optimization with multiple search strategies."""
    
    def __init__(self, n_jobs: int = -1, verbose: bool = True):
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.optimization_history = []
        
    def bayesian_optimization(self, model, param_space: Dict, X_train: np.ndarray, 
                             y_train: np.ndarray, n_trials: int = 100) -> Dict:
        """Bayesian optimization using Optuna."""
        try:
            import optuna
            
            def objective(trial):
                params = {}
                for param, values in param_space.items():
                    if isinstance(values[0], int):
                        params[param] = trial.suggest_int(param, values[0], values[-1])
                    elif isinstance(values[0], float):
                        params[param] = trial.suggest_float(param, values[0], values[-1])
                    else:
                        params[param] = trial.suggest_categorical(param, values)
                
                model.set_params(**params)
                scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=self.n_jobs)
                return scores.mean()
            
            study = optuna.create_study(direction='maximize')
            study.optimize(objective, n_trials=n_trials)
            
            return study.best_params
            
        except ImportError:
            logger.warning("Optuna not available, falling back to RandomizedSearchCV")
            return self.random_search(model, param_space, X_train, y_train, n_iter=n_trials)
    
    def random_search(self, model, param_grid: Dict, X_train: np.ndarray, 
                      y_train: np.ndarray, n_iter: int = 100) -> Dict:
        """Randomized search for hyperparameter optimization."""
        random_search = RandomizedSearchCV(
            model, param_grid, n_iter=n_iter, cv=5, 
            n_jobs=self.n_jobs, verbose=1 if self.verbose else 0
        )
        random_search.fit(X_train, y_train)
        return random_search.best_params_
    
    def grid_search(self, model, param_grid: Dict, X_train: np.ndarray, 
                    y_train: np.ndarray) -> Dict:
        """Grid search for hyperparameter optimization."""
        grid_search = GridSearchCV(
            model, param_grid, cv=5, 
            n_jobs=self.n_jobs, verbose=1 if self.verbose else 0
        )
        grid_search.fit(X_train, y_train)
        return grid_search.best_params_
    
    def optimize(self, model, param_grid: Dict, X_train: np.ndarray, 
                 y_train: np.ndarray, method: str = 'bayesian') -> Dict:
        """Optimize hyperparameters using specified method."""
        start_time = datetime.now()
        
        if method == 'bayesian':
            best_params = self.bayesian_optimization(model, param_grid, X_train, y_train)
        elif method == 'random':
            best_params = self.random_search(model, param_grid, X_train, y_train)
        elif method == 'grid':
            best_params = self.grid_search(model, param_grid, X_train, y_train)
        else:
            raise ValueError(f"Unknown optimization method: {method}")
        
        optimization_time = (datetime.now() - start_time).total_seconds()
        
        self.optimization_history.append({
            'model': model.__class__.__name__,
            'method': method,
            'best_params': best_params,
            'optimization_time': optimization_time,
            'timestamp': datetime.now().isoformat()
        })
        
        return best_params

class MachineLearningEngine:
    """
    Comprehensive Machine Learning Engine with AutoML capabilities.
    Supports classification, regression, feature engineering, and model optimization.
    """
    
    def __init__(self, n_jobs: int = -1, random_state: int = 42):
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.feature_engineer = FeatureEngineer()
        self.hyperparameter_optimizer = HyperparameterOptimizer(n_jobs=n_jobs)
        self.models_cache = {}
        self.training_history = []
        
        # Define model configurations
        self.classification_models = {
            'random_forest': ModelConfig(
                name='RandomForest',
                model=RandomForestClassifier(random_state=random_state, n_jobs=n_jobs),
                param_grid={
                    'n_estimators': [50, 100, 200],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                }
            ),
            'xgboost': ModelConfig(
                name='XGBoost',
                model=xgb.XGBClassifier(random_state=random_state, n_jobs=n_jobs),
                param_grid={
                    'n_estimators': [50, 100, 200],
                    'max_depth': [3, 6, 9],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'subsample': [0.8, 0.9, 1.0]
                }
            ),
            'lightgbm': ModelConfig(
                name='LightGBM',
                model=lgb.LGBMClassifier(random_state=random_state, n_jobs=n_jobs),
                param_grid={
                    'n_estimators': [50, 100, 200],
                    'max_depth': [3, 6, 9],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'num_leaves': [31, 50, 100]
                }
            ),
            'gradient_boosting': ModelConfig(
                name='GradientBoosting',
                model=GradientBoostingClassifier(random_state=random_state),
                param_grid={
                    'n_estimators': [50, 100, 200],
                    'max_depth': [3, 6, 9],
                    'learning_rate': [0.01, 0.1, 0.2]
                }
            ),
            'svm': ModelConfig(
                name='SVM',
                model=SVC(random_state=random_state, probability=True),
                param_grid={
                    'C': [0.1, 1, 10, 100],
                    'gamma': ['scale', 'auto', 0.001, 0.01],
                    'kernel': ['rbf', 'linear', 'poly']
                }
            ),
            'logistic': ModelConfig(
                name='LogisticRegression',
                model=LogisticRegression(random_state=random_state, n_jobs=n_jobs),
                param_grid={
                    'C': [0.1, 1, 10, 100],
                    'solver': ['liblinear', 'lbfgs'],
                    'penalty': ['l2']
                }
            ),
            'neural_network': ModelConfig(
                name='NeuralNetwork',
                model=MLPClassifier(random_state=random_state, max_iter=1000),
                param_grid={
                    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
                    'alpha': [0.0001, 0.001, 0.01],
                    'learning_rate': ['constant', 'adaptive']
                }
            )
        }
    
    async def preprocess_data(self, X: pd.DataFrame, y: Optional[pd.Series] = None, 
                            target_column: Optional[str] = None,
                            test_size: float = 0.2,
                            perform_feature_engineering: bool = True) -> Tuple:
        """Preprocess data with feature engineering and train/test split."""
        logger.info("Starting data preprocessing...")
        
        # Handle target column extraction
        if target_column and target_column in X.columns:
            y = X[target_column]
            X = X.drop(columns=[target_column])
        
        if y is None:
            raise ValueError("Target variable y must be provided")
        
        # Feature engineering
        if perform_feature_engineering:
            X = self.feature_engineer.engineer_features(X)
        
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state, stratify=y
        )
        
        # Scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        logger.info(f"Data preprocessing complete. Training samples: {len(X_train)}, "
                   f"Test samples: {len(X_test)}, Features: {X_train.shape[1]}")
        
        return X_train_scaled, X_test_scaled, y_train, y_test, scaler
    
    async def train_single_model(self, model_config: ModelConfig, X_train: np.ndarray, 
                               y_train: np.ndarray, optimize_hyperparameters: bool = True) -> Tuple:
        """Train a single model with optional hyperparameter optimization."""
        logger.info(f"Training {model_config.name}...")
        
        start_time = datetime.now()
        
        # Hyperparameter optimization
        if optimize_hyperparameters:
            best_params = self.hyperparameter_optimizer.optimize(
                model_config.model, model_config.param_grid, X_train, y_train
            )
            model_config.model.set_params(**best_params)
        else:
            best_params = {}
        
        # Train model
        model_config.model.fit(X_train, y_train)
        
        # Cross-validation score
        cv_scores = cross_val_score(model_config.model, X_train, y_train, cv=5, n_jobs=self.n_jobs)
        
        training_time = (datetime.now() - start_time).total_seconds()
        
        return model_config.model, cv_scores, best_params, training_time
    
    async def automl_classification(self, X: pd.DataFrame, y: pd.Series,
                                  target_column: Optional[str] = None,
                                  models_to_try: Optional[List[str]] = None,
                                  optimize_hyperparameters: bool = True,
                                  feature_engineering: bool = True) -> AutoMLResult:
        """
        AutoML pipeline for classification tasks.
        Automatically tries multiple models and selects the best one.
        """
        logger.info("Starting AutoML classification pipeline...")
        
        # Preprocess data
        X_train, X_test, y_train, y_test, scaler = await self.preprocess_data(
            X, y, target_column, perform_feature_engineering=feature_engineering
        )
        
        # Select models to try
        if models_to_try is None:
            models_to_try = list(self.classification_models.keys())
        
        # Train models in parallel
        tasks = []
        with ThreadPoolExecutor(max_workers=min(len(models_to_try), 4)) as executor:
            for model_name in models_to_try:
                if model_name in self.classification_models:
                    model_config = self.classification_models[model_name]
                    task = executor.submit(
                        asyncio.run,
                        self.train_single_model(model_config, X_train, y_train, optimize_hyperparameters)
                    )
                    tasks.append((model_name, task))
        
        # Collect results
        model_results = {}
        for model_name, task in tasks:
            try:
                model, cv_scores, best_params, training_time = task.result()
                model_results[model_name] = {
                    'model': model,
                    'cv_scores': cv_scores,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'best_params': best_params,
                    'training_time': training_time
                }
                logger.info(f"{model_name}: CV Score = {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
            except Exception as e:
                logger.error(f"Error training {model_name}: {str(e)}")
        
        # Select best model
        if not model_results:
            raise Exception("No models were successfully trained")
        
        best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['cv_mean'])
        best_result = model_results[best_model_name]
        
        # Evaluate on test set
        best_model = best_result['model']
        y_pred = best_model.predict(X_test)
        y_pred_proba = best_model.predict_proba(X_test) if hasattr(best_model, 'predict_proba') else None
        
        # Calculate evaluation metrics
        evaluation_metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'classification_report': classification_report(y_test, y_pred, output_dict=True),
            'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
        }
        
        # Feature importance
        feature_importance = None
        if hasattr(best_model, 'feature_importances_'):
            feature_names = [f"feature_{i}" for i in range(X_train.shape[1])]
            feature_importance = dict(zip(feature_names, best_model.feature_importances_))
        
        # Save training history
        self.training_history.append({
            'timestamp': datetime.now().isoformat(),
            'task_type': 'classification',
            'best_model': best_model_name,
            'best_score': best_result['cv_mean'],
            'models_tried': list(model_results.keys()),
            'evaluation_metrics': evaluation_metrics
        })
        
        logger.info(f"AutoML complete. Best model: {best_model_name} "
                   f"(CV Score: {best_result['cv_mean']:.4f}, "
                   f"Test Accuracy: {evaluation_metrics['accuracy']:.4f})")
        
        return AutoMLResult(
            best_model=best_model,
            best_score=best_result['cv_mean'],
            best_params=best_result['best_params'],
            cv_scores=best_result['cv_scores'],
            feature_importance=feature_importance,
            evaluation_metrics=evaluation_metrics,
            training_time=sum(r['training_time'] for r in model_results.values())
        )
    
    def create_ensemble_model(self, models: List[Tuple[str, Any]], voting: str = 'soft') -> VotingClassifier:
        """Create an ensemble model from multiple trained models."""
        ensemble = VotingClassifier(estimators=models, voting=voting, n_jobs=self.n_jobs)
        return ensemble
    
    def save_model(self, model: Any, filepath: Union[str, Path], 
                   include_metadata: bool = True) -> None:
        """Save trained model to disk with metadata."""
        filepath = Path(filepath)
        
        # Save model
        with open(filepath, 'wb') as f:
            pickle.dump(model, f)
        
        # Save metadata
        if include_metadata:
            metadata = {
                'model_type': model.__class__.__name__,
                'saved_at': datetime.now().isoformat(),
                'model_params': model.get_params() if hasattr(model, 'get_params') else {},
                'training_history': self.training_history[-1] if self.training_history else None
            }
            
            metadata_path = filepath.with_suffix('.json')
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=2)
        
        logger.info(f"Model saved to {filepath}")
    
    def load_model(self, filepath: Union[str, Path]) -> Any:
        """Load trained model from disk."""
        filepath = Path(filepath)
        
        with open(filepath, 'rb') as f:
            model = pickle.load(f)
        
        logger.info(f"Model loaded from {filepath}")
        return model
    
    def get_training_history(self) -> List[Dict]:
        """Get complete training history."""
        return self.training_history
    
    def benchmark_models(self, X: pd.DataFrame, y: pd.Series, 
                        n_iterations: int = 5) -> Dict[str, Dict]:
        """Benchmark all available models with multiple runs."""
        logger.info(f"Benchmarking models with {n_iterations} iterations...")
        
        benchmark_results = {}
        
        for model_name, model_config in self.classification_models.items():
            scores = []
            times = []
            
            for i in range(n_iterations):
                start_time = datetime.now()
                
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=self.random_state + i
                )
                
                model_config.model.fit(X_train, y_train)
                score = model_config.model.score(X_test, y_test)
                
                scores.append(score)
                times.append((datetime.now() - start_time).total_seconds())
            
            benchmark_results[model_name] = {
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
                'mean_time': np.mean(times),
                'std_time': np.std(times),
                'scores': scores,
                'times': times
            }
        
        return benchmark_results

# Example usage and testing
async def main():
    """Example usage of the Machine Learning Engine."""
    # Create sample data
    from sklearn.datasets import make_classification, load_iris
    
    print("=== Machine Learning Engine Demo ===")
    
    # Initialize engine
    ml_engine = MachineLearningEngine()
    
    # Test 1: AutoML with synthetic data
    print("\n1. Testing AutoML with synthetic classification data...")
    X_synthetic, y_synthetic = make_classification(
        n_samples=1000, n_features=10, n_informative=5,
        n_redundant=2, n_classes=3, random_state=42
    )
    X_synthetic_df = pd.DataFrame(X_synthetic, columns=[f"feature_{i}" for i in range(10)])
    y_synthetic_series = pd.Series(y_synthetic)
    
    automl_result = await ml_engine.automl_classification(
        X_synthetic_df, y_synthetic_series,
        models_to_try=['random_forest', 'xgboost', 'lightgbm'],
        optimize_hyperparameters=True
    )
    
    print(f"Best model: {automl_result.best_model.__class__.__name__}")
    print(f"Best CV score: {automl_result.best_score:.4f}")
    print(f"Test accuracy: {automl_result.evaluation_metrics['accuracy']:.4f}")
    
    # Test 2: Feature engineering
    print("\n2. Testing feature engineering...")
    iris = load_iris()
    X_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
    X_iris['species'] = iris.target
    X_iris['categorical_feature'] = np.random.choice(['A', 'B', 'C'], len(X_iris))
    
    feature_engineer = FeatureEngineer()
    X_engineered = feature_engineer.engineer_features(X_iris.drop('species', axis=1))
    
    print(f"Original features: {len(X_iris.columns) - 1}")
    print(f"Engineered features: {len(X_engineered.columns)}")
    
    # Test 3: Hyperparameter optimization
    print("\n3. Testing hyperparameter optimization...")
    optimizer = HyperparameterOptimizer()
    from sklearn.ensemble import RandomForestClassifier
    
    rf = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20]
    }
    
    best_params = optimizer.optimize(rf, param_grid, X_synthetic, y_synthetic, method='random')
    print(f"Best parameters: {best_params}")
    
    # Test 4: Model persistence
    print("\n4. Testing model saving and loading...")
    ml_engine.save_model(automl_result.best_model, 'best_model.pkl')
    loaded_model = ml_engine.load_model('best_model.pkl')
    print(f"Model saved and loaded successfully: {loaded_model.__class__.__name__}")
    
    # Test 5: Benchmarking
    print("\n5. Testing model benchmarking...")
    benchmark_results = ml_engine.benchmark_models(
        X_synthetic_df, y_synthetic_series, n_iterations=3
    )
    
    print("Benchmark Results:")
    for model_name, results in benchmark_results.items():
        print(f"{model_name}: {results['mean_score']:.4f} ± {results['std_score']:.4f}")
    
    print("\n=== Machine Learning Engine Demo Complete ===")

if __name__ == "__main__":
    asyncio.run(main()) 