#!/usr/bin/env python3
"""
Advanced Monitoring and Observability Framework for TuskLang Python SDK
Goal 7.3 Implementation - Monitoring and Observability System

Features:
- Comprehensive metrics collection and aggregation
- Structured logging with correlation IDs
- Distributed tracing capabilities
- Real-time alerting and notifications
- Performance monitoring and analysis
- Health checks and service discovery
"""

import time
import threading
import logging
import json
import uuid
import asyncio
import functools
from typing import Any, Dict, List, Optional, Callable, Union
from dataclasses import dataclass, field, asdict
from collections import defaultdict, deque
from contextlib import contextmanager
import weakref
import traceback
from datetime import datetime, timedelta

logger = logging.getLogger(__name__)

@dataclass
class Metric:
    """Metric data structure"""
    name: str
    value: float
    timestamp: float
    tags: Dict[str, str] = field(default_factory=dict)
    metadata: Dict[str, Any] = field(default_factory=dict)

@dataclass
class LogEntry:
    """Structured log entry"""
    level: str
    message: str
    timestamp: float
    correlation_id: str
    service: str
    component: str
    tags: Dict[str, str] = field(default_factory=dict)
    context: Dict[str, Any] = field(default_factory=dict)
    stack_trace: str = ""

@dataclass
class TraceSpan:
    """Distributed tracing span"""
    trace_id: str
    span_id: str
    parent_span_id: Optional[str]
    name: str
    start_time: float
    end_time: Optional[float] = None
    tags: Dict[str, str] = field(default_factory=dict)
    events: List[Dict[str, Any]] = field(default_factory=list)
    status: str = "active"

@dataclass
class Alert:
    """Alert definition"""
    name: str
    condition: str
    threshold: float
    severity: str
    message: str
    enabled: bool = True
    last_triggered: Optional[float] = None
    cooldown_period: float = 300.0  # 5 minutes

class MetricsCollector:
    """Metrics collection and aggregation system"""
    
    def __init__(self, retention_hours: int = 24):
        self.retention_hours = retention_hours
        self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=10000))
        self.aggregations: Dict[str, Dict[str, float]] = defaultdict(dict)
        self.lock = threading.RLock()
        
        # Start background aggregation
        self._start_aggregation_thread()
    
    def record_metric(self, name: str, value: float, tags: Dict[str, str] = None, 
                     metadata: Dict[str, Any] = None) -> None:
        """Record a metric"""
        metric = Metric(
            name=name,
            value=value,
            timestamp=time.time(),
            tags=tags or {},
            metadata=metadata or {}
        )
        
        with self.lock:
            self.metrics[name].append(metric)
    
    def get_metric(self, name: str, tags: Dict[str, str] = None, 
                   time_range: Optional[float] = None) -> List[Metric]:
        """Get metrics for a specific name and tags"""
        with self.lock:
            if name not in self.metrics:
                return []
            
            metrics = list(self.metrics[name])
            
            # Filter by tags if specified
            if tags:
                metrics = [
                    m for m in metrics
                    if all(m.tags.get(k) == v for k, v in tags.items())
                ]
            
            # Filter by time range if specified
            if time_range:
                cutoff_time = time.time() - time_range
                metrics = [m for m in metrics if m.timestamp >= cutoff_time]
            
            return metrics
    
    def get_aggregation(self, name: str, aggregation_type: str = "avg") -> Optional[float]:
        """Get aggregated metric value"""
        with self.lock:
            if name in self.aggregations and aggregation_type in self.aggregations[name]:
                return self.aggregations[name][aggregation_type]
            return None
    
    def _start_aggregation_thread(self):
        """Start background thread for metric aggregation"""
        def aggregate_metrics():
            while True:
                try:
                    self._calculate_aggregations()
                    time.sleep(60)  # Aggregate every minute
                except Exception as e:
                    logger.error(f"Metric aggregation error: {e}")
                    time.sleep(60)
        
        thread = threading.Thread(target=aggregate_metrics, daemon=True)
        thread.start()
    
    def _calculate_aggregations(self):
        """Calculate aggregations for all metrics"""
        with self.lock:
            for name, metrics_list in self.metrics.items():
                if not metrics_list:
                    continue
                
                # Get recent metrics (last hour)
                cutoff_time = time.time() - 3600
                recent_metrics = [m for m in metrics_list if m.timestamp >= cutoff_time]
                
                if recent_metrics:
                    values = [m.value for m in recent_metrics]
                    self.aggregations[name] = {
                        'count': len(values),
                        'sum': sum(values),
                        'avg': sum(values) / len(values),
                        'min': min(values),
                        'max': max(values),
                        'last': values[-1]
                    }

class StructuredLogger:
    """Structured logging system with correlation IDs"""
    
    def __init__(self, service_name: str = "tsk-sdk"):
        self.service_name = service_name
        self.logs: deque = deque(maxlen=10000)
        self.correlation_context = threading.local()
        self.lock = threading.RLock()
    
    def set_correlation_id(self, correlation_id: str = None):
        """Set correlation ID for current thread"""
        if correlation_id is None:
            correlation_id = str(uuid.uuid4())
        self.correlation_context.correlation_id = correlation_id
        return correlation_id
    
    def get_correlation_id(self) -> str:
        """Get current correlation ID"""
        if not hasattr(self.correlation_context, 'correlation_id'):
            return self.set_correlation_id()
        return self.correlation_context.correlation_id
    
    def log(self, level: str, message: str, component: str = "general", 
            tags: Dict[str, str] = None, context: Dict[str, Any] = None) -> None:
        """Log a structured message"""
        log_entry = LogEntry(
            level=level,
            message=message,
            timestamp=time.time(),
            correlation_id=self.get_correlation_id(),
            service=self.service_name,
            component=component,
            tags=tags or {},
            context=context or {}
        )
        
        with self.lock:
            self.logs.append(log_entry)
        
        # Also log to standard logger
        logger.log(getattr(logging, level.upper(), logging.INFO), 
                  f"[{log_entry.correlation_id}] {message}")
    
    def get_logs(self, level: str = None, component: str = None, 
                 correlation_id: str = None, time_range: Optional[float] = None) -> List[LogEntry]:
        """Get logs with filters"""
        with self.lock:
            logs = list(self.logs)
        
        # Apply filters
        if level:
            logs = [log for log in logs if log.level == level]
        if component:
            logs = [log for log in logs if log.component == component]
        if correlation_id:
            logs = [log for log in logs if log.correlation_id == correlation_id]
        if time_range:
            cutoff_time = time.time() - time_range
            logs = [log for log in logs if log.timestamp >= cutoff_time]
        
        return logs

class TraceCollector:
    """Distributed tracing system"""
    
    def __init__(self):
        self.traces: Dict[str, List[TraceSpan]] = defaultdict(list)
        self.active_spans: Dict[str, TraceSpan] = {}
        self.lock = threading.RLock()
    
    def start_span(self, name: str, trace_id: str = None, 
                   parent_span_id: str = None, tags: Dict[str, str] = None) -> str:
        """Start a new trace span"""
        if trace_id is None:
            trace_id = str(uuid.uuid4())
        
        span_id = str(uuid.uuid4())
        span = TraceSpan(
            trace_id=trace_id,
            span_id=span_id,
            parent_span_id=parent_span_id,
            name=name,
            start_time=time.time(),
            tags=tags or {}
        )
        
        with self.lock:
            self.traces[trace_id].append(span)
            self.active_spans[span_id] = span
        
        return span_id
    
    def end_span(self, span_id: str, status: str = "success", 
                 tags: Dict[str, str] = None) -> None:
        """End a trace span"""
        with self.lock:
            if span_id in self.active_spans:
                span = self.active_spans[span_id]
                span.end_time = time.time()
                span.status = status
                if tags:
                    span.tags.update(tags)
                del self.active_spans[span_id]
    
    def add_span_event(self, span_id: str, event_name: str, 
                      attributes: Dict[str, Any] = None) -> None:
        """Add event to span"""
        with self.lock:
            if span_id in self.active_spans:
                event = {
                    'name': event_name,
                    'timestamp': time.time(),
                    'attributes': attributes or {}
                }
                self.active_spans[span_id].events.append(event)
    
    def get_trace(self, trace_id: str) -> List[TraceSpan]:
        """Get complete trace"""
        with self.lock:
            return list(self.traces.get(trace_id, []))

class AlertManager:
    """Alert management system"""
    
    def __init__(self):
        self.alerts: Dict[str, Alert] = {}
        self.alert_handlers: List[Callable] = []
        self.lock = threading.RLock()
    
    def add_alert(self, alert: Alert) -> None:
        """Add an alert definition"""
        with self.lock:
            self.alerts[alert.name] = alert
    
    def register_alert_handler(self, handler: Callable) -> None:
        """Register alert handler"""
        with self.lock:
            self.alert_handlers.append(handler)
    
    def check_alerts(self, metrics_collector: MetricsCollector) -> List[Dict[str, Any]]:
        """Check all alerts against current metrics"""
        triggered_alerts = []
        
        with self.lock:
            for alert_name, alert in self.alerts.items():
                if not alert.enabled:
                    continue
                
                # Check cooldown
                if (alert.last_triggered and 
                    time.time() - alert.last_triggered < alert.cooldown_period):
                    continue
                
                # Get current metric value
                current_value = metrics_collector.get_aggregation(alert_name, "last")
                if current_value is None:
                    continue
                
                # Check condition
                triggered = False
                if alert.condition == "gt" and current_value > alert.threshold:
                    triggered = True
                elif alert.condition == "lt" and current_value < alert.threshold:
                    triggered = True
                elif alert.condition == "eq" and current_value == alert.threshold:
                    triggered = True
                
                if triggered:
                    alert.last_triggered = time.time()
                    alert_info = {
                        'name': alert.name,
                        'message': alert.message,
                        'severity': alert.severity,
                        'current_value': current_value,
                        'threshold': alert.threshold,
                        'timestamp': time.time()
                    }
                    triggered_alerts.append(alert_info)
                    
                    # Notify handlers
                    for handler in self.alert_handlers:
                        try:
                            handler(alert_info)
                        except Exception as e:
                            logger.error(f"Alert handler failed: {e}")
        
        return triggered_alerts

class HealthChecker:
    """Health check system"""
    
    def __init__(self):
        self.health_checks: Dict[str, Callable] = {}
        self.health_status: Dict[str, Dict[str, Any]] = {}
        self.lock = threading.RLock()
    
    def register_health_check(self, name: str, check_func: Callable) -> None:
        """Register a health check"""
        with self.lock:
            self.health_checks[name] = check_func
    
    def run_health_checks(self) -> Dict[str, Dict[str, Any]]:
        """Run all health checks"""
        results = {}
        
        with self.lock:
            for name, check_func in self.health_checks.items():
                try:
                    result = check_func()
                    status = {
                        'status': 'healthy' if result else 'unhealthy',
                        'timestamp': time.time(),
                        'result': result
                    }
                except Exception as e:
                    status = {
                        'status': 'unhealthy',
                        'timestamp': time.time(),
                        'error': str(e)
                    }
                
                results[name] = status
                self.health_status[name] = status
        
        return results
    
    def get_health_status(self, name: str = None) -> Dict[str, Any]:
        """Get health status"""
        with self.lock:
            if name:
                return self.health_status.get(name, {})
            return dict(self.health_status)

class MonitoringFramework:
    """Main monitoring and observability framework"""
    
    def __init__(self, service_name: str = "tsk-sdk"):
        self.service_name = service_name
        self.metrics = MetricsCollector()
        self.logger = StructuredLogger(service_name)
        self.tracer = TraceCollector()
        self.alerts = AlertManager()
        self.health = HealthChecker()
        
        # Register default health checks
        self._register_default_health_checks()
    
    def _register_default_health_checks(self):
        """Register default health checks"""
        self.health.register_health_check("memory_usage", self._check_memory_usage)
        self.health.register_health_check("disk_space", self._check_disk_space)
        self.health.register_health_check("service_uptime", self._check_service_uptime)
    
    def _check_memory_usage(self) -> bool:
        """Check memory usage"""
        try:
            import psutil
            process = psutil.Process()
            memory_percent = process.memory_percent()
            return memory_percent < 80  # Healthy if < 80%
        except:
            return True
    
    def _check_disk_space(self) -> bool:
        """Check disk space"""
        try:
            import psutil
            disk_usage = psutil.disk_usage('/')
            return disk_usage.percent < 90  # Healthy if < 90%
        except:
            return True
    
    def _check_service_uptime(self) -> bool:
        """Check service uptime"""
        return True  # Always healthy for now
    
    @contextmanager
    def trace_operation(self, operation_name: str, trace_id: str = None, 
                       parent_span_id: str = None, tags: Dict[str, str] = None):
        """Context manager for tracing operations"""
        span_id = self.tracer.start_span(operation_name, trace_id, parent_span_id, tags)
        try:
            yield span_id
        except Exception as e:
            self.tracer.end_span(span_id, "error", {"error": str(e)})
            raise
        else:
            self.tracer.end_span(span_id, "success")
    
    def record_metric(self, name: str, value: float, tags: Dict[str, str] = None):
        """Record a metric"""
        self.metrics.record_metric(name, value, tags)
    
    def log(self, level: str, message: str, component: str = "general", 
            tags: Dict[str, str] = None, context: Dict[str, Any] = None):
        """Log a message"""
        self.logger.log(level, message, component, tags, context)
    
    def set_correlation_id(self, correlation_id: str = None) -> str:
        """Set correlation ID"""
        return self.logger.set_correlation_id(correlation_id)
    
    def get_correlation_id(self) -> str:
        """Get correlation ID"""
        return self.logger.get_correlation_id()
    
    def add_alert(self, name: str, condition: str, threshold: float, 
                  severity: str, message: str, cooldown_period: float = 300.0):
        """Add an alert"""
        alert = Alert(
            name=name,
            condition=condition,
            threshold=threshold,
            severity=severity,
            message=message,
            cooldown_period=cooldown_period
        )
        self.alerts.add_alert(alert)
    
    def register_alert_handler(self, handler: Callable):
        """Register alert handler"""
        self.alerts.register_alert_handler(handler)
    
    def check_alerts(self) -> List[Dict[str, Any]]:
        """Check all alerts"""
        return self.alerts.check_alerts(self.metrics)
    
    def run_health_checks(self) -> Dict[str, Dict[str, Any]]:
        """Run health checks"""
        return self.health.run_health_checks()
    
    def get_status_report(self) -> Dict[str, Any]:
        """Get comprehensive status report"""
        return {
            'service': self.service_name,
            'timestamp': time.time(),
            'metrics': {
                'total_metrics': sum(len(metrics) for metrics in self.metrics.metrics.values()),
                'active_traces': len(self.tracer.active_spans),
                'total_logs': len(self.logger.logs)
            },
            'health': self.health.get_health_status(),
            'alerts': {
                'total_alerts': len(self.alerts.alerts),
                'enabled_alerts': sum(1 for alert in self.alerts.alerts.values() if alert.enabled)
            }
        }

# Global monitoring framework instance
monitoring_framework = MonitoringFramework()

def monitor_operation(operation_name: str = None, record_metrics: bool = True, 
                     trace: bool = True, log: bool = True):
    """Decorator for monitoring operations"""
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            op_name = operation_name or func.__name__
            
            # Start tracing
            if trace:
                with monitoring_framework.trace_operation(op_name) as span_id:
                    start_time = time.time()
                    try:
                        result = func(*args, **kwargs)
                        
                        # Record metrics
                        if record_metrics:
                            duration = time.time() - start_time
                            monitoring_framework.record_metric(f"{op_name}_duration", duration)
                            monitoring_framework.record_metric(f"{op_name}_success", 1)
                        
                        # Log success
                        if log:
                            monitoring_framework.log("info", f"Operation {op_name} completed successfully")
                        
                        return result
                    except Exception as e:
                        # Record error metrics
                        if record_metrics:
                            duration = time.time() - start_time
                            monitoring_framework.record_metric(f"{op_name}_duration", duration)
                            monitoring_framework.record_metric(f"{op_name}_error", 1)
                        
                        # Log error
                        if log:
                            monitoring_framework.log("error", f"Operation {op_name} failed: {e}")
                        
                        raise
            else:
                # No tracing
                start_time = time.time()
                try:
                    result = func(*args, **kwargs)
                    
                    if record_metrics:
                        duration = time.time() - start_time
                        monitoring_framework.record_metric(f"{op_name}_duration", duration)
                        monitoring_framework.record_metric(f"{op_name}_success", 1)
                    
                    if log:
                        monitoring_framework.log("info", f"Operation {op_name} completed successfully")
                    
                    return result
                except Exception as e:
                    if record_metrics:
                        duration = time.time() - start_time
                        monitoring_framework.record_metric(f"{op_name}_duration", duration)
                        monitoring_framework.record_metric(f"{op_name}_error", 1)
                    
                    if log:
                        monitoring_framework.log("error", f"Operation {op_name} failed: {e}")
                    
                    raise
        return wrapper
    return decorator

def monitor_async_operation(operation_name: str = None, record_metrics: bool = True, 
                           trace: bool = True, log: bool = True):
    """Decorator for monitoring async operations"""
    def decorator(func):
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            op_name = operation_name or func.__name__
            
            if trace:
                with monitoring_framework.trace_operation(op_name) as span_id:
                    start_time = time.time()
                    try:
                        result = await func(*args, **kwargs)
                        
                        if record_metrics:
                            duration = time.time() - start_time
                            monitoring_framework.record_metric(f"{op_name}_duration", duration)
                            monitoring_framework.record_metric(f"{op_name}_success", 1)
                        
                        if log:
                            monitoring_framework.log("info", f"Async operation {op_name} completed successfully")
                        
                        return result
                    except Exception as e:
                        if record_metrics:
                            duration = time.time() - start_time
                            monitoring_framework.record_metric(f"{op_name}_duration", duration)
                            monitoring_framework.record_metric(f"{op_name}_error", 1)
                        
                        if log:
                            monitoring_framework.log("error", f"Async operation {op_name} failed: {e}")
                        
                        raise
            else:
                start_time = time.time()
                try:
                    result = await func(*args, **kwargs)
                    
                    if record_metrics:
                        duration = time.time() - start_time
                        monitoring_framework.record_metric(f"{op_name}_duration", duration)
                        monitoring_framework.record_metric(f"{op_name}_success", 1)
                    
                    if log:
                        monitoring_framework.log("info", f"Async operation {op_name} completed successfully")
                    
                    return result
                except Exception as e:
                    if record_metrics:
                        duration = time.time() - start_time
                        monitoring_framework.record_metric(f"{op_name}_duration", duration)
                        monitoring_framework.record_metric(f"{op_name}_error", 1)
                    
                    if log:
                        monitoring_framework.log("error", f"Async operation {op_name} failed: {e}")
                    
                    raise
        return wrapper
    return decorator 