"""
Simple DuckDB Mounting for Hypersets

Mounts remote parquet files as virtual tables and lets DuckDB
handle all optimization, range requests, and file selection.
"""

import time
import logging
from typing import List, Optional, Any, Dict
from contextlib import contextmanager
import random

import duckdb
import pandas as pd
from .dataset_info import DatasetInfo, get_dataset_info

logger = logging.getLogger(__name__)


def _handle_duckdb_429_retries(func, max_retries: int = 10, base_delay: float = 1.0):
    """
    Wrapper for DuckDB operations to handle 429 rate limits with exponential backoff.

    This is the REAL 429 handling that was missing - it wraps DuckDB query execution
    and retries when DuckDB's HTTP client hits rate limits.
    """
    last_exception = None

    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            error_str = str(e)

            # Check if this is a 429 or rate limit error from DuckDB's HTTP client
            is_rate_limit = any(
                phrase in error_str.lower()
                for phrase in ["429", "rate limit", "slow down", "too many requests"]
            )

            if not is_rate_limit:
                # Not a rate limit error, re-raise immediately
                raise

            last_exception = e

            if attempt == max_retries - 1:
                # Last attempt, give up
                logger.error(
                    f"Max retries ({max_retries}) exceeded for DuckDB HTTP 429 errors"
                )
                raise last_exception

            # Calculate exponential backoff with jitter
            delay = min(base_delay * (2**attempt), 60.0)  # Cap at 60 seconds
            jitter = random.uniform(0.1, 0.3) * delay  # Add 10-30% jitter
            actual_delay = delay + jitter

            logger.warning(
                f"DuckDB hit 429 rate limit - retrying in {actual_delay:.1f}s (attempt {attempt + 1}/{max_retries})"
            )
            logger.warning(f"Error was: {error_str}")

            # Actually wait the delay
            time.sleep(actual_delay)

    # Should never reach here due to the attempt check above
    raise last_exception


class DuckDBMount:
    """
    Simple DuckDB connection that mounts parquet files as virtual tables.
    Lets DuckDB handle all optimization without interference.

    Now includes REAL 429 handling that wraps DuckDB query execution.
    """

    def __init__(self, memory_limit: str = "1GB", threads: int = 4):
        self.conn = None
        self.memory_limit = memory_limit
        self.threads = threads
        self.mounted_datasets = {}

    def __enter__(self):
        self.connect()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def connect(self):
        """Create DuckDB connection with optimized settings."""
        if self.conn is not None:
            return

        logger.debug("Creating DuckDB connection")
        self.conn = duckdb.connect(":memory:")

        # Configure DuckDB for remote parquet files
        self.conn.execute("INSTALL httpfs")
        self.conn.execute("LOAD httpfs")

        # Optimize settings
        self.conn.execute(f"SET memory_limit='{self.memory_limit}'")
        self.conn.execute(f"SET threads={self.threads}")

        # HTTP settings for better reliability (but DuckDB doesn't have built-in 429 handling)
        self.conn.execute("SET http_timeout=300000")  # 5 minutes - longer for retries
        self.conn.execute(
            "SET http_retries=3"
        )  # DuckDB's internal retries (not for 429s)
        self.conn.execute("SET http_retry_wait_ms=5000")  # 5 seconds between retries

        # Disable progress bar for cleaner output
        self.conn.execute("SET enable_progress_bar=false")

    def close(self):
        """Close DuckDB connection."""
        if self.conn:
            self.conn.close()
            self.conn = None
            self.mounted_datasets.clear()

    def mount_dataset(
        self,
        dataset_name: str,
        table_name: str = "dataset",
        config: Optional[str] = None,
        split: Optional[str] = None,
        token: Optional[str] = None,
    ):
        """
        Mount a HuggingFace dataset as a virtual table in DuckDB.

        Args:
            dataset_name: HF dataset name (e.g. "wikimedia/wikipedia")
            table_name: Name for the virtual table (default: "dataset")
            config: Optional config filter (e.g. "20231101.en")
            split: Optional split filter (e.g. "train")
            token: Optional HF token
        """
        if not self.conn:
            self.connect()

        # Get dataset info and parquet URLs
        dataset_info = get_dataset_info(dataset_name, token=token)
        parquet_urls = dataset_info.get_parquet_urls(config=config, split=split)

        if not parquet_urls:
            raise ValueError(
                f"No parquet files found for {dataset_name} config={config} split={split}"
            )

        # Create view pointing to parquet files - WITH 429 RETRY HANDLING
        def _mount_with_retries():
            if len(parquet_urls) == 1:
                files_expr = f"'{parquet_urls[0]}'"
            else:
                files_list = "', '".join(parquet_urls)
                files_expr = f"['{files_list}']"

            # Drop existing table if it exists
            self.conn.execute(f"DROP VIEW IF EXISTS {table_name}")

            # Create view - let DuckDB handle optimization
            # This is where 429s can happen during the initial schema discovery
            self.conn.execute(
                f"CREATE VIEW {table_name} AS SELECT * FROM read_parquet({files_expr})"
            )

        # Execute mount with 429 retry handling
        _handle_duckdb_429_retries(_mount_with_retries)

        # Store mount info
        self.mounted_datasets[table_name] = {
            "dataset_name": dataset_name,
            "config": config,
            "split": split,
            "file_count": len(parquet_urls),
            "dataset_info": dataset_info,
        }

        logger.info(
            f"Mounted {dataset_name} as '{table_name}' ({len(parquet_urls)} files)"
        )

    def query(self, sql: str) -> pd.DataFrame:
        """
        Execute SQL query with 429 retry handling.

        This is the CRITICAL fix - wrapping DuckDB query execution with retry logic.
        """
        if not self.conn:
            raise RuntimeError("Not connected to DuckDB")

        def _execute_with_retries():
            logger.debug(f"Executing SQL: {sql}")
            result_df = self.conn.execute(sql).df()
            return result_df

        # Execute query with 429 retry handling
        return _handle_duckdb_429_retries(
            _execute_with_retries, max_retries=15, base_delay=2.0
        )

    def count(self, table_name: str = "dataset") -> int:
        """Count rows in mounted table with retry handling."""
        result = self.query(f"SELECT COUNT(*) as count FROM {table_name}")
        return int(result["count"].iloc[0])

    def schema(self, table_name: str = "dataset") -> pd.DataFrame:
        """Get schema of mounted table with retry handling."""
        return self.query(f"DESCRIBE {table_name}")

    def sample(self, n: int, table_name: str = "dataset") -> pd.DataFrame:
        """Sample rows from mounted table with retry handling."""
        return self.query(f"SELECT * FROM {table_name} USING SAMPLE {n}")

    def get_mount_info(self, table_name: str = "dataset") -> Dict[str, Any]:
        """Get information about a mounted dataset."""
        if table_name not in self.mounted_datasets:
            raise ValueError(f"Table '{table_name}' is not mounted")
        return self.mounted_datasets[table_name]


@contextmanager
def mount_dataset(
    dataset_name: str,
    config: Optional[str] = None,
    split: Optional[str] = None,
    table_name: str = "dataset",
    token: Optional[str] = None,
    **kwargs,
):
    """
    Context manager for mounting a dataset temporarily.

    Usage:
        with mount_dataset("wikimedia/wikipedia", config="20231101.en") as mount:
            df = mount.query("SELECT title FROM dataset LIMIT 10")
    """
    # Filter out token from kwargs since DuckDBMount doesn't accept it
    mount = DuckDBMount(**kwargs)
    try:
        mount.connect()
        mount.mount_dataset(
            dataset_name=dataset_name,
            config=config,
            split=split,
            table_name=table_name,
            token=token,  # mount_dataset method does accept token
        )
        yield mount
    finally:
        mount.close()


def quick_query(
    dataset_name: str,
    sql: str,
    config: Optional[str] = None,
    split: Optional[str] = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Quick query execution with automatic cleanup and 429 handling.

    Args:
        dataset_name: HF dataset name
        sql: SQL query to execute
        config: Optional config filter
        split: Optional split filter
        **kwargs: Additional arguments for DuckDBMount

    Returns:
        pandas DataFrame with results
    """
    with mount_dataset(dataset_name, config=config, split=split, **kwargs) as mount:
        return mount.query(sql)
