"""
Optional Download Tracking for Hypersets

Provides optional tracking of data usage and estimated savings
without complex proxy systems.
"""

import time
import logging
from typing import Optional, Dict, Any
from dataclasses import dataclass, field

from .dataset_info import DatasetInfo, get_dataset_info

logger = logging.getLogger(__name__)


@dataclass
class DataSavings:
    """Information about data savings from using Hypersets."""

    total_dataset_size_gb: float
    estimated_downloaded_gb: float
    savings_gb: float
    savings_percentage: float

    def __str__(self) -> str:
        return f"Hypersets saved you {self.savings_gb:.1f} GB ({self.savings_percentage:.1f}% of dataset)!"


@dataclass
class DownloadStats:
    """Statistics about a query's data usage."""

    query_time_seconds: float
    rows_returned: int
    columns_returned: int
    estimated_data_processed_mb: float
    dataset_info: DatasetInfo

    @property
    def data_savings(self) -> DataSavings:
        """Calculate estimated data savings."""
        total_size_gb = self.dataset_info.estimated_total_size_gb
        downloaded_gb = self.estimated_data_processed_mb / 1024
        savings_gb = max(0, total_size_gb - downloaded_gb)
        savings_percentage = (
            (savings_gb / total_size_gb * 100) if total_size_gb > 0 else 0
        )

        return DataSavings(
            total_dataset_size_gb=total_size_gb,
            estimated_downloaded_gb=downloaded_gb,
            savings_gb=savings_gb,
            savings_percentage=savings_percentage,
        )


def estimate_data_usage(
    dataset_info: DatasetInfo,
    query: str,
    result_rows: int,
    result_columns: int,
    config: Optional[str] = None,
    split: Optional[str] = None,
) -> float:
    """
    Estimate how much data was processed for a query.

    This is a rough estimate based on:
    - Number of files that would be accessed
    - Result size as a proxy for data processed
    - Query type (LIMIT vs full scan)

    Returns estimated MB processed.
    """
    # Get the files that would be accessed
    available_files = len(dataset_info.get_parquet_urls(config=config, split=split))

    # Estimate based on query type
    query_lower = query.lower()

    if "limit" in query_lower and "sample" not in query_lower:
        # LIMIT query - DuckDB likely only reads first few files
        estimated_files_accessed = min(3, available_files)

    elif "using sample" in query_lower:
        # SAMPLE query - may need to scan more files for randomness
        if result_rows < 1000:
            # Small sample, DuckDB might optimize to fewer files
            estimated_files_accessed = min(5, available_files)
        else:
            # Larger sample, likely needs more files
            estimated_files_accessed = min(available_files // 2, 10)

    elif any(agg in query_lower for agg in ["count", "sum", "avg", "min", "max"]):
        # Aggregation query - might need full scan
        estimated_files_accessed = available_files

    else:
        # General query - estimate based on result size
        if result_rows < 100:
            estimated_files_accessed = min(2, available_files)
        elif result_rows < 10000:
            estimated_files_accessed = min(5, available_files)
        else:
            estimated_files_accessed = min(available_files // 3, 15)

    # Estimate data per file (rough average: 50MB per parquet file)
    avg_file_size_mb = 50

    # But adjust based on result size - if we got a lot of data, we processed more
    result_data_estimate_mb = (result_rows * result_columns * 100) / (
        1024 * 1024
    )  # 100 bytes per cell estimate

    # Take the higher of the two estimates
    estimated_mb = max(
        estimated_files_accessed
        * avg_file_size_mb
        * 0.3,  # Assume 30% of each file on average
        result_data_estimate_mb
        * 2,  # Result size * 2 to account for processing overhead
    )

    return estimated_mb


def create_download_stats(
    dataset_name: str,
    query: str,
    query_time: float,
    result_rows: int,
    result_columns: int,
    config: Optional[str] = None,
    split: Optional[str] = None,
    token: Optional[str] = None,
) -> DownloadStats:
    """
    Create download statistics for a completed query.

    Args:
        dataset_name: HF dataset name
        query: SQL query that was executed
        query_time: Time taken for query
        result_rows: Number of rows returned
        result_columns: Number of columns returned
        config: Config used (if any)
        split: Split used (if any)
        token: HF token (if any)

    Returns:
        DownloadStats with estimated data usage and savings
    """
    # Get dataset info
    dataset_info = get_dataset_info(dataset_name, token=token)

    # Estimate data usage
    estimated_mb = estimate_data_usage(
        dataset_info=dataset_info,
        query=query,
        result_rows=result_rows,
        result_columns=result_columns,
        config=config,
        split=split,
    )

    return DownloadStats(
        query_time_seconds=query_time,
        rows_returned=result_rows,
        columns_returned=result_columns,
        estimated_data_processed_mb=estimated_mb,
        dataset_info=dataset_info,
    )
