"""
Main user-facing API for hypersets with streaming and async support.

This module provides the primary interface that mimics Hugging Face's load_dataset
while leveraging DuckDB for efficient querying with true random sampling and streaming.
"""

import asyncio
from typing import Any, Dict, List, Optional, Union, Literal, Iterator, AsyncIterator
import warnings

import pandas as pd
from datasets import Dataset, Features

from .metadata import DatasetMetadata, SplitInfo
from .query import DuckDBQueryEngine, AsyncDuckDBQueryEngine


def get_dataset_info(
    dataset_name: str, config: Optional[str] = None, token: Optional[str] = None
) -> DatasetMetadata:
    """
    Get comprehensive dataset information without downloading data.

    Uses README.md parsing and file structure analysis instead of HF API.

    Args:
        dataset_name: Name of the dataset (e.g., "squad", "imdb")
        config: Configuration name. If None, uses default config.
        token: Optional Hugging Face token for private datasets

    Returns:
        DatasetMetadata object with all available information

    Example:
        >>> info = get_dataset_info("wikimedia/wikipedia")
        >>> print(f"Available configs: {info.get_config_names()[:5]}")
        >>> print(f"English config: 20231101.en" if "20231101.en" in info.get_config_names() else "Not found")
    """
    return DatasetMetadata.from_dataset_name(dataset_name, token=token)


def load_parquet_sql(
    dataset_name: str,
    config: Optional[str] = None,
    split: str = "train",
    query: Optional[str] = None,
    columns: Optional[List[str]] = None,
    where: Optional[str] = None,
    limit: Optional[int] = None,
    sample_size: Optional[int] = None,
    sample_method: str = "reservoir",
    return_type: Literal["df", "dataset"] = "df",
    token: Optional[str] = None,
    memory_limit: str = "1GB",
    threads: Optional[int] = None,
    no_cache: bool = True,
) -> Union[pd.DataFrame, Dataset]:
    """
    Load and query a Hugging Face dataset using DuckDB with TRUE random sampling.

    This function provides an efficient alternative to HF's load_dataset by:
    1. Parsing README.md and file structure for metadata discovery
    2. Using DuckDB to query remote Parquet files directly with true random sampling
    3. Operating in memory-only mode with no disk caching

    Args:
        dataset_name: Name of the dataset (e.g., "wikimedia/wikipedia", "imdb")
        config: Configuration name. If None, uses intelligent default.
        split: Split name (default: "train")
        query: Custom SQL query. If provided, other query parameters are ignored.
               Use '{parquet_files}' as placeholder for the dataset files.
        columns: List of columns to select. If None, selects all.
        where: WHERE clause for filtering rows
        limit: Maximum number of rows to return (applied after sampling)
        sample_size: Number of rows to randomly sample. Uses TRUE random sampling.
        sample_method: Sampling method ("reservoir", "bernoulli", "system")
        return_type: Return format - "df" for pandas DataFrame, "dataset" for HF Dataset
        token: Optional Hugging Face token for private datasets
        memory_limit: Memory limit for DuckDB (e.g., "1GB", "500MB")
        threads: Number of threads for DuckDB. If None, uses default.
        no_cache: If True, operates in memory-only mode with no disk caching

    Returns:
        pandas DataFrame or Hugging Face Dataset based on return_type

    Examples:
        >>> # Random sampling from English Wikipedia
        >>> df = load_parquet_sql(
        ...     "wikimedia/wikipedia",
        ...     config="20231101.en",
        ...     sample_size=100
        ... )

        >>> # Targeted filtering with sampling
        >>> df = load_parquet_sql(
        ...     "imdb",
        ...     columns=["text", "label"],
        ...     where="label = 1",
        ...     sample_size=1000,
        ...     sample_method="reservoir"
        ... )

        >>> # Custom SQL query
        >>> df = load_parquet_sql(
        ...     "imdb",
        ...     query="SELECT label, COUNT(*) as count FROM {parquet_files} GROUP BY label"
        ... )
    """
    # Get dataset metadata using improved discovery
    metadata = DatasetMetadata.from_dataset_name(dataset_name, token=token)

    # Validate config and split
    config_name = config or metadata.default_config
    if not config_name or config_name not in metadata.configs:
        available_configs = list(metadata.configs.keys())
        raise ValueError(
            f"Config '{config_name}' not found. Available configs: {available_configs[:10]}..."
        )

    if split not in metadata.configs[config_name].splits:
        available_splits = list(metadata.configs[config_name].splits.keys())
        raise ValueError(
            f"Split '{split}' not found in config '{config_name}'. "
            f"Available splits: {available_splits}"
        )

    # Get parquet files for the specified config and split
    parquet_files = metadata.get_parquet_files(config_name, split)
    if not parquet_files:
        raise ValueError(
            f"No Parquet files found for dataset '{dataset_name}', "
            f"config '{config_name}', split '{split}'"
        )

    # Execute query using DuckDB with proper random sampling
    engine_class = (
        DuckDBQueryEngine if not no_cache else DuckDBQueryEngine
    )  # Always use memory-only
    with engine_class(memory_limit=memory_limit, threads=threads) as engine:
        df = engine.query(
            parquet_files=parquet_files,
            query=query,
            columns=columns,
            where=where,
            limit=limit,
            sample_size=sample_size,
            sample_method=sample_method,
        )

    if return_type == "df":
        return df
    elif return_type == "dataset":
        return _dataframe_to_dataset(df, metadata, config_name, split)
    else:
        raise ValueError("return_type must be 'df' or 'dataset'")


def stream_dataset(
    dataset_name: str,
    config: Optional[str] = None,
    split: str = "train",
    columns: Optional[List[str]] = None,
    where: Optional[str] = None,
    batch_size: int = 1000,
    token: Optional[str] = None,
    memory_limit: str = "1GB",
) -> Iterator[pd.DataFrame]:
    """
    Stream dataset in batches without downloading everything.

    Args:
        dataset_name: Name of the dataset
        config: Configuration name
        split: Split name
        columns: List of columns to select
        where: WHERE clause for filtering
        batch_size: Number of rows per batch
        token: Optional Hugging Face token
        memory_limit: Memory limit for DuckDB

    Yields:
        pandas DataFrame batches

    Example:
        >>> for batch in stream_dataset("imdb", batch_size=500):
        ...     print(f"Processing batch of {len(batch)} rows")
        ...     # Process batch...
    """
    # Get dataset metadata
    metadata = DatasetMetadata.from_dataset_name(dataset_name, token=token)

    # Validate config and split
    config_name = config or metadata.default_config
    if not config_name or config_name not in metadata.configs:
        available_configs = list(metadata.configs.keys())
        raise ValueError(
            f"Config '{config_name}' not found. Available configs: {available_configs[:10]}..."
        )

    if split not in metadata.configs[config_name].splits:
        available_splits = list(metadata.configs[config_name].splits.keys())
        raise ValueError(
            f"Split '{split}' not found in config '{config_name}'. Available splits: {available_splits}"
        )

    # Get parquet files
    parquet_files = metadata.get_parquet_files(config_name, split)
    if not parquet_files:
        raise ValueError(
            f"No Parquet files found for dataset '{dataset_name}', config '{config_name}', split '{split}'"
        )

    # Stream using DuckDB
    with DuckDBQueryEngine(memory_limit=memory_limit) as engine:
        yield from engine.stream_query(
            parquet_files=parquet_files,
            batch_size=batch_size,
            columns=columns,
            where=where,
        )


async def astream_dataset(
    dataset_name: str,
    config: Optional[str] = None,
    split: str = "train",
    columns: Optional[List[str]] = None,
    where: Optional[str] = None,
    batch_size: int = 1000,
    token: Optional[str] = None,
    memory_limit: str = "1GB",
) -> AsyncIterator[pd.DataFrame]:
    """
    Async stream dataset in batches.

    Args:
        dataset_name: Name of the dataset
        config: Configuration name
        split: Split name
        columns: List of columns to select
        where: WHERE clause for filtering
        batch_size: Number of rows per batch
        token: Optional Hugging Face token
        memory_limit: Memory limit for DuckDB

    Yields:
        pandas DataFrame batches

    Example:
        >>> async for batch in astream_dataset("imdb", batch_size=500):
        ...     print(f"Processing batch of {len(batch)} rows")
        ...     await process_batch_async(batch)
    """
    # Get dataset metadata
    metadata = DatasetMetadata.from_dataset_name(dataset_name, token=token)

    # Validate config and split
    config_name = config or metadata.default_config
    if not config_name or config_name not in metadata.configs:
        available_configs = list(metadata.configs.keys())
        raise ValueError(
            f"Config '{config_name}' not found. Available configs: {available_configs[:10]}..."
        )

    if split not in metadata.configs[config_name].splits:
        available_splits = list(metadata.configs[config_name].splits.keys())
        raise ValueError(
            f"Split '{split}' not found in config '{config_name}'. Available splits: {available_splits}"
        )

    # Get parquet files
    parquet_files = metadata.get_parquet_files(config_name, split)
    if not parquet_files:
        raise ValueError(
            f"No Parquet files found for dataset '{dataset_name}', config '{config_name}', split '{split}'"
        )

    # Async stream using DuckDB
    async with AsyncDuckDBQueryEngine(memory_limit=memory_limit) as engine:
        async for batch in engine.astream_query(
            parquet_files=parquet_files,
            batch_size=batch_size,
            columns=columns,
            where=where,
        ):
            yield batch


def sample_dataset(
    dataset_name: str,
    n: int,
    config: Optional[str] = None,
    split: str = "train",
    method: str = "reservoir",
    columns: Optional[List[str]] = None,
    where: Optional[str] = None,
    return_type: Literal["df", "dataset"] = "df",
    token: Optional[str] = None,
) -> Union[pd.DataFrame, Dataset]:
    """
    True random sampling from a dataset without downloading the full dataset.

    Uses proper random sampling algorithms, not just LIMIT.

    Args:
        dataset_name: Name of the dataset
        n: Number of rows to sample
        config: Configuration name
        split: Split name
        method: Sampling method ("reservoir", "bernoulli", "system")
        columns: List of columns to select
        where: WHERE clause for filtering before sampling
        return_type: Return format - "df" or "dataset"
        token: Optional Hugging Face token

    Returns:
        Randomly sampled data as DataFrame or Dataset

    Example:
        >>> # True random sample from Wikipedia
        >>> sample = sample_dataset(
        ...     "wikimedia/wikipedia",
        ...     n=100,
        ...     config="20231101.en",
        ...     method="reservoir"
        ... )
    """
    return load_parquet_sql(
        dataset_name=dataset_name,
        config=config,
        split=split,
        columns=columns,
        where=where,
        sample_size=n,
        sample_method=method,
        return_type=return_type,
        token=token,
    )


def query_dataset(
    dataset_name: str,
    query: str,
    config: Optional[str] = None,
    split: str = "train",
    return_type: Literal["df", "dataset"] = "df",
    token: Optional[str] = None,
) -> Union[pd.DataFrame, Dataset]:
    """
    Execute a custom SQL query on a dataset.

    Args:
        dataset_name: Name of the dataset
        query: SQL query to execute. Use '{parquet_files}' as placeholder.
        config: Configuration name
        split: Split name
        return_type: Return format - "df" or "dataset"
        token: Optional Hugging Face token

    Returns:
        Query results as DataFrame or Dataset

    Example:
        >>> # Get label distribution
        >>> result = query_dataset(
        ...     "imdb",
        ...     "SELECT label, COUNT(*) as count FROM {parquet_files} GROUP BY label"
        ... )
    """
    return load_parquet_sql(
        dataset_name=dataset_name,
        config=config,
        split=split,
        query=query,
        return_type=return_type,
        token=token,
    )


def get_dataset_stats(
    dataset_name: str,
    config: Optional[str] = None,
    split: str = "train",
    columns: Optional[List[str]] = None,
    token: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Get comprehensive statistics about a dataset by querying it directly.

    Args:
        dataset_name: Name of the dataset
        config: Configuration name
        split: Split name
        columns: List of columns to analyze
        token: Optional Hugging Face token

    Returns:
        Dictionary with dataset statistics
    """
    metadata = DatasetMetadata.from_dataset_name(dataset_name, token=token)
    config_name = config or metadata.default_config

    if not config_name or config_name not in metadata.configs:
        return {"error": "Config not found"}

    parquet_files = metadata.get_parquet_files(config_name, split)
    if not parquet_files:
        return {"error": "No parquet files found"}

    with DuckDBQueryEngine() as engine:
        # Get basic info
        schema = engine.get_schema(parquet_files)
        row_count = engine.count_rows(parquet_files)

        stats = {
            "dataset_name": dataset_name,
            "config": config_name,
            "split": split,
            "num_rows": row_count,
            "num_columns": len(schema),
            "schema": schema,
            "parquet_files": len(parquet_files),
        }

        # Add split info if available
        split_info = metadata.get_split_info(config_name, split)
        if split_info:
            stats["split_info"] = {
                "num_examples": split_info.num_examples,
                "num_bytes": split_info.num_bytes,
                "num_parquet_files": len(split_info.parquet_files),
            }

        return stats


def _dataframe_to_dataset(
    df: pd.DataFrame, metadata: DatasetMetadata, config_name: str, split: str
) -> Dataset:
    """
    Convert pandas DataFrame to Hugging Face Dataset.

    Args:
        df: pandas DataFrame
        metadata: Dataset metadata
        config_name: Configuration name
        split: Split name

    Returns:
        Hugging Face Dataset object
    """
    try:
        # Try to get features from metadata if available
        config_info = metadata.configs.get(config_name)
        features = None
        if config_info and config_info.features:
            try:
                features = Features.from_dict(config_info.features)
            except Exception:
                # If features conversion fails, let Dataset infer them
                pass

        # Create Dataset from DataFrame
        dataset = Dataset.from_pandas(df, features=features)

        # Add dataset info
        dataset.info.dataset_name = metadata.dataset_name
        dataset.info.config_name = config_name
        dataset.info.splits = {
            split: dataset.info.splits.get(
                split, dataset.info.splits.get("train", None)
            )
        }

        return dataset

    except Exception as e:
        warnings.warn(
            f"Could not create Dataset object: {e}. Returning DataFrame instead."
        )
        return df
