"""
Metadata discovery for Hugging Face datasets.

This module handles reading dataset metadata by parsing README.md files
and analyzing file structure, avoiding reliance on HF API dataset info.
"""

import json
import re
import yaml
from typing import Dict, List, Optional, Any, Union
from dataclasses import dataclass, field
from pathlib import Path

import requests
from huggingface_hub import HfApi, hf_hub_url
from huggingface_hub.utils import RepositoryNotFoundError


@dataclass
class SplitInfo:
    """Information about a dataset split."""

    name: str
    num_examples: Optional[int] = None
    num_bytes: Optional[int] = None
    parquet_files: List[str] = field(default_factory=list)


@dataclass
class ConfigInfo:
    """Information about a dataset configuration."""

    name: str
    description: Optional[str] = None
    features: Optional[Dict[str, Any]] = None
    splits: Dict[str, SplitInfo] = field(default_factory=dict)


@dataclass
class DatasetMetadata:
    """Complete metadata for a Hugging Face dataset."""

    dataset_name: str
    description: Optional[str] = None
    citation: Optional[str] = None
    homepage: Optional[str] = None
    license: Optional[str] = None
    configs: Dict[str, ConfigInfo] = field(default_factory=dict)
    tags: List[str] = field(default_factory=list)
    default_config: Optional[str] = None

    @classmethod
    def from_dataset_name(
        cls, dataset_name: str, token: Optional[str] = None
    ) -> "DatasetMetadata":
        """
        Create DatasetMetadata by analyzing README.md and file structure.

        Args:
            dataset_name: Name of the dataset (e.g., "squad", "imdb")
            token: Optional Hugging Face token for private datasets

        Returns:
            DatasetMetadata object with all available information
        """
        api = HfApi(token=token)

        try:
            # Verify dataset exists
            api.dataset_info(dataset_name)
        except RepositoryNotFoundError:
            raise ValueError(f"Dataset '{dataset_name}' not found on Hugging Face Hub")

        metadata = cls(dataset_name=dataset_name)

        # Get metadata from README.md
        metadata._parse_readme(api, token)

        # Discover configs and splits from file structure
        metadata._discover_from_files(api, token)

        return metadata

    def _parse_readme(self, api: HfApi, token: Optional[str] = None) -> None:
        """Parse README.md for dataset metadata."""
        try:
            # Get README.md content
            readme_url = hf_hub_url(
                repo_id=self.dataset_name, filename="README.md", repo_type="dataset"
            )

            response = requests.get(readme_url)
            if response.status_code == 200:
                readme_content = response.text

                # Extract YAML frontmatter
                yaml_match = re.match(r"^---\n(.*?)\n---", readme_content, re.DOTALL)
                if yaml_match:
                    try:
                        yaml_content = yaml.safe_load(yaml_match.group(1))

                        # Extract basic info
                        self.description = yaml_content.get(
                            "description",
                            yaml_content.get("dataset_info", {}).get("description"),
                        )
                        self.citation = yaml_content.get("citation")
                        self.homepage = yaml_content.get("homepage")
                        self.license = yaml_content.get("license")
                        self.tags = yaml_content.get("tags", [])

                        # Extract config information
                        dataset_info = yaml_content.get("dataset_info")
                        if isinstance(dataset_info, dict):
                            # Single config
                            config_name = dataset_info.get("config_name", "default")
                            self._parse_config_info(config_name, dataset_info)
                        elif isinstance(dataset_info, list):
                            # Multiple configs
                            for config_data in dataset_info:
                                config_name = config_data.get("config_name", "default")
                                self._parse_config_info(config_name, config_data)

                    except yaml.YAMLError:
                        pass  # Continue without YAML metadata

        except Exception:
            pass  # Continue without README metadata

    def _parse_config_info(self, config_name: str, config_data: dict) -> None:
        """Parse configuration information from YAML."""
        config_info = ConfigInfo(
            name=config_name,
            description=config_data.get("description"),
            features=config_data.get("features"),
        )

        # Parse splits
        splits_data = config_data.get("splits", {})
        for split_name, split_data in splits_data.items():
            split_info = SplitInfo(
                name=split_name,
                num_examples=split_data.get("num_examples"),
                num_bytes=split_data.get("num_bytes"),
            )
            config_info.splits[split_name] = split_info

        self.configs[config_name] = config_info

        # Set default config
        if not self.default_config:
            self.default_config = config_name

    def _discover_from_files(self, api: HfApi, token: Optional[str] = None) -> None:
        """Discover configurations and splits from parquet file structure."""
        try:
            # List all files in the repository
            repo_files = api.list_repo_files(
                self.dataset_name, repo_type="dataset", token=token
            )

            # Filter for parquet files
            parquet_files = [f for f in repo_files if f.endswith(".parquet")]

            # Clear existing configs if we found parquet files
            if parquet_files:
                discovered_configs = {}

                # Analyze file structure to discover configs
                for parquet_file in parquet_files:
                    config_name, split_name = self._parse_parquet_path(parquet_file)

                    # Create config if not exists
                    if config_name not in discovered_configs:
                        discovered_configs[config_name] = ConfigInfo(name=config_name)

                    # Create split if not exists
                    if split_name not in discovered_configs[config_name].splits:
                        discovered_configs[config_name].splits[split_name] = SplitInfo(
                            name=split_name
                        )

                    # Add parquet file URL
                    file_url = hf_hub_url(
                        repo_id=self.dataset_name,
                        filename=parquet_file,
                        repo_type="dataset",
                    )
                    discovered_configs[config_name].splits[
                        split_name
                    ].parquet_files.append(file_url)

                # Merge discovered configs with existing ones
                for config_name, config_info in discovered_configs.items():
                    if config_name in self.configs:
                        # Merge splits
                        self.configs[config_name].splits.update(config_info.splits)
                    else:
                        # Add new config
                        self.configs[config_name] = config_info

                # Set intelligent default config
                self._set_default_config()

        except Exception as e:
            print(f"Warning: Could not discover from files: {e}")

    def _set_default_config(self) -> None:
        """Set intelligent default configuration."""
        if not self.configs:
            return

        # Priority order for default config
        priorities = [
            # English configs
            "20231101.en",
            "en",
            "english",
            # Common defaults
            "default",
            "main",
            "train",
            # Latest dates (for Wikipedia-style datasets)
        ]

        # Check priorities first
        for priority in priorities:
            if priority in self.configs:
                self.default_config = priority
                return

        # Look for English-like configs
        en_configs = [c for c in self.configs.keys() if "en" in c.lower()]
        if en_configs:
            # Sort by name to get most recent
            self.default_config = sorted(en_configs)[-1]
            return

        # Fall back to first config
        self.default_config = next(iter(self.configs.keys()))

    def _parse_parquet_path(self, file_path: str) -> tuple[str, str]:
        """
        Parse parquet file path to extract config and split names.

        Args:
            file_path: Path to parquet file

        Returns:
            Tuple of (config_name, split_name)
        """
        parts = file_path.split("/")

        # Pattern 1: config/split/file.parquet or config/file.parquet
        if len(parts) >= 2:
            config_candidate = parts[0]

            # Check if it looks like a config (date-based, language, etc.)
            if re.match(r"\d{8}\.\w+", config_candidate):  # 20231101.en format
                split_name = "train"  # Default split
                if len(parts) >= 3:
                    # Check if second part might be a split
                    filename = parts[1]
                    if any(
                        split in filename
                        for split in ["train", "test", "val", "validation"]
                    ):
                        split_name = filename.split("-")[0]
                return config_candidate, split_name

        # Pattern 2: Split from filename (train-00000-of-00001.parquet)
        filename = parts[-1]
        split_match = re.match(r"([^-]+)-\d+-of-\d+\.parquet", filename)
        if split_match:
            split_name = split_match.group(1)
            config_name = parts[0] if len(parts) > 1 else "default"
            return config_name, split_name

        # Pattern 3: Data directory
        if parts[0] == "data":
            return "default", "train"

        # Default fallback
        return "default", "train"

    def get_config_names(self) -> List[str]:
        """Get list of available configuration names."""
        return list(self.configs.keys())

    def get_split_names(self, config: Optional[str] = None) -> List[str]:
        """
        Get list of available split names for a configuration.

        Args:
            config: Configuration name. If None, uses default config.

        Returns:
            List of split names
        """
        config_name = config or self.default_config
        if config_name and config_name in self.configs:
            return list(self.configs[config_name].splits.keys())
        return []

    def get_parquet_files(
        self, config: Optional[str] = None, split: str = "train"
    ) -> List[str]:
        """
        Get list of parquet file URLs for a specific config and split.

        Args:
            config: Configuration name. If None, uses default config.
            split: Split name (default: "train")

        Returns:
            List of parquet file URLs
        """
        config_name = config or self.default_config
        if (
            config_name
            and config_name in self.configs
            and split in self.configs[config_name].splits
        ):
            return self.configs[config_name].splits[split].parquet_files
        return []

    def get_split_info(
        self, config: Optional[str] = None, split: str = "train"
    ) -> Optional[SplitInfo]:
        """
        Get detailed information about a specific split.

        Args:
            config: Configuration name. If None, uses default config.
            split: Split name (default: "train")

        Returns:
            SplitInfo object or None if not found
        """
        config_name = config or self.default_config
        if (
            config_name
            and config_name in self.configs
            and split in self.configs[config_name].splits
        ):
            return self.configs[config_name].splits[split]
        return None
