"""
Dataset updater module.
Contains the DatasetUpdater class that orchestrates all data collection,
processing, and feature engineering steps.
"""

import os
from typing import Optional, List, Dict, Tuple, Any
import pandas as pd
import logging

# Import custom modules
from scrapers.fbref_scraper import FBRefScraper

# from scrapers.transfermarkt_scraper import TransfermarktScraper
# from scrapers.fotmob_scraper import FotmobScraper
# from data_processors.team_manager import TeamManager
# from data_processors.ranking_calculator import RankingCalculator
# from data_processors.match_organizer import MatchOrganizer
from utils.date import (
    get_previous_days,
    validate_execution_day,
    season_from_date,
    sort_by_match_datetime,
)
from utils.format import _format_paths

# Setup logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


class DatasetUpdater:
    """Main class to orchestrate the dataset update process."""

    def __init__(
        self,
        circular_max_values: Dict[str, int],
        features_config: Dict[str, any],
        competitions_config: Dict[str, Dict],
        paths: Dict[str, str],
    ):
        """
        Initialize DatasetUpdater with configuration from Settings.

        Args:
            circular_max_values: Max values for circular features encoding
            features_config: Configuration for feature engineering
            competitions_config: Configuration for competitions
            dataset_csv: Path to the dataset CSV file
        """
        self.circular_max_values = circular_max_values
        self.features_config = features_config
        self.competitions_config = competitions_config
        self.paths = paths

        self.matches_csv_path = paths["matches"]
        self.players_csv_path = paths["players"]
        self.keepers_csv_path = paths["keepers"]
        self.players_features_csv_path = paths["players_features"]
        self.keepers_features_csv_path = paths["keepers_features"]
        self.ranking_csv_path = paths["ranking"]
        self.trophies_csv_path = paths["trophies"]
        self.dataset_global_csv_path = paths["dataset_global"]
        self.dataset_country_csv_path = paths["dataset_country"]
        self.dataset_competition_csv_path = paths["dataset_competition"]

    def update_dataset(
        self, date: Optional[str] = None, competition: Optional[str] = None
    ) -> pd.DataFrame:
        """
        Main method to update the dataset with new matches and features.

        Process:
        0. Validate execution day (must be Tuesday or Friday):
        - If date provided: Check if it's Tuesday or Friday
        - If no date: Use today and check if it's Tuesday or Friday
        - Friday: Get matches for Fri, Sat, Sun, Mon (4 days)
        - Tuesday: Get matches for Tue, Wed, Thu (3 days)
        - Other days: Return error message
        1. Iterate through competitions (all or specific one)
        2. For each competition:
        - Scrape match data from FBRef (leagues, cups, supercups, european, international)
        - Check for new teams and add them if needed (scraping Transfermarkt)
        - Scrape injury and suspension data from Fotmob
        - Calculate team rankings and trophy information
        - Organize matches by date and time
        3. Apply feature engineering (competition-specific and general)
        4. Append to existing CSV dataset

        Args:
            date: Date to process (format: YYYY-MM-DD). If None, uses today.
                Must be Tuesday or Friday.
            competition: Specific competition to update (None for all)

        Returns:
            DataFrame with updated match data and features

        Raises:
            ValueError: If the date is not a Tuesday or Friday
        """
        logger.info("=" * 80)
        logger.info("STARTING DATASET UPDATE PROCESS")
        logger.info("=" * 80)

        # Step 0: Validate execution day
        try:
            execution_date = validate_execution_day(date)
        except ValueError as e:
            error_msg = str(e)
            logger.error(error_msg)
            print(f"\n❌ ERROR: {error_msg}")
            raise

        # Determine date range based on validated day
        days = get_previous_days(execution_date)

        last_season = season_from_date(execution_date)

        # Step 1: Determine which competitions to process
        competitions_list = self._get_competitions(competition, last_season)
        logger.info(f"🏆 Processing {len(competitions_list)} competition(s)")

        # Log competition details
        for comp_info in competitions_list:
            logger.info(f"   📍 {comp_info['name'].upper()} ({comp_info['country']})")

        matches = pd.DataFrame()

        # Step 2: Process each competition
        for comp_info in competitions_list:
            comp_name = comp_info["name"]
            comp_country = comp_info["country"]

            fotmob_url = comp_info["information_urls"]["fotmob"]

            formatted_paths = _format_paths(
                self.paths,
                last_season=last_season,
                country=comp_country,
                competition=comp_name,
            )

            matches_csv_path = formatted_paths["matches"]
            players_csv_path = formatted_paths["players"]
            keepers_csv_path = formatted_paths["keepers"]
            players_features_csv_path = formatted_paths["players_features"]
            keepers_features_csv_path = formatted_paths["keepers_features"]
            ranking_csv_path = formatted_paths["ranking"]
            trophies_csv_path = formatted_paths["trophies"]
            dataset_global_csv_path = formatted_paths["dataset_global"]
            dataset_country_csv_path = formatted_paths["dataset_country"]
            dataset_competition_csv_path = formatted_paths["dataset_competition"]

            # --- Ensure parent folders exist ---
            for _p in [matches_csv_path]:
                _parent = os.path.dirname(_p)
                if _parent:
                    os.makedirs(_parent, exist_ok=True)

            logger.info("=" * 80)
            logger.info(f"📊 Processing competition: {comp_name} ({comp_country})")
            logger.info("=" * 80)

            try:
                comp_matches = self._process_competition(
                    comp_name,
                    comp_country,
                    matches_csv_path,
                    players_csv_path,
                    keepers_csv_path,
                    players_features_csv_path,
                    keepers_features_csv_path,
                    ranking_csv_path,
                    trophies_csv_path,
                    dataset_global_csv_path,
                    dataset_country_csv_path,
                    dataset_competition_csv_path,
                    days,
                    last_season,
                    fotmob_url,
                )

                if len(comp_matches) > 0:
                    comp_matches = sort_by_match_datetime(comp_matches)
                    matches = pd.concat([matches, comp_matches], ignore_index=True)
                    logger.info(
                        f"✅ {comp_name}: {len(comp_matches)} matches collected"
                    )
                else:
                    logger.info(
                        f"ℹ️ {comp_name}: 0 matches collected (no recent matches to process)"
                    )

            except Exception as e:
                logger.error(
                    f"❌ Error processing competition {comp_name}: {str(e)}",
                    exc_info=True,
                )
                continue

        # --- ESTO DEBE ESTAR FUERA DEL LOOP ---
        # Check if any matches were found across ALL competitions
        if matches.empty:
            logger.warning("⚠️  No matches found to process across all competitions")
            logger.info("=" * 80)
            logger.info("✅ DATASET UPDATE COMPLETED (NO NEW DATA)")
            logger.info("=" * 80)
            return pd.DataFrame()

        logger.info(f"\n{'='*60}")
        logger.info(
            f"📈 Total matches collected across all competitions: {len(matches)}"
        )
        logger.info(f"{'='*60}")

        # Step 3: Save collected matches for each competition
        logger.info("🔄 Saving matches to respective competition CSVs...")

        matches = sort_by_match_datetime(matches)
        all_matches = (
            pd.read_csv(matches_csv_path)
            if os.path.exists(matches_csv_path)
            else pd.DataFrame()
        )
        final_all_matches = pd.concat([matches, all_matches], ignore_index=True)
        final_all_matches.to_csv(matches_csv_path, index=False, encoding="utf-8")

        logger.info("=" * 80)
        logger.info("✅ DATASET UPDATE COMPLETED SUCCESSFULLY")
        logger.info("=" * 80)

        return matches

    def _get_competitions(
        self, competition: Optional[str], last_season: Tuple[str, int, int]
    ) -> List[Dict[str, Any]]:
        """
        Get list of competitions to process with their full configuration.

        Args:
            competition: Specific competition or None for all
            last_season: Tuple with (season_string, start_year, end_year)

        Returns:
            List of dictionaries containing competition data with structure:
            [
                {
                    'name': 'liga',
                    'country': 'spain',
                    'paths': {
                        'ranking': 'https://...',
                        'teams_players': 'https://...',
                        'trophies': 'https://...'
                    },
                    'information_urls': {
                        'matches': 'https://fbref.com/...',
                        'fotmob': 'https://www.fotmob.com/...',
                        'teams': 'https://www.transfermarkt.com/...'
                    }
                },
                ...
            ]

        Raises:
            ValueError: If specified competition doesn't exist in config
        """
        # Determine which competitions to process
        if competition:
            print(f"🔍 Checking competition: {competition}")
            if competition not in self.competitions_config:
                available = list(self.competitions_config.keys())
                raise ValueError(
                    f"Competition '{competition}' not found in configuration. "
                    f"Available competitions: {available}"
                )
            competitions_to_process = [competition]
        else:
            competitions_to_process = list(self.competitions_config.keys())

        # Build detailed competition list
        detailed_competitions = []

        for comp_name in competitions_to_process:
            competition_config = self.competitions_config.get(comp_name, {})

            # Extract country
            country = competition_config.get("country", "unknown")

            # Extract and format information scraping URLs
            info_urls = competition_config.get("information_scraping_urls", {})
            formatted_info_urls = {}

            for url_key, url_template in info_urls.items():
                if url_key == "fotmob":
                    # FotMob URL needs page parameter, store template without formatting page
                    formatted_info_urls[url_key] = url_template.format(
                        last_season=last_season,
                        page="{page}",  # Keep placeholder for later formatting
                    )
                else:
                    # Format other URLs normally
                    formatted_info_urls[url_key] = url_template.format(
                        last_season=last_season
                    )

            # Build competition dictionary
            comp_dict = {
                "name": comp_name,
                "country": country,
                "information_urls": formatted_info_urls,
            }

            detailed_competitions.append(comp_dict)

        return detailed_competitions

    def _process_competition(
        self,
        comp_name: str,
        comp_country: str,
        matches_csv_path: str,
        players_csv_path: str,
        keepers_csv_path: str,
        players_features_csv_path: str,
        keepers_features_csv_path: str,
        ranking_csv_path: str,
        trophies_csv_path: str,
        dataset_global_csv_path: str,
        dataset_country_csv_path: str,
        dataset_competition_csv_path: str,
        days: List[str],
        last_season: Tuple[str, int, int],
        fotmob_url: str,
    ) -> List[Dict]:
        """
        Process a single competition and return match data.

        Args:
            comp_info: Dictionary containing competition information:
                {
                    'name': str,
                    'country': str,
                    'paths': {
                        'ranking': str,
                        'matches': str,
                        'teams': str,
                        'teams_players': str,
                        'trophies': str
                    }
                }
            days: List of dates to process
            execution_date: Date of execution

        Returns:
            List of match dictionaries
        """

        # Get competition configuration
        comp_config = self.competitions_config[comp_name]
        comp_country = comp_config.get("country", "unknown")

        # Determine competition type from config or infer from name
        comp_type = self._infer_competition_type(comp_name)

        logger.info(f"🌍 Country: {comp_country}")
        logger.info(f"   Type: {comp_type}")
        logger.info(f"🔗 Paths:")
        logger.info(f"   - Matches: {matches_csv_path}")
        logger.info(f"   - Players: {players_csv_path}")
        logger.info(f"   - Keepers: {keepers_csv_path}")
        logger.info(f"   - Players features: {players_features_csv_path}")
        logger.info(f"   - Keepers features: {keepers_features_csv_path}")
        logger.info(f"   - Ranking: {ranking_csv_path}")
        logger.info(f"   - Trophies: {trophies_csv_path}")
        logger.info(f"   - Global dataset: {dataset_global_csv_path}")
        logger.info(f"   - Country dataset: {dataset_country_csv_path}")
        logger.info(f"   - Competition dataset: {dataset_competition_csv_path}")

        # Scrape match data from FBRef
        logger.info(f"   🔍 Scraping FBRef data...")

        # Create scraper instance
        scraper = FBRefScraper(
            last_season, self.features_config, comp_name, comp_config, comp_type, days
        )

        # Run the scraper to get DataFrame-like objects
        df_matches, players_data, keepers_data = scraper.run_before()

        # Add competition metadata columns
        df_matches["competition_type"] = comp_type
        df_matches["competition_country"] = comp_country

        # --- Ensure parent folder exists for players CSVs ---
        _parent = os.path.dirname(players_csv_path)
        if _parent:
            os.makedirs(_parent, exist_ok=True)

        # ---------- PLAYERS ----------
        raw = players_data
        if isinstance(raw, pd.DataFrame):
            df_players = raw
        elif isinstance(raw, list):
            if not raw:
                df_players = pd.DataFrame()
            elif all(isinstance(x, dict) for x in raw):
                df_players = pd.DataFrame(raw)
            elif all(isinstance(x, pd.DataFrame) for x in raw):
                df_players = pd.concat(raw, ignore_index=True)
            elif all(hasattr(x, "to_dict") for x in raw):
                df_players = pd.DataFrame([x.to_dict() for x in raw])
            else:
                raise TypeError("players_data list has unsupported element types")
        else:
            raise TypeError(f"Unsupported players_data type: {type(raw)}")

        # Ensure parent folder exists
        dirpath = os.path.dirname(os.path.abspath(players_csv_path))
        os.makedirs(dirpath, exist_ok=True)

        # Detect if file already exists
        file_exists = os.path.isfile(players_csv_path)

        # If df_players is empty, skip writing (optional safeguard)
        if df_players is not None and not df_players.empty:
            df_players.to_csv(
                players_csv_path,
                mode="a" if file_exists else "w",  # append if exists, else create
                header=not file_exists,  # write header only on first write
                index=False,
                encoding="utf-8",
            )

        # ---------- KEEPERS ----------
        raw = keepers_data
        if isinstance(raw, pd.DataFrame):
            df_keepers = raw
        elif isinstance(raw, list):
            if not raw:
                df_keepers = pd.DataFrame()
            elif all(isinstance(x, dict) for x in raw):
                df_keepers = pd.DataFrame(raw)
            elif all(isinstance(x, pd.DataFrame) for x in raw):
                df_keepers = pd.concat(raw, ignore_index=True)
            elif all(hasattr(x, "to_dict") for x in raw):
                df_keepers = pd.DataFrame([x.to_dict() for x in raw])
            else:
                raise TypeError("keepers_data list has unsupported element types")
        else:
            raise TypeError(f"Unsupported keepers_data type: {type(raw)}")

        # Ensure parent folder exists
        dirpath = os.path.dirname(os.path.abspath(keepers_csv_path))
        os.makedirs(dirpath, exist_ok=True)

        # Detect if file already exists
        file_exists = os.path.isfile(keepers_csv_path)

        # If df_keepers is empty, skip writing (optional safeguard)
        if df_keepers is not None and not df_keepers.empty:
            df_keepers.to_csv(
                keepers_csv_path,
                mode="a" if file_exists else "w",  # append if exists, else create
                header=not file_exists,  # write header only on first write
                index=False,
                encoding="utf-8",
            )

        return df_matches

    def _infer_competition_type(self, competition: str) -> str:
        """
        Infer competition type from competition name.

        Args:
            competition: Competition identifier

        Returns:
            Competition type: 'league', 'cup', 'supercup', 'european', or 'international'
        """
        comp_lower = competition.lower()

        if (
            "league" in comp_lower
            or "liga" in comp_lower
            or "serie" in comp_lower
            or "bundesliga" in comp_lower
            or "ligue" in comp_lower
        ):
            return "league"
        elif (
            "fa" in comp_lower
            or "carabao" in comp_lower
            or "rey" in comp_lower
            or "pokal" in comp_lower
            or "coppa" in comp_lower
        ):
            return "cup"
        elif (
            "supercup" in comp_lower
            or "supercopa" in comp_lower
            or "shield" in comp_lower
        ):
            return "supercup"
        elif "uefa" in comp_lower:
            return "european"
        elif "fifa" in comp_lower:
            return "international"
        else:
            return "league"

    def _enrich_match_data(
        self,
        match: dict,
        competition: str,
        comp_type: str,
        comp_country: str,
        injury_data: dict,
        rankings: dict,
        trophy_data: dict,
    ) -> dict:
        """
        Enrich match data with all additional information.

        Args:
            match: Base match dictionary
            competition: Competition identifier
            comp_type: Competition type
            comp_country: Competition country
            injury_data: Dictionary of injury/suspension data
            rankings: Dictionary of team rankings
            trophy_data: Dictionary of trophy information

        Returns:
            Enriched match dictionary
        """
        home_team = match.get("home_team", "")
        away_team = match.get("away_team", "")

        # Add competition metadata
        match["competition"] = competition
        match["competition_type"] = comp_type
        match["competition_country"] = comp_country

        # Add team data (when team_manager is ready)
        # match['home_team_data'] = self.team_manager.get_team_data(home_team)
        # match['away_team_data'] = self.team_manager.get_team_data(away_team)

        # Add injury/suspension data
        match["home_injuries"] = injury_data.get(home_team, {}).get("injuries", [])
        match["home_suspensions"] = injury_data.get(home_team, {}).get(
            "suspensions", []
        )
        match["away_injuries"] = injury_data.get(away_team, {}).get("injuries", [])
        match["away_suspensions"] = injury_data.get(away_team, {}).get(
            "suspensions", []
        )

        # Add ranking data
        match["home_ranking"] = rankings.get(home_team, {})
        match["away_ranking"] = rankings.get(away_team, {})

        # Add trophy data
        match["home_trophies"] = trophy_data.get(home_team, {})
        match["away_trophies"] = trophy_data.get(away_team, {})

        return match

    def _check_and_add_new_teams(self, matches: List[dict]) -> List[str]:
        """
        Check for new teams in matches and add them to the team database.

        Args:
            matches: List of match dictionaries

        Returns:
            List of newly added team names
        """
        # TODO: Implement when team_manager is ready
        return []

        # When ready:
        # all_teams = set()
        # for match in matches:
        #     all_teams.add(match.get('home_team'))
        #     all_teams.add(match.get('away_team'))
        #
        # new_teams = []
        # for team in all_teams:
        #     if team and not self.team_manager.team_exists(team):
        #         logger.info(f"      🆕 New team found: {team}. Scraping Transfermarkt data...")
        #         team_data = self.transfermarkt_scraper.scrape_team(team)
        #         self.team_manager.add_team(team, team_data)
        #         new_teams.append(team)
        #
        # return new_teams

    def _prepare_target_matches(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Extract home_team, away_team, and date from a DataFrame to prepare
        target matches for FotMob scraper.

        Args:
            df: DataFrame with columns ['home_team_name', 'away_team_name', 'date_of_match']

        Returns:
            DataFrame with columns ['date', 'home_team', 'away_team'] ready for scraper
        """
        # Select relevant columns
        target_matches = df[
            ["date_of_match", "home_team_name", "away_team_name"]
        ].copy()

        # Rename columns to match scraper expected format
        target_matches = target_matches.rename(
            columns={
                "date_of_match": "date",
                "home_team_name": "home_team",
                "away_team_name": "away_team",
            }
        )

        # Remove duplicates if any
        target_matches = target_matches.drop_duplicates()

        # Reset index
        target_matches = target_matches.reset_index(drop=True)

        return target_matches

    def _save_to_csv(self, df: pd.DataFrame) -> None:
        """
        Save the updated dataset to CSV, appending to existing data.

        Args:
            df: DataFrame to save
        """
        try:
            # Try to load existing data
            existing_df = pd.read_csv(self.dataset_csv)
            initial_count = len(existing_df)

            # Combine and remove duplicates
            combined_df = pd.concat([existing_df, df], ignore_index=True)
            combined_df.drop_duplicates(
                subset=["match_id", "date"], keep="last", inplace=True
            )

            # Save combined data
            combined_df.to_csv(self.dataset_csv, index=False)

            new_matches = len(combined_df) - initial_count
            logger.info(f"💾 Updated dataset saved to {self.dataset_csv}")
            logger.info(f"   • Previous matches: {initial_count}")
            logger.info(f"   • New matches added: {new_matches}")
            logger.info(f"   • Total matches: {len(combined_df)}")

        except FileNotFoundError:
            # Create new file if it doesn't exist
            df.to_csv(self.dataset_csv, index=False)
            logger.info(f"💾 New dataset created at {self.dataset_csv}")
            logger.info(f"   • Total matches: {len(df)}")
