"""
players_state_builder.py

Build a players_state_df from a player-by-match CSV.
Auto-detects OUTFIELD vs KEEPERS schemas and computes IS variants:
- Global
- Home / Away
- Per competition
- Per competition × (Home/Away)

Usage:
    python players_state_builder.py \
        --input_csv data/players_matches.csv \
        --output_csv data/players_state_df.csv
"""

import os
import numpy as np
import pandas as pd
from typing import List, Dict
from datetime import datetime, timedelta

# ---------------------------
# Outfield config (5 buckets)
# ---------------------------
OUTFIELD_BASE_METRICS: List[str] = [
    # Finishing / creation
    "PlayersShots",
    "PlayersShotsOnTarget",
    "PlayersExpectedAssistance",
    "PlayersKeyPasses",
    "PlayersAwayPenaltyAreaTouches",
    # Passing progression
    "PlayersDistanceProgression",
    "PlayersThroughPasses",
    "PlayersCrosses",
    "PlayersLiveBallPasses",
    "PlayersDeadBallPasses",
    # Carrying progression
    "PlayersBallCarries",
    "PlayersDistanceCarried",
    "PlayersForwardCarries",
    # Dribbling
    "PlayersDribblesCompleted",
    "PlayersAttemptedDribbles",
    # Defensive actions
    "PlayersTackles",
    "PlayersInterceptions",
    "PlayersTackles+Interceptions",
    "PlayersBallsBlocked",
    "PlayersShotsBlocked",
    "PlayersClearances",
    # Duels
    "PlayersAerialsWon",
    "PlayersAerialsLost",
]

# 5 buckets: CD (centrales), FB (laterales), MF (mediocampistas), WG (bandas), ST (delanteros)
CORE_METRICS_BY_ROLE: Dict[str, List[str]] = {
    "CD": [
        "PlayersTackles+Interceptions",
        "PlayersClearances",
        "PlayersAerialsWon",
        "PlayersShotsBlocked",
        "PlayersBallsBlocked",
    ],
    "FB": [
        "PlayersDistanceProgression",
        "PlayersCrosses",
        "PlayersTackles+Interceptions",
        "PlayersDribblesCompleted",
        "PlayersAwayPenaltyAreaTouches",
        "PlayersAttemptedDribbles",
        "PlayersBallsBlocked",
        "PlayersShotsBlocked",
        "PlayersClearances",
    ],
    "MF": [
        "PlayersDistanceProgression",
        "PlayersLiveBallPasses",
        "PlayersKeyPasses",
        "PlayersTackles+Interceptions",
        "PlayersBallCarries",
        "PlayersDistanceCarried",
        "PlayersThroughPasses",
        "PlayersExpectedAssistance",
        "PlayersDeadBallPasses",
        "PlayersBallsBlocked",
        "PlayersShotsBlocked",
        "PlayersClearances",
    ],
    "WG": [
        "PlayersExpectedAssistance",
        "PlayersKeyPasses",
        "PlayersCrosses",
        "PlayersDribblesCompleted",
        "PlayersAwayPenaltyAreaTouches",
        "PlayersForwardCarries",
        "PlayersShots",
        "PlayersShotsOnTarget",
        "PlayersDistanceProgression",
        "PlayersThroughPasses",
        "PlayersBallCarries",
        "PlayersDistanceCarried",
        "PlayersAttemptedDribbles",
    ],
    "ST": [
        "PlayersShots",
        "PlayersShotsOnTarget",
        "PlayersExpectedAssistance",
        "PlayersKeyPasses",
        "PlayersAwayPenaltyAreaTouches",
        "PlayersThroughPasses",
        "PlayersDribblesCompleted",
        "PlayersAttemptedDribbles",
        "PlayersAerialsWon",
    ],
}

# ---------------------------
# Keepers config
# ---------------------------
KEEPER_BASE_METRICS: List[str] = [
    "KeepersShotsOnTargetAgainst",
    "KeepersGoalsAgainst",
    "KeepersSaved",
    "KeepersxG",
    "KeepersPasses",
    "KeepersAttemptedPasses",
    "KeepersPassesDistance",
    "KeepersPassesLaunched",
    "KeepersAttemptedPassesLaunched",
    "KeepersAttemptedKicks",
    "KeepersKicksDistance",
    "KeepersCrosses",
    "KeepersCrossesStopped",
    "KeepersActionsOutsideArea",
    "KeepersDistanceActionsArea",
    # % columns pueden venir; no son necesarios para per90 pero los usaremos como rates si existen
    "Keepers%Saved",
    "Keepers%CompletedPasses",
    "Keepers%CompletedPassesLaunched",
    "Keepers%Kicks",
    "Keepers%CrossesStopped",
]

# Núcleo para IS de portero (usaremos per90 + rates robustos si están)
# psxg_minus_ga_per90, save_pct, claims_per90/claims_pct, sweeper (actions/dist), distribution (passes/launch/dist)
KEEPER_CORE_NAMES = [
    "gk_psxg_minus_ga_per90",
    "gk_save_pct",
    "gk_claims_per90",
    "gk_claims_pct",
    "gk_actions_outside_area_per90",
    "gk_distance_actions_area_per90",
    "gk_passes_per90",
    "gk_launches_per90",
    "gk_passes_distance_per90",
    "gk_kicks_distance_per90",
]


# ---------------------------
# Helpers
# ---------------------------
def normalize_role(position: str) -> str:
    """
    5 buckets:
      CD (centrales), FB (laterales), MF (mediocampistas),
      WG (bandas), ST (delanteros).
    """
    p = (position or "").strip().upper()
    # Centrales
    if p in {"CB", "RCB", "LCB", "CBR", "CBL", "SW"}:
        return "CD"
    # Laterales
    if p in {"RB", "LB", "RWB", "LWB", "FB", "RFB", "LFB", "WB"}:
        return "FB"
    # Mediocampistas
    if p in {"DM", "CDM", "CM", "RCM", "LCM", "MC", "AM", "CAM", "6", "8", "10"}:
        return "MF"
    # Bandas
    if p in {"LW", "RW", "WF", "W", "LWF", "RWF", "LM", "RM"}:
        return "WG"
    # Delanteros
    if p in {"FW", "CF", "ST", "9", "SS"}:
        return "ST"
    # Fallback
    return "MF"


def ensure_columns(df: pd.DataFrame, cols: List[str]) -> None:
    for c in cols:
        if c not in df.columns:
            if c in (
                "Players",
                "Position",
                "team",
                "competition_type",
                "Keepers",
                "competition_name",
            ):
                df[c] = ""
            else:
                df[c] = 0.0


def zscore_group(series: pd.Series, group: pd.Series) -> pd.Series:
    """Z-score de series por grupo (role_bucket o 'GK')."""
    frame = pd.DataFrame({"x": series, "g": group})

    def _z(s: pd.Series) -> pd.Series:
        mu = s.mean()
        sd = s.std(ddof=0)
        return (s - mu) / (sd if sd > 1e-9 else 1.0)

    return frame.groupby("g")["x"].transform(_z)


def pick_team_name_column(df: pd.DataFrame) -> str | None:
    candidates = [
        "team_name",
        "name",
        "club",
        "squad",
        "Equipo",
        "equipo",
        "team_long_name",
        "team_short_name",
        "TeamName",
        "Club",
        "Squad",
        "team",  # fallback (home/away)
    ]
    for c in candidates:
        if c in df.columns:
            return c
    return None


# ---------------------------
# Outfield pipeline
# ---------------------------
def per90_outfield(df: pd.DataFrame) -> pd.DataFrame:
    g = df.copy()
    g["PlayersMinutes"] = g["PlayersMinutes"].fillna(0.0)
    denom = g["PlayersMinutes"].clip(lower=1.0)
    factor = 90.0 / denom
    for m in OUTFIELD_BASE_METRICS:
        if m not in g.columns:
            g[m] = 0.0
        g[f"{m}_per90"] = g[m].fillna(0.0) * factor

    # Derived rates (not used directly in IS core but handy)
    if "PlayersDribblesCompleted" in g and "PlayersAttemptedDribbles" in g:
        g["dribble_success"] = (
            g["PlayersDribblesCompleted"].fillna(0.0)
            / g["PlayersAttemptedDribbles"].replace(0, np.nan)
        ).fillna(0.0)
    if "PlayersAerialsWon" in g and "PlayersAerialsLost" in g:
        g["aerial_win_rate"] = (
            g["PlayersAerialsWon"].fillna(0.0)
            / (
                g["PlayersAerialsWon"].fillna(0.0) + g["PlayersAerialsLost"].fillna(0.0)
            ).replace(0, np.nan)
        ).fillna(0.0)
    return g


def compute_IS_outfield_subset(df_sub: pd.DataFrame) -> pd.DataFrame:
    per90_cols = [c for c in df_sub.columns if c.endswith("_per90")]
    if not per90_cols:
        return pd.DataFrame(columns=["Players", "role_bucket", "IS"])
    # Aggregate per player & role bucket
    agg = df_sub.groupby(["Players", "role_bucket"], as_index=False)[per90_cols].mean()
    chunks = []
    for role in agg["role_bucket"].unique():
        core = [
            f"{m}_per90"
            for m in CORE_METRICS_BY_ROLE.get(role, [])
            if f"{m}_per90" in agg.columns
        ]
        tmp = agg[agg["role_bucket"] == role].copy()
        if not core:
            out = tmp[["Players", "role_bucket"]].copy()
            out["IS"] = 0.0
            chunks.append(out)
            continue
        for col in core:
            tmp[f"z_{col}"] = zscore_group(tmp[col], tmp["role_bucket"])
        tmp["IS"] = tmp[[f"z_{c}" for c in core]].mean(axis=1).fillna(0.0)
        chunks.append(tmp[["Players", "role_bucket", "IS"]])
    return (
        pd.concat(chunks, ignore_index=True)
        if chunks
        else pd.DataFrame(columns=["Players", "role_bucket", "IS"])
    )


def build_outfield_state(df: pd.DataFrame) -> pd.DataFrame:
    # Required columns
    base_needed = [
        "Players",
        "Position",
        "PlayersMinutes",
        "team",
        "competition_type",
        "competition_name",
    ]
    ensure_columns(df, base_needed + OUTFIELD_BASE_METRICS)

    df["role_bucket"] = df["Position"].map(normalize_role)
    dfp = per90_outfield(df)

    is_global = compute_IS_outfield_subset(dfp).rename(columns={"IS": "IS_global"})
    is_home = compute_IS_outfield_subset(dfp[dfp["team"].str.lower() == "home"]).rename(
        columns={"IS": "IS_home"}
    )
    is_away = compute_IS_outfield_subset(dfp[dfp["team"].str.lower() == "away"]).rename(
        columns={"IS": "IS_away"}
    )

    state = is_global.merge(is_home, on=["Players", "role_bucket"], how="left").merge(
        is_away, on=["Players", "role_bucket"], how="left"
    )

    # Competitions
    competitions = sorted(dfp["competition_type"].dropna().unique().tolist())
    for comp in competitions:
        mask = dfp["competition_type"] == comp
        state = state.merge(
            compute_IS_outfield_subset(dfp[mask]).rename(columns={"IS": f"IS_{comp}"}),
            on=["Players", "role_bucket"],
            how="left",
        )
        state = state.merge(
            compute_IS_outfield_subset(
                dfp[mask & (dfp["team"].str.lower() == "home")]
            ).rename(columns={"IS": f"IS_{comp}_home"}),
            on=["Players", "role_bucket"],
            how="left",
        )
        state = state.merge(
            compute_IS_outfield_subset(
                dfp[mask & (dfp["team"].str.lower() == "away")]
            ).rename(columns={"IS": f"IS_{comp}_away"}),
            on=["Players", "role_bucket"],
            how="left",
        )

    # Counts
    cnt_global = (
        dfp.groupby(["Players", "role_bucket"])
        .size()
        .reset_index(name="n_matches_global")
    )
    cnt_home = (
        dfp[dfp["team"].str.lower() == "home"]
        .groupby(["Players", "role_bucket"])
        .size()
        .reset_index(name="n_matches_home")
    )
    cnt_away = (
        dfp[dfp["team"].str.lower() == "away"]
        .groupby(["Players", "role_bucket"])
        .size()
        .reset_index(name="n_matches_away")
    )

    state = (
        state.merge(cnt_global, on=["Players", "role_bucket"], how="left")
        .merge(cnt_home, on=["Players", "role_bucket"], how="left")
        .merge(cnt_away, on=["Players", "role_bucket"], how="left")
    )

    # team_name más reciente
    team_name_col = pick_team_name_column(df)
    if "match_date" in df.columns:
        df_sorted = df.sort_values("match_date").reset_index(drop=True)
    elif "date" in df.columns:
        df_sorted = df.sort_values("date").reset_index(drop=True)
    else:
        df_sorted = df.reset_index(drop=True)

    if team_name_col:
        last_team = (
            df_sorted.dropna(subset=["Players"])
            .drop_duplicates(subset=["Players"], keep="last")[
                ["Players", team_name_col]
            ]
            .rename(columns={team_name_col: "team_name"})
        )
        state = state.merge(last_team, on="Players", how="left")

    # Fill counts NaN with 0; keep IS NaN if subset absent (optional: fillna(0.0))
    for c in ["n_matches_global", "n_matches_home", "n_matches_away"]:
        if c in state.columns:
            state[c] = state[c].fillna(0).astype(float)

    return state.sort_values(["role_bucket", "Players"]).reset_index(drop=True)


# ---------------------------
# Keepers pipeline
# ---------------------------
def per90_keepers(df: pd.DataFrame) -> pd.DataFrame:
    g = df.copy()
    g["KeepersMinutes"] = g["KeepersMinutes"].fillna(0.0)
    denom = g["KeepersMinutes"].clip(lower=1.0)
    factor = 90.0 / denom

    # Ensure numeric columns exist
    for m in KEEPER_BASE_METRICS:
        if m not in g.columns:
            g[m] = 0.0

    # Per90 for counting/distance fields
    for m in [
        "KeepersShotsOnTargetAgainst",
        "KeepersGoalsAgainst",
        "KeepersSaved",
        "KeepersxG",
        "KeepersPasses",
        "KeepersAttemptedPasses",
        "KeepersPassesDistance",
        "KeepersPassesLaunched",
        "KeepersAttemptedPassesLaunched",
        "KeepersAttemptedKicks",
        "KeepersKicksDistance",
        "KeepersCrosses",
        "KeepersCrossesStopped",
        "KeepersActionsOutsideArea",
        "KeepersDistanceActionsArea",
    ]:
        g[f"{m}_per90"] = g[m].fillna(0.0) * factor

    # Robust rates
    g["gk_save_pct"] = (
        g["KeepersSaved"].fillna(0.0)
        / g["KeepersShotsOnTargetAgainst"].replace(0, np.nan)
    ).fillna(0.0)

    g["gk_claims_pct"] = (
        g["KeepersCrossesStopped"].fillna(0.0) / g["KeepersCrosses"].replace(0, np.nan)
    ).fillna(0.0)

    # psxg_minus_ga per90 (si xG es PSxG concedido)
    g["gk_psxg_minus_ga_per90"] = g["KeepersxG_per90"] - g["KeepersGoalsAgainst_per90"]

    # Convenience per90 feature names
    g["gk_claims_per90"] = g["KeepersCrossesStopped_per90"]
    g["gk_actions_outside_area_per90"] = g["KeepersActionsOutsideArea_per90"]
    g["gk_distance_actions_area_per90"] = g["KeepersDistanceActionsArea_per90"]
    g["gk_passes_per90"] = g["KeepersPasses_per90"]
    g["gk_launches_per90"] = g["KeepersPassesLaunched_per90"]
    g["gk_passes_distance_per90"] = g["KeepersPassesDistance_per90"]
    g["gk_kicks_distance_per90"] = g["KeepersKicksDistance_per90"]

    # Fixed role bucket for GK
    g["role_bucket"] = "GK"
    # Standardize keeper id column
    if "Keepers" in g.columns and "Players" not in g.columns:
        g = g.rename(columns={"Keepers": "Players"})
    return g


def compute_IS_keepers_subset(df_sub: pd.DataFrame) -> pd.DataFrame:
    """Compute GK IS within the subset using KEEPER_CORE_NAMES (z-scored within 'GK')."""
    if df_sub.empty:
        return pd.DataFrame(columns=["Players", "role_bucket", "IS"])
    cols = [c for c in KEEPER_CORE_NAMES if c in df_sub.columns]
    if not cols:
        # No core metrics -> zero
        out = df_sub.groupby(["Players", "role_bucket"], as_index=False).size()[
            ["Players", "role_bucket"]
        ]
        out["IS"] = 0.0
        return out
    # Aggregate per GK
    agg = df_sub.groupby(["Players", "role_bucket"], as_index=False)[cols].mean()
    # Z per metric within GK (single role)
    for c in cols:
        agg[f"z_{c}"] = zscore_group(agg[c], agg["role_bucket"])
    agg["IS"] = agg[[f"z_{c}" for c in cols]].mean(axis=1).fillna(0.0)
    return agg[["Players", "role_bucket", "IS"]]


def build_keepers_state(df: pd.DataFrame) -> pd.DataFrame:
    # Required columns
    base_needed = ["team", "competition_type", "Keepers", "KeepersMinutes"]
    ensure_columns(df, base_needed + KEEPER_BASE_METRICS)

    dfp = per90_keepers(df)

    is_global = compute_IS_keepers_subset(dfp).rename(columns={"IS": "IS_global"})
    is_home = compute_IS_keepers_subset(dfp[dfp["team"].str.lower() == "home"]).rename(
        columns={"IS": "IS_home"}
    )
    is_away = compute_IS_keepers_subset(dfp[dfp["team"].str.lower() == "away"]).rename(
        columns={"IS": "IS_away"}
    )

    state = is_global.merge(is_home, on=["Players", "role_bucket"], how="left").merge(
        is_away, on=["Players", "role_bucket"], how="left"
    )

    # competitions
    competitions = sorted(dfp["competition_type"].dropna().unique().tolist())
    for comp in competitions:
        mask = dfp["competition_type"] == comp
        state = (
            state.merge(
                compute_IS_keepers_subset(dfp[mask]).rename(
                    columns={"IS": f"IS_{comp}"}
                ),
                on=["Players", "role_bucket"],
                how="left",
            )
            .merge(
                compute_IS_keepers_subset(
                    dfp[mask & (dfp["team"].str.lower() == "home")]
                ).rename(columns={"IS": f"IS_{comp}_home"}),
                on=["Players", "role_bucket"],
                how="left",
            )
            .merge(
                compute_IS_keepers_subset(
                    dfp[mask & (dfp["team"].str.lower() == "away")]
                ).rename(columns={"IS": f"IS_{comp}_away"}),
                on=["Players", "role_bucket"],
                how="left",
            )
        )

    # counts
    cnt_global = (
        dfp.groupby(["Players", "role_bucket"])
        .size()
        .reset_index(name="n_matches_global")
    )
    cnt_home = (
        dfp[dfp["team"].str.lower() == "home"]
        .groupby(["Players", "role_bucket"])
        .size()
        .reset_index(name="n_matches_home")
    )
    cnt_away = (
        dfp[dfp["team"].str.lower() == "away"]
        .groupby(["Players", "role_bucket"])
        .size()
        .reset_index(name="n_matches_away")
    )

    state = (
        state.merge(cnt_global, on=["Players", "role_bucket"], how="left")
        .merge(cnt_home, on=["Players", "role_bucket"], how="left")
        .merge(cnt_away, on=["Players", "role_bucket"], how="left")
    )

    # team_name más reciente
    team_name_col = pick_team_name_column(df)
    if "match_date" in df.columns:
        df_sorted = df.sort_values("match_date").reset_index(drop=True)
    elif "date" in df.columns:
        df_sorted = df.sort_values("date").reset_index(drop=True)
    else:
        df_sorted = df.reset_index(drop=True)

    if team_name_col:
        last_team = (
            df_sorted.dropna(subset=["Keepers"])
            .drop_duplicates(subset=["Keepers"], keep="last")[
                ["Keepers", team_name_col]
            ]
            .rename(columns={"Keepers": "Players", team_name_col: "team_name"})
        )
        state = state.merge(last_team, on="Players", how="left")

    # Fill counts
    for c in ["n_matches_global", "n_matches_home", "n_matches_away"]:
        if c in state.columns:
            state[c] = state[c].fillna(0).astype(float)

    return state.sort_values(["role_bucket", "Players"]).reset_index(drop=True)


# ---------------------------
# Auto-detect & CLI
# ---------------------------
def update_csv_with_new_data(
    csv_path: str, new_df: pd.DataFrame, key_column: str = "Players"
):
    """
    Update existing rows in CSV based on key_column, keep other rows unchanged.
    Adds update date and removes rows older than 2 months.

    Args:
        csv_path: Path to the CSV file
        new_df: New DataFrame with updated data
        key_column: Column name to match rows (default: "Players")
    """

    # Get today's date
    today = datetime.now().strftime("%Y-%m-%d")

    # Add update date column to new data
    new_df["last_updated"] = today

    # Read existing CSV (if exists)
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)

        # Ensure existing_df has last_updated column
        if "last_updated" not in existing_df.columns:
            existing_df["last_updated"] = today

        # Get players from new DataFrame
        new_players = set(new_df[key_column].unique())

        # Keep rows that are NOT in the new DataFrame (unchanged)
        rows_to_keep = existing_df[~existing_df[key_column].isin(new_players)]

        # Combine: old rows that won't be updated + new rows
        updated_df = pd.concat([rows_to_keep, new_df], ignore_index=True)

    else:
        # If CSV doesn't exist, use new DataFrame directly
        updated_df = new_df

    # Remove rows older than 3 months
    three_months_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
    updated_df["last_updated"] = pd.to_datetime(updated_df["last_updated"])
    updated_df = updated_df[updated_df["last_updated"] >= three_months_ago]

    # Convert back to string format for CSV
    updated_df["last_updated"] = updated_df["last_updated"].dt.strftime("%Y-%m-%d")

    # Save
    updated_df.to_csv(csv_path, index=False)


def build_players_state_df(csv_path: str, final_csv_path: str) -> pd.DataFrame:
    """
    Auto-detect schema:
      - If 'Keepers' in columns -> keepers pipeline
      - Else if 'Players' in columns -> outfield pipeline
    """
    # Check if the file exists
    if not os.path.exists(csv_path):
        print(f"Error: File {csv_path} does not exist.")
        return None

    # Try to read the CSV file
    try:
        df = pd.read_csv(csv_path, low_memory=False)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Check if the DataFrame is empty
    if df.empty:
        print(f"Warning: File {csv_path} is empty.")
        return None

    # Process keepers data
    if "Keepers" in df.columns:
        keepers_df = build_keepers_state(df)

        update_csv_with_new_data(final_csv_path, keepers_df, key_column="Players")
        return keepers_df

    # Process outfield players data
    elif "Players" in df.columns:
        players_df = build_outfield_state(df)

        update_csv_with_new_data(final_csv_path, players_df, key_column="Players")
        return players_df

    # Raise error if neither column is found
    else:
        raise ValueError(
            "Input CSV must have either 'Players' (outfield) or 'Keepers' (keepers) column."
        )
