import os
import warnings
import pandas as pd
from pybaseball import statcast


# TODO: note that pybaseball returns ALL games (not just regular season)
# TODO: for book, should we filter to regular for any data directly provided?
# TODO: download always gets ALL data, but loading can filer to only regular season
# TODO: can pyarrow dtypes be used here? would that need to be changed in pybaseball?
def download_statcast_seasons(start_year, end_year=None, force_download=False):

    # define end_year if not provided by user, only download start year
    if end_year is None:
        end_year = start_year

    # check that years are provided as integers
    if not (isinstance(start_year, int) and isinstance(end_year, int)):
        raise ValueError("'start_year' and 'end_year' must be integers.")

    # check that years are properly ordered
    if start_year > end_year:
        raise ValueError("'start_year' must be less than or equal to 'end_year'.")

    for year in range(start_year, end_year + 1):
        parquet_path = f"statcast/sc-{year}.parquet"
        if os.path.exists(parquet_path) and not force_download:
            print(f"Data for {year} already exists, skipping.")
        else:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", FutureWarning)
                print(f"Downloading data for {year}.")
                df = statcast(start_dt=f"{year}-01-01", end_dt=f"{year}-12-31")
                df = df.convert_dtypes(dtype_backend="pyarrow")
                df.to_parquet(parquet_path, index=False)


def load_statcast_season(year, force_download=False, regular_season=False):
    parquet_path = f"statcast/sc-{year}.parquet"
    if not os.path.exists(parquet_path):
        print(f"Data for {year} does not exist, downloading.")
        download_statcast_seasons(year, year, force_download=force_download)
    df = pd.read_parquet(parquet_path)
    if regular_season:
        df = filter_regular_season(df)
    return df


def filter_regular_season(df):
    if "game_type" not in df.columns:
        raise ValueError("DataFrame must contain a 'game_type' column.")
    return df[df["game_type"] == "R"].copy()


# TODO: remove columns that are generally never used
# TODO: also rearrange according to groupings found in book appendix about statcast? "process?"
def remove_columns(df):
    pass
