diff --git a/.github/workflows/data_refresh.yml b/.github/workflows/data_refresh.yml index 74a72b5..4367a3e 100644 --- a/.github/workflows/data_refresh.yml +++ b/.github/workflows/data_refresh.yml @@ -1,6 +1,6 @@ name: Data Pipeline -on: [push] +# on: [push] jobs: data-pipeline: diff --git a/pipeline/process/process_defense.py b/pipeline/process/process_defense.py index 891ba2f..ccbd203 100644 --- a/pipeline/process/process_defense.py +++ b/pipeline/process/process_defense.py @@ -131,29 +131,33 @@ def rank_defense(df: pd.DataFrame, stats_columns: List[str]) -> pd.DataFrame: @pf.register_dataframe_method -def create_future_defense_rankings(df: pd.DataFrame, current_week: str) -> pd.DataFrame: - """Creates a dataframe containing the future defense rankings for each team based - their ranking from the prior week. Simply carries over the prior week's ranking. +def append_future_week_defense_rankings( + df: pd.DataFrame, season_year: int +) -> pd.DataFrame: + """Shifts the previous week's defense rankings to the current week. + Args: - df (pd.DataFrame): The dataframe containing the defense rankings. - current_week (str): The current week of the season. Note that - this week should not have any games played yet. + df (pd.DataFrame): defense rankings for each team. + season_year (int): The current season year. Returns: - pd.DataFrame: The future defense rankings for each team along with the - historical defense rankings. + pd.DataFrame: The defense rankings for each team based on cumulative stats + leading up to the current week. """ - most_recent_actual_week = max(df["week"]) - if most_recent_actual_week < int(current_week): - most_recent_week_df = df.query(f"week == {most_recent_actual_week}") - # replace week with current_week - most_recent_week_df["week"] = int(current_week) - # concat with defense_stats_df - df = pd.concat([df, most_recent_week_df]) - return df - else: - return df + max_week = max(df["week"]) + is_add_future_week = (season_year > 2020 and max_week <= 17) or ( + season_year <= 2020 and max_week <= 16 + ) + if is_add_future_week: + future_week_place_holder_df = pd.DataFrame( + zip([max_week + 1] * len(df["opp"].unique()), df["opp"].unique()), + columns=["week", "opp"], + ) + df = pd.concat([df, future_week_place_holder_df]) + for def_rank_column in [x for x in df.columns if x.endswith("_rank")]: + df = df.assign(**{def_rank_column: df.groupby("opp")[def_rank_column].shift(1)}) + return df if __name__ == "__main__": @@ -179,9 +183,8 @@ def create_future_defense_rankings(df: pd.DataFrame, current_week: str) -> pd.Da ) cumulative_stats.insert(0, "week", week) defense_stats_df = pd.concat([defense_stats_df, cumulative_stats]) - current_week = fetch_current_week(calendar_df) - defense_stats_df = defense_stats_df.create_future_defense_rankings( - current_week=current_week + defense_stats_df = defense_stats_df.append_future_week_defense_rankings( + season_year=args.season_year ) defense_stats_df["season_year"] = args.season_year defense_stats_df.write_ff_csv(root_dir, args.season_year, dir_type, data_type) diff --git a/pipeline/utils.py b/pipeline/utils.py index 9c7a82d..e1ae147 100644 --- a/pipeline/utils.py +++ b/pipeline/utils.py @@ -330,6 +330,7 @@ def read_ff_csv(dir_path: PosixPath) -> pd.DataFrame: return df +# TO DO: create logic to handle if function is called outside of the season def fetch_current_week(calendar_df: pd.DataFrame) -> str: """Fetches the current week from the calendar dataframe.""" # first determine which day of the week it is diff --git a/src/fantasyfootball/datasets/season/2015/defense.gz b/src/fantasyfootball/datasets/season/2015/defense.gz index 78ef22e..686fc49 100644 Binary files a/src/fantasyfootball/datasets/season/2015/defense.gz and b/src/fantasyfootball/datasets/season/2015/defense.gz differ diff --git a/src/fantasyfootball/datasets/season/2016/defense.gz b/src/fantasyfootball/datasets/season/2016/defense.gz index 536ba59..92fa2e4 100644 Binary files a/src/fantasyfootball/datasets/season/2016/defense.gz and b/src/fantasyfootball/datasets/season/2016/defense.gz differ diff --git a/src/fantasyfootball/datasets/season/2017/defense.gz b/src/fantasyfootball/datasets/season/2017/defense.gz index 76283cf..43eaede 100644 Binary files a/src/fantasyfootball/datasets/season/2017/defense.gz and b/src/fantasyfootball/datasets/season/2017/defense.gz differ diff --git a/src/fantasyfootball/datasets/season/2018/defense.gz b/src/fantasyfootball/datasets/season/2018/defense.gz index 2b5b9a2..aef5670 100644 Binary files a/src/fantasyfootball/datasets/season/2018/defense.gz and b/src/fantasyfootball/datasets/season/2018/defense.gz differ diff --git a/src/fantasyfootball/datasets/season/2019/defense.gz b/src/fantasyfootball/datasets/season/2019/defense.gz index 6278674..2217e53 100644 Binary files a/src/fantasyfootball/datasets/season/2019/defense.gz and b/src/fantasyfootball/datasets/season/2019/defense.gz differ diff --git a/src/fantasyfootball/datasets/season/2020/defense.gz b/src/fantasyfootball/datasets/season/2020/defense.gz index 940e7d9..dc5cd11 100644 Binary files a/src/fantasyfootball/datasets/season/2020/defense.gz and b/src/fantasyfootball/datasets/season/2020/defense.gz differ diff --git a/src/fantasyfootball/datasets/season/2021/defense.gz b/src/fantasyfootball/datasets/season/2021/defense.gz index 149864f..4cbf799 100644 Binary files a/src/fantasyfootball/datasets/season/2021/defense.gz and b/src/fantasyfootball/datasets/season/2021/defense.gz differ diff --git a/src/fantasyfootball/datasets/season/2022/defense.gz b/src/fantasyfootball/datasets/season/2022/defense.gz index 2a2387b..89b1581 100644 Binary files a/src/fantasyfootball/datasets/season/2022/defense.gz and b/src/fantasyfootball/datasets/season/2022/defense.gz differ diff --git a/src/fantasyfootball/features.py b/src/fantasyfootball/features.py index 7913090..b9ce31c 100644 --- a/src/fantasyfootball/features.py +++ b/src/fantasyfootball/features.py @@ -77,9 +77,17 @@ def transform(self, X, y=None): for col in self.window_columns: for window in self.n_week_window: col_name = f"{col}_ma_{window}" + # sort by player_group_columns and week + X = X.sort_values(self.player_group_columns + ["week"]) + # shift the column by 1 to get the previous value + X = X.assign( + **{col_name: X.groupby(self.player_group_columns)[col].shift(1)} + ) X = X.assign( **{ - col_name: X.groupby(self.player_group_columns)[col].transform( + col_name: X.groupby(self.player_group_columns)[ + col_name + ].transform( lambda x: x.rolling( window, min_periods=1, center=False ).mean() @@ -467,6 +475,18 @@ def create_future_week(self) -> FantasyFeatures: future_week_df = pd.merge( future_week_df, stats_df, how="left", on=["name", "team"] ) + # load in defensive stats data and add in for future week + defense_df = pd.read_csv(ff_data_dir / "defense.gz", compression="gzip") + # drop the defensive ranking fields in future week df, becauses they are nan + future_week_df = future_week_df.drop( + columns=[col for col in future_week_df.columns if "_def_rank" in col] + ) + # filter to future week + defense_df = defense_df[defense_df["week"] == max_week + 1] + # add the defensive rankings back into the future frame + future_week_df = pd.merge( + future_week_df, defense_df, how="left", on=["opp", "week", "season_year"] + ) future_week_df["is_future_week"] = 1 self.df = ( pd.concat([self.df, future_week_df], axis=0) diff --git a/tests/test_features.py b/tests/test_features.py index 8b4fa52..f0d897e 100644 --- a/tests/test_features.py +++ b/tests/test_features.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np import pytest from fantasyfootball.config import data_sources, root_dir from fantasyfootball.features import ( @@ -425,7 +426,18 @@ def test_add_moving_average_feature(df): window_columns = "passing_yds" n_week_window = 2 expected_column_name = "passing_yds_ma_2" - expected_values = [0.0, 148.5, 254.0, 293.0, 0.0, 445.0, 222.5, 195.0, 234.5, 229.0] + expected_values = expected_values = [ + 0, + 0, + 148.5, + 254.0, + 0, + 0, + 445.0, + 0, + 195.0, + 234.5, + ] features = FantasyFeatures(df, y="actual_pts", position="QB") features.add_moving_avg_feature( n_week_window=n_week_window, window_columns=window_columns @@ -435,8 +447,8 @@ def test_add_moving_average_feature(df): # check column name correctly formatted assert expected_column_name in result.columns - # check column values correct - assert result[expected_column_name].values.tolist() == expected_values + # check column values correct, also fillna(0) to account for NaNs + assert result[expected_column_name].fillna(0).values.tolist() == expected_values def test_add_target_encoded_feature(df):