Merge pull request #39 from thecodeforest/moving-average-fix

Moving average fix
thecodeforest · Oct 7, 2022 · c7fba52 · c7fba52
2 parents d81d838 + ad74bd7
commit c7fba52
Show file tree

Hide file tree

Showing 13 changed files with 62 additions and 26 deletions.
diff --git a/.github/workflows/data_refresh.yml b/.github/workflows/data_refresh.yml
@@ -1,6 +1,6 @@
 name: Data Pipeline
 
-on: [push]
+# on: [push]
 
 jobs: 
  data-pipeline: 

diff --git a/pipeline/process/process_defense.py b/pipeline/process/process_defense.py
@@ -131,29 +131,33 @@ def rank_defense(df: pd.DataFrame, stats_columns: List[str]) -> pd.DataFrame:
 
 
 @pf.register_dataframe_method
-def create_future_defense_rankings(df: pd.DataFrame, current_week: str) -> pd.DataFrame:
- """Creates a dataframe containing the future defense rankings for each team based
- their ranking from the prior week. Simply carries over the prior week's ranking.
+def append_future_week_defense_rankings(
+ df: pd.DataFrame, season_year: int
+) -> pd.DataFrame:
+ """Shifts the previous week's defense rankings to the current week.
+
 
  Args:
- df (pd.DataFrame): The dataframe containing the defense rankings.
- current_week (str): The current week of the season. Note that
- this week should not have any games played yet.
+ df (pd.DataFrame): defense rankings for each team.
+ season_year (int): The current season year.
 
  Returns:
- pd.DataFrame: The future defense rankings for each team along with the
-  historical defense rankings.
+ pd.DataFrame: The defense rankings for each team based on cumulative stats
+ leading up to the current week.
  """
- most_recent_actual_week = max(df["week"])
- if most_recent_actual_week < int(current_week):
- most_recent_week_df = df.query(f"week == {most_recent_actual_week}")
- # replace week with current_week
- most_recent_week_df["week"] = int(current_week)
- # concat with defense_stats_df
- df = pd.concat([df, most_recent_week_df])
- return df
- else:
- return df
+ max_week = max(df["week"])
+ is_add_future_week = (season_year > 2020 and max_week <= 17) or (
+ season_year <= 2020 and max_week <= 16
+ )
+ if is_add_future_week:
+ future_week_place_holder_df = pd.DataFrame(
+ zip([max_week + 1] * len(df["opp"].unique()), df["opp"].unique()),
+ columns=["week", "opp"],
+ )
+ df = pd.concat([df, future_week_place_holder_df])
+ for def_rank_column in [x for x in df.columns if x.endswith("_rank")]:
+ df = df.assign(**{def_rank_column: df.groupby("opp")[def_rank_column].shift(1)})
+ return df
 
 
 if __name__ == "__main__":
@@ -179,9 +183,8 @@ def create_future_defense_rankings(df: pd.DataFrame, current_week: str) -> pd.Da
  )
  cumulative_stats.insert(0, "week", week)
  defense_stats_df = pd.concat([defense_stats_df, cumulative_stats])
- current_week = fetch_current_week(calendar_df)
- defense_stats_df = defense_stats_df.create_future_defense_rankings(
- current_week=current_week
+ defense_stats_df = defense_stats_df.append_future_week_defense_rankings(
+ season_year=args.season_year
  )
  defense_stats_df["season_year"] = args.season_year
  defense_stats_df.write_ff_csv(root_dir, args.season_year, dir_type, data_type)
diff --git a/pipeline/utils.py b/pipeline/utils.py
@@ -330,6 +330,7 @@ def read_ff_csv(dir_path: PosixPath) -> pd.DataFrame:
  return df
 
 
+# TO DO: create logic to handle if function is called outside of the season
 def fetch_current_week(calendar_df: pd.DataFrame) -> str:
  """Fetches the current week from the calendar dataframe."""
  # first determine which day of the week it is

diff --git a/src/fantasyfootball/datasets/season/2015/defense.gz b/src/fantasyfootball/datasets/season/2015/defense.gz
diff --git a/src/fantasyfootball/datasets/season/2016/defense.gz b/src/fantasyfootball/datasets/season/2016/defense.gz
diff --git a/src/fantasyfootball/datasets/season/2017/defense.gz b/src/fantasyfootball/datasets/season/2017/defense.gz
diff --git a/src/fantasyfootball/datasets/season/2018/defense.gz b/src/fantasyfootball/datasets/season/2018/defense.gz
diff --git a/src/fantasyfootball/datasets/season/2019/defense.gz b/src/fantasyfootball/datasets/season/2019/defense.gz
diff --git a/src/fantasyfootball/datasets/season/2020/defense.gz b/src/fantasyfootball/datasets/season/2020/defense.gz
diff --git a/src/fantasyfootball/datasets/season/2021/defense.gz b/src/fantasyfootball/datasets/season/2021/defense.gz
diff --git a/src/fantasyfootball/datasets/season/2022/defense.gz b/src/fantasyfootball/datasets/season/2022/defense.gz
diff --git a/src/fantasyfootball/features.py b/src/fantasyfootball/features.py
@@ -77,9 +77,17 @@ def transform(self, X, y=None):
  for col in self.window_columns:
  for window in self.n_week_window:
  col_name = f"{col}_ma_{window}"
+ # sort by player_group_columns and week
+ X = X.sort_values(self.player_group_columns + ["week"])
+ # shift the column by 1 to get the previous value
+ X = X.assign(
+ **{col_name: X.groupby(self.player_group_columns)[col].shift(1)}
+ )
  X = X.assign(
  **{
- col_name: X.groupby(self.player_group_columns)[col].transform(
+ col_name: X.groupby(self.player_group_columns)[
+ col_name
+ ].transform(
  lambda x: x.rolling(
  window, min_periods=1, center=False
  ).mean()
@@ -467,6 +475,18 @@ def create_future_week(self) -> FantasyFeatures:
  future_week_df = pd.merge(
  future_week_df, stats_df, how="left", on=["name", "team"]
  )
+ # load in defensive stats data and add in for future week
+ defense_df = pd.read_csv(ff_data_dir / "defense.gz", compression="gzip")
+ # drop the defensive ranking fields in future week df, becauses they are nan
+ future_week_df = future_week_df.drop(
+ columns=[col for col in future_week_df.columns if "_def_rank" in col]
+ )
+ # filter to future week
+ defense_df = defense_df[defense_df["week"] == max_week + 1]
+ # add the defensive rankings back into the future frame
+ future_week_df = pd.merge(
+ future_week_df, defense_df, how="left", on=["opp", "week", "season_year"]
+ )
  future_week_df["is_future_week"] = 1
  self.df = (
  pd.concat([self.df, future_week_df], axis=0)

diff --git a/tests/test_features.py b/tests/test_features.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 import pytest
 from fantasyfootball.config import data_sources, root_dir
 from fantasyfootball.features import (
@@ -425,7 +426,18 @@ def test_add_moving_average_feature(df):
  window_columns = "passing_yds"
  n_week_window = 2
  expected_column_name = "passing_yds_ma_2"
- expected_values = [0.0, 148.5, 254.0, 293.0, 0.0, 445.0, 222.5, 195.0, 234.5, 229.0]
+ expected_values = expected_values = [
+ 0,
+ 0,
+ 148.5,
+ 254.0,
+ 0,
+ 0,
+ 445.0,
+ 0,
+ 195.0,
+ 234.5,
+ ]
  features = FantasyFeatures(df, y="actual_pts", position="QB")
  features.add_moving_avg_feature(
  n_week_window=n_week_window, window_columns=window_columns
@@ -435,8 +447,8 @@ def test_add_moving_average_feature(df):
  # check column name correctly formatted
  assert expected_column_name in result.columns
 
- # check column values correct
- assert result[expected_column_name].values.tolist() == expected_values
+ # check column values correct, also fillna(0) to account for NaNs
+ assert result[expected_column_name].fillna(0).values.tolist() == expected_values
 
 
 def test_add_target_encoded_feature(df):