Skip to content

Commit

Permalink
Merge pull request #39 from thecodeforest/moving-average-fix
Browse files Browse the repository at this point in the history
Moving average fix
  • Loading branch information
thecodeforest authored Oct 7, 2022
2 parents d81d838 + ad74bd7 commit c7fba52
Show file tree
Hide file tree
Showing 13 changed files with 62 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/data_refresh.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Data Pipeline

on: [push]
# on: [push]

jobs:
data-pipeline:
Expand Down
45 changes: 24 additions & 21 deletions pipeline/process/process_defense.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,29 +131,33 @@ def rank_defense(df: pd.DataFrame, stats_columns: List[str]) -> pd.DataFrame:


@pf.register_dataframe_method
def create_future_defense_rankings(df: pd.DataFrame, current_week: str) -> pd.DataFrame:
"""Creates a dataframe containing the future defense rankings for each team based
their ranking from the prior week. Simply carries over the prior week's ranking.
def append_future_week_defense_rankings(
df: pd.DataFrame, season_year: int
) -> pd.DataFrame:
"""Shifts the previous week's defense rankings to the current week.
Args:
df (pd.DataFrame): The dataframe containing the defense rankings.
current_week (str): The current week of the season. Note that
this week should not have any games played yet.
df (pd.DataFrame): defense rankings for each team.
season_year (int): The current season year.
Returns:
pd.DataFrame: The future defense rankings for each team along with the
historical defense rankings.
pd.DataFrame: The defense rankings for each team based on cumulative stats
leading up to the current week.
"""
most_recent_actual_week = max(df["week"])
if most_recent_actual_week < int(current_week):
most_recent_week_df = df.query(f"week == {most_recent_actual_week}")
# replace week with current_week
most_recent_week_df["week"] = int(current_week)
# concat with defense_stats_df
df = pd.concat([df, most_recent_week_df])
return df
else:
return df
max_week = max(df["week"])
is_add_future_week = (season_year > 2020 and max_week <= 17) or (
season_year <= 2020 and max_week <= 16
)
if is_add_future_week:
future_week_place_holder_df = pd.DataFrame(
zip([max_week + 1] * len(df["opp"].unique()), df["opp"].unique()),
columns=["week", "opp"],
)
df = pd.concat([df, future_week_place_holder_df])
for def_rank_column in [x for x in df.columns if x.endswith("_rank")]:
df = df.assign(**{def_rank_column: df.groupby("opp")[def_rank_column].shift(1)})
return df


if __name__ == "__main__":
Expand All @@ -179,9 +183,8 @@ def create_future_defense_rankings(df: pd.DataFrame, current_week: str) -> pd.Da
)
cumulative_stats.insert(0, "week", week)
defense_stats_df = pd.concat([defense_stats_df, cumulative_stats])
current_week = fetch_current_week(calendar_df)
defense_stats_df = defense_stats_df.create_future_defense_rankings(
current_week=current_week
defense_stats_df = defense_stats_df.append_future_week_defense_rankings(
season_year=args.season_year
)
defense_stats_df["season_year"] = args.season_year
defense_stats_df.write_ff_csv(root_dir, args.season_year, dir_type, data_type)
1 change: 1 addition & 0 deletions pipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ def read_ff_csv(dir_path: PosixPath) -> pd.DataFrame:
return df


# TO DO: create logic to handle if function is called outside of the season
def fetch_current_week(calendar_df: pd.DataFrame) -> str:
"""Fetches the current week from the calendar dataframe."""
# first determine which day of the week it is
Expand Down
Binary file modified src/fantasyfootball/datasets/season/2015/defense.gz
Binary file not shown.
Binary file modified src/fantasyfootball/datasets/season/2016/defense.gz
Binary file not shown.
Binary file modified src/fantasyfootball/datasets/season/2017/defense.gz
Binary file not shown.
Binary file modified src/fantasyfootball/datasets/season/2018/defense.gz
Binary file not shown.
Binary file modified src/fantasyfootball/datasets/season/2019/defense.gz
Binary file not shown.
Binary file modified src/fantasyfootball/datasets/season/2020/defense.gz
Binary file not shown.
Binary file modified src/fantasyfootball/datasets/season/2021/defense.gz
Binary file not shown.
Binary file modified src/fantasyfootball/datasets/season/2022/defense.gz
Binary file not shown.
22 changes: 21 additions & 1 deletion src/fantasyfootball/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,17 @@ def transform(self, X, y=None):
for col in self.window_columns:
for window in self.n_week_window:
col_name = f"{col}_ma_{window}"
# sort by player_group_columns and week
X = X.sort_values(self.player_group_columns + ["week"])
# shift the column by 1 to get the previous value
X = X.assign(
**{col_name: X.groupby(self.player_group_columns)[col].shift(1)}
)
X = X.assign(
**{
col_name: X.groupby(self.player_group_columns)[col].transform(
col_name: X.groupby(self.player_group_columns)[
col_name
].transform(
lambda x: x.rolling(
window, min_periods=1, center=False
).mean()
Expand Down Expand Up @@ -467,6 +475,18 @@ def create_future_week(self) -> FantasyFeatures:
future_week_df = pd.merge(
future_week_df, stats_df, how="left", on=["name", "team"]
)
# load in defensive stats data and add in for future week
defense_df = pd.read_csv(ff_data_dir / "defense.gz", compression="gzip")
# drop the defensive ranking fields in future week df, becauses they are nan
future_week_df = future_week_df.drop(
columns=[col for col in future_week_df.columns if "_def_rank" in col]
)
# filter to future week
defense_df = defense_df[defense_df["week"] == max_week + 1]
# add the defensive rankings back into the future frame
future_week_df = pd.merge(
future_week_df, defense_df, how="left", on=["opp", "week", "season_year"]
)
future_week_df["is_future_week"] = 1
self.df = (
pd.concat([self.df, future_week_df], axis=0)
Expand Down
18 changes: 15 additions & 3 deletions tests/test_features.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import numpy as np
import pytest
from fantasyfootball.config import data_sources, root_dir
from fantasyfootball.features import (
Expand Down Expand Up @@ -425,7 +426,18 @@ def test_add_moving_average_feature(df):
window_columns = "passing_yds"
n_week_window = 2
expected_column_name = "passing_yds_ma_2"
expected_values = [0.0, 148.5, 254.0, 293.0, 0.0, 445.0, 222.5, 195.0, 234.5, 229.0]
expected_values = expected_values = [
0,
0,
148.5,
254.0,
0,
0,
445.0,
0,
195.0,
234.5,
]
features = FantasyFeatures(df, y="actual_pts", position="QB")
features.add_moving_avg_feature(
n_week_window=n_week_window, window_columns=window_columns
Expand All @@ -435,8 +447,8 @@ def test_add_moving_average_feature(df):
# check column name correctly formatted
assert expected_column_name in result.columns

# check column values correct
assert result[expected_column_name].values.tolist() == expected_values
# check column values correct, also fillna(0) to account for NaNs
assert result[expected_column_name].fillna(0).values.tolist() == expected_values


def test_add_target_encoded_feature(df):
Expand Down

0 comments on commit c7fba52

Please sign in to comment.