From a094f50fae2d7ebc59609b0b3f6fa5688532cebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Wed, 6 Mar 2024 13:15:27 -0600 Subject: [PATCH] add trend and pipeline to feature_engineering (#70) --- action_files/lint | 2 +- nbs/feature_engineering.ipynb | 1010 +++++++++++++++++++++++--- pyproject.toml | 2 +- settings.ini | 2 +- utilsforecast/__init__.py | 2 +- utilsforecast/_modidx.py | 16 +- utilsforecast/feature_engineering.py | 231 ++++-- 7 files changed, 1091 insertions(+), 174 deletions(-) diff --git a/action_files/lint b/action_files/lint index 88776fa..4179493 100755 --- a/action_files/lint +++ b/action_files/lint @@ -1,3 +1,3 @@ #!/usr/bin/env bash -ruff utilsforecast || exit -1 +ruff check utilsforecast || exit -1 mypy utilsforecast || exit -1 diff --git a/nbs/feature_engineering.ipynb b/nbs/feature_engineering.ipynb index 68f7a9b..7f02b5f 100644 --- a/nbs/feature_engineering.ipynb +++ b/nbs/feature_engineering.ipynb @@ -39,7 +39,8 @@ "outputs": [], "source": [ "#| export\n", - "from typing import Tuple\n", + "from functools import partial\n", + "from typing import Callable, List, Optional, Tuple\n", "\n", "import numpy as np\n", "\n", @@ -48,6 +49,103 @@ "from utilsforecast.validation import validate_format, validate_freq" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e70c32b-cf3c-4988-8fde-a54b8252b6e5", + "metadata": {}, + "outputs": [], + "source": [ + "#| exporti\n", + "_Features = Tuple[List[str], np.ndarray, np.ndarray]\n", + "\n", + "def _add_features(\n", + " df: DataFrame,\n", + " freq: str,\n", + " h: int,\n", + " id_col: str,\n", + " time_col: str,\n", + " f: Callable[[np.ndarray, int], _Features],\n", + ") -> Tuple[DataFrame, DataFrame]:\n", + " # validations\n", + " if not isinstance(h, int) or h < 0:\n", + " raise ValueError('`h` must be a non-negative integer')\n", + " validate_format(df, id_col, time_col, None)\n", + " validate_freq(df[time_col], freq)\n", + "\n", + " # decompose series\n", + " id_counts = ufp.counts_by_id(df, id_col)\n", + " uids = id_counts[id_col]\n", + " sizes = id_counts['counts'].to_numpy()\n", + "\n", + " # compute values\n", + " cols, vals, future_vals = f(sizes=sizes, h=h) # type: ignore\n", + "\n", + " # assign back to df\n", + " sort_idxs = ufp.maybe_compute_sort_indices(df, id_col, time_col)\n", + " times = df[time_col]\n", + " if sort_idxs is not None:\n", + " restore_idxs = np.empty_like(sort_idxs)\n", + " restore_idxs[sort_idxs] = np.arange(sort_idxs.size)\n", + " vals = vals[restore_idxs]\n", + " times = ufp.take_rows(times, sort_idxs)\n", + " last_times = ufp.take_rows(times, sizes.cumsum() - 1)\n", + " df = ufp.copy_if_pandas(df, deep=False)\n", + " transformed = ufp.assign_columns(df, cols, vals)\n", + "\n", + " if h == 0:\n", + " return transformed, type(df)({})\n", + "\n", + " # future vals\n", + " future_df = ufp.make_future_dataframe(\n", + " uids=uids,\n", + " last_times=last_times,\n", + " freq=freq,\n", + " h=h,\n", + " id_col=id_col,\n", + " time_col=time_col,\n", + " )\n", + " future_df = ufp.assign_columns(future_df, cols, future_vals)\n", + " return transformed, future_df\n", + "\n", + "def _assign_slices(\n", + " sizes: np.ndarray,\n", + " feats: np.ndarray,\n", + " h: int,\n", + ") -> Tuple[np.ndarray, np.ndarray]:\n", + " max_samples, n_feats = feats.shape\n", + " vals = np.empty((sizes.sum(), n_feats), dtype=np.float32)\n", + " future_vals = np.empty((h * sizes.size, n_feats))\n", + " start = 0\n", + " for i, size in enumerate(sizes):\n", + " vals[start : start + size, :] = feats[max_samples - size - h : max_samples - h]\n", + " future_vals[i * h: (i + 1) * h] = feats[max_samples - h :]\n", + " start += size\n", + " return vals, future_vals\n", + "\n", + "def _fourier(\n", + " sizes: np.ndarray,\n", + " h: int,\n", + " season_length: int,\n", + " k: int,\n", + ") -> _Features:\n", + " # taken from: https://github.com/tblume1992/TSUtilities/blob/main/TSUtilities/TSFeatures/fourier_seasonality.py\n", + " x = 2 * np.pi * np.arange(1, k + 1) / season_length\n", + " x = x.astype(np.float32)\n", + " t = np.arange(1, sizes.max() + 1 + h, dtype=np.float32)\n", + " x = x * t[:, None]\n", + " terms = np.hstack([np.sin(x), np.cos(x)])\n", + " cols = [f'{op}{i+1}_{season_length}' for op in ('sin', 'cos') for i in range(k)]\n", + " vals, future_vals = _assign_slices(sizes=sizes, feats=terms, h=h)\n", + " return cols, vals, future_vals\n", + "\n", + "def _trend(sizes: np.ndarray, h: int) -> _Features:\n", + " t = np.arange(1, sizes.max() + 1 + h, dtype=np.float32).reshape(-1, 1)\n", + " cols = ['trend']\n", + " vals, future_vals = _assign_slices(sizes=sizes, feats=t, h=h)\n", + " return cols, vals, future_vals" + ] + }, { "cell_type": "code", "execution_count": null, @@ -91,59 +189,15 @@ " future_df : pandas or polars DataFrame\n", " DataFrame with future values\n", " \"\"\"\n", - " if not isinstance(h, int) or h < 0:\n", - " raise ValueError('`h` must be a non-negative integer')\n", - " validate_format(df, id_col, time_col, None)\n", - " validate_freq(df[time_col], freq)\n", - "\n", - " # compute terms based on the max size\n", - " id_counts = ufp.counts_by_id(df, id_col)\n", - " uids = id_counts[id_col]\n", - " sizes = id_counts['counts'].to_numpy()\n", - " max_size = sizes.max()\n", - " # taken from: https://github.com/tblume1992/TSUtilities/blob/main/TSUtilities/TSFeatures/fourier_seasonality.py\n", - " x = 2 * np.pi * np.arange(1, k + 1) / season_length\n", - " x = x.astype(np.float32)\n", - " t = np.arange(1, max_size + 1 + h, dtype=np.float32)\n", - " x = x * t[:, None]\n", - " terms = np.hstack([np.sin(x), np.cos(x)])\n", - " cols = [f'{op}{i+1}_{season_length}' for op in ('sin', 'cos') for i in range(k)]\n", - "\n", - " # slice for each serie\n", - " vals = np.empty((sizes.sum(), 2 * k), dtype=np.float32)\n", - " future_vals = np.empty((h * sizes.size, 2 * k), dtype=np.float32)\n", - " start = 0\n", - " for i, size in enumerate(sizes):\n", - " vals[start : start + size, :] = terms[:size]\n", - " future_vals[i * h: (i + 1) * h] = terms[size : size + h]\n", - " start += size\n", - "\n", - " # assign back to df\n", - " sort_idxs = ufp.maybe_compute_sort_indices(df, id_col, time_col)\n", - " times = df[time_col]\n", - " if sort_idxs is not None:\n", - " restore_idxs = np.empty_like(sort_idxs)\n", - " restore_idxs[sort_idxs] = np.arange(sort_idxs.size)\n", - " vals = vals[restore_idxs]\n", - " times = ufp.take_rows(times, sort_idxs)\n", - " last_times = ufp.take_rows(times, sizes.cumsum() - 1)\n", - " df = ufp.copy_if_pandas(df, deep=False)\n", - " transformed = ufp.assign_columns(df, cols, vals)\n", - "\n", - " if h == 0:\n", - " return transformed, type(df)({})\n", - "\n", - " # future vals\n", - " future_df = ufp.make_future_dataframe(\n", - " uids=uids,\n", - " last_times=last_times,\n", + " f = partial(_fourier, season_length=season_length, k=k)\n", + " return _add_features(\n", + " df=df,\n", " freq=freq,\n", " h=h,\n", " id_col=id_col,\n", " time_col=time_col,\n", - " )\n", - " future_df = ufp.assign_columns(future_df, cols, future_vals)\n", - " return transformed, future_df" + " f=f,\n", + " )" ] }, { @@ -198,52 +252,52 @@ " \n", " 0\n", " 0\n", - " 2000-01-01\n", + " 2000-10-05\n", " 0.428973\n", - " 0.781832\n", - " 0.974928\n", - " 0.623490\n", - " -0.222521\n", + " -0.974927\n", + " 0.433894\n", + " -0.222526\n", + " -0.900964\n", " \n", " \n", " 1\n", " 0\n", - " 2000-01-02\n", + " 2000-10-06\n", " 1.423626\n", - " 0.974928\n", - " -0.433884\n", - " -0.222521\n", - " -0.900969\n", + " -0.781835\n", + " -0.974926\n", + " 0.623486\n", + " -0.222531\n", " \n", " \n", " 2\n", " 0\n", - " 2000-01-03\n", + " 2000-10-07\n", " 2.311782\n", - " 0.433884\n", - " -0.781831\n", - " -0.900969\n", - " 0.623490\n", + " -0.000005\n", + " -0.000009\n", + " 1.000000\n", + " 1.000000\n", " \n", " \n", " 3\n", " 0\n", - " 2000-01-04\n", + " 2000-10-08\n", " 3.192191\n", - " -0.433884\n", - " 0.781832\n", - " -0.900969\n", - " 0.623490\n", + " 0.781829\n", + " 0.974930\n", + " 0.623493\n", + " -0.222512\n", " \n", " \n", " 4\n", " 0\n", - " 2000-01-05\n", + " 2000-10-09\n", " 4.148767\n", - " -0.974928\n", - " 0.433884\n", - " -0.222521\n", - " -0.900969\n", + " 0.974929\n", + " -0.433877\n", + " -0.222517\n", + " -0.900972\n", " \n", " \n", " ...\n", @@ -258,7 +312,7 @@ " \n", " 1096\n", " 4\n", - " 2001-01-03\n", + " 2001-05-10\n", " 4.058910\n", " -0.974927\n", " 0.433888\n", @@ -268,7 +322,7 @@ " \n", " 1097\n", " 4\n", - " 2001-01-04\n", + " 2001-05-11\n", " 5.178157\n", " -0.781823\n", " -0.974934\n", @@ -278,7 +332,7 @@ " \n", " 1098\n", " 4\n", - " 2001-01-05\n", + " 2001-05-12\n", " 6.133142\n", " -0.000002\n", " -0.000003\n", @@ -288,7 +342,7 @@ " \n", " 1099\n", " 4\n", - " 2001-01-06\n", + " 2001-05-13\n", " 0.403709\n", " 0.781840\n", " 0.974922\n", @@ -298,7 +352,7 @@ " \n", " 1100\n", " 4\n", - " 2001-01-07\n", + " 2001-05-14\n", " 1.081779\n", " 0.974928\n", " -0.433882\n", @@ -312,17 +366,17 @@ ], "text/plain": [ " unique_id ds y sin1_7 sin2_7 cos1_7 cos2_7\n", - "0 0 2000-01-01 0.428973 0.781832 0.974928 0.623490 -0.222521\n", - "1 0 2000-01-02 1.423626 0.974928 -0.433884 -0.222521 -0.900969\n", - "2 0 2000-01-03 2.311782 0.433884 -0.781831 -0.900969 0.623490\n", - "3 0 2000-01-04 3.192191 -0.433884 0.781832 -0.900969 0.623490\n", - "4 0 2000-01-05 4.148767 -0.974928 0.433884 -0.222521 -0.900969\n", + "0 0 2000-10-05 0.428973 -0.974927 0.433894 -0.222526 -0.900964\n", + "1 0 2000-10-06 1.423626 -0.781835 -0.974926 0.623486 -0.222531\n", + "2 0 2000-10-07 2.311782 -0.000005 -0.000009 1.000000 1.000000\n", + "3 0 2000-10-08 3.192191 0.781829 0.974930 0.623493 -0.222512\n", + "4 0 2000-10-09 4.148767 0.974929 -0.433877 -0.222517 -0.900972\n", "... ... ... ... ... ... ... ...\n", - "1096 4 2001-01-03 4.058910 -0.974927 0.433888 -0.222523 -0.900967\n", - "1097 4 2001-01-04 5.178157 -0.781823 -0.974934 0.623500 -0.222495\n", - "1098 4 2001-01-05 6.133142 -0.000002 -0.000003 1.000000 1.000000\n", - "1099 4 2001-01-06 0.403709 0.781840 0.974922 0.623479 -0.222548\n", - "1100 4 2001-01-07 1.081779 0.974928 -0.433882 -0.222520 -0.900970\n", + "1096 4 2001-05-10 4.058910 -0.974927 0.433888 -0.222523 -0.900967\n", + "1097 4 2001-05-11 5.178157 -0.781823 -0.974934 0.623500 -0.222495\n", + "1098 4 2001-05-12 6.133142 -0.000002 -0.000003 1.000000 1.000000\n", + "1099 4 2001-05-13 0.403709 0.781840 0.974922 0.623479 -0.222548\n", + "1100 4 2001-05-14 1.081779 0.974928 -0.433882 -0.222520 -0.900970\n", "\n", "[1101 rows x 7 columns]" ] @@ -333,7 +387,7 @@ } ], "source": [ - "series = generate_series(5)\n", + "series = generate_series(5, equal_ends=True)\n", "transformed_df, future_df = fourier(series, freq='D', season_length=7, k=2, h=1)\n", "transformed_df" ] @@ -377,45 +431,45 @@ " \n", " 0\n", " 0\n", - " 2000-08-10\n", - " -7.818282e-01\n", - " -9.749302e-01\n", - " 0.623494\n", - " -0.222511\n", + " 2001-05-15\n", + " 0.433871\n", + " -0.781813\n", + " -0.900975\n", + " 0.623513\n", " \n", " \n", " 1\n", " 1\n", - " 2000-04-07\n", - " -2.320534e-06\n", - " -4.641067e-06\n", - " 1.000000\n", - " 1.000000\n", + " 2001-05-15\n", + " 0.433871\n", + " -0.781813\n", + " -0.900975\n", + " 0.623513\n", " \n", " \n", " 2\n", " 2\n", - " 2000-06-16\n", - " 3.815962e-07\n", - " 7.631924e-07\n", - " 1.000000\n", - " 1.000000\n", + " 2001-05-15\n", + " 0.433871\n", + " -0.781813\n", + " -0.900975\n", + " 0.623513\n", " \n", " \n", " 3\n", " 3\n", - " 2000-08-30\n", - " -9.749273e-01\n", - " 4.338886e-01\n", - " -0.222524\n", - " -0.900967\n", + " 2001-05-15\n", + " 0.433871\n", + " -0.781813\n", + " -0.900975\n", + " 0.623513\n", " \n", " \n", " 4\n", " 4\n", - " 2001-01-08\n", - " 4.338705e-01\n", - " -7.818132e-01\n", + " 2001-05-15\n", + " 0.433871\n", + " -0.781813\n", " -0.900975\n", " 0.623513\n", " \n", @@ -424,12 +478,12 @@ "" ], "text/plain": [ - " unique_id ds sin1_7 sin2_7 cos1_7 cos2_7\n", - "0 0 2000-08-10 -7.818282e-01 -9.749302e-01 0.623494 -0.222511\n", - "1 1 2000-04-07 -2.320534e-06 -4.641067e-06 1.000000 1.000000\n", - "2 2 2000-06-16 3.815962e-07 7.631924e-07 1.000000 1.000000\n", - "3 3 2000-08-30 -9.749273e-01 4.338886e-01 -0.222524 -0.900967\n", - "4 4 2001-01-08 4.338705e-01 -7.818132e-01 -0.900975 0.623513" + " unique_id ds sin1_7 sin2_7 cos1_7 cos2_7\n", + "0 0 2001-05-15 0.433871 -0.781813 -0.900975 0.623513\n", + "1 1 2001-05-15 0.433871 -0.781813 -0.900975 0.623513\n", + "2 2 2001-05-15 0.433871 -0.781813 -0.900975 0.623513\n", + "3 3 2001-05-15 0.433871 -0.781813 -0.900975 0.623513\n", + "4 4 2001-05-15 0.433871 -0.781813 -0.900975 0.623513" ] }, "execution_count": null, @@ -479,7 +533,7 @@ "source": [ "#| hide\n", "#| polars\n", - "series_pl = generate_series(5, engine='polars')\n", + "series_pl = generate_series(5, equal_ends=True, engine='polars')\n", "transformed_pl, future_pl = fourier(series_pl, freq='1d', season_length=7, k=2, h=1)\n", "transformed_pl2, future_pl2 = fourier(series_pl.sample(fraction=1.0), freq='1d', season_length=7, k=2, h=1)\n", "pl.testing.assert_frame_equal(transformed_pl, transformed_pl2)\n", @@ -493,6 +547,720 @@ " future_pl.drop(columns=['unique_id']).to_pandas()\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ef99e58-df8c-4121-990e-873e66de7bcb", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def trend(\n", + " df: DataFrame,\n", + " freq: str,\n", + " h: int = 0,\n", + " id_col: str = 'unique_id',\n", + " time_col: str = 'ds',\n", + ") -> Tuple[DataFrame, DataFrame]:\n", + " \"\"\"Add a trend column with consecutive integers for training and forecasting\n", + "\n", + " Parameters\n", + " ----------\n", + " df : pandas or polars DataFrame\n", + " Dataframe with ids, times and values for the exogenous regressors.\n", + " freq : str or int\n", + " Frequency of the data. Must be a valid pandas or polars offset alias, or an integer.\n", + " h : int (default=0)\n", + " Forecast horizon. \n", + " id_col : str (default='unique_id')\n", + " Column that identifies each serie.\n", + " time_col : str (default='ds')\n", + " Column that identifies each timestep, its values can be timestamps or integers.\n", + "\n", + " Returns\n", + " -------\n", + " transformed_df : pandas or polars DataFrame\n", + " Original DataFrame with the computed features\n", + " future_df : pandas or polars DataFrame\n", + " DataFrame with future values\n", + " \"\"\"\n", + " return _add_features(\n", + " df=df,\n", + " freq=freq,\n", + " h=h,\n", + " id_col=id_col,\n", + " time_col=time_col,\n", + " f=_trend,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0146556d-7084-4274-9735-cd79da4198e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsytrend
002000-10-050.428973152.0
102000-10-061.423626153.0
202000-10-072.311782154.0
302000-10-083.192191155.0
402000-10-094.148767156.0
...............
109642001-05-104.058910369.0
109742001-05-115.178157370.0
109842001-05-126.133142371.0
109942001-05-130.403709372.0
110042001-05-141.081779373.0
\n", + "

1101 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " unique_id ds y trend\n", + "0 0 2000-10-05 0.428973 152.0\n", + "1 0 2000-10-06 1.423626 153.0\n", + "2 0 2000-10-07 2.311782 154.0\n", + "3 0 2000-10-08 3.192191 155.0\n", + "4 0 2000-10-09 4.148767 156.0\n", + "... ... ... ... ...\n", + "1096 4 2001-05-10 4.058910 369.0\n", + "1097 4 2001-05-11 5.178157 370.0\n", + "1098 4 2001-05-12 6.133142 371.0\n", + "1099 4 2001-05-13 0.403709 372.0\n", + "1100 4 2001-05-14 1.081779 373.0\n", + "\n", + "[1101 rows x 4 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "series = generate_series(5, equal_ends=True)\n", + "transformed_df, future_df = trend(series, freq='D', h=1)\n", + "transformed_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5901f01-404b-4bc4-91b0-27703f72693e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddstrend
002001-05-15374.0
112001-05-15374.0
222001-05-15374.0
332001-05-15374.0
442001-05-15374.0
\n", + "
" + ], + "text/plain": [ + " unique_id ds trend\n", + "0 0 2001-05-15 374.0\n", + "1 1 2001-05-15 374.0\n", + "2 2 2001-05-15 374.0\n", + "3 3 2001-05-15 374.0\n", + "4 4 2001-05-15 374.0" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "future_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3307d99-62fd-46b0-856b-ad4cf297b32c", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def pipeline(\n", + " df: DataFrame,\n", + " features: List[Callable],\n", + " freq: str,\n", + " h: int = 0,\n", + " id_col: str = 'unique_id',\n", + " time_col: str = 'ds',\n", + ") -> Tuple[DataFrame, DataFrame]:\n", + " \"\"\"Compute several features for training and forecasting\n", + "\n", + " Parameters\n", + " ----------\n", + " df : pandas or polars DataFrame\n", + " Dataframe with ids, times and values for the exogenous regressors.\n", + " features : list of callable\n", + " List of features to compute. Must take only df, freq, h, id_col and time_col (other arguments must be fixed).\n", + " freq : str or int\n", + " Frequency of the data. Must be a valid pandas or polars offset alias, or an integer.\n", + " h : int (default=0)\n", + " Forecast horizon. \n", + " id_col : str (default='unique_id')\n", + " Column that identifies each serie.\n", + " time_col : str (default='ds')\n", + " Column that identifies each timestep, its values can be timestamps or integers.\n", + "\n", + " Returns\n", + " -------\n", + " transformed_df : pandas or polars DataFrame\n", + " Original DataFrame with the computed features\n", + " future_df : pandas or polars DataFrame\n", + " DataFrame with future values\n", + " \"\"\"\n", + " transformed: Optional[DataFrame] = None\n", + " future: Optional[DataFrame] = None\n", + " for f in features:\n", + " f_transformed, f_future = f(df=df, freq=freq, h=h, id_col=id_col, time_col=time_col)\n", + " if transformed is None:\n", + " transformed = f_transformed\n", + " future = f_future\n", + " else:\n", + " feat_cols = [c for c in f_future.columns if c not in (id_col, time_col)]\n", + " transformed = ufp.horizontal_concat([transformed, f_transformed[feat_cols]])\n", + " future = ufp.horizontal_concat([future, f_future[feat_cols]])\n", + " return transformed, future" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a7c6074-096b-4f9c-9f34-79ed65f06e0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsytrendsin1_7cos1_7sin1_28cos1_28
002000-10-050.428973152.0-0.974927-0.2225260.433885-9.009683e-01
102000-10-061.423626153.0-0.7818350.6234860.222522-9.749276e-01
202000-10-072.311782154.0-0.0000051.0000000.000001-1.000000e+00
302000-10-083.192191155.00.7818290.623493-0.222520-9.749281e-01
402000-10-094.148767156.00.974929-0.222517-0.433883-9.009693e-01
...........................
109642001-05-104.058910369.0-0.974927-0.2225230.9009694.338843e-01
109742001-05-115.178157370.0-0.7818230.6235000.9749292.225177e-01
109842001-05-126.133142371.0-0.0000021.0000001.0000004.251100e-07
109942001-05-130.403709372.00.7818400.6234790.974927-2.225243e-01
110042001-05-141.081779373.00.974928-0.2225200.900969-4.338835e-01
\n", + "

1101 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " unique_id ds y trend sin1_7 cos1_7 sin1_28 \\\n", + "0 0 2000-10-05 0.428973 152.0 -0.974927 -0.222526 0.433885 \n", + "1 0 2000-10-06 1.423626 153.0 -0.781835 0.623486 0.222522 \n", + "2 0 2000-10-07 2.311782 154.0 -0.000005 1.000000 0.000001 \n", + "3 0 2000-10-08 3.192191 155.0 0.781829 0.623493 -0.222520 \n", + "4 0 2000-10-09 4.148767 156.0 0.974929 -0.222517 -0.433883 \n", + "... ... ... ... ... ... ... ... \n", + "1096 4 2001-05-10 4.058910 369.0 -0.974927 -0.222523 0.900969 \n", + "1097 4 2001-05-11 5.178157 370.0 -0.781823 0.623500 0.974929 \n", + "1098 4 2001-05-12 6.133142 371.0 -0.000002 1.000000 1.000000 \n", + "1099 4 2001-05-13 0.403709 372.0 0.781840 0.623479 0.974927 \n", + "1100 4 2001-05-14 1.081779 373.0 0.974928 -0.222520 0.900969 \n", + "\n", + " cos1_28 \n", + "0 -9.009683e-01 \n", + "1 -9.749276e-01 \n", + "2 -1.000000e+00 \n", + "3 -9.749281e-01 \n", + "4 -9.009693e-01 \n", + "... ... \n", + "1096 4.338843e-01 \n", + "1097 2.225177e-01 \n", + "1098 4.251100e-07 \n", + "1099 -2.225243e-01 \n", + "1100 -4.338835e-01 \n", + "\n", + "[1101 rows x 8 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features = [\n", + " trend,\n", + " partial(fourier, season_length=7, k=1),\n", + " partial(fourier, season_length=28, k=1),\n", + "]\n", + "transformed_df, future_df = pipeline(\n", + " series,\n", + " features=features,\n", + " freq='D',\n", + " h=1,\n", + ")\n", + "transformed_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c454175-20d9-4031-8c23-91a9e0f7c416", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddstrendsin1_7cos1_7sin1_28cos1_28
002001-05-15374.00.433871-0.9009750.781829-0.623493
112001-05-15374.00.433871-0.9009750.781829-0.623493
222001-05-15374.00.433871-0.9009750.781829-0.623493
332001-05-15374.00.433871-0.9009750.781829-0.623493
442001-05-15374.00.433871-0.9009750.781829-0.623493
\n", + "
" + ], + "text/plain": [ + " unique_id ds trend sin1_7 cos1_7 sin1_28 cos1_28\n", + "0 0 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493\n", + "1 1 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493\n", + "2 2 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493\n", + "3 3 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493\n", + "4 4 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "future_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f92c960-9ef6-49b7-83d3-cc6a3366c346", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "from functools import reduce" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83f7d54d-7c73-40f1-a192-8e5816e1ce29", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "def reduce_join(dfs, on):\n", + " return reduce(\n", + " lambda left, right: left.merge(right, on=on, how='left'),\n", + " dfs,\n", + " )\n", + "\n", + "individual_results = [f(series, freq='D', h=1) for f in features]\n", + "expected_transformed = reduce_join(\n", + " [r[0] for r in individual_results], on=['unique_id', 'ds', 'y']\n", + ")\n", + "expected_future = reduce_join(\n", + " [r[1] for r in individual_results], on=['unique_id', 'ds']\n", + ")\n", + "pd.testing.assert_frame_equal(transformed_df, expected_transformed)\n", + "pd.testing.assert_frame_equal(future_df, expected_future)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3d65447-93e8-4692-ad34-2a21852fb504", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "#| polars\n", + "transformed_pl, future_pl = pipeline(\n", + " series_pl,\n", + " features=features,\n", + " freq='1d',\n", + " h=1,\n", + ")\n", + "pd.testing.assert_frame_equal(\n", + " transformed_pl.drop('unique_id').to_pandas(),\n", + " transformed_df.drop(columns='unique_id'),\n", + ")\n", + "pd.testing.assert_frame_equal(\n", + " future_pl.drop('unique_id').to_pandas(),\n", + " future_df.drop(columns='unique_id'),\n", + ")" + ] } ], "metadata": { diff --git a/pyproject.toml b/pyproject.toml index 6eba2bb..f06ef84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,5 +4,5 @@ ignore_missing_imports = true module = 'utilsforecast.compat' ignore_errors = true -[tool.ruff] +[tool.ruff.lint] select = ["F", "ARG"] diff --git a/settings.ini b/settings.ini index eabd221..d0d8c8c 100644 --- a/settings.ini +++ b/settings.ini @@ -1,7 +1,7 @@ [DEFAULT] repo = utilsforecast lib_name = utilsforecast -version = 0.1.0 +version = 0.1.1 min_python = 3.8 license = apache2 black_formatting = True diff --git a/utilsforecast/__init__.py b/utilsforecast/__init__.py index 3dc1f76..485f44a 100644 --- a/utilsforecast/__init__.py +++ b/utilsforecast/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.1.1" diff --git a/utilsforecast/_modidx.py b/utilsforecast/_modidx.py index a202c04..d1572ed 100644 --- a/utilsforecast/_modidx.py +++ b/utilsforecast/_modidx.py @@ -14,8 +14,20 @@ 'utilsforecast.evaluation._quantiles_from_levels': ( 'evaluation.html#_quantiles_from_levels', 'utilsforecast/evaluation.py'), 'utilsforecast.evaluation.evaluate': ('evaluation.html#evaluate', 'utilsforecast/evaluation.py')}, - 'utilsforecast.feature_engineering': { 'utilsforecast.feature_engineering.fourier': ( 'feature_engineering.html#fourier', - 'utilsforecast/feature_engineering.py')}, + 'utilsforecast.feature_engineering': { 'utilsforecast.feature_engineering._add_features': ( 'feature_engineering.html#_add_features', + 'utilsforecast/feature_engineering.py'), + 'utilsforecast.feature_engineering._assign_slices': ( 'feature_engineering.html#_assign_slices', + 'utilsforecast/feature_engineering.py'), + 'utilsforecast.feature_engineering._fourier': ( 'feature_engineering.html#_fourier', + 'utilsforecast/feature_engineering.py'), + 'utilsforecast.feature_engineering._trend': ( 'feature_engineering.html#_trend', + 'utilsforecast/feature_engineering.py'), + 'utilsforecast.feature_engineering.fourier': ( 'feature_engineering.html#fourier', + 'utilsforecast/feature_engineering.py'), + 'utilsforecast.feature_engineering.pipeline': ( 'feature_engineering.html#pipeline', + 'utilsforecast/feature_engineering.py'), + 'utilsforecast.feature_engineering.trend': ( 'feature_engineering.html#trend', + 'utilsforecast/feature_engineering.py')}, 'utilsforecast.grouped_array': { 'utilsforecast.grouped_array.GroupedArray': ( 'grouped_array.html#groupedarray', 'utilsforecast/grouped_array.py'), 'utilsforecast.grouped_array.GroupedArray.__getitem__': ( 'grouped_array.html#groupedarray.__getitem__', diff --git a/utilsforecast/feature_engineering.py b/utilsforecast/feature_engineering.py index 1c60b0d..8d2facd 100644 --- a/utilsforecast/feature_engineering.py +++ b/utilsforecast/feature_engineering.py @@ -1,10 +1,11 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/feature_engineering.ipynb. # %% auto 0 -__all__ = ['fourier'] +__all__ = ['fourier', 'trend', 'pipeline'] # %% ../nbs/feature_engineering.ipynb 3 -from typing import Tuple +from functools import partial +from typing import Callable, List, Optional, Tuple import numpy as np @@ -13,6 +14,99 @@ from .validation import validate_format, validate_freq # %% ../nbs/feature_engineering.ipynb 4 +_Features = Tuple[List[str], np.ndarray, np.ndarray] + + +def _add_features( + df: DataFrame, + freq: str, + h: int, + id_col: str, + time_col: str, + f: Callable[[np.ndarray, int], _Features], +) -> Tuple[DataFrame, DataFrame]: + # validations + if not isinstance(h, int) or h < 0: + raise ValueError("`h` must be a non-negative integer") + validate_format(df, id_col, time_col, None) + validate_freq(df[time_col], freq) + + # decompose series + id_counts = ufp.counts_by_id(df, id_col) + uids = id_counts[id_col] + sizes = id_counts["counts"].to_numpy() + + # compute values + cols, vals, future_vals = f(sizes=sizes, h=h) # type: ignore + + # assign back to df + sort_idxs = ufp.maybe_compute_sort_indices(df, id_col, time_col) + times = df[time_col] + if sort_idxs is not None: + restore_idxs = np.empty_like(sort_idxs) + restore_idxs[sort_idxs] = np.arange(sort_idxs.size) + vals = vals[restore_idxs] + times = ufp.take_rows(times, sort_idxs) + last_times = ufp.take_rows(times, sizes.cumsum() - 1) + df = ufp.copy_if_pandas(df, deep=False) + transformed = ufp.assign_columns(df, cols, vals) + + if h == 0: + return transformed, type(df)({}) + + # future vals + future_df = ufp.make_future_dataframe( + uids=uids, + last_times=last_times, + freq=freq, + h=h, + id_col=id_col, + time_col=time_col, + ) + future_df = ufp.assign_columns(future_df, cols, future_vals) + return transformed, future_df + + +def _assign_slices( + sizes: np.ndarray, + feats: np.ndarray, + h: int, +) -> Tuple[np.ndarray, np.ndarray]: + max_samples, n_feats = feats.shape + vals = np.empty((sizes.sum(), n_feats), dtype=np.float32) + future_vals = np.empty((h * sizes.size, n_feats)) + start = 0 + for i, size in enumerate(sizes): + vals[start : start + size, :] = feats[max_samples - size - h : max_samples - h] + future_vals[i * h : (i + 1) * h] = feats[max_samples - h :] + start += size + return vals, future_vals + + +def _fourier( + sizes: np.ndarray, + h: int, + season_length: int, + k: int, +) -> _Features: + # taken from: https://github.com/tblume1992/TSUtilities/blob/main/TSUtilities/TSFeatures/fourier_seasonality.py + x = 2 * np.pi * np.arange(1, k + 1) / season_length + x = x.astype(np.float32) + t = np.arange(1, sizes.max() + 1 + h, dtype=np.float32) + x = x * t[:, None] + terms = np.hstack([np.sin(x), np.cos(x)]) + cols = [f"{op}{i+1}_{season_length}" for op in ("sin", "cos") for i in range(k)] + vals, future_vals = _assign_slices(sizes=sizes, feats=terms, h=h) + return cols, vals, future_vals + + +def _trend(sizes: np.ndarray, h: int) -> _Features: + t = np.arange(1, sizes.max() + 1 + h, dtype=np.float32).reshape(-1, 1) + cols = ["trend"] + vals, future_vals = _assign_slices(sizes=sizes, feats=t, h=h) + return cols, vals, future_vals + +# %% ../nbs/feature_engineering.ipynb 5 def fourier( df: DataFrame, freq: str, @@ -48,56 +142,99 @@ def fourier( future_df : pandas or polars DataFrame DataFrame with future values """ - if not isinstance(h, int) or h < 0: - raise ValueError("`h` must be a non-negative integer") - validate_format(df, id_col, time_col, None) - validate_freq(df[time_col], freq) - - # compute terms based on the max size - id_counts = ufp.counts_by_id(df, id_col) - uids = id_counts[id_col] - sizes = id_counts["counts"].to_numpy() - max_size = sizes.max() - # taken from: https://github.com/tblume1992/TSUtilities/blob/main/TSUtilities/TSFeatures/fourier_seasonality.py - x = 2 * np.pi * np.arange(1, k + 1) / season_length - x = x.astype(np.float32) - t = np.arange(1, max_size + 1 + h, dtype=np.float32) - x = x * t[:, None] - terms = np.hstack([np.sin(x), np.cos(x)]) - cols = [f"{op}{i+1}_{season_length}" for op in ("sin", "cos") for i in range(k)] - - # slice for each serie - vals = np.empty((sizes.sum(), 2 * k), dtype=np.float32) - future_vals = np.empty((h * sizes.size, 2 * k), dtype=np.float32) - start = 0 - for i, size in enumerate(sizes): - vals[start : start + size, :] = terms[:size] - future_vals[i * h : (i + 1) * h] = terms[size : size + h] - start += size + f = partial(_fourier, season_length=season_length, k=k) + return _add_features( + df=df, + freq=freq, + h=h, + id_col=id_col, + time_col=time_col, + f=f, + ) - # assign back to df - sort_idxs = ufp.maybe_compute_sort_indices(df, id_col, time_col) - times = df[time_col] - if sort_idxs is not None: - restore_idxs = np.empty_like(sort_idxs) - restore_idxs[sort_idxs] = np.arange(sort_idxs.size) - vals = vals[restore_idxs] - times = ufp.take_rows(times, sort_idxs) - last_times = ufp.take_rows(times, sizes.cumsum() - 1) - df = ufp.copy_if_pandas(df, deep=False) - transformed = ufp.assign_columns(df, cols, vals) +# %% ../nbs/feature_engineering.ipynb 12 +def trend( + df: DataFrame, + freq: str, + h: int = 0, + id_col: str = "unique_id", + time_col: str = "ds", +) -> Tuple[DataFrame, DataFrame]: + """Add a trend column with consecutive integers for training and forecasting - if h == 0: - return transformed, type(df)({}) + Parameters + ---------- + df : pandas or polars DataFrame + Dataframe with ids, times and values for the exogenous regressors. + freq : str or int + Frequency of the data. Must be a valid pandas or polars offset alias, or an integer. + h : int (default=0) + Forecast horizon. + id_col : str (default='unique_id') + Column that identifies each serie. + time_col : str (default='ds') + Column that identifies each timestep, its values can be timestamps or integers. - # future vals - future_df = ufp.make_future_dataframe( - uids=uids, - last_times=last_times, + Returns + ------- + transformed_df : pandas or polars DataFrame + Original DataFrame with the computed features + future_df : pandas or polars DataFrame + DataFrame with future values + """ + return _add_features( + df=df, freq=freq, h=h, id_col=id_col, time_col=time_col, + f=_trend, ) - future_df = ufp.assign_columns(future_df, cols, future_vals) - return transformed, future_df + +# %% ../nbs/feature_engineering.ipynb 15 +def pipeline( + df: DataFrame, + features: List[Callable], + freq: str, + h: int = 0, + id_col: str = "unique_id", + time_col: str = "ds", +) -> Tuple[DataFrame, DataFrame]: + """Compute several features for training and forecasting + + Parameters + ---------- + df : pandas or polars DataFrame + Dataframe with ids, times and values for the exogenous regressors. + features : list of callable + List of features to compute. Must take only df, freq, h, id_col and time_col (other arguments must be fixed). + freq : str or int + Frequency of the data. Must be a valid pandas or polars offset alias, or an integer. + h : int (default=0) + Forecast horizon. + id_col : str (default='unique_id') + Column that identifies each serie. + time_col : str (default='ds') + Column that identifies each timestep, its values can be timestamps or integers. + + Returns + ------- + transformed_df : pandas or polars DataFrame + Original DataFrame with the computed features + future_df : pandas or polars DataFrame + DataFrame with future values + """ + transformed: Optional[DataFrame] = None + future: Optional[DataFrame] = None + for f in features: + f_transformed, f_future = f( + df=df, freq=freq, h=h, id_col=id_col, time_col=time_col + ) + if transformed is None: + transformed = f_transformed + future = f_future + else: + feat_cols = [c for c in f_future.columns if c not in (id_col, time_col)] + transformed = ufp.horizontal_concat([transformed, f_transformed[feat_cols]]) + future = ufp.horizontal_concat([future, f_future[feat_cols]]) + return transformed, future