diff --git a/CHANGELOG.md b/CHANGELOG.md index a21511c..675cf61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## Version 2.0.2 - Bugfix release - 2023-01 +- 🪲 Fix the bug that was adding an extra date at the end after resampling when the last input timestamp was exactly at the end of a period (week, month, half-year, year) + ## Version 2.0.1 - Bugfix release - 2021-06 - :bug: Keep the empty values rather than filtering them with the extrapolation method "Don't extrapolate (impute nulls)" - :scissors: Add the extrapolation method "Don't extrapolate (no imputation)" to filter missing values diff --git a/plugin.json b/plugin.json index 082e6e8..64ff1f9 100755 --- a/plugin.json +++ b/plugin.json @@ -1,6 +1,6 @@ { "id": "timeseries-preparation", - "version": "2.0.1", + "version": "2.0.2", "meta": { "supportLevel": "SUPPORTED", "label": "Time Series Preparation", @@ -15,4 +15,4 @@ "Time Series" ] } -} +} \ No newline at end of file diff --git a/python-lib/dku_timeseries/timeseries_helpers.py b/python-lib/dku_timeseries/timeseries_helpers.py index 55df3ce..fb5f8f0 100644 --- a/python-lib/dku_timeseries/timeseries_helpers.py +++ b/python-lib/dku_timeseries/timeseries_helpers.py @@ -5,6 +5,7 @@ import pandas as pd from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import BDay +from pandas.tseries.offsets import Day logger = logging.getLogger(__name__) @@ -77,17 +78,35 @@ def get_date_offset(time_unit, offset_value): def generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit): - rounding_freq_string = FREQUENCY_STRINGS.get(time_unit) clip_start_value = get_date_offset(time_unit, clip_start) clip_end_value = get_date_offset(time_unit, clip_end) shift_value = get_date_offset(time_unit, shift) - if time_unit in ROUND_COMPATIBLE_TIME_UNIT: - start_index = start_time.round(rounding_freq_string) + clip_start_value + shift_value - end_index = end_time.round(rounding_freq_string) - clip_end_value + shift_value - else: # for week, month, year we round up to closest day - start_index = start_time.round("D") + clip_start_value + shift_value - # for some reason date_range omit the last entry when dealing with months, years - end_index = end_time.round("D") - clip_end_value + get_date_offset(time_unit, time_step) + shift_value + + # for business day, week, month, year we round up to closest day + rounding_freq_string = FREQUENCY_STRINGS.get(time_unit) if time_unit in ROUND_COMPATIBLE_TIME_UNIT else "D" + start_index = start_time.round(rounding_freq_string) + end_index = end_time.round(rounding_freq_string) + + if time_unit not in ROUND_COMPATIBLE_TIME_UNIT: + # pd.date_range omits the end index when frequency is business day, week, month or year, + # unless the end index is exactly at the end of the period. + # so we need to offset the end index to make sure it falls between the last time step and the following one + if time_unit == "business_days": + # if start index is not a business day, then we want to start the range on the next Monday + # adding BDay(0) does nothing if the timestamp is already a business day and converts it into the first next business day otherwise + start_index = start_index + BDay(0) + + # if end index is not a business day, then we want to end the range on the previous Friday + # adding Day(1) then subtracting BDay(1) does nothing if the timestamp is already a business day and converts it into the last previous business day otherwise + end_index = (end_index + Day(1)) - BDay(1) + else: + # we add one less Day to the end index to make sure we do not include the following time stamp + # if the end index is exactly at the end of the period + end_index = end_index + get_date_offset(time_unit, time_step) - Day(1) + + start_index = start_index + clip_start_value + shift_value + end_index = end_index - clip_end_value + shift_value + return pd.date_range(start=start_index, end=end_index, freq=frequency) diff --git a/tests/python/unit/dku_timeseries/resampling/test_resampler_helpers.py b/tests/python/unit/dku_timeseries/resampling/test_resampler_helpers.py index f714df9..08dd3e5 100644 --- a/tests/python/unit/dku_timeseries/resampling/test_resampler_helpers.py +++ b/tests/python/unit/dku_timeseries/resampling/test_resampler_helpers.py @@ -91,7 +91,7 @@ def test_generate_date_range_week(self, config): end_time = pd.Timestamp('2021-01-24 00:00:00') date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) - np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-12-27', '2021-01-10', '2021-01-24', '2021-02-07'])) + np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-12-27', '2021-01-10', '2021-01-24'])) date_range = generate_date_range(start_time, end_time, 1, 0, 1, frequency, time_step, time_unit) np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-10', '2021-01-24', '2021-02-07'])) @@ -145,19 +145,19 @@ def test_generate_date_range_b_days(self, config): time_step = params.time_step date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit) - np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11'])) + np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08'])) clip_start = 1 clip_end = 1 shift = 0 date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit) - np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11'])) + np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07'])) clip_start = 2 clip_end = 2 shift = 0 date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit) - np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08'])) + np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-06'])) def test_generate_date_range_days(self, config): config["time_unit"] = "days" diff --git a/tests/python/unit/dku_timeseries/resampling/test_resampling_frequencies.py b/tests/python/unit/dku_timeseries/resampling/test_resampling_frequencies.py index 41878d4..3f3c6e6 100644 --- a/tests/python/unit/dku_timeseries/resampling/test_resampling_frequencies.py +++ b/tests/python/unit/dku_timeseries/resampling/test_resampling_frequencies.py @@ -44,7 +44,6 @@ def test_month(self, config, columns): df = get_df("Y", columns) output_df = resampler.transform(df, columns.date) - assert np.mean(output_df[columns.data]) == 316.32550000000003 expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1960-02-29T00:00:00.000000000', '1960-04-30T00:00:00.000000000', '1960-06-30T00:00:00.000000000', '1960-08-31T00:00:00.000000000', '1960-10-31T00:00:00.000000000', @@ -54,7 +53,7 @@ def test_month(self, config, columns): '1961-12-31T00:00:00.000000000', '1962-02-28T00:00:00.000000000', '1962-04-30T00:00:00.000000000', '1962-06-30T00:00:00.000000000', '1962-08-31T00:00:00.000000000', '1962-10-31T00:00:00.000000000', - '1962-12-31T00:00:00.000000000', '1963-02-28T00:00:00.000000000']) + '1962-12-31T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates) def test_weeks_sunday_end(self, config, columns):