Skip to content

Commit

Permalink
Merge pull request #52 from dataiku/fix/resampling-last-timestamp-not…
Browse files Browse the repository at this point in the history
…-round-unit-sc-113991

 fix resampling extra date
  • Loading branch information
StanislasGuinel authored Jan 9, 2023
2 parents bdb662c + e1820aa commit 16b6167
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 16 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## Version 2.0.2 - Bugfix release - 2023-01
- 🪲 Fix the bug that was adding an extra date at the end after resampling when the last input timestamp was exactly at the end of a period (week, month, half-year, year)

## Version 2.0.1 - Bugfix release - 2021-06
- :bug: Keep the empty values rather than filtering them with the extrapolation method "Don't extrapolate (impute nulls)"
- :scissors: Add the extrapolation method "Don't extrapolate (no imputation)" to filter missing values
Expand Down
4 changes: 2 additions & 2 deletions plugin.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"id": "timeseries-preparation",
"version": "2.0.1",
"version": "2.0.2",
"meta": {
"supportLevel": "SUPPORTED",
"label": "Time Series Preparation",
Expand All @@ -15,4 +15,4 @@
"Time Series"
]
}
}
}
35 changes: 27 additions & 8 deletions python-lib/dku_timeseries/timeseries_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import BDay
from pandas.tseries.offsets import Day

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -77,17 +78,35 @@ def get_date_offset(time_unit, offset_value):


def generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit):
rounding_freq_string = FREQUENCY_STRINGS.get(time_unit)
clip_start_value = get_date_offset(time_unit, clip_start)
clip_end_value = get_date_offset(time_unit, clip_end)
shift_value = get_date_offset(time_unit, shift)
if time_unit in ROUND_COMPATIBLE_TIME_UNIT:
start_index = start_time.round(rounding_freq_string) + clip_start_value + shift_value
end_index = end_time.round(rounding_freq_string) - clip_end_value + shift_value
else: # for week, month, year we round up to closest day
start_index = start_time.round("D") + clip_start_value + shift_value
# for some reason date_range omit the last entry when dealing with months, years
end_index = end_time.round("D") - clip_end_value + get_date_offset(time_unit, time_step) + shift_value

# for business day, week, month, year we round up to closest day
rounding_freq_string = FREQUENCY_STRINGS.get(time_unit) if time_unit in ROUND_COMPATIBLE_TIME_UNIT else "D"
start_index = start_time.round(rounding_freq_string)
end_index = end_time.round(rounding_freq_string)

if time_unit not in ROUND_COMPATIBLE_TIME_UNIT:
# pd.date_range omits the end index when frequency is business day, week, month or year,
# unless the end index is exactly at the end of the period.
# so we need to offset the end index to make sure it falls between the last time step and the following one
if time_unit == "business_days":
# if start index is not a business day, then we want to start the range on the next Monday
# adding BDay(0) does nothing if the timestamp is already a business day and converts it into the first next business day otherwise
start_index = start_index + BDay(0)

# if end index is not a business day, then we want to end the range on the previous Friday
# adding Day(1) then subtracting BDay(1) does nothing if the timestamp is already a business day and converts it into the last previous business day otherwise
end_index = (end_index + Day(1)) - BDay(1)
else:
# we add one less Day to the end index to make sure we do not include the following time stamp
# if the end index is exactly at the end of the period
end_index = end_index + get_date_offset(time_unit, time_step) - Day(1)

start_index = start_index + clip_start_value + shift_value
end_index = end_index - clip_end_value + shift_value

return pd.date_range(start=start_index, end=end_index, freq=frequency)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_generate_date_range_week(self, config):

end_time = pd.Timestamp('2021-01-24 00:00:00')
date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-12-27', '2021-01-10', '2021-01-24', '2021-02-07']))
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-12-27', '2021-01-10', '2021-01-24']))

date_range = generate_date_range(start_time, end_time, 1, 0, 1, frequency, time_step, time_unit)
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-10', '2021-01-24', '2021-02-07']))
Expand Down Expand Up @@ -145,19 +145,19 @@ def test_generate_date_range_b_days(self, config):
time_step = params.time_step

date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11']))
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08']))

clip_start = 1
clip_end = 1
shift = 0
date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11']))
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07']))

clip_start = 2
clip_end = 2
shift = 0
date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08']))
np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-06']))

def test_generate_date_range_days(self, config):
config["time_unit"] = "days"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def test_month(self, config, columns):
df = get_df("Y", columns)
output_df = resampler.transform(df, columns.date)

assert np.mean(output_df[columns.data]) == 316.32550000000003
expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1960-02-29T00:00:00.000000000',
'1960-04-30T00:00:00.000000000', '1960-06-30T00:00:00.000000000',
'1960-08-31T00:00:00.000000000', '1960-10-31T00:00:00.000000000',
Expand All @@ -54,7 +53,7 @@ def test_month(self, config, columns):
'1961-12-31T00:00:00.000000000', '1962-02-28T00:00:00.000000000',
'1962-04-30T00:00:00.000000000', '1962-06-30T00:00:00.000000000',
'1962-08-31T00:00:00.000000000', '1962-10-31T00:00:00.000000000',
'1962-12-31T00:00:00.000000000', '1963-02-28T00:00:00.000000000'])
'1962-12-31T00:00:00.000000000'])
np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)

def test_weeks_sunday_end(self, config, columns):
Expand Down

0 comments on commit 16b6167

Please sign in to comment.