From e72c4686b3b4c330cdf835574174617dc662ea21 Mon Sep 17 00:00:00 2001 From: jgmill Date: Sun, 1 Jul 2018 14:39:19 +0200 Subject: [PATCH] fix PL price - Omit polish price data reported in EUR (keeping PLN prices) - Turn off interpolation for price data - Fix CE(S)T timestamp column in Excel --- checksums.txt | 10 +++++----- processing.ipynb | 10 +++++++--- timeseries_scripts/imputation.py | 9 ++++++--- timeseries_scripts/read.py | 4 +++- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/checksums.txt b/checksums.txt index 4326d6f..198c9f4 100644 --- a/checksums.txt +++ b/checksums.txt @@ -1,11 +1,11 @@ -time_series.sqlite,4f5f7d2432cb4c48d29b062ce322b4517c4e82aa6841eb275070017a5cd46158 -time_series.xlsx,3f03c3387da92cef1948154c79d347f52b9acb4726bf4b6b9f164ebf9ee8682a +time_series.sqlite,82418ddf346af9525101edf78224d9fbb2c9f25fc2d07801e8b707c1ab915b90 +time_series.xlsx,b8d95fc579ae3cf60d00b5f5cffac7e98b3232b79891af7cee199306e0409b5f time_series_15min_multiindex.csv,898cff6ff1276097e45a44d4beeb53066297a476af326d25748b9209b56f4ebe time_series_15min_singleindex.csv,7bc97b7053f7895aea2a9b3901cfa149eeea4232e4461b6aef870c90be38f151 time_series_15min_stacked.csv,3b2510aa6cdb3897313003bc743b2be6b8cbf8cdd5b647d242a0cc6feeb3f93d time_series_30min_multiindex.csv,1d652f39f755825aa340bbde09f783346bdc53221b5e67bed58474ba97fed412 time_series_30min_singleindex.csv,1d99520014a6ac8428056fd319d88c4489447cec5d6604f42358a9ea76469d79 time_series_30min_stacked.csv,21acd2c7c75da1657ea7821bad3a5b9f6c886642df6a1c27cc7fc72d69fbfb19 -time_series_60min_multiindex.csv,f7467de31d1c0f710bba51f4647c1f644eb357f6a414f7e1b6d30b049d1257ef -time_series_60min_singleindex.csv,20dd12e4073eac0dd8150c34b8d6a0f38d77ec12abad6e59fa8c261837db0fd5 -time_series_60min_stacked.csv,87ceb85ae256d68d8bd93b6b44aeebcce4e31c084e34966d348476db320c43bd +time_series_60min_multiindex.csv,5bfe62ffa0098629260637e8af795dabdbf8f2a9261350068e36642c39b3e3a4 +time_series_60min_singleindex.csv,6f2fb8b24614f45377518fb6528dc1925aff456ba5705b79b1d4a4fb698867e7 +time_series_60min_stacked.csv,0619a1fc2b9ee104f2609a9ea36361d8e5841d2d64e401ec46bf585362766168 diff --git a/processing.ipynb b/processing.ipynb index ddff383..d229e83 100644 --- a/processing.ipynb +++ b/processing.ipynb @@ -1722,7 +1722,7 @@ } }, "source": [ - "Writing the full tables to Excel takes extremely long. As a workaround, only the first 5 rows are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files." + "Writing the full tables to Excel takes extremely long. As a workaround, only the timestamp-columns are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files." ] }, { @@ -1739,8 +1739,12 @@ "os.chdir(out_path)\n", "writer = pd.ExcelWriter('time_series1.xlsx')\n", "for res_key, df in data_sets_multiindex.items():\n", - " df.iloc[:, 0].to_excel(writer, res_key.split('_')[0], float_format='%.2f',\n", - " merge_cells=True)\n", + " # Need to convert CE(S)T-timestamps to tz-naive, otherwise Excel converts\n", + " # them back to UTC\n", + " excel_timestamps = df.loc[:,(info_cols['cet'], '', '', '', '', '')]\n", + " excel_timestamps = excel_timestamps.dt.tz_localize(None)\n", + " excel_timestamps.to_excel(writer, res_key.split('_')[0],\n", + " float_format='%.2f', merge_cells=True)\n", " # merge_cells=False doesn't work properly with multiindex\n", "writer.save()" ] diff --git a/timeseries_scripts/imputation.py b/timeseries_scripts/imputation.py index 4809c24..71bef57 100644 --- a/timeseries_scripts/imputation.py +++ b/timeseries_scripts/imputation.py @@ -187,21 +187,24 @@ def choose_fill_method( ''' for i, nan_block in nan_blocks.iterrows(): j = 0 + if col_name[1] == 'price': + # Do not interpolate prices + patched_col = col # Interpolate missing value spans up to 2 hours - if nan_block['span'] <= timedelta(hours=2): + elif nan_block['span'] <= timedelta(hours=2): patched_col, marker_col = my_interpolate( i, j, nan_block, col, col_name, marker_col, nan_blocks, one_period, message) # Guess missing value spans longer than one hour based on other tsos # (Only for German wind and solar generation data) - elif col_name[1][:2] == 'DE' and col_name[2] == 'generation': + elif col_name[0][:2] == 'DE' and col_name[2] == 'generation_actual': # NOT IMPLEMENTED # patched_col = impute(nan_block, col, col_name, nan_blocks, df, one_period) # instead: - pass + patched_col = col else: j += 1 patched_col = col diff --git a/timeseries_scripts/read.py b/timeseries_scripts/read.py index 6288a44..191efb7 100644 --- a/timeseries_scripts/read.py +++ b/timeseries_scripts/read.py @@ -93,8 +93,10 @@ def read_entso_e_transparency( df_raw.replace({'ProductionType_Name': renewables}, inplace=True) if variable_name == 'Day Ahead Prices': + # Omit polish price data reported in EUR (keeping PLN prices) + # (Before 2017-03-02, the data is very messy) no_polish_euro = ~((df_raw['AreaName'] == 'PSE SA BZ') & - (df_raw['Currency_IsoCode'] == 'EUR')) + (df_raw.index < pd.to_datetime('2017-03-02 00:00:00'))) df_raw = df_raw.loc[no_polish_euro] # keep only entries for selected geographic entities as specified in