From e72c4686b3b4c330cdf835574174617dc662ea21 Mon Sep 17 00:00:00 2001
From: jgmill <muehlenpfordt@neon-energie.de>
Date: Sun, 1 Jul 2018 14:39:19 +0200
Subject: [PATCH] fix PL price

- Omit polish price data reported in EUR (keeping PLN prices)
- Turn off interpolation for price data
- Fix CE(S)T timestamp column in Excel
---
 checksums.txt                    | 10 +++++-----
 processing.ipynb                 | 10 +++++++---
 timeseries_scripts/imputation.py |  9 ++++++---
 timeseries_scripts/read.py       |  4 +++-
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/checksums.txt b/checksums.txt
index 4326d6f..198c9f4 100644
--- a/checksums.txt
+++ b/checksums.txt
@@ -1,11 +1,11 @@
-time_series.sqlite,4f5f7d2432cb4c48d29b062ce322b4517c4e82aa6841eb275070017a5cd46158
-time_series.xlsx,3f03c3387da92cef1948154c79d347f52b9acb4726bf4b6b9f164ebf9ee8682a
+time_series.sqlite,82418ddf346af9525101edf78224d9fbb2c9f25fc2d07801e8b707c1ab915b90
+time_series.xlsx,b8d95fc579ae3cf60d00b5f5cffac7e98b3232b79891af7cee199306e0409b5f
 time_series_15min_multiindex.csv,898cff6ff1276097e45a44d4beeb53066297a476af326d25748b9209b56f4ebe
 time_series_15min_singleindex.csv,7bc97b7053f7895aea2a9b3901cfa149eeea4232e4461b6aef870c90be38f151
 time_series_15min_stacked.csv,3b2510aa6cdb3897313003bc743b2be6b8cbf8cdd5b647d242a0cc6feeb3f93d
 time_series_30min_multiindex.csv,1d652f39f755825aa340bbde09f783346bdc53221b5e67bed58474ba97fed412
 time_series_30min_singleindex.csv,1d99520014a6ac8428056fd319d88c4489447cec5d6604f42358a9ea76469d79
 time_series_30min_stacked.csv,21acd2c7c75da1657ea7821bad3a5b9f6c886642df6a1c27cc7fc72d69fbfb19
-time_series_60min_multiindex.csv,f7467de31d1c0f710bba51f4647c1f644eb357f6a414f7e1b6d30b049d1257ef
-time_series_60min_singleindex.csv,20dd12e4073eac0dd8150c34b8d6a0f38d77ec12abad6e59fa8c261837db0fd5
-time_series_60min_stacked.csv,87ceb85ae256d68d8bd93b6b44aeebcce4e31c084e34966d348476db320c43bd
+time_series_60min_multiindex.csv,5bfe62ffa0098629260637e8af795dabdbf8f2a9261350068e36642c39b3e3a4
+time_series_60min_singleindex.csv,6f2fb8b24614f45377518fb6528dc1925aff456ba5705b79b1d4a4fb698867e7
+time_series_60min_stacked.csv,0619a1fc2b9ee104f2609a9ea36361d8e5841d2d64e401ec46bf585362766168
diff --git a/processing.ipynb b/processing.ipynb
index ddff383..d229e83 100644
--- a/processing.ipynb
+++ b/processing.ipynb
@@ -1722,7 +1722,7 @@
     }
    },
    "source": [
-    "Writing the full tables to Excel takes extremely long. As a workaround, only the first 5 rows are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files."
+    "Writing the full tables to Excel takes extremely long. As a workaround, only the timestamp-columns are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files."
    ]
   },
   {
@@ -1739,8 +1739,12 @@
     "os.chdir(out_path)\n",
     "writer = pd.ExcelWriter('time_series1.xlsx')\n",
     "for res_key, df in data_sets_multiindex.items():\n",
-    "    df.iloc[:, 0].to_excel(writer, res_key.split('_')[0], float_format='%.2f',\n",
-    "                           merge_cells=True)\n",
+    "    # Need to convert CE(S)T-timestamps to tz-naive, otherwise Excel converts\n",
+    "    # them back to UTC\n",
+    "    excel_timestamps = df.loc[:,(info_cols['cet'], '', '', '', '', '')]\n",
+    "    excel_timestamps = excel_timestamps.dt.tz_localize(None)\n",
+    "    excel_timestamps.to_excel(writer, res_key.split('_')[0],\n",
+    "                              float_format='%.2f', merge_cells=True)\n",
     "    # merge_cells=False doesn't work properly with multiindex\n",
     "writer.save()"
    ]
diff --git a/timeseries_scripts/imputation.py b/timeseries_scripts/imputation.py
index 4809c24..71bef57 100644
--- a/timeseries_scripts/imputation.py
+++ b/timeseries_scripts/imputation.py
@@ -187,21 +187,24 @@ def choose_fill_method(
     '''
     for i, nan_block in nan_blocks.iterrows():
         j = 0
+        if col_name[1] == 'price':
+            # Do not interpolate prices
+            patched_col = col
 
         # Interpolate missing value spans up to 2 hours
-        if nan_block['span'] <= timedelta(hours=2):
+        elif nan_block['span'] <= timedelta(hours=2):
             patched_col, marker_col = my_interpolate(
                 i, j, nan_block, col, col_name, marker_col, nan_blocks,
                 one_period, message)
 
         # Guess missing value spans longer than one hour based on other tsos
         # (Only for German wind and solar generation data)
-        elif col_name[1][:2] == 'DE' and col_name[2] == 'generation':
+        elif col_name[0][:2] == 'DE' and col_name[2] == 'generation_actual':
 
             # NOT IMPLEMENTED
             # patched_col = impute(nan_block, col, col_name, nan_blocks, df, one_period)
             # instead:
-            pass
+            patched_col = col
         else:
             j += 1
             patched_col = col
diff --git a/timeseries_scripts/read.py b/timeseries_scripts/read.py
index 6288a44..191efb7 100644
--- a/timeseries_scripts/read.py
+++ b/timeseries_scripts/read.py
@@ -93,8 +93,10 @@ def read_entso_e_transparency(
         df_raw.replace({'ProductionType_Name': renewables}, inplace=True)
 
     if variable_name == 'Day Ahead Prices':
+        # Omit polish price data reported in EUR (keeping PLN prices)
+        # (Before 2017-03-02, the data is very messy)
         no_polish_euro = ~((df_raw['AreaName'] == 'PSE SA BZ') &
-                           (df_raw['Currency_IsoCode'] == 'EUR'))
+                           (df_raw.index < pd.to_datetime('2017-03-02 00:00:00')))
         df_raw = df_raw.loc[no_polish_euro]
 
     # keep only entries for selected geographic entities as specified in