From d6864c40185275aaed2d20d9f1dacd3fa89b683b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonathan=20M=C3=BChlenpfordt?=
 <muehlenpfordt@neon-energie.de>
Date: Fri, 28 Oct 2016 04:12:57 +0200
Subject: [PATCH] implemented resampling of marker column

---
 processing.ipynb | 245 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 181 insertions(+), 64 deletions(-)

diff --git a/processing.ipynb b/processing.ipynb
index fae2d48..b0252f8 100644
--- a/processing.ipynb
+++ b/processing.ipynb
@@ -123,7 +123,6 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "import logging\n",
-    "import pycountry\n",
     "import json\n",
     "import sqlite3\n",
     "import yaml\n",
@@ -241,10 +240,10 @@
    "source": [
     "subset = yaml.load('''\n",
     "insert_source_here:\n",
-    "- insert_dataset1_here\n",
+    "- insert_dataset1_from_that_source_here\n",
     "- insert_dataset2_here\n",
-    "...\n",
-    "- ...\n",
+    "more_sources...\n",
+    "- more_data_sets\n",
     "''')  # Or\n",
     "subset = None"
    ]
@@ -410,7 +409,7 @@
     }
    },
    "source": [
-    "This section: Read each downloaded file into a pandas-DataFrame and merge data from different sources if it has the same time resolution. Takes about 30 minutes to run."
+    "This section: Read each downloaded file into a pandas-DataFrame and merge data from different sources if it has the same time resolution. Takes ~15 minutes to run."
    ]
   },
   {
@@ -421,7 +420,7 @@
     }
    },
    "source": [
-    "Set the title of the rows at the top of the data used to store metadata internally."
+    "Set the title of the rows at the top of the data used to store metadata internally"
    ]
   },
   {
@@ -442,7 +441,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Create a dictionary of empty DataFrames to be populated by the rea function."
+    "Create a dictionary of empty DataFrames to be populated by the rea function"
    ]
   },
   {
@@ -502,7 +501,7 @@
     }
    },
    "source": [
-    "Display the first five columns of the dataframes to get a first impression of the data."
+    "Display some rows of the dataframes to get a first impression of the data."
    ]
   },
   {
@@ -510,7 +509,7 @@
    "execution_count": null,
    "metadata": {
     "collapsed": false,
-    "scrolled": true
+    "scrolled": false
    },
    "outputs": [],
    "source": [
@@ -547,13 +546,12 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
-    "with pd.HDFStore('patched.h5') as store:\n",
-    "    store['q_raw'] = data_sets['15min']\n",
-    "    store['h_raw'] = data_sets['60min']"
+    "data_sets['15min'].to_pickle('raw_15.pickle')\n",
+    "data_sets['60min'].to_pickle('raw_60.pickle')"
    ]
   },
   {
@@ -567,13 +565,12 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
-    "with pd.HDFStore('store.h5') as store:\n",
-    "    data_sets['15min'] = store['q']\n",
-    "    data_sets['60min'] = store['h']"
+    "#data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n",
+    "data_sets['60min'] = pd.read_pickle('raw_60.pickle')"
    ]
   },
   {
@@ -617,9 +614,11 @@
     }
    },
    "source": [
-    "Patch missing data. At this stage, only small gaps (up to 2 hours) are filled by linear interpolation. \n",
+    "Patch missing data. At this stage, only small gaps (up to 2 hours) are filled by linear interpolation. This catched most of the missing data due to daylight savings time transitions, while leaving bigger gaps untouched\n",
+    "\n",
+    "The exact locations of missing data are stored in the `nan_table` DataFrames.\n",
     "\n",
-    "The locations of missing data are stored in the nan_table DataFrame."
+    "Where data has been interpolated, it is marked in a new column `comment`. For eaxample the comment `solar_DE-transnetbw_generation;` means that in the original data, there is a gap in the solar generation timeseries from TransnetBW in the time period where the marker appears."
    ]
   },
   {
@@ -630,7 +629,7 @@
     }
    },
    "source": [
-    "Patch the datasets and display the location of missing Data in the original data."
+    "Patch the datasets and display the location of missing Data in the original data. Takes ~30 minutes to run."
    ]
   },
   {
@@ -645,7 +644,7 @@
    },
    "outputs": [],
    "source": [
-    "%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n",
+    "#%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n",
     "%time data_sets['60min'], nan_table60 = find_nan(data_sets['60min'], headers, patch=True)"
    ]
   },
@@ -664,9 +663,20 @@
    },
    "outputs": [],
    "source": [
-    "with pd.HDFStore('patched.h5') as store:\n",
-    "    store['q_patched'] = data_sets['15min']\n",
-    "    store['h_patched'] = data_sets['60min']"
+    "data_sets['15min'].to_pickle('patched_15.pickle')\n",
+    "data_sets['60min'].to_pickle('patched_60.pickle')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "data_sets['15min'] = pd.read_pickle('patched_15.pickle')\n",
+    "data_sets['60min'] = pd.read_pickle('patched_60.pickle')"
    ]
   },
   {
@@ -743,22 +753,24 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "scrolled": true
    },
    "outputs": [],
    "source": [
-    "data_sets['15min'][data_sets['15min']['comment'].notnull()].head()"
+    "data_sets['15min'][data_sets['15min']['comment'].notnull()].tail()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "scrolled": true
    },
    "outputs": [],
    "source": [
-    "data_sets['60min'][data_sets['60min']['comment'].notnull()].head()"
+    "data_sets['60min'][data_sets['60min']['comment'].notnull()].tail()"
    ]
   },
   {
@@ -789,7 +801,8 @@
    "source": [
     "For 50 Hertz, it is already in the data.\n",
     "For TenneT, it calculated by substracting offshore from total generation.\n",
-    "For Amprion and TransnetBW, onshore wind generation is just total wind generation"
+    "For Amprion and TransnetBW, onshore wind generation is just total wind generation.\n",
+    "Takes <1 second to run."
    ]
   },
   {
@@ -847,7 +860,7 @@
     }
    },
    "source": [
-    "The wind and solar in-feed data for the 4 German balancing areas is summed up and stored in in new columns, which are then used to calculate profiles, that is, the share of wind/solar capacity producing at a given time. The column headers are created in the fashion introduced in the read script."
+    "The wind and solar in-feed data for the 4 German balancing areas is summed up and stored in in new columns, which are then used to calculate profiles, that is, the share of wind/solar capacity producing at a given time. The column headers are created in the fashion introduced in the read script. Takes 5 seconds to run."
    ]
   },
   {
@@ -917,7 +930,73 @@
    "source": [
     "Some data comes in 15-minute intervals (i.e. German renewable generation), other in 60-minutes (i.e. load data from ENTSO-E and Prices). We resample the 15-minute data to hourly resolution and append it to the 60-minutes dataset.\n",
     "\n",
-    "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries."
+    "The marker column is resampled separately in such a way that all information on where data has been interpolated is preserved.\n",
+    "\n",
+    "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 1 minute to run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def resample_markers(group):\n",
+    "    '''Resample marker column from 15 to 60 min\n",
+    "    \n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    group: pd.Series\n",
+    "        Series of 4 succeeding quarter-hourly values from the marker column\n",
+    "        that have to be combined into one.\n",
+    "        \n",
+    "    Returns\n",
+    "    ----------\n",
+    "    aggregated_marker : str or np.nan\n",
+    "        If there were any markers in group: the unique values from the marker\n",
+    "        column group joined together in one string, np.nan otherwise\n",
+    "    \n",
+    "    '''\n",
+    "    \n",
+    "    if group.notnull().values.any():\n",
+    "        unpacked = [mark for line in group if type(line) is str for mark in line.split(';')[:-1]]\n",
+    "        aggregated_marker = '; '.join(set(unpacked)) + '; '\n",
+    "    else:\n",
+    "        aggregated_marker = np.nan\n",
+    "    return aggregated_marker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "marker_col_15 = data_sets['15min']['comment']\n",
+    "marker_col_15 = marker_col_15.groupby(\n",
+    "    pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "marker_col_15 = marker_col_15.reindex(data_sets['60min'].index)\n",
+    "data_sets['60min']['comment'] = (\n",
+    "    data_sets['60min']['comment']\n",
+    "    .str.cat(others=marker_col_15, sep='', na_rep='')\n",
+    "    .replace(to_replace='', value=np.nan))"
    ]
   },
   {
@@ -939,6 +1018,18 @@
     "    data_sets['60min'] = resampled"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "data_sets['60min']['2016-09-27 21:45:00':].shape"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1028,7 +1119,34 @@
     }
    },
    "source": [
-    "This section: Save as [Data Package](http://data.okfn.org/doc/tabular-data-package) (data in CSV, metadata in JSON file). All files are saved in the directory of this notebook. Alternative file formats (SQL, XLSX) are also exported. Takes about 1 hour to run."
+    "This section: Save as [Data Package](http://data.okfn.org/doc/tabular-data-package) (data in CSV, metadata in JSON file). All files are saved in the directory of this notebook. Alternative file formats (SQL, XLSX) are also exported. Takes about 1 hour to run.\n",
+    "\n",
+    "But first, create a final savepoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "data_sets['15min'].to_pickle('final_15.pickle')\n",
+    "data_sets['60min'].to_pickle('final_60.pickle')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data_sets = {}\n",
+    "data_sets['15min'] = pd.read_pickle('final_15.pickle')\n",
+    "data_sets['60min'] = pd.read_pickle('final_60.pickle')"
    ]
   },
   {
@@ -1051,9 +1169,14 @@
    },
    "source": [
     "Data are provided in three different \"shapes\": \n",
-    "* Singleindex (easy to read for humans, compatible with datapackage standard, small file size) \n",
-    "* Multiindex (easy to read into GAMS, not compatible with datapackage standard, small file size) \n",
-    "* Stacked (compatible with data package standard, large file size, many rows, too many for Excel) "
+    "- SingleIndex (easy to read for humans, compatible with datapackage standard, small file size)\n",
+    "  - Fileformat: CSV, SQLite\n",
+    "- MultiIndex (easy to read into GAMS, not compatible with datapackage standard, small file size)\n",
+    "  - Fileformat: CSV, Excel\n",
+    "- Stacked (compatible with data package standard, large file size, many rows, too many for Excel) \n",
+    "  - Fileformat: CSV\n",
+    "\n",
+    "The different shapes need to be created internally befor they can be saved to files. Takes about 1 minute to run."
    ]
   },
   {
@@ -1064,14 +1187,14 @@
     "nbpresent": {
      "id": "769225c6-31f5-4db8-8d91-32a3f983489c"
     },
-    "scrolled": false
+    "scrolled": true
    },
    "outputs": [],
    "source": [
     "%%time\n",
     "data_sets_singleindex = {}\n",
-    "#data_sets_multiindex = {}\n",
-    "#data_sets_stacked = {}\n",
+    "data_sets_multiindex = {}\n",
+    "data_sets_stacked = {}\n",
     "for res_key, df in data_sets.items():\n",
     "    if df.empty:\n",
     "        continue\n",
@@ -1079,35 +1202,26 @@
     "    for col_name, col in df.iteritems():\n",
     "        if not (col_name[0] in ['ce(s)t-timestamp', 'comment', 'marker'] or\n",
     "                col_name[2] == 'profile'):\n",
-    "            df[col_name] = col.map(lambda x: '%.0f' % x)\n",
+    "            df[col_name] = col.round(0)\n",
     "            \n",
     "    df_singleindex = df.copy()\n",
     "\n",
     "    # use first 3 levels of multiindex to create singleindex\n",
     "    df_singleindex.columns = [\n",
-    "        '_'.join(col[0:3]) for col in df.columns.values]\n",
+    "        col[0] if col[0] in ['ce(s)t-timestamp', 'comment']\n",
+    "        else '_'.join(col[0:3]) for col in df.columns.values]\n",
     "\n",
-    "    data_sets_singleindex[res_key] = df_singleindex\n",
+    "    data_sets_singleindex[res_key + '_singleindex'] = df_singleindex\n",
     "\n",
     "    data_sets_multiindex[res_key + '_multiindex'] = df\n",
     "\n",
     "    stacked = df.copy()\n",
+    "    stacked.drop('ce(s)t-timestamp', axis=1, inplace=True)\n",
     "    stacked.columns = stacked.columns.droplevel(['source', 'web'])\n",
     "    stacked = stacked.transpose().stack(dropna=True).to_frame(name='data')\n",
     "    data_sets_stacked[res_key + '_stacked'] = stacked"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "data_sets['15min']['2015-07-01 12:00':].head()#.map(lambda x: '%.0f' % x)#.round(0)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1143,11 +1257,12 @@
    "source": [
     "%%time \n",
     "for res_key, df in data_sets_singleindex.items():\n",
-    "    f = 'time_series' + res_key\n",
+    "    f = 'time_series_' + res_key\n",
     "    df = df.copy()\n",
     "    df.index = df.index.strftime('%Y-%m-%dT%H:%M:%SZ')\n",
-    "    df.to_sql(f, sqlite3.connect(f + '.sqlite'),\n",
-    "              if_exists='replace', index_label='timestamp')"
+    "    df['ce(s)t-timestamp'] = df['ce(s)t-timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
+    "    df.to_sql(f, sqlite3.connect('time_series.sqlite'),\n",
+    "              if_exists='replace', index_label='utc-timestamp')"
    ]
   },
   {
@@ -1169,7 +1284,7 @@
     }
    },
    "source": [
-    "This takes days to complete. Better create the Excel files manually!"
+    "Writing the full tables to Excel takes extremely long. As a workaroun, only the first five rows are exported. The rest of the data is inserted manually from the CSV."
    ]
   },
   {
@@ -1186,7 +1301,7 @@
     "%%time\n",
     "writer = pd.ExcelWriter('time_series.xlsx')\n",
     "for res_key, df in data_sets_multiindex.items():\n",
-    "    df.to_excel(writer, res_key, float_format='%.2f', merge_cells=False)\n",
+    "    df.head().to_excel(writer, res_key, float_format='%.2f', merge_cells=True)\n",
     "writer.save()"
    ]
   },
@@ -1225,14 +1340,16 @@
    "source": [
     "%%time\n",
     "# itertoools.chain() allows iterating over multiple dicts at once\n",
-    "for res_key, df in itertools.chain(\n",
-    "        data_sets_singleindex.items(),\n",
-    "        data_sets_multiindex.items(),\n",
-    "    # data_sets_stacked.items()\n",
-    "):\n",
-    "    f = 'time_series' + res_key\n",
+    "for res_stacking_key, df in itertools.chain(\n",
+    "        #data_sets_singleindex.items(),\n",
+    "        #data_sets_multiindex.items(),):\n",
+    "        data_sets_stacked.items()):\n",
+    "    # convert the format of the ce(s)t-timestamp to ISO-8601\n",
+    "    if not  res_stacking_key in ['15min_stacked', '60min_stacked']:\n",
+    "        df.iloc[:,0] = df.iloc[:,0].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
+    "    f = 'time_series_' + res_stacking_key\n",
     "    df.to_csv(f + '.csv', float_format='%.2f',\n",
-    "              date_format='%Y-%m-%dT%H:%M:%SZ')"
+    "              date_format='%Y-%m-%dT%H:%M:%S%z')"
    ]
   }
  ],