From d6864c40185275aaed2d20d9f1dacd3fa89b683b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20M=C3=BChlenpfordt?= Date: Fri, 28 Oct 2016 04:12:57 +0200 Subject: [PATCH] implemented resampling of marker column --- processing.ipynb | 245 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 181 insertions(+), 64 deletions(-) diff --git a/processing.ipynb b/processing.ipynb index fae2d48..b0252f8 100644 --- a/processing.ipynb +++ b/processing.ipynb @@ -123,7 +123,6 @@ "import pandas as pd\n", "import numpy as np\n", "import logging\n", - "import pycountry\n", "import json\n", "import sqlite3\n", "import yaml\n", @@ -241,10 +240,10 @@ "source": [ "subset = yaml.load('''\n", "insert_source_here:\n", - "- insert_dataset1_here\n", + "- insert_dataset1_from_that_source_here\n", "- insert_dataset2_here\n", - "...\n", - "- ...\n", + "more_sources...\n", + "- more_data_sets\n", "''') # Or\n", "subset = None" ] @@ -410,7 +409,7 @@ } }, "source": [ - "This section: Read each downloaded file into a pandas-DataFrame and merge data from different sources if it has the same time resolution. Takes about 30 minutes to run." + "This section: Read each downloaded file into a pandas-DataFrame and merge data from different sources if it has the same time resolution. Takes ~15 minutes to run." ] }, { @@ -421,7 +420,7 @@ } }, "source": [ - "Set the title of the rows at the top of the data used to store metadata internally." + "Set the title of the rows at the top of the data used to store metadata internally" ] }, { @@ -442,7 +441,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a dictionary of empty DataFrames to be populated by the rea function." + "Create a dictionary of empty DataFrames to be populated by the rea function" ] }, { @@ -502,7 +501,7 @@ } }, "source": [ - "Display the first five columns of the dataframes to get a first impression of the data." + "Display some rows of the dataframes to get a first impression of the data." ] }, { @@ -510,7 +509,7 @@ "execution_count": null, "metadata": { "collapsed": false, - "scrolled": true + "scrolled": false }, "outputs": [], "source": [ @@ -547,13 +546,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ - "with pd.HDFStore('patched.h5') as store:\n", - " store['q_raw'] = data_sets['15min']\n", - " store['h_raw'] = data_sets['60min']" + "data_sets['15min'].to_pickle('raw_15.pickle')\n", + "data_sets['60min'].to_pickle('raw_60.pickle')" ] }, { @@ -567,13 +565,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ - "with pd.HDFStore('store.h5') as store:\n", - " data_sets['15min'] = store['q']\n", - " data_sets['60min'] = store['h']" + "#data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n", + "data_sets['60min'] = pd.read_pickle('raw_60.pickle')" ] }, { @@ -617,9 +614,11 @@ } }, "source": [ - "Patch missing data. At this stage, only small gaps (up to 2 hours) are filled by linear interpolation. \n", + "Patch missing data. At this stage, only small gaps (up to 2 hours) are filled by linear interpolation. This catched most of the missing data due to daylight savings time transitions, while leaving bigger gaps untouched\n", + "\n", + "The exact locations of missing data are stored in the `nan_table` DataFrames.\n", "\n", - "The locations of missing data are stored in the nan_table DataFrame." + "Where data has been interpolated, it is marked in a new column `comment`. For eaxample the comment `solar_DE-transnetbw_generation;` means that in the original data, there is a gap in the solar generation timeseries from TransnetBW in the time period where the marker appears." ] }, { @@ -630,7 +629,7 @@ } }, "source": [ - "Patch the datasets and display the location of missing Data in the original data." + "Patch the datasets and display the location of missing Data in the original data. Takes ~30 minutes to run." ] }, { @@ -645,7 +644,7 @@ }, "outputs": [], "source": [ - "%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n", + "#%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n", "%time data_sets['60min'], nan_table60 = find_nan(data_sets['60min'], headers, patch=True)" ] }, @@ -664,9 +663,20 @@ }, "outputs": [], "source": [ - "with pd.HDFStore('patched.h5') as store:\n", - " store['q_patched'] = data_sets['15min']\n", - " store['h_patched'] = data_sets['60min']" + "data_sets['15min'].to_pickle('patched_15.pickle')\n", + "data_sets['60min'].to_pickle('patched_60.pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data_sets['15min'] = pd.read_pickle('patched_15.pickle')\n", + "data_sets['60min'] = pd.read_pickle('patched_60.pickle')" ] }, { @@ -743,22 +753,24 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "scrolled": true }, "outputs": [], "source": [ - "data_sets['15min'][data_sets['15min']['comment'].notnull()].head()" + "data_sets['15min'][data_sets['15min']['comment'].notnull()].tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "scrolled": true }, "outputs": [], "source": [ - "data_sets['60min'][data_sets['60min']['comment'].notnull()].head()" + "data_sets['60min'][data_sets['60min']['comment'].notnull()].tail()" ] }, { @@ -789,7 +801,8 @@ "source": [ "For 50 Hertz, it is already in the data.\n", "For TenneT, it calculated by substracting offshore from total generation.\n", - "For Amprion and TransnetBW, onshore wind generation is just total wind generation" + "For Amprion and TransnetBW, onshore wind generation is just total wind generation.\n", + "Takes <1 second to run." ] }, { @@ -847,7 +860,7 @@ } }, "source": [ - "The wind and solar in-feed data for the 4 German balancing areas is summed up and stored in in new columns, which are then used to calculate profiles, that is, the share of wind/solar capacity producing at a given time. The column headers are created in the fashion introduced in the read script." + "The wind and solar in-feed data for the 4 German balancing areas is summed up and stored in in new columns, which are then used to calculate profiles, that is, the share of wind/solar capacity producing at a given time. The column headers are created in the fashion introduced in the read script. Takes 5 seconds to run." ] }, { @@ -917,7 +930,73 @@ "source": [ "Some data comes in 15-minute intervals (i.e. German renewable generation), other in 60-minutes (i.e. load data from ENTSO-E and Prices). We resample the 15-minute data to hourly resolution and append it to the 60-minutes dataset.\n", "\n", - "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries." + "The marker column is resampled separately in such a way that all information on where data has been interpolated is preserved.\n", + "\n", + "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 1 minute to run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def resample_markers(group):\n", + " '''Resample marker column from 15 to 60 min\n", + " \n", + " Parameters\n", + " ----------\n", + " group: pd.Series\n", + " Series of 4 succeeding quarter-hourly values from the marker column\n", + " that have to be combined into one.\n", + " \n", + " Returns\n", + " ----------\n", + " aggregated_marker : str or np.nan\n", + " If there were any markers in group: the unique values from the marker\n", + " column group joined together in one string, np.nan otherwise\n", + " \n", + " '''\n", + " \n", + " if group.notnull().values.any():\n", + " unpacked = [mark for line in group if type(line) is str for mark in line.split(';')[:-1]]\n", + " aggregated_marker = '; '.join(set(unpacked)) + '; '\n", + " else:\n", + " aggregated_marker = np.nan\n", + " return aggregated_marker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "marker_col_15 = data_sets['15min']['comment']\n", + "marker_col_15 = marker_col_15.groupby(\n", + " pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%%time\n", + "marker_col_15 = marker_col_15.reindex(data_sets['60min'].index)\n", + "data_sets['60min']['comment'] = (\n", + " data_sets['60min']['comment']\n", + " .str.cat(others=marker_col_15, sep='', na_rep='')\n", + " .replace(to_replace='', value=np.nan))" ] }, { @@ -939,6 +1018,18 @@ " data_sets['60min'] = resampled" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "data_sets['60min']['2016-09-27 21:45:00':].shape" + ] + }, { "cell_type": "markdown", "metadata": { @@ -1028,7 +1119,34 @@ } }, "source": [ - "This section: Save as [Data Package](http://data.okfn.org/doc/tabular-data-package) (data in CSV, metadata in JSON file). All files are saved in the directory of this notebook. Alternative file formats (SQL, XLSX) are also exported. Takes about 1 hour to run." + "This section: Save as [Data Package](http://data.okfn.org/doc/tabular-data-package) (data in CSV, metadata in JSON file). All files are saved in the directory of this notebook. Alternative file formats (SQL, XLSX) are also exported. Takes about 1 hour to run.\n", + "\n", + "But first, create a final savepoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data_sets['15min'].to_pickle('final_15.pickle')\n", + "data_sets['60min'].to_pickle('final_60.pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data_sets = {}\n", + "data_sets['15min'] = pd.read_pickle('final_15.pickle')\n", + "data_sets['60min'] = pd.read_pickle('final_60.pickle')" ] }, { @@ -1051,9 +1169,14 @@ }, "source": [ "Data are provided in three different \"shapes\": \n", - "* Singleindex (easy to read for humans, compatible with datapackage standard, small file size) \n", - "* Multiindex (easy to read into GAMS, not compatible with datapackage standard, small file size) \n", - "* Stacked (compatible with data package standard, large file size, many rows, too many for Excel) " + "- SingleIndex (easy to read for humans, compatible with datapackage standard, small file size)\n", + " - Fileformat: CSV, SQLite\n", + "- MultiIndex (easy to read into GAMS, not compatible with datapackage standard, small file size)\n", + " - Fileformat: CSV, Excel\n", + "- Stacked (compatible with data package standard, large file size, many rows, too many for Excel) \n", + " - Fileformat: CSV\n", + "\n", + "The different shapes need to be created internally befor they can be saved to files. Takes about 1 minute to run." ] }, { @@ -1064,14 +1187,14 @@ "nbpresent": { "id": "769225c6-31f5-4db8-8d91-32a3f983489c" }, - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ "%%time\n", "data_sets_singleindex = {}\n", - "#data_sets_multiindex = {}\n", - "#data_sets_stacked = {}\n", + "data_sets_multiindex = {}\n", + "data_sets_stacked = {}\n", "for res_key, df in data_sets.items():\n", " if df.empty:\n", " continue\n", @@ -1079,35 +1202,26 @@ " for col_name, col in df.iteritems():\n", " if not (col_name[0] in ['ce(s)t-timestamp', 'comment', 'marker'] or\n", " col_name[2] == 'profile'):\n", - " df[col_name] = col.map(lambda x: '%.0f' % x)\n", + " df[col_name] = col.round(0)\n", " \n", " df_singleindex = df.copy()\n", "\n", " # use first 3 levels of multiindex to create singleindex\n", " df_singleindex.columns = [\n", - " '_'.join(col[0:3]) for col in df.columns.values]\n", + " col[0] if col[0] in ['ce(s)t-timestamp', 'comment']\n", + " else '_'.join(col[0:3]) for col in df.columns.values]\n", "\n", - " data_sets_singleindex[res_key] = df_singleindex\n", + " data_sets_singleindex[res_key + '_singleindex'] = df_singleindex\n", "\n", " data_sets_multiindex[res_key + '_multiindex'] = df\n", "\n", " stacked = df.copy()\n", + " stacked.drop('ce(s)t-timestamp', axis=1, inplace=True)\n", " stacked.columns = stacked.columns.droplevel(['source', 'web'])\n", " stacked = stacked.transpose().stack(dropna=True).to_frame(name='data')\n", " data_sets_stacked[res_key + '_stacked'] = stacked" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "data_sets['15min']['2015-07-01 12:00':].head()#.map(lambda x: '%.0f' % x)#.round(0)" - ] - }, { "cell_type": "markdown", "metadata": { @@ -1143,11 +1257,12 @@ "source": [ "%%time \n", "for res_key, df in data_sets_singleindex.items():\n", - " f = 'time_series' + res_key\n", + " f = 'time_series_' + res_key\n", " df = df.copy()\n", " df.index = df.index.strftime('%Y-%m-%dT%H:%M:%SZ')\n", - " df.to_sql(f, sqlite3.connect(f + '.sqlite'),\n", - " if_exists='replace', index_label='timestamp')" + " df['ce(s)t-timestamp'] = df['ce(s)t-timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n", + " df.to_sql(f, sqlite3.connect('time_series.sqlite'),\n", + " if_exists='replace', index_label='utc-timestamp')" ] }, { @@ -1169,7 +1284,7 @@ } }, "source": [ - "This takes days to complete. Better create the Excel files manually!" + "Writing the full tables to Excel takes extremely long. As a workaroun, only the first five rows are exported. The rest of the data is inserted manually from the CSV." ] }, { @@ -1186,7 +1301,7 @@ "%%time\n", "writer = pd.ExcelWriter('time_series.xlsx')\n", "for res_key, df in data_sets_multiindex.items():\n", - " df.to_excel(writer, res_key, float_format='%.2f', merge_cells=False)\n", + " df.head().to_excel(writer, res_key, float_format='%.2f', merge_cells=True)\n", "writer.save()" ] }, @@ -1225,14 +1340,16 @@ "source": [ "%%time\n", "# itertoools.chain() allows iterating over multiple dicts at once\n", - "for res_key, df in itertools.chain(\n", - " data_sets_singleindex.items(),\n", - " data_sets_multiindex.items(),\n", - " # data_sets_stacked.items()\n", - "):\n", - " f = 'time_series' + res_key\n", + "for res_stacking_key, df in itertools.chain(\n", + " #data_sets_singleindex.items(),\n", + " #data_sets_multiindex.items(),):\n", + " data_sets_stacked.items()):\n", + " # convert the format of the ce(s)t-timestamp to ISO-8601\n", + " if not res_stacking_key in ['15min_stacked', '60min_stacked']:\n", + " df.iloc[:,0] = df.iloc[:,0].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n", + " f = 'time_series_' + res_stacking_key\n", " df.to_csv(f + '.csv', float_format='%.2f',\n", - " date_format='%Y-%m-%dT%H:%M:%SZ')" + " date_format='%Y-%m-%dT%H:%M:%S%z')" ] } ],