From e1d671dbddf222dbaeb80cf7c4fa8d56b25fa365 Mon Sep 17 00:00:00 2001 From: jgmill Date: Tue, 6 Oct 2020 19:56:21 +0200 Subject: [PATCH] Yearly Update 2020 This time only using data from ENTSO-E Transparency. --- checksums.txt | 21 +- input/areas.csv | 12 +- input/sources.yml | 64 ++-- processing.ipynb | 622 +++++++++++--------------------- requirements.yml | 4 +- timeseries_scripts/make_json.py | 8 +- timeseries_scripts/read.py | 43 ++- 7 files changed, 282 insertions(+), 492 deletions(-) diff --git a/checksums.txt b/checksums.txt index a7c5f9c..a4b42fd 100644 --- a/checksums.txt +++ b/checksums.txt @@ -1,10 +1,11 @@ -time_series.sqlite,4cbe3f00c9ab945df92b475cb89e84edd1c03e4ab2de9e13ce10d7428d5ff7fe -time_series_15min_multiindex.csv,1406e4a4551d3e3763338b670ef81448a7b00343eda36d7bcb3bd875ba773cde -time_series_15min_singleindex.csv,9a0d54da4b2898690f6b2917f6feb1b9ba630d9d4e53fd514e29344f8235c107 -time_series_15min_stacked.csv,b21faab1f280a3d65e88697de33fd94384fad47b61a57099736c4119ff932c0b -time_series_30min_multiindex.csv,3b734a43b338e524e0eb0c1f89cf190cddec3102883b8b7c657c71749b6c8ed9 -time_series_30min_singleindex.csv,a9395820af21c9f50582ff031aaf2eadd317749afdd86439d0c627cf982f57f3 -time_series_30min_stacked.csv,d8e31a0fae2962710469f30584ccd4a504a83dca6896d0f6571eafd729db3e8e -time_series_60min_multiindex.csv,38087091a891bea7a65b51b92413549773eebd8f693f066d0ab54de16d20150c -time_series_60min_singleindex.csv,659fe789af2672aabe989aebc8c5c21052a1a96e4da70b0fc941910a1cd4de9d -time_series_60min_stacked.csv,90e2e450e60d9451b551cf9c4fa254a0818642336f9fe3b568dbedb496a714f6 +time_series.sqlite,37ef381572b669937712f85e8c5b40f4ef6148bf9a288424c503e2db2e72f51c +time_series.xlsx,df774691bad0a4ab5846ec1208a9e9f90c904ea26a28cb67b6c805214c42321d +time_series_15min_multiindex.csv,72574089a06d3531d4264dce59ddd70f3731060438d077723cccad3f423350a2 +time_series_15min_singleindex.csv,194f3ee7d110d04389751602132899172cdd8b7ca73c07d6e9009ddac066b2a1 +time_series_15min_stacked.csv,b040107c2ee5efb163d4301e8e14e0c0b3f4cf550dfdc19e2046b6df15fbeda7 +time_series_30min_multiindex.csv,c52f367e6c902aa54b8365069ddc86cc181273f3fceb56e017798abb0e0323f9 +time_series_30min_singleindex.csv,af27bb28291fa9a1c54ccf8fc831d2b3300cdf2e7e8efd1090874277a4bf900c +time_series_30min_stacked.csv,32e5a66cd1f4a12031db7ad047cf290e5823acb7617cabad5157d09178ddfb2e +time_series_60min_multiindex.csv,f674f1871620fade450ff68b93bd0dfafb5078ae4158c18a951c778f1d38854f +time_series_60min_singleindex.csv,6a7f2bc571314cbf9c321cc03437691cd4be95c3a6f075e60ff99e8035c704c8 +time_series_60min_stacked.csv,28a6cda5e20080db4c258747c02ad666ccb09241250bf244c8d2548f14f41e6d diff --git a/input/areas.csv b/input/areas.csv index 913f481..0c32b3d 100644 --- a/input/areas.csv +++ b/input/areas.csv @@ -16,16 +16,16 @@ DE_AT_LU,,,DE-AT-LU,bidding zone,False,True,False,False,True,10Y1001A1001A63L,DE DK,DK,DK,Denmark,country,True,True,False,False,True,10Y1001A1001A65H,Denmark,Denmark,,,,,True,True,False,True,False,True,False,False DK_1,,DK1,DK1 BZ,bidding zone,False,True,False,False,True,10YDK-1--------W,DK1 BZ / MBA,,DK1,,,DK1,True,True,True,True,True,False,True,False DK_2,,DK2,DK2 BZ,bidding zone,False,True,False,False,True,10YDK-2--------M,DK2 BZ / MBA,,DK2,,,DK2,True,True,True,True,True,False,True,False -DK_energinet,,DK,Energinet CA,control area,False,True,False,False,True,10Y1001A1001A796,"Denmark, Energinet CA",,,,Energinet,,True,True,False,True,False,False,False,False +DK_energinet,,DK,,control area,False,True,False,False,True,10Y1001A1001A796,"Denmark, Energinet CA",,,,Energinet,,True,True,False,True,False,False,False,False EE,EE,EE,Estonia,country,True,True,False,False,True,10Y1001A1001A39I,"Estonia, Elering BZ / CA / MBA",Estonia,Elering,,Elering,Elering,True,True,False,False,False,True,False,False ES,ES,ES,Spain,country,True,True,False,False,True,10YES-REE------0,"Spain, REE BZ / CA / MBA",Spain,REE,,REE,REE,True,True,False,False,False,True,False,False FI,FI,FI,Finland,country,True,True,False,False,True,10YFI-1--------U,"Finland, Fingrid BZ / CA / MBA",Finland,Fingrid,,Fingrid,Fingrid,True,True,False,False,False,True,False,False FR,FR,FR,France,country,True,True,False,False,True,10YFR-RTE------C,"France, RTE BZ / CA / MBA",France,RTE,,RTE,RTE,True,True,False,True,False,True,False,False -GB_UKM,GB,GB,United Kingdom,country,True,True,False,True,False,GB,,United Kingdom,,,,,True,True,False,False,False,True,False,True -GB_GBN,,GB,National Grid BZ,country,False,True,False,True,False,10YGB----------A,National Grid BZ / CA/ MBA,Great Britain,National Grid,,National Grid,National Grid,True,True,True,False,False,False,False,True -GB_NIR,,NIE,SONI CA,country,False,True,False,True,False,10Y1001A1001A016,"Northern Ireland, SONI CA",Northern Ireland,,,SONI,,True,True,False,False,False,False,False,True -GB_EAW,,,,country,False,True,False,True,False,,,England and Wales,,,,,False,False,False,False,False,False,True,False -GB_SCT,,,,country,False,True,False,True,False,,,Scotland,,,,,False,False,False,False,False,False,False,False +GB_UKM,GB,GB,United Kingdom,country,False,False,False,True,False,GB,,United Kingdom,,,,,True,True,False,False,False,True,False,True +GB_GBN,,GB,National Grid BZ,country,False,False,False,True,False,10YGB----------A,National Grid BZ / CA/ MBA,Great Britain,National Grid,,National Grid,National Grid,True,True,True,False,False,False,False,True +GB_NIR,,NIE,SONI CA,country,False,False,False,True,False,10Y1001A1001A016,"Northern Ireland, SONI CA",Northern Ireland,,,SONI,,True,True,False,False,False,False,False,True +GB_EAW,,,,country,False,False,False,True,False,,,England and Wales,,,,,False,False,False,False,False,False,True,False +GB_SCT,,,,country,False,False,False,True,False,,,Scotland,,,,,False,False,False,False,False,False,False,False GR,GR,GR,Greece,country,True,True,False,False,True,10YGR-HTSO-----Y,"Greece, IPTO BZ / CA/ MBA",Greece,IPTO,,IPTO,IPTO,True,True,False,False,False,True,False,False HR,HR,HR,Croatia,country,True,True,False,False,True,10YHR-HEP------M,"Croatia, HOPS BZ / CA / MBA",Croatia,HOPS,,HOPS,HOPS,True,False,False,False,False,True,False,False HU,HU,HU,Hungary,country,True,True,True,False,False,10YHU-MAVIR----U,"Hungary, MAVIR CA / BZ / MBA",Hungary,MAVIR,,MAVIR,MAVIR,True,True,False,False,False,True,False,False diff --git a/input/sources.yml b/input/sources.yml index 7e141d9..7ffab03 100644 --- a/input/sources.yml +++ b/input/sources.yml @@ -124,12 +124,12 @@ ENTSO-E Transparency FTP: OPSD: capacity: - url_template: https://data.open-power-system-data.org/renewable_power_plants/2019-04-05/renewable_capacity_timeseries.csv + url_template: https://data.open-power-system-data.org/renewable_power_plants/2020-08-25/renewable_capacity_timeseries.csv filename: renewable_capacity_timeseries.csv url_params_template: frequency: single file start: 1901-01-01 - end: 2019-01-21 + end: 2019-12-31 filetype: csv resolution: - days @@ -142,6 +142,13 @@ OPSD: unit: MW variable: '{variable}' web: https://data.open-power-system-data.org/renewable_power_plants + CZ: + attribute: capacity + region: CZ + source: own calculation based on Energetický regulační úřad + unit: MW + variable: '{variable}' + web: https://data.open-power-system-data.org/renewable_power_plants DE: attribute: capacity region: DE @@ -177,34 +184,12 @@ OPSD: unit: MW variable: '{variable}' web: https://data.open-power-system-data.org/renewable_power_plants - - Offshore: + SE: attribute: capacity - region: DE - source: own calculation based on BNetzA and netztransparenz.de - unit: MW - variable: wind_offshore - web: https://data.open-power-system-data.org/renewable_power_plants - Onshore: - attribute: capacity - region: DE - source: own calculation based on BNetzA and netztransparenz.de - unit: MW - variable: wind_onshore - web: https://data.open-power-system-data.org/renewable_power_plants - Wind: - attribute: capacity - region: DE - source: own calculation based on BNetzA and netztransparenz.de - unit: MW - variable: wind - web: https://data.open-power-system-data.org/renewable_power_plants - Solar: - attribute: capacity - region: DE - source: own calculation based on BNetzA and netztransparenz.de + region: SE + source: own calculation based on Vindbrukskollen unit: MW - variable: solar + variable: '{variable}' web: https://data.open-power-system-data.org/renewable_power_plants ENTSO-E Data Portal: @@ -604,7 +589,7 @@ Svenska Kraftnaet: filetype: xls resolution: - 60min - web: https://www.svk.se/aktorsportalen/elmarknad/statistik/ + web: https://www.svk.se/aktorsportalen/elmarknad/kraftsystemdata/elstatistik/ filename: n_fot{u_start:%Y}.xls colmap: load: @@ -613,14 +598,14 @@ Svenska Kraftnaet: source: Svenska Kraftnaet unit: MW variable: load - web: https://www.svk.se/aktorsportalen/elmarknad/statistik/ + web: https://www.svk.se/aktorsportalen/elmarknad/kraftsystemdata/elstatistik/ solar: attribute: generation_actual region: SE source: Svenska Kraftnaet unit: MW variable: solar - web: https://www.svk.se/aktorsportalen/elmarknad/statistik/ + web: https://www.svk.se/aktorsportalen/elmarknad/kraftsystemdata/elstatistik/ wind: attribute: generation_actual region: SE @@ -637,7 +622,7 @@ Svenska Kraftnaet: filetype: xls resolution: - 60min - web: https://www.svk.se/aktorsportalen/elmarknad/statistik/ + web: https://www.svk.se/aktorsportalen/elmarknad/kraftsystemdata/elstatistik/ filename: n_fot201001-06.xls wind_solar_3: url_template: https://www.svk.se/siteassets/aktorsportalen/elmarknad/statistik/forbrukning-tillforsel-per-timme/arkiverade/n_fot2010-07-12.xls @@ -648,7 +633,7 @@ Svenska Kraftnaet: filetype: xls resolution: - 60min - web: https://www.svk.se/aktorsportalen/elmarknad/statistik/ + web: https://www.svk.se/aktorsportalen/elmarknad/kraftsystemdata/elstatistik/ filename: n_fot2010-07-12.xls wind_solar_4: url_template: https://www.svk.se/siteassets/aktorsportalen/elmarknad/statistik/forbrukning-tillforsel-per-timme/arkiverade/n-fot2011-01-12.xls @@ -688,18 +673,18 @@ Svenska Kraftnaet: url_params_template: frequency: yearly start: 2017-01-01 - end: 2018-12-31 + end: 2019-12-31 filetype: xls resolution: - 60min web: https://www.svk.se/aktorsportalen/elmarknad/statistik filename: n_fot{u_start:%Y}-01-12.xls wind_solar_8: - url_template: https://www.svk.se/siteassets/aktorsportalen/elmarknad/statistik/forbrukning-tillforsel-per-timme/forbrukning-och-tillforsel-per-timme-i-normaltid-2019.xls + url_template: https://www.svk.se/siteassets/aktorsportalen/elmarknad/statistik/forbrukning-tillforsel-per-timme/n_fot2020-01-08.xls url_params_template: frequency: yearly - start: 2019-01-01 - end: 2019-12-31 + start: 2020-01-01 + end: 2020-12-31 filetype: xls resolution: - 60min @@ -752,6 +737,7 @@ Elia: isEliaConnected: '' frequency: quarterly start: 2013-01-01 + end: recent filetype: xls resolution: - 15min @@ -990,6 +976,7 @@ PSE: web: https://www.pse.pl/web/pse-eng/data/polish-power-system-operation/generation-in-wind-farms APG: + #https://transparency.apg.at/transparency-api/api/v1/Download/AGPT/German/M15/2020-09-01T000000/2020-10-01T000000/1a95fdf3-b194-4877-8381-c412159c8a68/AGPT_2020-08-31T22_00_00Z_2020-09-30T22_00_00Z_15M_de_2020-10-01T20_02_59Z.csv? generation_by_source: url_template: https://www.apg.at/transparency/Download.aspx url_params_template: @@ -1189,6 +1176,9 @@ Elexon: - url: https://downloads.elexonportal.co.uk/file/download/24412 start: 2019-01-01 end: 2019-12-31 + - url: https://downloads.elexonportal.co.uk/file/download/26347 + start: 2020-01-01 + end: 2020-12-31 start: 2005-01-01 end: 2018-12-31 frequency: file list diff --git a/processing.ipynb b/processing.ipynb index d9a9322..db41d94 100644 --- a/processing.ipynb +++ b/processing.ipynb @@ -16,21 +16,15 @@ }, { "cell_type": "markdown", - "metadata": { - "toc": "true" - }, + "metadata": {}, "source": [ "

Table of Contents

\n", - "
" + "
" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "bba7a58e-f3b2-4d3b-9617-d734c369084f" - } - }, + "metadata": {}, "source": [ "# Introductory Notes" ] @@ -44,11 +38,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "ba6b62da-6cee-476b-a563-c945f3fd0f79" - } - }, + "metadata": {}, "source": [ "# Settings" ] @@ -73,22 +63,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true - }, + "metadata": {}, "outputs": [], "source": [ - "version = '2019-06-05'\n", - "changes = '''Correct Error in German wind generation data'''" + "version = '2020-10-06'\n", + "changes = '''Yearly update'''" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "2b838df4-f987-4ae4-a132-9c898e3ffab1" - } - }, + "metadata": {}, "source": [ "## Import Python libraries" ] @@ -96,13 +80,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true, - "nbpresent": { - "id": "c0035fc6-ff1d-44d8-a3fd-b4c08f53be71" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "# Python modules\n", @@ -144,9 +122,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true - }, + "metadata": {}, "outputs": [], "source": [ "# Allow pretty-display of multiple variables\n", @@ -169,9 +145,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true - }, + "metadata": {}, "outputs": [], "source": [ "# make sure the working directory is this file's directory\n", @@ -196,6 +170,7 @@ "out_path = os.path.join(save_path, version) \n", "temp_path = os.path.join(save_path, 'temp')\n", "parsed_path = os.path.join(save_path, 'parsed')\n", + "chromedriver_path = os.path.join(home_path, 'chromedriver', 'chromedriver')\n", "for path in [data_path, out_path, temp_path, parsed_path]:\n", " os.makedirs(path, exist_ok=True)\n", "\n", @@ -221,9 +196,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true - }, + "metadata": {}, "outputs": [], "source": [ "# Deciding whether to use the provided database of Terna links\n", @@ -245,9 +218,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true - }, + "metadata": {}, "outputs": [], "source": [ "# Configure the display of logs in the notebook and attach it to the root logger\n", @@ -260,13 +231,14 @@ "script_logger.setLevel(logging.DEBUG)\n", "formatter = logging.Formatter(fmt='%(asctime)s %(name)s %(levelname)s %(message)s',\n", " datefmt='%Y-%m-%d %H:%M:%S',)\n", + "# Set up a logger for logs from the notebook\n", + "logger = logging.getLogger('notebook')\n", + "\n", + "# Set up a logfile and attach it to both loggers\n", "logfile = logging.handlers.TimedRotatingFileHandler(os.path.join(temp_path, 'logfile.log'), when='midnight')\n", "logfile.setFormatter(formatter)\n", "logfile.setLevel(logging.DEBUG) #threshold for log messages in logfile\n", "script_logger.addHandler(logfile)\n", - "\n", - "# Set up a logger for logs from the notebook\n", - "logger = logging.getLogger('notebook')\n", "logger.addHandler(logfile)" ] }, @@ -288,11 +260,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "8ed60fdb-284d-43db-b802-8e5c405b8e84" - } - }, + "metadata": {}, "source": [ "## Select timerange" ] @@ -320,17 +288,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true, - "nbpresent": { - "id": "f3008d47-ec89-40d0-85ee-776d110f3bb4" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "start_from_user = date(2005, 1, 1)\n", - "end_from_user = date(2019, 4, 30)" + "start_from_user = date(2015, 1, 1)\n", + "end_from_user = date(2020, 9, 30)" ] }, { @@ -356,9 +318,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true - }, + "metadata": {}, "outputs": [], "source": [ "archive_version = None # i.e. '2016-07-14'" @@ -381,13 +341,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "with open(sources_yaml_path, 'r', encoding='UTF-8') as f:\n", - " sources = yaml.load(f.read())" + " sources = yaml.full_load (f.read())" ] }, { @@ -419,20 +377,19 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "subset = yaml.load('''\n", - "50Hertz:\n", - "- wind generation_actual with-offshore\n", + "subset = yaml.full_load('''\n", + "ENTSO-E Transparency FTP:\n", + "- Actual Generation per Production Type\n", + "- Actual Total Load\n", + "- Day-ahead Total Load Forecast\n", + "- Day-ahead Prices\n", + "OPSD:\n", + "- capacity\n", "''')\n", - "subset = None # to include all sources\n", - "\n", - "exclude = yaml.load('''\n", - "''')" + "exclude=None" ] }, { @@ -445,14 +402,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "with open(sources_yaml_path, 'r', encoding='UTF-8') as f:\n", - " sources = yaml.load(f.read())\n", + " sources = yaml.full_load(f.read())\n", "if subset: # eliminate sources and datasets not in subset\n", " sources = {source_name: \n", " {k: v for k, v in sources[source_name].items()\n", @@ -471,9 +425,7 @@ }, { "cell_type": "markdown", - "metadata": { - "toc-hr-collapsed": false - }, + "metadata": {}, "source": [ "# Download" ] @@ -502,7 +454,7 @@ "metadata": {}, "outputs": [], "source": [ - "auth = yaml.load('''\n", + "auth = yaml.full_load('''\n", "ENTSO-E Transparency FTP:\n", " username: your email\n", " password: your password\n", @@ -514,11 +466,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "e08368cb-021f-453a-b201-a8d48bc8e4c4" - } - }, + "metadata": {}, "source": [ "## Automatic download (for most sources)" ] @@ -526,14 +474,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "0c1eb987-6d5f-4e3d-9248-df80b9f37a49" - } - }, + "metadata": {}, "outputs": [], "source": [ - "download(sources, data_path, input_path, auth,\n", + "download(sources, data_path, input_path, chromedriver_path, auth,\n", " archive_version=None,\n", " start_from_user=start_from_user,\n", " end_from_user=end_from_user,\n", @@ -542,11 +486,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "bae7810a-d4af-4021-a9ab-c4b772a2bd53" - } - }, + "metadata": {}, "source": [ "## Manual download" ] @@ -560,11 +500,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "a69f8271-e1fc-4e64-814c-4210fb80c006" - } - }, + "metadata": {}, "source": [ "Go to http://osp.energinet.dk/_layouts/Markedsdata/framework/integrations/markedsdatatemplate.aspx.\n", "\n", @@ -647,22 +583,14 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "d0b353e7-179d-4556-bdd2-270192c830fb" - } - }, + "metadata": {}, "source": [ "# Read" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "d223207e-2b0e-49ec-8bf6-af2969ee5b28" - } - }, + "metadata": {}, "source": [ "This section: Read each downloaded file into a pandas-DataFrame and merge data from different sources if it has the same time resolution. Takes ~15 minutes to run." ] @@ -676,11 +604,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "1300cbb2-efde-4844-b08c-fed092023e38" - } - }, + "metadata": {}, "source": [ "Set the title of the rows at the top of the data used to store metadata internally. The order of this list determines the order of the levels in the resulting output." ] @@ -688,13 +612,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true, - "nbpresent": { - "id": "4dc92cc3-c01d-4c83-9252-80958edbe0f9" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "headers = ['region', 'variable', 'attribute', 'source', 'web', 'unit']" @@ -710,10 +628,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "areas = pd.read_csv(areas_csv_path)" @@ -729,9 +644,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "areas.loc[areas['area ID'].notnull(), :'EIC'].fillna('')" @@ -755,9 +668,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "areas = pd.read_csv(areas_csv_path)\n", @@ -777,9 +688,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "# Create a dictionary of empty DataFrames to be populated with data\n", @@ -788,7 +697,15 @@ " '60min': pd.DataFrame()}\n", "entso_e = {'15min': pd.DataFrame(),\n", " '30min': pd.DataFrame(),\n", - " '60min': pd.DataFrame()}\n", + " '60min': pd.DataFrame()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "for filename in os.listdir(parsed_path):\n", " res_key, source_name, dataset_name, = filename.split('_')[:3]\n", " if subset and not source_name in subset.keys():\n", @@ -796,10 +713,10 @@ " logger.info('include %s', filename)\n", " df_portion = pd.read_pickle(os.path.join(parsed_path, filename))\n", "\n", - " if source_name == 'ENTSO-E Transparency FTP':\n", - " dfs = entso_e\n", - " else:\n", - " dfs = data_sets\n", + " #if source_name == 'ENTSO-E Transparency FTP':\n", + " # dfs = entso_e\n", + " #else:\n", + " dfs = data_sets\n", "\n", " if dfs[res_key].empty:\n", " dfs[res_key] = df_portion\n", @@ -812,24 +729,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "for res_key, df in data_sets.items():\n", " logger.info(res_key + ': %s', df.shape)\n", - "for res_key, df in entso_e.items():\n", - " logger.info('ENTSO-E ' + res_key + ': %s', df.shape)" + "#for res_key, df in entso_e.items():\n", + "# logger.info('ENTSO-E ' + res_key + ': %s', df.shape)" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "be24bc5f-e281-43bf-853a-e7130d20d2e8" - } - }, + "metadata": {}, "source": [ "Display some rows of the dataframes to get a first impression of the data." ] @@ -840,7 +751,7 @@ "metadata": {}, "outputs": [], "source": [ - "data_sets['60min'].head()" + "data_sets['60min']" ] }, { @@ -852,11 +763,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "019c8ae9-8934-4074-adfd-278acd212152" - } - }, + "metadata": {}, "source": [ "Save the DataFrames created by the read function to disk. This way you have the raw data to fall back to if something goes wrong in the ramainder of this notebook without having to repeat the previos steps." ] @@ -864,9 +771,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(temp_path)\n", @@ -888,9 +793,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(temp_path)\n", @@ -906,33 +809,21 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "026c6ad8-0422-4887-9361-8e45ae33e0c6" - } - }, + "metadata": {}, "source": [ "# Processing" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "c446157a-7872-491b-a9c4-0336b87568a6" - } - }, + "metadata": {}, "source": [ "This section: missing data handling, aggregation of sub-national to national data, aggregate 15'-data to 60'-resolution. Takes 30 minutes to run." ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "0c13e5eb-d8e3-425d-982e-2a25c364cdc5" - } - }, + "metadata": {}, "source": [ "## Missing data handling" ] @@ -946,11 +837,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "81d4d772-d3c7-4e07-a0a7-49afb3bdf8f6" - } - }, + "metadata": {}, "source": [ "Patch missing data. At this stage, only small gaps (up to 2 hours) are filled by linear interpolation. This catched most of the missing data due to daylight savings time transitions, while leaving bigger gaps untouched\n", "\n", @@ -959,11 +846,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "533cd53f-eb8f-4699-bbf7-72ad1cf90325" - } - }, + "metadata": {}, "source": [ "Patch the datasets and display the location of missing Data in the original data. Takes ~5 minutes to run." ] @@ -971,12 +854,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "33602355-28cb-4d5c-b97e-fe59b9b48883" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "nan_tables = {}\n", @@ -989,12 +867,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "33602355-28cb-4d5c-b97e-fe59b9b48883" - }, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "for res_key, df in entso_e.items():\n", @@ -1004,22 +877,14 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "345c31ec-fa0b-4859-a930-8b7c20f1e1bf" - } - }, + "metadata": {}, "source": [ "Execute this to see an example of where the data has been patched." ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "4c5c4893-5098-47df-8e68-51b13f5fd484" - } - }, + "metadata": {}, "source": [ "Display the table of regions of missing values" ] @@ -1027,12 +892,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "b5207ca6-4a72-40b6-8afd-3123f66ae323" - }, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "nan_tables['60min']" @@ -1048,9 +908,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(temp_path)\n", @@ -1075,9 +933,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(temp_path)\n", @@ -1092,9 +948,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(temp_path)\n", @@ -1110,62 +964,95 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "abd61a95-68e4-4bf2-9f1f-ef010b4cd288" - }, - "toc-hr-collapsed": false - }, + "metadata": {}, "source": [ - "## Country specific calculations" + "Some of the following operations require the Dataframes to be lexsorted in the columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for res_key, df in data_sets.items():\n", + " df.sort_index(axis='columns', inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Some of the following operations require the Dataframes to be lexsorted in the columns" + "## Aggregate wind offshore + onshore" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "for res_key, df in data_sets.items():\n", - " df.sort_index(axis='columns', inplace=True)" + " for geo in df.columns.get_level_values(0).unique():\n", + " # we could also include 'generation_forecast'\n", + " for attribute in ['generation_actual']:\n", + " df_wind = df.loc[:, (geo, ['wind_onshore', 'wind_offshore'], attribute)]\n", + " if ('wind_onshore' in df_wind.columns.get_level_values('variable') and\n", + " 'wind_offshore' in df_wind.columns.get_level_values('variable')):\n", + " logger.info(f'aggregate onhore + offshore for {res_key} {geo}')\n", + " \n", + " # skipna=False, otherwise NAs will become zeros after summation\n", + " sum_col = df_wind.sum(axis='columns', skipna=False).to_frame()\n", + "\n", + " # Create a new MultiIndex\n", + " new_col_header = {\n", + " 'region': geo,\n", + " 'variable': 'wind',\n", + " 'attribute': 'generation_actual',\n", + " 'source': 'own calculation based on ENTSO-E Transparency',\n", + " 'web': '',\n", + " 'unit': 'MW'\n", + " }\n", + " new_col_header = tuple(new_col_header[level] for level in headers)\n", + " df[new_col_header] = sum_col\n", + " #df[new_col_header].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dfi = data_sets['15min'].copy()\n", + "dfi.columns = [' '.join(col[:3]).strip() for col in dfi.columns.values]\n", + "dfi.info(verbose=True, null_counts=True)" ] }, { "cell_type": "markdown", - "metadata": { - "toc-hr-collapsed": false - }, + "metadata": {}, + "source": [ + "## Country specific calculations - not used in this release" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ "### Germany" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "cbcd1b7c-6c9c-4d1a-ab70-28276b421805" - } - }, + "metadata": {}, "source": [ "#### Aggregate German data from individual TSOs" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "9ab0116d-402b-444b-acf2-925388a9886b" - } - }, + "metadata": {}, "source": [ "The wind and solar in-feed data for the 4 German control areas is summed up and stored in a new column. The column headers are created in the fashion introduced in the read script. Takes 5 seconds to run." ] @@ -1173,12 +1060,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "26a83b1c-682d-43ec-bb70-12a096089508" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "df = data_sets['15min']\n", @@ -1309,11 +1191,12 @@ "outputs": [], "source": [ "for res_key, df in data_sets.items():\n", - " if res_key == '60min':\n", - " continue\n", + " #if res_key == '60min':\n", + " # continue\n", " for col_name, col in df.loc[:,(slice(None), slice(None), 'capacity')].iteritems():\n", - " # Calculate the profile column\n", - " kwargs = {'key': (col_name[0], col_name[1], 'generation_actual'),\n", + " # Get the generation data for the selected capacity column\n", + " kwargs = {\n", + " 'key': (col_name[0], col_name[1], 'generation_actual'),\n", " 'level': ['region', 'variable', 'attribute'],\n", " 'axis': 'columns', 'drop_level': False}\n", " generation_col = df.xs(**kwargs)\n", @@ -1325,6 +1208,7 @@ " continue\n", " if generation_col.size == 0:\n", " continue\n", + " # Calculate the profile column\n", " profile_col = generation_col.divide(col, axis='index').round(4)\n", "\n", " # Create a new MultiIndex\n", @@ -1363,9 +1247,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "for res_key, df in data_sets.items():\n", @@ -1382,9 +1264,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(temp_path)\n", @@ -1412,22 +1292,14 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "4e311f14-5124-47fd-9f94-0a866e9a6f71" - } - }, + "metadata": {}, "source": [ "## Resample higher frequencies to 60'" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "882bf163-9e93-44a4-9c51-a3ce0c32b4f4" - } - }, + "metadata": {}, "source": [ "Some data comes in 15 or 30-minute intervals (i.e. German or British renewable generation), other in 60-minutes (i.e. load data from ENTSO-E and Prices). We resample the 15 and 30-minute data to hourly resolution and append it to the 60-minutes dataset.\n", "\n", @@ -1437,12 +1309,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ - "for ds in [data_sets, entso_e]:\n", + "for ds in [data_sets]:#, entso_e]:\n", " for res_key, df in ds.items():\n", " if res_key == '60min':\n", " continue\n", @@ -1493,12 +1363,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "c19e3eb9-feca-4e94-9035-580aa07189ba" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "data_cols = data_sets['60min'].columns.droplevel(['source', 'web', 'unit'])\n", @@ -1541,11 +1406,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "dea911fb-326f-46a6-a009-57af147d4be4" - } - }, + "metadata": {}, "source": [ "## Insert a column with Central European (Summer-)time" ] @@ -1560,10 +1421,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "init_cell": true, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "info_cols = {'utc': 'utc_timestamp',\n", @@ -1573,15 +1431,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "b1550779-53cc-498d-980b-7aa253974c91" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "for ds in [data_sets, entso_e]:\n", + "for ds in [data_sets]: #, entso_e]:\n", " for res_key, df in ds.items():\n", " if df.empty:\n", " continue\n", @@ -1592,11 +1445,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "4e625ea7-c448-45fe-85b2-026157ad24c0" - } - }, + "metadata": {}, "source": [ "# Create a final savepoint" ] @@ -1604,9 +1453,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "data_sets['15min'].to_pickle('final_15.pickle')\n", @@ -1620,9 +1467,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(temp_path)\n", @@ -1668,22 +1513,14 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "fa919796-a7f6-4556-aeed-181ddc6028ac" - } - }, + "metadata": {}, "source": [ "# Write data to disk" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "1b3c9199-ce14-4487-939d-656f60c14df3" - } - }, + "metadata": {}, "source": [ "This section: Save as [Data Package](http://data.okfn.org/doc/tabular-data-package) (data in CSV, metadata in JSON file). All files are saved in the directory of this notebook. Alternative file formats (SQL, XLSX) are also exported. Takes about 1 hour to run." ] @@ -1706,14 +1543,14 @@ " # In order to make sure that the respective time period is covered in both\n", " # UTC and CE(S)T, we set the start in CE(S)T, but the end in UTC\n", " if start_from_user:\n", - " start_from_user = (\n", - " pytz.timezone('Europe/Brussels')\n", + " start_from_user = (pytz.timezone('Europe/Brussels')\n", " .localize(datetime.combine(start_from_user, time()))\n", - " .astimezone(pytz.timezone('UTC')))\n", + " .astimezone(pytz.timezone('UTC'))\n", + " .replace(tzinfo=None))\n", " if end_from_user:\n", - " end_from_user = (\n", - " pytz.timezone('UTC')\n", + " end_from_user = (pytz.timezone('UTC')\n", " .localize(datetime.combine(end_from_user, time()))\n", + " .replace(tzinfo=None)\n", " # Appropriate offset to inlude the end of period\n", " + timedelta(days=1, minutes=-int(res_key[:2])))\n", " # Then cut off the data_set\n", @@ -1722,22 +1559,14 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "47c1efa2-d93f-4d13-81d7-8f64dadeff3f" - } - }, + "metadata": {}, "source": [ "## Different shapes" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "a753ac43-a0f4-44bc-a89d-1ccaaf48289a" - } - }, + "metadata": {}, "source": [ "Data are provided in three different \"shapes\": \n", "- SingleIndex (easy to read for humans, compatible with datapackage standard, small file size)\n", @@ -1753,12 +1582,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "769225c6-31f5-4db8-8d91-32a3f983489c" - }, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "combined_singleindex = {}\n", @@ -1802,22 +1626,14 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "84f1822e-3aa6-42c4-a424-5dc5ab6fa56f" - } - }, + "metadata": {}, "source": [ "## Write to SQLite-database" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "3bb78fa9-5309-46b6-b945-68dcb654a567" - } - }, + "metadata": {}, "source": [ "This file format is required for the filtering function on the OPSD website. This takes ~3 minutes to complete." ] @@ -1825,12 +1641,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "fd35212c-ec5c-4fcf-9897-4608742d1bf8" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(out_path)\n", @@ -1846,22 +1657,14 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "143b39aa-dd2e-4923-be56-bb6c4706837d" - } - }, + "metadata": {}, "source": [ "## Write to Excel" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "ef7763f5-9bbc-40b8-8cee-829131b40336" - } - }, + "metadata": {}, "source": [ "Writing the full tables to Excel takes extremely long. As a workaround, only the timestamp-columns are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files." ] @@ -1869,47 +1672,34 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "9e84c62a-7bd6-4319-89dd-409574dda234" - }, - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(out_path)\n", "writer = pd.ExcelWriter('time_series.xlsx')\n", + "writer.save()\n", "for res_key, df in data_sets.items():\n", " # Need to convert CE(S)T-timestamps to tz-naive, otherwise Excel converts\n", " # them back to UTC\n", " df.loc[:,(info_cols['cet'], '', '', '', '', '')].dt.tz_localize(None).to_excel(writer, res_key)\n", " filename = 'tsos_' + res_key + '.csv'\n", " df.to_csv(filename, float_format='%.4f', date_format='%Y-%m-%dT%H:%M:%SZ')\n", - "for res_key, df in entso_e.items():\n", - " df.loc[:,(info_cols['cet'], '', '', '', '', '')].dt.tz_localize(None).to_excel(writer, res_key+ ' ENTSO-E')\n", - " filename = 'entso_e_' + res_key + '.csv'\n", - " df.to_csv(filename, float_format='%.4f', date_format='%Y-%m-%dT%H:%M:%SZ')\n", - "writer.save()" + "#for res_key, df in entso_e.items():\n", + "# df.loc[:,(info_cols['cet'], '', '', '', '', '')].dt.tz_localize(None).to_excel(writer, res_key+ ' ENTSO-E')\n", + "# filename = 'entso_e_' + res_key + '.csv'\n", + "# df.to_csv(filename, float_format='%.4f', date_format='%Y-%m-%dT%H:%M:%SZ')" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "454ee5f5-e8f1-4088-94e9-e846f48ee75b" - } - }, + "metadata": {}, "source": [ "## Write to CSV" ] }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "279e3306-6dea-454d-bec8-0c826509ecd1" - } - }, + "metadata": {}, "source": [ "This takes about 10 minutes to complete." ] @@ -1917,12 +1707,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "nbpresent": { - "id": "89449c49-608d-488d-8bc8-077c64bc26c7" - }, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(out_path)\n", @@ -1951,11 +1736,7 @@ }, { "cell_type": "markdown", - "metadata": { - "nbpresent": { - "id": "38b6a454-31c4-4112-971f-da8271131d54" - } - }, + "metadata": {}, "source": [ "This section: create the metadata, both general and column-specific. All metadata we be stored as a JSON file. Takes 10s to run." ] @@ -1963,9 +1744,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "os.chdir(out_path)\n", @@ -2007,15 +1786,22 @@ "# leaving a copy in the version directory for reference\n", "copyfile('checksums.txt', os.path.join(home_path, 'checksums.txt'))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "anaconda-cloud": {}, "celltoolbar": "Initialization Cell", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:opsd_time_series_2020]", "language": "python", - "name": "python3" + "name": "conda-env-opsd_time_series_2020-py" }, "language_info": { "codemirror_mode": { @@ -2027,7 +1813,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.8.5" }, "nbpresent": { "slides": { @@ -2086,11 +1872,11 @@ "title_sidebar": "Contents", "toc_cell": true, "toc_position": { - "height": "660px", + "height": "899px", "left": "59px", "right": "2296px", "top": "110px", - "width": "266px" + "width": "442px" }, "toc_section_display": true, "toc_window_display": true @@ -2137,5 +1923,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/requirements.yml b/requirements.yml index 1ca5fa1..977ee35 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ channels: - conda-forge dependencies: - - python=3.6 + - python=3.8 - pandas - numpy - xlrd # pandas: excel i/o @@ -16,4 +16,4 @@ dependencies: - requests - jupyter_contrib_nbextensions # nice add-ons for jupyter - selenium # required to scrape URLs from Terna website - - paramiko + - paramiko # for sftp access to ENTSO-E Transparency diff --git a/timeseries_scripts/make_json.py b/timeseries_scripts/make_json.py index 8750363..e55d73c 100644 --- a/timeseries_scripts/make_json.py +++ b/timeseries_scripts/make_json.py @@ -229,7 +229,7 @@ def make_json(data_sets, info_cols, version, changes, headers, areas, if not primary_concept == 'country': geo = geo + ' (' + primary_concept + ')' - descriptions = yaml.load( + descriptions = yaml.full_load( descriptions_template.format(**h, geo=geo)) try: h['description'] = descriptions[h['attribute']] @@ -252,7 +252,7 @@ def make_json(data_sets, info_cols, version, changes, headers, areas, # dicts, this requires first converting it to a tuple, then converting it back to a dict. # entry is a dict of structure {'name': source_name} source_list = [dict(tupleized) for tupleized in set( - tuple(entry.items()) for entry in yaml.load(source_list) + tuple(entry.items()) for entry in yaml.full_load(source_list) if not entry['name'].startswith('own calculation'))] source_list + [{'name': 'German Bundesnetzagentur (BNetzA) and German TSOs via netztransparenz.de'}, {'name': 'United Kingdom Department for Business, Energy and Industrial Strategy (BEIS)'}, @@ -260,13 +260,13 @@ def make_json(data_sets, info_cols, version, changes, headers, areas, {'name': 'Swiss Bundesamt für Energie (BFE)'}] # Parse the YAML-Strings and stitch the building blocks together - metadata = yaml.load(metadata_head.format( + metadata = yaml.full_load(metadata_head.format( version=version, changes=changes, start=start_from_user, end=end_from_user, bytes=os.path.getsize('time_series.xlsx'), hash=get_sha_hash('time_series.xlsx'))) metadata['sources'] = source_list - metadata['resources'] = yaml.load(resource_list) + metadata['resources'] = yaml.full_load(resource_list) # write the metadata to disk datapackage_json = json.dumps(metadata, indent=4, separators=(',', ': ')) diff --git a/timeseries_scripts/read.py b/timeseries_scripts/read.py index bdc0145..f464382 100644 --- a/timeseries_scripts/read.py +++ b/timeseries_scripts/read.py @@ -118,11 +118,22 @@ def read_entso_e_transparency( for res in ['15', '30', '60']: df = (df_raw.loc[df_raw['resolution'] == 'PT' + res + 'M', :] .copy().sort_index(axis='columns')) - df.drop(columns=['resolution'], inplace=True) + + df = df.drop(columns=['resolution']) + + # DST-handling + # Hours 2-3 of the DST-day in March are both labelled 3:00, with no possibility + # to distinguish them. We have to delete both + dst_transitions_spring = [d.replace(hour=3, minute=m) + for d in pytz.timezone('Europe/Paris')._utc_transition_times + if 2000 <= d.year <= datetime.today().year and d.month == 3 + for m in [0, 15, 30, 45]] + + df = df.loc[~df.index.isin(dst_transitions_spring)] # juggle the index and columns df.set_index(stacked, append=True, inplace=True) - # at this point, only the values we are intereseted in are are left as + # at this point, only the values we are intereseted in are left as # columns df.columns.rename(unstacked, inplace=True) df = df.unstack(stacked) @@ -615,13 +626,13 @@ def read_opsd(filepath, param_dict, headers): last = pd.to_datetime([df.index[-1]]) + timedelta(days=1, minutes=59) until_last = df.index.append(last).rename('timestamp') df = df.reindex(index=until_last, method='ffill') - df = df.loc[df.index.year >= 2000] + df = df.loc[(2005 <= df.index.year) & (df.index.year <= 2019)] dfs = {} for timezone, res, ddf in [ ('CET', '15min', df.loc[:, ['DE']]), ('WET', '30min', df.loc[:, ['GB-UKM', 'GB-GBN', 'GB-NIR']]), - ('CET', '60min', df.loc[:, ['CH', 'DK']])]: + ('CET', '60min', df.loc[:, ['CH', 'DK', 'SE']])]: # DST-handling ddf.index = ddf.index.tz_localize(timezone).tz_convert(None) @@ -1070,7 +1081,7 @@ def read_dataset( files_existing = sum([len(files) for r, d, files in os.walk(dataset_dir)]) files_success = 0 - # Check there are files for dataset_name + # Check if there are files for dataset_name if files_existing == 0: logger.warning('no files found') return @@ -1087,12 +1098,12 @@ def read_dataset( # Skip this file if period covered excluded by user if start_from_user: # start lies after file end => filecontent is too old - if start_from_user > yaml.load(container.split('_')[1]): + if start_from_user > yaml.full_load(container.split('_')[1]): continue # go to next container if end_from_user: # end lies before file start => filecontent is too recent - if end_from_user < yaml.load(container.split('_')[0]) - timedelta(days=1): + if end_from_user < yaml.full_load(container.split('_')[0]) - timedelta(days=1): continue # go to next container # Check if there is only one file per folder @@ -1208,7 +1219,7 @@ def read_dataset( def trim_df( - df, + df0, res_key, source_name, dataset_name, @@ -1234,14 +1245,14 @@ def trim_df( ''' # sort the index - df.sort_index(axis='index', inplace=True) + df0.sort_index(axis='index', inplace=True) # Reindex with a new index that is sure to be continous in order to later # expose gaps in the data. - no_gaps = pd.date_range(start=df.index[0], - end=df.index[-1], - freq=res_key) - df = df.reindex(index=no_gaps) - missing_rows = df.shape[0] - df.shape[0] + #no_gaps = pd.date_range(start=df0.index[0], + # end=df0.index[-1], + # freq=res_key) + df = df0.asfreq(res_key) + missing_rows = df.shape[0] - df0.shape[0] if not missing_rows == 0: logger.info(' {:20.20} | {:20.20} | {} missing rows' .format(source_name, dataset_name, missing_rows)) @@ -1253,11 +1264,13 @@ def trim_df( start_from_user = ( pytz.timezone('CET') .localize(datetime.combine(start_from_user, time())) - .astimezone(pytz.timezone('UTC'))) + .astimezone(pytz.timezone('UTC')) + .replace(tzinfo=None)) if end_from_user: end_from_user = ( pytz.timezone('UTC') .localize(datetime.combine(end_from_user, time())) + .replace(tzinfo=None) # Appropriate offset to include the end of period (23:45 for the # same day) + timedelta(days=1, minutes=-int(res_key[:2])))