From ec3c56ffc96e6c31f72a238295f0d1ab86408f6d Mon Sep 17 00:00:00 2001 From: grgmiller Date: Fri, 10 Feb 2023 17:23:27 -0800 Subject: [PATCH 01/27] update warning threshold --- src/validation.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/validation.py b/src/validation.py index 15d05827..f157c292 100644 --- a/src/validation.py +++ b/src/validation.py @@ -57,16 +57,24 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated): "fuel_consumed_for_electricity_mmbtu", ] ].sum() - # calculate the difference between the values - plant_total_diff = plant_total_gf - plant_total_alloc - # flag values where the absolute difference is greater than 10 mwh or mmbtu + # calculate the percentage difference between the values + plant_total_diff = (plant_total_alloc - plant_total_gf) / plant_total_gf + # flag rows where the absolute percentage difference is greater than our threshold + threshold_percent = 0.05 mismatched_allocation = plant_total_diff[ - (abs(plant_total_diff["fuel_consumed_mmbtu"]) > 10) - | (abs(plant_total_diff["net_generation_mwh"]) > 10) + (abs(plant_total_diff["fuel_consumed_mmbtu"]) > threshold_percent) + | (abs(plant_total_diff["net_generation_mwh"]) > threshold_percent) ] if len(mismatched_allocation) > 0: - print("WARNING: Allocated EIA-923 doesn't match input data for plants:") + print( + "WARNING: Allocated EIA-923 data doesn't match input data for the following plants:" + ) + print("Percentage Difference:") print(mismatched_allocation) + print("Input Totals:") + print(plant_total_gf.loc[mismatched_allocation.index, :]) + print("Allocated Totals:") + print(plant_total_alloc.loc[mismatched_allocation.index, :]) def test_for_negative_values(df, small: bool = False): From 1484db8ffff610d2c1943096d049eed09017c786 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Fri, 10 Feb 2023 17:34:53 -0800 Subject: [PATCH 02/27] update message --- src/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/validation.py b/src/validation.py index f157c292..e88512f2 100644 --- a/src/validation.py +++ b/src/validation.py @@ -71,7 +71,7 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated): ) print("Percentage Difference:") print(mismatched_allocation) - print("Input Totals:") + print("EIA-923 Input Totals:") print(plant_total_gf.loc[mismatched_allocation.index, :]) print("Allocated Totals:") print(plant_total_alloc.loc[mismatched_allocation.index, :]) From d1643b13bdd79b08d6a8473fa23b4a1dc0472c8c Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 11 Feb 2023 08:38:22 -0800 Subject: [PATCH 03/27] change threshold to 0.001 --- src/data_cleaning.py | 4 +++- src/validation.py | 16 +++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index 6ce26ca7..e6a67eb7 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -401,7 +401,9 @@ def clean_eia923( ) # test to make sure allocated totals match input totals - validation.check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated) + validation.check_allocated_gf_matches_input_gf( + pudl_out, gen_fuel_allocated, threshold_percent=0.01 + ) # manually update energy source code when OTH gen_fuel_allocated = update_energy_source_codes(gen_fuel_allocated) diff --git a/src/validation.py b/src/validation.py index e88512f2..4fa35e5e 100644 --- a/src/validation.py +++ b/src/validation.py @@ -40,8 +40,19 @@ def validate_year(year): raise UserWarning(year_warning) -def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated): - """Checks that the allocated generation and fuel from EIA-923 matches the input totals.""" +def check_allocated_gf_matches_input_gf( + pudl_out, gen_fuel_allocated, threshold_percent=0.001 +): + """ + Checks that the allocated generation and fuel from EIA-923 matches the input totals. + + Because there might be small rounding errors in the allocation that make the + allocated total slightly off from the input data, we allow the user to specify a + threshold percentage above which mismatched data is flagged. The default value is + 0.1%, so that if either the allocated total fuel consumption or allocated total net + generation is more than +/-0.1% different from the total input generation or fuel, + the record is flagged. + """ gf = pudl_out.gf_eia923() plant_total_gf = gf.groupby("plant_id_eia")[ [ @@ -60,7 +71,6 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated): # calculate the percentage difference between the values plant_total_diff = (plant_total_alloc - plant_total_gf) / plant_total_gf # flag rows where the absolute percentage difference is greater than our threshold - threshold_percent = 0.05 mismatched_allocation = plant_total_diff[ (abs(plant_total_diff["fuel_consumed_mmbtu"]) > threshold_percent) | (abs(plant_total_diff["net_generation_mwh"]) > threshold_percent) From 84ae470132e9412c2fd331cdcc9f599a843a510d Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 11 Feb 2023 08:39:25 -0800 Subject: [PATCH 04/27] fix threshold in data_cleaning --- src/data_cleaning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index e6a67eb7..558c4b7d 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -402,7 +402,7 @@ def clean_eia923( # test to make sure allocated totals match input totals validation.check_allocated_gf_matches_input_gf( - pudl_out, gen_fuel_allocated, threshold_percent=0.01 + pudl_out, gen_fuel_allocated, threshold_percent=0.001 ) # manually update energy source code when OTH From 7fd11a45d24a72c3bb195cfedd9ff97834984668 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 11 Feb 2023 13:46:19 -0800 Subject: [PATCH 05/27] fix issue with indexes --- notebooks/validation/validate_vs_egrid.ipynb | 136 +++- .../GH279_missing_cems_data.ipynb | 731 ++++++++++++++++++ src/data_cleaning.py | 14 + src/emissions.py | 7 +- src/validation.py | 82 ++ 5 files changed, 929 insertions(+), 41 deletions(-) create mode 100644 notebooks/work_in_progress/GH279_missing_cems_data.ipynb diff --git a/notebooks/validation/validate_vs_egrid.ipynb b/notebooks/validation/validate_vs_egrid.ipynb index 689c0901..70ec8518 100644 --- a/notebooks/validation/validate_vs_egrid.ipynb +++ b/notebooks/validation/validate_vs_egrid.ipynb @@ -245,6 +245,36 @@ "]\n" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate Plant-level discrepencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# evaluate all plants\n", + "comparison_count, compared = validation.compare_plant_level_results_to_egrid(\n", + " annual_plant_results, egrid_plant, PLANTS_MISSING_FROM_EGRID\n", + ")\n", + "comparison_count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compared[(compared[\"ba_code\"] == \"SOCO\") & (compared[\"co2_mass_lb_status\"] != \"!exact\")]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -404,23 +434,32 @@ "metadata": {}, "outputs": [], "source": [ - "year = 2020\n", + "year = 2021\n", "path_prefix = year\n", "\n", + "DATA_COLUMNS = [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Load the eGRID plant table\n", "egrid_plant = validation.load_egrid_plant_file(year)\n", "\n", "egrid_ba = validation.load_egrid_ba_file(year)\n", "\n", "# aggregate the plant data up to the BA level\n", - "data_columns = [\n", - " \"net_generation_mwh\",\n", - " \"fuel_consumed_mmbtu\",\n", - " \"fuel_consumed_for_electricity_mmbtu\",\n", - " \"co2_mass_lb\",\n", - " \"co2_mass_lb_for_electricity_adjusted\",\n", - "]\n", - "egrid_plant_ba_agg = egrid_plant.groupby([\"ba_code\"]).sum()[data_columns].reset_index()\n" + "egrid_plant_ba_agg = egrid_plant.groupby([\"ba_code\"]).sum()[DATA_COLUMNS].reset_index()\n", + "\n", + "egrid_plant_ba_agg[\"generated_co2_rate_lb_per_mwh\"] = egrid_plant_ba_agg[\"co2_mass_lb\"] / egrid_plant_ba_agg[\"net_generation_mwh\"]" ] }, { @@ -430,14 +469,6 @@ "outputs": [], "source": [ "# load our annual ba data\n", - "DATA_COLUMNS = [\n", - " \"net_generation_mwh\",\n", - " \"fuel_consumed_mmbtu\",\n", - " \"fuel_consumed_for_electricity_mmbtu\",\n", - " \"co2_mass_lb\",\n", - " \"co2_mass_lb_adjusted\",\n", - "]\n", - "\n", "calculated_ba = []\n", "\n", "for filename in os.listdir(\n", @@ -455,7 +486,9 @@ " ba_data = ba_data[[\"ba_code\"] + DATA_COLUMNS]\n", " calculated_ba.append(ba_data)\n", "\n", - "calculated_ba = pd.concat(calculated_ba, axis=0)\n" + "calculated_ba = pd.concat(calculated_ba, axis=0)\n", + "\n", + "calculated_ba[\"generated_co2_rate_lb_per_mwh\"] = calculated_ba[\"co2_mass_lb\"] / calculated_ba[\"net_generation_mwh\"]\n" ] }, { @@ -473,15 +506,6 @@ ").round(2)\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "percent_diff_from_egrid.sort_values(by=\"net_generation_mwh\")\n" - ] - }, { "cell_type": "code", "execution_count": null, @@ -497,21 +521,59 @@ " .sort_values(by=\"co2_mass_lb\")\n", " .round(3)\n", ")\n", + "ba_metric = ba_metric - 1\n", "\n", "total = pd.DataFrame(\n", - " calculated_ba[data_columns]\n", + " calculated_ba[DATA_COLUMNS + [\"generated_co2_rate_lb_per_mwh\"]]\n", " .sum()\n", - " .div(egrid_plant_ba_agg[data_columns].sum())\n", + " .div(egrid_plant_ba_agg[DATA_COLUMNS + [\"generated_co2_rate_lb_per_mwh\"]].sum())\n", " .rename(\"Total\")\n", ").T\n", + "total = total - 1\n", "\n", "# calculate the difference in the number of plants in each region\n", - "# plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')\n", - "# ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()\n", + "plant_count = (\n", + " annual_plant_results[\n", + " ~(\n", + " annual_plant_results[\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " ]\n", + " ].sum(axis=1)\n", + " == 0\n", + " )\n", + " ]\n", + " .groupby(\"ba_code\", dropna=False)[\"plant_id_eia\"]\n", + " .nunique()\n", + " - egrid_plant[\n", + " ~(\n", + " egrid_plant[\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " ]\n", + " ].sum(axis=1)\n", + " == 0\n", + " )\n", + " ]\n", + " .groupby(\"ba_code\", dropna=False)[\"plant_id_eia\"]\n", + " .nunique()\n", + ").rename(\"num_plants\")\n", + "\n", + "ba_metric = ba_metric.merge(\n", + " plant_count, how=\"left\", left_index=True, right_index=True\n", + ").sort_index()\n", + "\n", + "ba_metric = ba_metric.sort_values(by=[\"generated_co2_rate_lb_per_mwh\"], ascending=True)\n", "\n", "ba_metric = pd.concat([ba_metric, total], axis=0).round(2)\n", "\n", - "ba_metric = ba_metric[data_columns]\n", + "ba_metric = ba_metric[DATA_COLUMNS + [\"generated_co2_rate_lb_per_mwh\", \"num_plants\"]]\n", "\n", "columns_to_check = [\n", " \"net_generation_mwh\",\n", @@ -521,19 +583,15 @@ "]\n", "\n", "with pd.option_context(\"display.max_rows\", None, \"display.max_columns\", None):\n", - " display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])\n" + " display(ba_metric[~(ba_metric[columns_to_check] == 0).all(axis=1)])\n" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Explore specific plants\n", - "\n", - "### Notes\n", - "\n", - "BA Totals\n", - " - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC\n" + "## Explore specific plants\n" ] }, { diff --git a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb new file mode 100644 index 00000000..904e85ac --- /dev/null +++ b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb @@ -0,0 +1,731 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import packages\n", + "import pandas as pd\n", + "\n", + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# # Tell python where to look for modules.\n", + "import sys\n", + "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "from column_checks import get_dtypes\n", + "from filepaths import *\n", + "import load_data\n", + "from data_cleaning import *\n", + "import validation\n", + "import emissions\n", + "\n", + "year = 2021" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What does the cleaned CEMS data look like" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load data from csv\n", + "year = 2021\n", + "path_prefix = f\"{year}/\"\n", + "\n", + "cems = pd.read_csv(outputs_folder(f\"{path_prefix}/cems_cleaned_{year}.csv\"), dtype=get_dtypes())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems[cems[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems[cems[\"plant_id_eia\"] == 3].sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia923_allocated = pd.read_csv(outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"), dtype=get_dtypes())\n", + "eia923_allocated[eia923_allocated[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"subplant_id\",\"report_date\"]).sum(numeric_only=True).head(20)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test where data is being dropped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " eia923_allocated,\n", + " primary_fuel_table,\n", + " subplant_emission_factors,\n", + ") = clean_eia923(year, False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# does the raw cems match this?\n", + "cems_raw = load_data.load_cems_data(year)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "barry.sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remove non-grid connected plants\n", + "cems_raw = remove_plants(\n", + " cems_raw,\n", + " non_grid_connected=True,\n", + " remove_states=[\"PR\"],\n", + " steam_only_plants=False,\n", + " distribution_connected_plants=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# manually remove steam-only units\n", + "cems_raw = manually_remove_steam_units(cems_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add a report date\n", + "cems_raw = load_data.add_report_date(cems_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remove data for any unit-months where there are incomplete data reported\n", + "# this is generally when there is a single observation reported for an entire month\n", + "cems_raw = remove_incomplete_unit_months(cems_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add subplant id\n", + "subplant_crosswalk = (\n", + " pd.read_csv(\n", + " outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n", + " .drop_duplicates()\n", + " .dropna(subset=\"emissions_unit_id_epa\")\n", + ")\n", + "cems_raw = cems_raw.merge(\n", + " subplant_crosswalk,\n", + " how=\"left\",\n", + " on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n", + " validate=\"m:1\",\n", + ")\n", + "validation.test_for_missing_subplant_id(cems_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add a fuel type to each observation\n", + "cems_raw = assign_fuel_type_to_cems(cems_raw, year, primary_fuel_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fill in missing hourly emissions data using the fuel type and heat input\n", + "validation.test_for_missing_energy_source_code(cems_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save a copy of the cems data at this point to test later\n", + "cems_test = cems_raw.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw = emissions.calculate_ghg_emissions_from_fuel_consumption(\n", + " df=cems_raw, year=year, include_co2=False, include_ch4=True, include_n2o=True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw = remove_cems_with_zero_monthly_data(cems_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Investigate emissions filling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_test[cems_test[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "barry = cems_test.copy() #[(cems_test[\"plant_id_eia\"] == 3)]\n", + "barry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add a new categorical option to the mass measurement code\n", + "barry[\"co2_mass_measurement_code\"] = barry[\n", + " \"co2_mass_measurement_code\"\n", + "].cat.add_categories(\"Imputed\")\n", + "\n", + "# replace all \"missing\" CO2 values with zero\n", + "barry[\"co2_mass_lb\"] = barry[\"co2_mass_lb\"].fillna(0)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# replace 0 reported CO2 values with missing values, if there was reported heat input\n", + "barry.loc[\n", + " (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# replace 0 reported CO2 values with missing values, if there was reported heat input\n", + "barry.loc[\n", + " (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n", + " \"co2_mass_lb\",\n", + "] = np.NaN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new df with all observations with missing co2 data\n", + "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n", + "missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "unit_months_missing_co2 = missing_co2[\n", + " [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"]\n", + " ].drop_duplicates()\n", + "unit_months_missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get non-missing data from cems for these unit months\n", + "unit_months_missing_co2 = unit_months_missing_co2.merge(\n", + " barry[\n", + " [\n", + " \"plant_id_eia\",\n", + " \"emissions_unit_id_epa\",\n", + " \"report_date\",\n", + " \"co2_mass_lb\",\n", + " \"fuel_consumed_mmbtu\",\n", + " ]\n", + " ],\n", + " how=\"left\",\n", + " on=[\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"],\n", + " validate=\"1:m\",\n", + ")\n", + "unit_months_missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "unit_months_missing_co2 = unit_months_missing_co2[\n", + " (unit_months_missing_co2[\"co2_mass_lb\"] > 0)\n", + " & (unit_months_missing_co2[\"fuel_consumed_mmbtu\"] > 0)\n", + " ]\n", + "unit_months_missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# calculate total fuel consumption and emissions by month\n", + "unit_month_efs = (\n", + " unit_months_missing_co2.groupby(\n", + " [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"], dropna=False\n", + " )\n", + " .sum()\n", + " .reset_index()\n", + ")\n", + "unit_month_efs[\"co2_lb_per_mmbtu\"] = (\n", + " unit_month_efs[\"co2_mass_lb\"] / unit_month_efs[\"fuel_consumed_mmbtu\"]\n", + ")\n", + "unit_month_efs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge these EFs into the missing cems data\n", + "missing_co2 = missing_co2.merge(\n", + " unit_month_efs[\n", + " [\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\", \"co2_lb_per_mmbtu\"]\n", + " ],\n", + " how=\"left\",\n", + " on=[\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\"],\n", + " validate=\"m:1\",\n", + ").set_index(missing_co2.index)\n", + "missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# only keep observations where there is a non-missing ef\n", + "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n", + "\n", + "# calculate missing co2 data\n", + "missing_co2[\"co2_mass_lb\"] = (\n", + " missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n", + ")\n", + "missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# update in CEMS table\n", + "barry.update(missing_co2[[\"co2_mass_lb\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# update the co2 mass measurement code\n", + "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n", + "\n", + "# identify all observations that are still missing co2 data\n", + "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n", + "missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the weighted ef into the missing data\n", + "missing_co2 = missing_co2.merge(\n", + " subplant_emission_factors[\n", + " [\"plant_id_eia\", \"report_date\", \"subplant_id\", \"co2_lb_per_mmbtu\"]\n", + " ],\n", + " how=\"left\",\n", + " on=[\"plant_id_eia\", \"report_date\", \"subplant_id\"],\n", + " validate=\"m:1\",\n", + ").set_index(missing_co2.index)\n", + "missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# only keep observations where there is a non-missing ef\n", + "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n", + "\n", + "# calculate missing co2 data\n", + "missing_co2[\"co2_mass_lb\"] = (\n", + " missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n", + ")\n", + "missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# update in barry table\n", + "barry.update(missing_co2[[\"co2_mass_lb\"]])\n", + "\n", + "# update the co2 mass measurement code\n", + "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n", + "\n", + "# identify all observations that are still missing co2 data\n", + "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "missing_co2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for rows that have a successful fuel code match, move to a temporary dataframe to hold the data\n", + "co2_to_fill = missing_co2.copy()[~missing_co2[\"energy_source_code\"].isna()]\n", + "fill_index = co2_to_fill.index\n", + "co2_to_fill" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# calculate emissions based on fuel type\n", + "co2_to_fill = emissions.calculate_ghg_emissions_from_fuel_consumption(\n", + " df=co2_to_fill,\n", + " year=year,\n", + " include_co2=True,\n", + " include_ch4=False,\n", + " include_n2o=False,\n", + ").set_index(fill_index)\n", + "\n", + "co2_to_fill" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fill this data into the original cems data\n", + "barry.update(co2_to_fill[[\"co2_mass_lb\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_fill = cems_test.loc[cems_test[\"co2_mass_lb\"] > 0,[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]\n", + "test_fill = test_fill.merge(barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]], how=\"left\", on=[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\"], validate=\"1:1\", suffixes=(\"_original\",\"_postfill\"))\n", + "test_fill[\"diff\"] = test_fill[\"co2_mass_lb_postfill\"] - test_fill[\"co2_mass_lb_original\"]\n", + "test_fill[test_fill[\"diff\"] != 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open_grid_emissions", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "25e36f192ecdbe5da57d9bea009812e7b11ef0e0053366a845a2802aae1b29d2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/data_cleaning.py b/src/data_cleaning.py index 6ce26ca7..dd88e073 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -1298,6 +1298,20 @@ def remove_cems_with_zero_monthly_data(cems): print( f" Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported" ) + check_that_data_is_zero = cems[ + cems["missing_data_flag"] == "remove", + [ + "gross_generation_mwh", + "steam_load_1000_lb", + "fuel_consumed_mmbtu", + "co2_mass_lb", + "nox_mass_lb", + "so2_mass_lb", + ], + ].sum(numeric_only=True) + if check_that_data_is_zero.sum() > 0: + print("WARNING: Some data being removed has non-zero data associated with it:") + print(check_that_data_is_zero) cems = cems[cems["missing_data_flag"] != "remove"] # drop the missing data flag column cems = cems.drop(columns="missing_data_flag") diff --git a/src/emissions.py b/src/emissions.py index 0987b70d..01245150 100644 --- a/src/emissions.py +++ b/src/emissions.py @@ -1753,7 +1753,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors): how="left", on=["plant_id_eia", "report_date", "emissions_unit_id_epa"], validate="m:1", - ) + ).set_index(missing_co2.index) # only keep observations where there is a non-missing ef missing_co2 = missing_co2[~missing_co2["co2_lb_per_mmbtu"].isna()] @@ -1783,7 +1783,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors): how="left", on=["plant_id_eia", "report_date", "subplant_id"], validate="m:1", - ) + ).set_index(missing_co2.index) # only keep observations where there is a non-missing ef missing_co2 = missing_co2[~missing_co2["co2_lb_per_mmbtu"].isna()] @@ -1831,4 +1831,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors): "There are still misssing CO2 values remaining after filling missing CO2 values in CEMS" ) + # check that no non-missing co2 values were modified during filling + validation.check_non_missing_cems_co2_values_unchanged(cems, year) + return cems diff --git a/src/validation.py b/src/validation.py index 15d05827..5564e225 100644 --- a/src/validation.py +++ b/src/validation.py @@ -265,6 +265,38 @@ def test_for_missing_energy_source_code(df): return missing_esc_test +def check_non_missing_cems_co2_values_unchanged(cems, year): + """Checks that no non-missing CO2 values were modified during the process of filling.""" + print(" Checking that original CO2 data in CEMS was not modified by filling missing values...", end="") + # re-load the raw cems data + cems_original = load_data.load_cems_data(year) + # only keep non-zero and non-missing co2 values, since these should have not been modified + cems_original = cems_original.loc[ + cems_original["co2_mass_lb"] > 0, + ["plant_id_eia", "emissions_unit_id_epa", "datetime_utc", "co2_mass_lb"], + ] + test_fill = cems_original.merge( + cems[["plant_id_eia", "emissions_unit_id_epa", "datetime_utc", "co2_mass_lb"]], + how="left", + on=["plant_id_eia", "emissions_unit_id_epa", "datetime_utc"], + validate="1:1", + suffixes=("_original", "_postfill"), + ) + test_fill["diff"] = ( + test_fill["co2_mass_lb_postfill"] - test_fill["co2_mass_lb_original"] + ) + if len(test_fill[test_fill["diff"] != 0]) > 0: + print(" ") + print( + f"WARNING: There are {len(test_fill[test_fill["diff"] != 0])} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error" + ) + else: + print("OK") + + del cems_original + + + def test_for_missing_subplant_id(df): """Checks if any records are missing a `subplant_id`.""" print(" Checking that all data has an associated `subplant_id`... ", end="") @@ -1831,6 +1863,56 @@ def compare_plant_level_results_to_egrid( [comparison_count, pd.DataFrame(comparison_count.sum().rename("Total")).T], axis=0, ) + + compared = compared_merged.merge( + compared[ + [ + "plant_name_eia", + "ba_code", + "state", + "net_generation_mwh_status", + "fuel_consumed_mmbtu_status", + "fuel_consumed_for_electricity_mmbtu_status", + "co2_mass_lb_for_electricity_adjusted_status", + "co2_mass_lb_status", + "so2_mass_lb_status", + "nox_mass_lb_status", + ] + ], + how="left", + left_index=True, + right_index=True, + ) + + compared = compared[ + [ + "plant_name_eia", + "ba_code", + "state", + "net_generation_mwh_status", + "net_generation_mwh_calc", + "net_generation_mwh_egrid", + "fuel_consumed_mmbtu_status", + "fuel_consumed_mmbtu_calc", + "fuel_consumed_mmbtu_egrid", + "fuel_consumed_for_electricity_mmbtu_status", + "fuel_consumed_for_electricity_mmbtu_calc", + "fuel_consumed_for_electricity_mmbtu_egrid", + "co2_mass_lb_status", + "co2_mass_lb_calc", + "co2_mass_lb_egrid", + "nox_mass_lb_status", + "nox_mass_lb_calc", + "nox_mass_lb_egrid", + "so2_mass_lb_status", + "so2_mass_lb_calc", + "so2_mass_lb_egrid", + "co2_mass_lb_for_electricity_adjusted_status", + "co2_mass_lb_for_electricity_adjusted_calc", + "co2_mass_lb_for_electricity_adjusted_egrid", + ] + ] + return comparison_count, compared From 6127925965e6236f9dfb27e6ad5fc20f8470ef6b Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 11 Feb 2023 13:55:02 -0800 Subject: [PATCH 06/27] move validation function --- src/data_cleaning.py | 15 +-------------- src/validation.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index dd88e073..30294715 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -1298,20 +1298,7 @@ def remove_cems_with_zero_monthly_data(cems): print( f" Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported" ) - check_that_data_is_zero = cems[ - cems["missing_data_flag"] == "remove", - [ - "gross_generation_mwh", - "steam_load_1000_lb", - "fuel_consumed_mmbtu", - "co2_mass_lb", - "nox_mass_lb", - "so2_mass_lb", - ], - ].sum(numeric_only=True) - if check_that_data_is_zero.sum() > 0: - print("WARNING: Some data being removed has non-zero data associated with it:") - print(check_that_data_is_zero) + validation.check_removed_data_is_empty(cems) cems = cems[cems["missing_data_flag"] != "remove"] # drop the missing data flag column cems = cems.drop(columns="missing_data_flag") diff --git a/src/validation.py b/src/validation.py index 5564e225..64f87203 100644 --- a/src/validation.py +++ b/src/validation.py @@ -295,7 +295,22 @@ def check_non_missing_cems_co2_values_unchanged(cems, year): del cems_original - +def check_removed_data_is_empty(cems): + """Checks that the rows removed by `data_cleaning.remove_cems_with_zero_monthly_data()` don't actually contain non-zero data""" + check_that_data_is_zero = cems[ + cems["missing_data_flag"] == "remove", + [ + "gross_generation_mwh", + "steam_load_1000_lb", + "fuel_consumed_mmbtu", + "co2_mass_lb", + "nox_mass_lb", + "so2_mass_lb", + ], + ].sum(numeric_only=True) + if check_that_data_is_zero.sum() > 0: + print("WARNING: Some data being removed has non-zero data associated with it:") + print(check_that_data_is_zero) def test_for_missing_subplant_id(df): """Checks if any records are missing a `subplant_id`.""" From c9604d6acdb299f58a22cdfedc5c08848480b99e Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 11 Feb 2023 14:00:15 -0800 Subject: [PATCH 07/27] update format --- src/validation.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/validation.py b/src/validation.py index 64f87203..090de123 100644 --- a/src/validation.py +++ b/src/validation.py @@ -267,7 +267,10 @@ def test_for_missing_energy_source_code(df): def check_non_missing_cems_co2_values_unchanged(cems, year): """Checks that no non-missing CO2 values were modified during the process of filling.""" - print(" Checking that original CO2 data in CEMS was not modified by filling missing values...", end="") + print( + " Checking that original CO2 data in CEMS was not modified by filling missing values...", + end="", + ) # re-load the raw cems data cems_original = load_data.load_cems_data(year) # only keep non-zero and non-missing co2 values, since these should have not been modified @@ -285,16 +288,18 @@ def check_non_missing_cems_co2_values_unchanged(cems, year): test_fill["diff"] = ( test_fill["co2_mass_lb_postfill"] - test_fill["co2_mass_lb_original"] ) - if len(test_fill[test_fill["diff"] != 0]) > 0: + num_nonzero_rows = len(test_fill[test_fill["diff"] != 0]) + if num_nonzero_rows > 0: print(" ") print( - f"WARNING: There are {len(test_fill[test_fill["diff"] != 0])} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error" + f"WARNING: There are {num_nonzero_rows} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error" ) else: print("OK") del cems_original + def check_removed_data_is_empty(cems): """Checks that the rows removed by `data_cleaning.remove_cems_with_zero_monthly_data()` don't actually contain non-zero data""" check_that_data_is_zero = cems[ @@ -312,6 +317,7 @@ def check_removed_data_is_empty(cems): print("WARNING: Some data being removed has non-zero data associated with it:") print(check_that_data_is_zero) + def test_for_missing_subplant_id(df): """Checks if any records are missing a `subplant_id`.""" print(" Checking that all data has an associated `subplant_id`... ", end="") From 9ff5541e617c84779adbcdaefffcc1bb2b48a3b0 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 11 Feb 2023 15:55:43 -0800 Subject: [PATCH 08/27] add diff notebook --- .../validation/diff_output_versions.ipynb | 208 ++++++++++++++++++ .../GH279_missing_cems_data.ipynb | 78 ++++++- src/emissions.py | 5 +- src/validation.py | 8 +- 4 files changed, 290 insertions(+), 9 deletions(-) create mode 100644 notebooks/validation/diff_output_versions.ipynb diff --git a/notebooks/validation/diff_output_versions.ipynb b/notebooks/validation/diff_output_versions.ipynb new file mode 100644 index 00000000..19f6a649 --- /dev/null +++ b/notebooks/validation/diff_output_versions.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import packages\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import plotly.express as px\n", + "import zipfile\n", + "\n", + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# # Tell python where to look for modules.\n", + "import sys\n", + "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "import download_data\n", + "import load_data\n", + "from column_checks import get_dtypes\n", + "from filepaths import *\n", + "import impute_hourly_profiles\n", + "import data_cleaning\n", + "import output_data\n", + "import emissions\n", + "import validation\n", + "import gross_to_net_generation\n", + "import eia930\n", + "\n", + "year = 2021\n", + "path_prefix = f\"{year}/\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## About this notebook\n", + "This notebook can be used to identify differences between one version of OGE data and another. \n", + "This is useful if you want to identify how much a code update affects the output results.\n", + "\n", + "This notebook compares files in the `outputs` and `results` directory against archived data in the `zenodo` or `s3_upload` directories. \n", + "This assumes that the previous, stable version of the data outputs are archived on your computer." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare plant data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load archived data\n", + "data_type = \"plant_data\"\n", + "resolution = \"annual\"\n", + "\n", + "# unzip archived data\n", + "if not os.path.exists(data_folder(\"diff\")):\n", + " os.mkdir(data_folder(\"diff\"))\n", + "with zipfile.ZipFile(data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\") as zip_to_unzip:\n", + " zip_to_unzip.extractall(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\"))\n", + "\n", + "# load archived data\n", + "prev_data = pd.read_csv(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/plant_data.csv\"), dtype=get_dtypes()).round(0)\n", + "\n", + "# load new data\n", + "new_data = pd.read_csv(results_folder(f\"{year}/{data_type}/{resolution}/us_units/plant_data.csv\"), dtype=get_dtypes()).round(0)\n", + "\n", + "# load plant attributes\n", + "plant_attributes = pd.read_csv(outputs_folder(f\"{year}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes())\n", + "\n", + "prev_data = prev_data.merge(plant_attributes[[\"plant_id_eia\",\"ba_code\",\"fuel_category\"]], how=\"left\", on=\"plant_id_eia\")\n", + "new_data = new_data.merge(plant_attributes[[\"plant_id_eia\",\"ba_code\",\"fuel_category\"]], how=\"left\", on=\"plant_id_eia\")\n", + "\n", + "key_cols = [\"plant_id_eia\",\"ba_code\",\"fuel_category\"]\n", + "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n", + "\n", + "# get difference\n", + "diff = comparison.groupby(level=0, axis=1).diff().rename(columns={\"new\":\"pct_diff\"}).drop(columns=[\"previous\"], level=1)\n", + "comparison = pd.concat([comparison, diff], axis=1).sort_index(axis=1, level=0, ascending=True, sort_remaining=False)\n", + "comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'] = (comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'].values / comparison.iloc[:, comparison.columns.get_level_values(1)=='previous'].values).round(2)\n", + "\n", + "comparison\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare BA data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load archived data\n", + "data_type = \"power_sector_data\"\n", + "resolution = \"annual\"\n", + "\n", + "# unzip archived data\n", + "if not os.path.exists(data_folder(\"diff\")):\n", + " os.mkdir(data_folder(\"diff\"))\n", + "with zipfile.ZipFile(data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\") as zip_to_unzip:\n", + " zip_to_unzip.extractall(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\"))\n", + "\n", + "# load archived data\n", + "prev_data = []\n", + "for ba in os.listdir(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\")):\n", + " df = pd.read_csv(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/{ba}\"), dtype=get_dtypes())\n", + " df[\"ba_code\"] = ba.split(\".\")[0]\n", + " prev_data.append(df)\n", + "\n", + "prev_data = pd.concat(prev_data, axis=0).reset_index(drop=True)\n", + "\n", + "# load data\n", + "new_data = []\n", + "for ba in os.listdir(results_folder(f\"{year}/{data_type}/{resolution}/us_units\")):\n", + " df = pd.read_csv(results_folder(f\"{year}/{data_type}/{resolution}/us_units/{ba}\"), dtype=get_dtypes())\n", + " df[\"ba_code\"] = ba.split(\".\")[0]\n", + " new_data.append(df)\n", + "\n", + "new_data = pd.concat(new_data, axis=0).reset_index(drop=True)\n", + "\n", + "key_cols = [\"ba_code\", \"fuel_category\"]\n", + "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n", + "comparison\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare intermediate outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load archived data\n", + "file = \"cems_cleaned\"\n", + "key_cols = [\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\"]\n", + "\n", + "# unzip archived data\n", + "if not os.path.exists(data_folder(f\"diff/outputs_{year}\")):\n", + " os.mkdir(data_folder(f\"diff/outputs_{year}\"))\n", + " with zipfile.ZipFile(data_folder(f\"zenodo/outputs_{year}.zip\"), \"r\") as zip_to_unzip:\n", + " zip_to_unzip.extractall(data_folder(f\"diff/outputs_{year}\"))\n", + "\n", + "# load archived data\n", + "prev_data = pd.read_csv(data_folder(f\"diff/outputs_{year}/{file}_{year}.csv\"), dtype=get_dtypes())\n", + "\n", + "# load new data\n", + "new_data = pd.read_csv(outputs_folder(f\"{year}/{file}_{year}.csv\"), dtype=get_dtypes())\n", + "\n", + "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n", + "comparison\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open_grid_emissions", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "25e36f192ecdbe5da57d9bea009812e7b11ef0e0053366a845a2802aae1b29d2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb index 904e85ac..c90d0608 100644 --- a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb +++ b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb @@ -686,19 +686,93 @@ "test_fill[test_fill[\"diff\"] != 0]" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Why are non-missing values being removed?" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "(\n", + " eia923_allocated,\n", + " primary_fuel_table,\n", + " subplant_emission_factors,\n", + ") = clean_eia923(year, False)\n", + "\n", + "# load the CEMS data\n", + "cems = load_data.load_cems_data(year)\n", + "\n", + "\n", + "# remove non-grid connected plants\n", + "cems = remove_plants(\n", + " cems,\n", + " non_grid_connected=True,\n", + " remove_states=[\"PR\"],\n", + " steam_only_plants=False,\n", + " distribution_connected_plants=False,\n", + ")\n", + "\n", + "# manually remove steam-only units\n", + "cems = manually_remove_steam_units(cems)\n", + "\n", + "# add a report date\n", + "cems = load_data.add_report_date(cems)\n", + "\n", + "# remove data for any unit-months where there are incomplete data reported\n", + "# this is generally when there is a single observation reported for an entire month\n", + "cems = remove_incomplete_unit_months(cems)\n", + "\n", + "# TODO: identify and remove any hourly values that appear to be outliers\n", + "# See: https://github.com/singularity-energy/open-grid-emissions/issues/50\n", + "\n", + "# add subplant id\n", + "subplant_crosswalk = (\n", + " pd.read_csv(\n", + " outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n", + " .drop_duplicates()\n", + " .dropna(subset=\"emissions_unit_id_epa\")\n", + ")\n", + "cems = cems.merge(\n", + " subplant_crosswalk,\n", + " how=\"left\",\n", + " on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n", + " validate=\"m:1\",\n", + ")\n", + "validation.test_for_missing_subplant_id(cems)\n", + "\n", + "# add a fuel type to each observation\n", + "cems = assign_fuel_type_to_cems(cems, year, primary_fuel_table)\n", + "\n", + "# fill in missing hourly emissions data using the fuel type and heat input\n", + "validation.test_for_missing_energy_source_code(cems)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "cems_test = cems.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems_test = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)" + ] } ], "metadata": { diff --git a/src/emissions.py b/src/emissions.py index 01245150..966b87d3 100644 --- a/src/emissions.py +++ b/src/emissions.py @@ -1688,6 +1688,8 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors): 3. For any remaining missing values, calculate emissions based on the subplant primary fuel and fuel consumption """ + # make a copy of the cems data so that we can validate the outputs + cems_original = cems.copy() # add a new categorical option to the mass measurement code cems["co2_mass_measurement_code"] = cems[ "co2_mass_measurement_code" @@ -1832,6 +1834,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors): ) # check that no non-missing co2 values were modified during filling - validation.check_non_missing_cems_co2_values_unchanged(cems, year) + validation.check_non_missing_cems_co2_values_unchanged(cems_original, cems) + del cems_original return cems diff --git a/src/validation.py b/src/validation.py index 090de123..0f2fcd86 100644 --- a/src/validation.py +++ b/src/validation.py @@ -265,14 +265,12 @@ def test_for_missing_energy_source_code(df): return missing_esc_test -def check_non_missing_cems_co2_values_unchanged(cems, year): +def check_non_missing_cems_co2_values_unchanged(cems_original, cems): """Checks that no non-missing CO2 values were modified during the process of filling.""" print( " Checking that original CO2 data in CEMS was not modified by filling missing values...", end="", ) - # re-load the raw cems data - cems_original = load_data.load_cems_data(year) # only keep non-zero and non-missing co2 values, since these should have not been modified cems_original = cems_original.loc[ cems_original["co2_mass_lb"] > 0, @@ -297,12 +295,10 @@ def check_non_missing_cems_co2_values_unchanged(cems, year): else: print("OK") - del cems_original - def check_removed_data_is_empty(cems): """Checks that the rows removed by `data_cleaning.remove_cems_with_zero_monthly_data()` don't actually contain non-zero data""" - check_that_data_is_zero = cems[ + check_that_data_is_zero = cems.loc[ cems["missing_data_flag"] == "remove", [ "gross_generation_mwh", From 4e0f919fe641bbf49b30a496ab31436b47720ae8 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 11 Feb 2023 17:08:43 -0800 Subject: [PATCH 09/27] clean up notebook --- .../validation/diff_output_versions.ipynb | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/notebooks/validation/diff_output_versions.ipynb b/notebooks/validation/diff_output_versions.ipynb index 19f6a649..b6c8d200 100644 --- a/notebooks/validation/diff_output_versions.ipynb +++ b/notebooks/validation/diff_output_versions.ipynb @@ -8,9 +8,7 @@ "source": [ "# import packages\n", "import pandas as pd\n", - "import numpy as np\n", "import os\n", - "import plotly.express as px\n", "import zipfile\n", "\n", "%reload_ext autoreload\n", @@ -20,17 +18,10 @@ "import sys\n", "sys.path.append('../../../open-grid-emissions/src/')\n", "\n", - "import download_data\n", "import load_data\n", "from column_checks import get_dtypes\n", "from filepaths import *\n", - "import impute_hourly_profiles\n", - "import data_cleaning\n", - "import output_data\n", - "import emissions\n", - "import validation\n", - "import gross_to_net_generation\n", - "import eia930\n", + "\n", "\n", "year = 2021\n", "path_prefix = f\"{year}/\"" @@ -96,6 +87,15 @@ "comparison\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comparison[comparison.loc[:,(\"co2_mass_lb_for_electricity\",\"pct_diff\")] > 0.001]#.groupby(\"ba_code\").sum().sum()" + ] + }, { "attachments": {}, "cell_type": "markdown", From 390318693dfe7020245baf08858384291b2d0796 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Wed, 15 Feb 2023 12:47:32 -0800 Subject: [PATCH 10/27] use isclose --- src/validation.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/validation.py b/src/validation.py index 4fa35e5e..7ef45054 100644 --- a/src/validation.py +++ b/src/validation.py @@ -40,18 +40,12 @@ def validate_year(year): raise UserWarning(year_warning) -def check_allocated_gf_matches_input_gf( - pudl_out, gen_fuel_allocated, threshold_percent=0.001 -): +def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated): """ Checks that the allocated generation and fuel from EIA-923 matches the input totals. - Because there might be small rounding errors in the allocation that make the - allocated total slightly off from the input data, we allow the user to specify a - threshold percentage above which mismatched data is flagged. The default value is - 0.1%, so that if either the allocated total fuel consumption or allocated total net - generation is more than +/-0.1% different from the total input generation or fuel, - the record is flagged. + We use np.isclose() to identify any values that are off by more than 1e-9% different + from the total input generation or fuel. """ gf = pudl_out.gf_eia923() plant_total_gf = gf.groupby("plant_id_eia")[ @@ -69,11 +63,13 @@ def check_allocated_gf_matches_input_gf( ] ].sum() # calculate the percentage difference between the values - plant_total_diff = (plant_total_alloc - plant_total_gf) / plant_total_gf + plant_total_diff = ((plant_total_alloc - plant_total_gf) / plant_total_gf).dropna( + how="any", axis=0 + ) # flag rows where the absolute percentage difference is greater than our threshold mismatched_allocation = plant_total_diff[ - (abs(plant_total_diff["fuel_consumed_mmbtu"]) > threshold_percent) - | (abs(plant_total_diff["net_generation_mwh"]) > threshold_percent) + (~np.isclose(plant_total_diff["fuel_consumed_mmbtu"], 0)) + | (~np.isclose(plant_total_diff["net_generation_mwh"], 0)) ] if len(mismatched_allocation) > 0: print( From 50977c50145826da5cad87f4b4fa1be23725bc54 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Wed, 15 Feb 2023 12:48:53 -0800 Subject: [PATCH 11/27] update validation parameters --- src/data_cleaning.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index 558c4b7d..6ce26ca7 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -401,9 +401,7 @@ def clean_eia923( ) # test to make sure allocated totals match input totals - validation.check_allocated_gf_matches_input_gf( - pudl_out, gen_fuel_allocated, threshold_percent=0.001 - ) + validation.check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated) # manually update energy source code when OTH gen_fuel_allocated = update_energy_source_codes(gen_fuel_allocated) From c9472bc17c72a64eb3fa2a3118b3121dd96e4746 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Wed, 15 Feb 2023 12:56:20 -0800 Subject: [PATCH 12/27] remove WIP notebook --- .../GH279_missing_cems_data.ipynb | 805 ------------------ 1 file changed, 805 deletions(-) delete mode 100644 notebooks/work_in_progress/GH279_missing_cems_data.ipynb diff --git a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb deleted file mode 100644 index c90d0608..00000000 --- a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb +++ /dev/null @@ -1,805 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import packages\n", - "import pandas as pd\n", - "\n", - "%reload_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# # Tell python where to look for modules.\n", - "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", - "\n", - "from column_checks import get_dtypes\n", - "from filepaths import *\n", - "import load_data\n", - "from data_cleaning import *\n", - "import validation\n", - "import emissions\n", - "\n", - "year = 2021" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## What does the cleaned CEMS data look like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load data from csv\n", - "year = 2021\n", - "path_prefix = f\"{year}/\"\n", - "\n", - "cems = pd.read_csv(outputs_folder(f\"{path_prefix}/cems_cleaned_{year}.csv\"), dtype=get_dtypes())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems[cems[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems[cems[\"plant_id_eia\"] == 3].sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia923_allocated = pd.read_csv(outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"), dtype=get_dtypes())\n", - "eia923_allocated[eia923_allocated[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"subplant_id\",\"report_date\"]).sum(numeric_only=True).head(20)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test where data is being dropped" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " eia923_allocated,\n", - " primary_fuel_table,\n", - " subplant_emission_factors,\n", - ") = clean_eia923(year, False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# does the raw cems match this?\n", - "cems_raw = load_data.load_cems_data(year)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "barry.sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove non-grid connected plants\n", - "cems_raw = remove_plants(\n", - " cems_raw,\n", - " non_grid_connected=True,\n", - " remove_states=[\"PR\"],\n", - " steam_only_plants=False,\n", - " distribution_connected_plants=False,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# manually remove steam-only units\n", - "cems_raw = manually_remove_steam_units(cems_raw)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add a report date\n", - "cems_raw = load_data.add_report_date(cems_raw)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove data for any unit-months where there are incomplete data reported\n", - "# this is generally when there is a single observation reported for an entire month\n", - "cems_raw = remove_incomplete_unit_months(cems_raw)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add subplant id\n", - "subplant_crosswalk = (\n", - " pd.read_csv(\n", - " outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n", - " dtype=get_dtypes(),\n", - " )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n", - " .drop_duplicates()\n", - " .dropna(subset=\"emissions_unit_id_epa\")\n", - ")\n", - "cems_raw = cems_raw.merge(\n", - " subplant_crosswalk,\n", - " how=\"left\",\n", - " on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n", - " validate=\"m:1\",\n", - ")\n", - "validation.test_for_missing_subplant_id(cems_raw)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add a fuel type to each observation\n", - "cems_raw = assign_fuel_type_to_cems(cems_raw, year, primary_fuel_table)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fill in missing hourly emissions data using the fuel type and heat input\n", - "validation.test_for_missing_energy_source_code(cems_raw)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# save a copy of the cems data at this point to test later\n", - "cems_test = cems_raw.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw = emissions.calculate_ghg_emissions_from_fuel_consumption(\n", - " df=cems_raw, year=year, include_co2=False, include_ch4=True, include_n2o=True\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw = remove_cems_with_zero_monthly_data(cems_raw)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Investigate emissions filling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_test[cems_test[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "barry = cems_test.copy() #[(cems_test[\"plant_id_eia\"] == 3)]\n", - "barry" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add a new categorical option to the mass measurement code\n", - "barry[\"co2_mass_measurement_code\"] = barry[\n", - " \"co2_mass_measurement_code\"\n", - "].cat.add_categories(\"Imputed\")\n", - "\n", - "# replace all \"missing\" CO2 values with zero\n", - "barry[\"co2_mass_lb\"] = barry[\"co2_mass_lb\"].fillna(0)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# replace 0 reported CO2 values with missing values, if there was reported heat input\n", - "barry.loc[\n", - " (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# replace 0 reported CO2 values with missing values, if there was reported heat input\n", - "barry.loc[\n", - " (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n", - " \"co2_mass_lb\",\n", - "] = np.NaN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a new df with all observations with missing co2 data\n", - "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n", - "missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unit_months_missing_co2 = missing_co2[\n", - " [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"]\n", - " ].drop_duplicates()\n", - "unit_months_missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get non-missing data from cems for these unit months\n", - "unit_months_missing_co2 = unit_months_missing_co2.merge(\n", - " barry[\n", - " [\n", - " \"plant_id_eia\",\n", - " \"emissions_unit_id_epa\",\n", - " \"report_date\",\n", - " \"co2_mass_lb\",\n", - " \"fuel_consumed_mmbtu\",\n", - " ]\n", - " ],\n", - " how=\"left\",\n", - " on=[\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"],\n", - " validate=\"1:m\",\n", - ")\n", - "unit_months_missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unit_months_missing_co2 = unit_months_missing_co2[\n", - " (unit_months_missing_co2[\"co2_mass_lb\"] > 0)\n", - " & (unit_months_missing_co2[\"fuel_consumed_mmbtu\"] > 0)\n", - " ]\n", - "unit_months_missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# calculate total fuel consumption and emissions by month\n", - "unit_month_efs = (\n", - " unit_months_missing_co2.groupby(\n", - " [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"], dropna=False\n", - " )\n", - " .sum()\n", - " .reset_index()\n", - ")\n", - "unit_month_efs[\"co2_lb_per_mmbtu\"] = (\n", - " unit_month_efs[\"co2_mass_lb\"] / unit_month_efs[\"fuel_consumed_mmbtu\"]\n", - ")\n", - "unit_month_efs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# merge these EFs into the missing cems data\n", - "missing_co2 = missing_co2.merge(\n", - " unit_month_efs[\n", - " [\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\", \"co2_lb_per_mmbtu\"]\n", - " ],\n", - " how=\"left\",\n", - " on=[\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\"],\n", - " validate=\"m:1\",\n", - ").set_index(missing_co2.index)\n", - "missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# only keep observations where there is a non-missing ef\n", - "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n", - "\n", - "# calculate missing co2 data\n", - "missing_co2[\"co2_mass_lb\"] = (\n", - " missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n", - ")\n", - "missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# update in CEMS table\n", - "barry.update(missing_co2[[\"co2_mass_lb\"]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# update the co2 mass measurement code\n", - "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n", - "\n", - "# identify all observations that are still missing co2 data\n", - "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n", - "missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# merge the weighted ef into the missing data\n", - "missing_co2 = missing_co2.merge(\n", - " subplant_emission_factors[\n", - " [\"plant_id_eia\", \"report_date\", \"subplant_id\", \"co2_lb_per_mmbtu\"]\n", - " ],\n", - " how=\"left\",\n", - " on=[\"plant_id_eia\", \"report_date\", \"subplant_id\"],\n", - " validate=\"m:1\",\n", - ").set_index(missing_co2.index)\n", - "missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# only keep observations where there is a non-missing ef\n", - "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n", - "\n", - "# calculate missing co2 data\n", - "missing_co2[\"co2_mass_lb\"] = (\n", - " missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n", - ")\n", - "missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# update in barry table\n", - "barry.update(missing_co2[[\"co2_mass_lb\"]])\n", - "\n", - "# update the co2 mass measurement code\n", - "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n", - "\n", - "# identify all observations that are still missing co2 data\n", - "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "missing_co2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for rows that have a successful fuel code match, move to a temporary dataframe to hold the data\n", - "co2_to_fill = missing_co2.copy()[~missing_co2[\"energy_source_code\"].isna()]\n", - "fill_index = co2_to_fill.index\n", - "co2_to_fill" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# calculate emissions based on fuel type\n", - "co2_to_fill = emissions.calculate_ghg_emissions_from_fuel_consumption(\n", - " df=co2_to_fill,\n", - " year=year,\n", - " include_co2=True,\n", - " include_ch4=False,\n", - " include_n2o=False,\n", - ").set_index(fill_index)\n", - "\n", - "co2_to_fill" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fill this data into the original cems data\n", - "barry.update(co2_to_fill[[\"co2_mass_lb\"]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_fill = cems_test.loc[cems_test[\"co2_mass_lb\"] > 0,[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]\n", - "test_fill = test_fill.merge(barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]], how=\"left\", on=[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\"], validate=\"1:1\", suffixes=(\"_original\",\"_postfill\"))\n", - "test_fill[\"diff\"] = test_fill[\"co2_mass_lb_postfill\"] - test_fill[\"co2_mass_lb_original\"]\n", - "test_fill[test_fill[\"diff\"] != 0]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Why are non-missing values being removed?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " eia923_allocated,\n", - " primary_fuel_table,\n", - " subplant_emission_factors,\n", - ") = clean_eia923(year, False)\n", - "\n", - "# load the CEMS data\n", - "cems = load_data.load_cems_data(year)\n", - "\n", - "\n", - "# remove non-grid connected plants\n", - "cems = remove_plants(\n", - " cems,\n", - " non_grid_connected=True,\n", - " remove_states=[\"PR\"],\n", - " steam_only_plants=False,\n", - " distribution_connected_plants=False,\n", - ")\n", - "\n", - "# manually remove steam-only units\n", - "cems = manually_remove_steam_units(cems)\n", - "\n", - "# add a report date\n", - "cems = load_data.add_report_date(cems)\n", - "\n", - "# remove data for any unit-months where there are incomplete data reported\n", - "# this is generally when there is a single observation reported for an entire month\n", - "cems = remove_incomplete_unit_months(cems)\n", - "\n", - "# TODO: identify and remove any hourly values that appear to be outliers\n", - "# See: https://github.com/singularity-energy/open-grid-emissions/issues/50\n", - "\n", - "# add subplant id\n", - "subplant_crosswalk = (\n", - " pd.read_csv(\n", - " outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n", - " dtype=get_dtypes(),\n", - " )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n", - " .drop_duplicates()\n", - " .dropna(subset=\"emissions_unit_id_epa\")\n", - ")\n", - "cems = cems.merge(\n", - " subplant_crosswalk,\n", - " how=\"left\",\n", - " on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n", - " validate=\"m:1\",\n", - ")\n", - "validation.test_for_missing_subplant_id(cems)\n", - "\n", - "# add a fuel type to each observation\n", - "cems = assign_fuel_type_to_cems(cems, year, primary_fuel_table)\n", - "\n", - "# fill in missing hourly emissions data using the fuel type and heat input\n", - "validation.test_for_missing_energy_source_code(cems)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_test = cems.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems_test = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "open_grid_emissions", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "25e36f192ecdbe5da57d9bea009812e7b11ef0e0053366a845a2802aae1b29d2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From ac8175c665f737ed86c233e6793eb4f1aa635295 Mon Sep 17 00:00:00 2001 From: Milo Knowles Date: Thu, 23 Feb 2023 15:35:16 -0500 Subject: [PATCH 13/27] Make PUDL logging show up, and set up logging in OGE (#285) * Fix logging and show example * WIP * Use logger everywhere * Remove leading spaces * Print dataframes properly * Address comments * Remove empty logging --------- Co-authored-by: Greg Miller <45949268+grgmiller@users.noreply.github.com> --- .gitignore | 3 +- environment.yml | 1 + src/__init__.py | 5 + src/column_checks.py | 13 +- src/consumed.py | 27 ++-- src/data_cleaning.py | 36 +++--- src/data_pipeline.py | 63 +++++----- src/download_data.py | 17 +-- src/eia930.py | 15 ++- src/emissions.py | 74 ++++++----- src/filepaths.py | 3 +- src/gross_to_net_generation.py | 15 ++- src/impute_hourly_profiles.py | 24 ++-- src/load_data.py | 23 ++-- src/logging_util.py | 49 ++++++++ src/output_data.py | 20 +-- src/validation.py | 217 +++++++++++++++------------------ src/visualization.py | 5 +- test/test_logging.py | 33 +++++ 19 files changed, 364 insertions(+), 279 deletions(-) create mode 100644 src/logging_util.py create mode 100644 test/test_logging.py diff --git a/.gitignore b/.gitignore index 2f35d2c3..093d108f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,13 +3,14 @@ data/* example/.ipynb_checkpoints/ test/__pycache__/ +test/*.txt src/__pycache__/ CHANGELOG.md notebooks/visualization/outputs/* -# Python +# Python notebooks/.ipynb_checkpoints notebooks/*/.ipynb_checkpoints .hypothesis/ diff --git a/environment.yml b/environment.yml index 505de70c..6b6ca3f8 100644 --- a/environment.yml +++ b/environment.yml @@ -27,6 +27,7 @@ dependencies: - sqlalchemy - sqlite # used for pudl - statsmodels + - coloredlogs # used for prettier logging - pip: # --editable ../pudl #NOTE: this is for development use diff --git a/src/__init__.py b/src/__init__.py index e69de29b..efd6d259 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1,5 @@ +# Set up the OGE logging configuration once. +import logging +from .logging_util import configure_root_logger +from .filepaths import outputs_folder +configure_root_logger(outputs_folder("logfile.txt"), logging.INFO) diff --git a/src/column_checks.py b/src/column_checks.py index 545c7bdf..5fb8fd03 100644 --- a/src/column_checks.py +++ b/src/column_checks.py @@ -17,6 +17,9 @@ After any change, re-run data_pipeline to regenerate all files and re-run these checks. """ +from logging_util import get_logger +logger = get_logger(__name__) + COLUMNS = { "eia923_allocated": { @@ -348,8 +351,8 @@ def check_columns(df, file_name): # Check for extra columns. Warning not exception extras = cols - expected_cols if len(extras) > 0: - print( - f"WARNING: columns {extras} in {file_name} are not guaranteed by column_checks.py" + logger.warning( + f"columns {extras} in {file_name} are not guaranteed by column_checks.py" ) # Raise exception for missing columns @@ -464,8 +467,8 @@ def apply_dtypes(df): if (col not in dtypes) and (col not in datetime_columns) ] if len(cols_missing_dtypes) > 0: - print( - "WARNING: The following columns do not have dtypes assigned in `column_checks.get_dtypes()`" + logger.warning( + "The following columns do not have dtypes assigned in `column_checks.get_dtypes()`" ) - print(cols_missing_dtypes) + logger.warning(cols_missing_dtypes) return df.astype({col: dtypes[col] for col in df.columns if col in dtypes}) diff --git a/src/consumed.py b/src/consumed.py index 7f164b89..65f0f037 100644 --- a/src/consumed.py +++ b/src/consumed.py @@ -6,6 +6,7 @@ from gridemissions.load import BaData from gridemissions.eia_api import KEYS, SRC from filepaths import outputs_folder, manual_folder, results_folder +from logging_util import get_logger from output_data import ( GENERATED_EMISSION_RATE_COLS, @@ -14,11 +15,13 @@ TIME_RESOLUTIONS, ) -""" For these BAs, there are significant and systematic differences -between our net_generation_mwh and EIA-930 net generation and interchange, -so we cannot combine our net generation and 930 interchange to get net_consumed. -Instead, we use 930 demand as net_consumed. Note: there may be issues with the 930 -demand! But it is better than combining inconsistent generation and interchange, +logger = get_logger(__name__) + +""" For these BAs, there are significant and systematic differences +between our net_generation_mwh and EIA-930 net generation and interchange, +so we cannot combine our net generation and 930 interchange to get net_consumed. +Instead, we use 930 demand as net_consumed. Note: there may be issues with the 930 +demand! But it is better than combining inconsistent generation and interchange, which results in unreasonable profiles with many negative hours. """ # Identify the BAs for which we need to use demand data for the consumed calculation @@ -118,8 +121,8 @@ def get_average_emission_factors(prefix: str, year: int): for fuel in SRC: column = get_rate_column(pol, adjustment, generated=True) if FUEL_TYPE_MAP[fuel] not in genavg.index: - print( - f"WARNING: fuel {FUEL_TYPE_MAP[fuel]} not found in file annual_generation_averages_by_fuel_{year}.csv, using average" + logger.warning( + f"fuel {FUEL_TYPE_MAP[fuel]} not found in file annual_generation_averages_by_fuel_{year}.csv, using average" ) efs[pol][adjustment][fuel] = genavg.loc["total", column] else: @@ -288,7 +291,7 @@ def output_results(self): if (ba in self.import_regions) or (ba in self.generation_regions): continue if ba in BA_930_INCONSISTENCY[self.year]: - print(f"Using D instead of (G-TI) for consumed calc in {ba}") + logger.warning(f"Using D instead of (G-TI) for consumed calc in {ba}") self.results[ba]["net_consumed_mwh"] = self.eia930.df[ KEYS["E"]["D"] % ba ][self.generation.index] @@ -325,8 +328,8 @@ def output_results(self): time_cols = ["datetime_utc", "datetime_local"] missing_hours = time_dat[time_dat.isna().any(axis=1)] if len(missing_hours) > 0: - print( - f"WARNING: {len(missing_hours)} hours are missing in {ba} consumed data" + logger.warning( + f"{len(missing_hours)} hours are missing in {ba} consumed data" ) elif time_resolution == "monthly": time_dat["month"] = time_dat.datetime_local.dt.month @@ -513,6 +516,6 @@ def run(self): for (i, r) in enumerate(self.regions): self.results[r].loc[date, col] = consumed_emissions[i] if total_failed > 0: - print( - f"Warning: {total_failed} hours failed to solve for consumed {pol} {adj} emissions." + logger.warning( + f"{total_failed} hours failed to solve for consumed {pol} {adj} emissions." ) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index 30294715..f11330ac 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -13,7 +13,9 @@ from emissions import CLEAN_FUELS from column_checks import get_dtypes, apply_dtypes from filepaths import manual_folder, outputs_folder, downloads_folder +from logging_util import get_logger +logger = get_logger(__name__) DATA_COLUMNS = [ "net_generation_mwh", @@ -52,11 +54,11 @@ def identify_subplants(year, number_of_years=5): end_year = year # load 5 years of monthly data from CEMS - print(" loading CEMS ids") + logger.info(" loading CEMS ids") cems_ids = load_data.load_cems_ids(start_year, end_year) # add subplant ids to the data - print(" identifying unique subplants") + logger.info(" identifying unique subplants") generate_subplant_ids(start_year, end_year, cems_ids) @@ -543,14 +545,12 @@ def update_energy_source_codes(df): (df["energy_source_code"] == "OTH") & (df["fuel_consumed_mmbtu"] > 0) ] if len(plants_with_other_fuel) > 0: - print( - "WARNING: After cleaning energy source codes, some fuel consumption is still associated with an 'OTH' fuel type." + logger.warning(f""" + After cleaning energy source codes, some fuel consumption is still associated with an 'OTH' fuel type. + This will lead to incorrect emissions calculations. + Check the following plants: {list(plants_with_other_fuel.plant_id_eia.unique())} + Assign a fuel type in `data_cleaning.update_energy_source_codes`""" ) - print("This will lead to incorrect emissions calculations.") - print( - f"Check the following plants: {list(plants_with_other_fuel.plant_id_eia.unique())}" - ) - print("Assign a fuel type in `data_cleaning.update_energy_source_codes`") return df @@ -735,7 +735,7 @@ def calculate_aggregated_primary_fuel( plants_with_no_primary_fuel = agg_primary_fuel[ agg_primary_fuel[f"{level}_primary_fuel"].isna() ] - print( + logger.warning( f"Check the following plants: {list(plants_with_no_primary_fuel.plant_id_eia.unique())}" ) raise UserWarning( @@ -882,7 +882,7 @@ def remove_plants( plant_states["state"].isin(remove_states) ].plant_id_eia.unique() ) - print( + logger.info( f" Removing {len(plants_in_states_to_remove)} plants located in the following states: {remove_states}" ) df = df[~df["plant_id_eia"].isin(plants_in_states_to_remove)] @@ -918,7 +918,7 @@ def remove_non_grid_connected_plants(df): "plant_id_eia" ].unique() ) - print(f" Removing {num_plants} plants that are not grid-connected") + logger.info(f" Removing {num_plants} plants that are not grid-connected") df = df[~df["plant_id_eia"].isin(ngc_plants)] @@ -1005,7 +1005,7 @@ def clean_cems(year: int, small: bool, primary_fuel_table, subplant_emission_fac def smallerize_test_data(df, random_seed=None): - print(" Randomly selecting 5% of plants for faster test run.") + logger.info(" Randomly selecting 5% of plants for faster test run.") # Select 5% of plants selected_plants = df.plant_id_eia.unique() if random_seed is not None: @@ -1030,7 +1030,7 @@ def manually_remove_steam_units(df): dtype=get_dtypes(), )[["plant_id_eia", "emissions_unit_id_epa"]] - print( + logger.info( f" Removing {len(units_to_remove)} units that only produce steam and do not report to EIA" ) @@ -1062,7 +1062,7 @@ def remove_incomplete_unit_months(cems): unit_hours_in_month["datetime_utc"] < 600 ].drop(columns="datetime_utc") - print( + logger.info( f" Removing {len(unit_months_to_remove)} unit-months with incomplete hourly data" ) @@ -1295,7 +1295,7 @@ def remove_cems_with_zero_monthly_data(cems): validate="m:1", ) # remove any observations with the missing data flag - print( + logger.info( f" Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported" ) validation.check_removed_data_is_empty(cems) @@ -1960,8 +1960,8 @@ def assign_ba_code_to_plant(df, year): df = df.merge(plant_ba, how="left", on="plant_id_eia", validate="m:1") if len(df[df["ba_code"].isna()]) > 0: - print(" WARNING: the following plants are missing ba_code:") - print(df[df["ba_code"].isna()]) + logger.warning(" the following plants are missing ba_code:") + logger.warning("\n" + df[df["ba_code"].isna()].tostring()) # replace missing ba codes with NA df["ba_code"] = df["ba_code"].fillna("NA") diff --git a/src/data_pipeline.py b/src/data_pipeline.py index bd092eaa..d30cefa1 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -6,16 +6,11 @@ Optional arguments are --year (default 2021), --shape_individual_plants (default True) Optional arguments for development are --small, --flat, and --skip_outputs """ - - -# import packages import argparse import os import shutil # import local modules -# import local modules -# # # Tell python where to look for modules. import download_data import data_cleaning import emissions @@ -26,11 +21,17 @@ import output_data import consumed from filepaths import downloads_folder, outputs_folder, results_folder +from logging_util import get_logger, configure_root_logger -def get_args(): - """ - Specify arguments here. +# Log the print statements to a file for debugging. +configure_root_logger(logfile=outputs_folder("data_pipeline.log")) +logger = get_logger("data_pipeline") + + +def get_args() -> argparse.Namespace: + """Specify arguments here. + Returns dictionary of {arg_name: arg_value} """ parser = argparse.ArgumentParser() @@ -63,8 +64,10 @@ def get_args(): def main(): + """Runs the OGE data pipeline.""" args = get_args() year = args.year + logger.info(f'Running data pipeline for year {year}') validation.validate_year(year) @@ -99,7 +102,7 @@ def main(): # 1. Download data #################################################################################### - print("1. Downloading data") + logger.info("1. Downloading data") # PUDL download_data.download_pudl_data( zenodo_url="https://zenodo.org/record/7472137/files/pudl-v2022.11.30.tgz" @@ -131,12 +134,12 @@ def main(): # 2. Identify subplants #################################################################################### - print("2. Identifying subplant IDs") + logger.info("2. Identifying subplant IDs") data_cleaning.identify_subplants(year) # 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level #################################################################################### - print("3. Cleaning EIA-923 data") + logger.info("3. Cleaning EIA-923 data") ( eia923_allocated, primary_fuel_table, @@ -152,7 +155,7 @@ def main(): # 4. Clean Hourly Data from CEMS #################################################################################### - print("4. Cleaning CEMS data") + logger.info("4. Cleaning CEMS data") cems = data_cleaning.clean_cems( year, args.small, primary_fuel_table, subplant_emission_factors ) @@ -178,14 +181,14 @@ def main(): # 5. Assign static characteristics to CEMS and EIA data to aid in aggregation #################################################################################### - print("5. Loading plant static attributes") + logger.info("5. Loading plant static attributes") plant_attributes = data_cleaning.create_plant_attributes_table( cems, eia923_allocated, year, primary_fuel_table ) # 6. Crosswalk CEMS and EIA data #################################################################################### - print("6. Identifying source for hourly data") + logger.info("6. Identifying source for hourly data") eia923_allocated = data_cleaning.identify_hourly_data_source( eia923_allocated, cems, year ) @@ -207,13 +210,13 @@ def main(): # 7. Aggregating CEMS data to subplant #################################################################################### - print("7. Aggregating CEMS data from unit to subplant") + logger.info("7. Aggregating CEMS data from unit to subplant") # aggregate cems data to subplant level cems = data_cleaning.aggregate_cems_to_subplant(cems) # 8. Calculate hourly data for partial_cems plants #################################################################################### - print("8. Shaping partial CEMS data") + logger.info("8. Shaping partial CEMS data") # shape partial CEMS plant data partial_cems_plant = impute_hourly_profiles.shape_partial_cems_plants( cems, eia923_allocated @@ -251,7 +254,7 @@ def main(): # 9. Convert CEMS Hourly Gross Generation to Hourly Net Generation #################################################################################### - print("9. Converting CEMS gross generation to net generation") + logger.info("9. Converting CEMS gross generation to net generation") cems, gtn_conversions = gross_to_net_generation.convert_gross_to_net_generation( cems, eia923_allocated, plant_attributes, year ) @@ -273,7 +276,7 @@ def main(): # 10. Adjust CEMS emission data for CHP #################################################################################### - print("10. Adjusting CEMS emissions for CHP") + logger.info("10. Adjusting CEMS emissions for CHP") cems = data_cleaning.adjust_cems_for_chp(cems, eia923_allocated) cems = emissions.calculate_co2e_mass( cems, year, gwp_horizon=100, ar5_climate_carbon_feedback=True @@ -290,7 +293,7 @@ def main(): # 11. Export monthly and annual plant-level results #################################################################################### - print("11. Exporting monthly and annual plant-level results") + logger.info("11. Exporting monthly and annual plant-level results") # create a separate dataframe containing only the EIA data that is missing from cems monthly_eia_data_to_shape = eia923_allocated[ (eia923_allocated["hourly_data_source"] == "eia") @@ -327,14 +330,14 @@ def main(): # 12. Clean and Reconcile EIA-930 data #################################################################################### - print("12. Cleaning EIA-930 data") + logger.info("12. Cleaning EIA-930 data") # Scrapes and cleans data in data/downloads, outputs cleaned file at EBA_elec.csv if args.flat: - print(" Not running 930 cleaning because we'll be using a flat profile.") + logger.info(" Not running 930 cleaning because we'll be using a flat profile.") elif not (os.path.exists(outputs_folder(f"{path_prefix}/eia930/eia930_elec.csv"))): eia930.clean_930(year, small=args.small, path_prefix=path_prefix) else: - print( + logger.info( f" Not re-running 930 cleaning. If you'd like to re-run, please delete data/outputs/{path_prefix}/eia930/" ) @@ -351,7 +354,7 @@ def main(): # 13. Calculate hourly profiles for monthly EIA data #################################################################################### - print("13. Estimating hourly profiles for EIA data") + logger.info("13. Estimating hourly profiles for EIA data") hourly_profiles = impute_hourly_profiles.calculate_hourly_profiles( cems, partial_cems_subplant, @@ -384,7 +387,7 @@ def main(): # 14. Export hourly plant-level data #################################################################################### - print("14. Exporting Hourly Plant-level data for each BA") + logger.info("14. Exporting Hourly Plant-level data for each BA") if args.shape_individual_plants and not args.small: impute_hourly_profiles.combine_and_export_hourly_plant_data( cems, @@ -398,16 +401,16 @@ def main(): region_to_group="ba_code", ) else: - print( + logger.info( " Not shaping and exporting individual plant data since `shape_individual_plants` is False." ) - print( + logger.info( " Plants that only report to EIA will be aggregated to the fleet level before shaping." ) # 15. Shape fleet-level data #################################################################################### - print("15. Assigning hourly profiles to monthly EIA-923 data") + logger.info("15. Assigning hourly profiles to monthly EIA-923 data") hourly_profiles = impute_hourly_profiles.convert_profile_to_percent( hourly_profiles, group_keys=["ba_code", "fuel_category", "profile_method"], @@ -465,7 +468,7 @@ def main(): # 16. Combine plant-level data from all sources #################################################################################### - print("16. Combining plant-level hourly data") + logger.info("16. Combining plant-level hourly data") # write metadata outputs output_data.write_plant_metadata( plant_attributes, @@ -511,7 +514,7 @@ def main(): # 17. Aggregate CEMS data to BA-fuel and write power sector results #################################################################################### - print("17. Creating and exporting BA-level power sector results") + logger.info("17. Creating and exporting BA-level power sector results") ba_fuel_data = data_cleaning.aggregate_plant_data_to_ba_fuel( combined_plant_data, plant_attributes ) @@ -525,7 +528,7 @@ def main(): # 18. Calculate consumption-based emissions and write carbon accounting results #################################################################################### - print("18. Calculating and exporting consumption-based results") + logger.info("18. Calculating and exporting consumption-based results") hourly_consumed_calc = consumed.HourlyConsumed( clean_930_file, path_prefix, diff --git a/src/download_data.py b/src/download_data.py index d50eaf60..151564fb 100644 --- a/src/download_data.py +++ b/src/download_data.py @@ -7,6 +7,9 @@ import zipfile from filepaths import downloads_folder, data_folder +from logging_util import get_logger + +logger = get_logger(__name__) def download_helper( @@ -38,11 +41,11 @@ def download_helper( # If the file already exists, do not re-download it. final_destination = output_path if output_path is not None else download_path if os.path.exists(final_destination): - print(f" {final_destination.split('/')[-1]} already downloaded, skipping.") + logger.info(f" {final_destination.split('/')[-1]} already downloaded, skipping.") return False # Otherwise, download to the file in chunks. - print(f" Downloading {final_destination.split('/')[-1]}") + logger.info(f" Downloading {final_destination.split('/')[-1]}") r = requests.get(input_url, stream=True) with open(download_path, "wb") as fd: for chunk in r.iter_content(chunk_size=chunk_size): @@ -94,10 +97,10 @@ def download_pudl_data(zenodo_url: str): with open(pudl_version_file, "r") as f: existing_version = f.readlines()[0].replace("\n", "") if pudl_version == existing_version: - print(" PUDL version already downloaded") + logger.info(" PUDL version already downloaded") return else: - print(" Downloading new version of pudl") + logger.info(" Downloading new version of pudl") shutil.rmtree(downloads_folder("pudl")) download_pudl(zenodo_url, pudl_version) @@ -117,10 +120,10 @@ def download_pudl(zenodo_url, pudl_version): ) fd.write(chunk) downloaded += block_size - print(" Downloading PUDL. Progress: 100.0%") + logger.info(" Downloading PUDL. Progress: 100.0%") # extract the tgz file - print(" Extracting PUDL data...") + logger.info(" Extracting PUDL data...") with tarfile.open(downloads_folder("pudl.tgz")) as tar: tar.extractall(data_folder()) @@ -268,7 +271,7 @@ def download_raw_eia860(year): Downloads raw EIA-860 data (zip files), and unzips them to the downloads folder. """ if year < 2005: - raise NotImplementedError(f"WARNING: We haven't tested EIA-860 for '{year}'.") + raise NotImplementedError(f"We haven't tested EIA-860 for '{year}'.") os.makedirs(downloads_folder("eia860"), exist_ok=True) url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip" archive_url = ( diff --git a/src/eia930.py b/src/eia930.py index d2a11f54..36aa7c1e 100644 --- a/src/eia930.py +++ b/src/eia930.py @@ -7,12 +7,15 @@ import load_data from column_checks import get_dtypes from filepaths import top_folder, downloads_folder, outputs_folder, manual_folder +from logging_util import get_logger # Tell gridemissions where to find config before we load gridemissions os.environ["GRIDEMISSIONS_CONFIG_FILE_PATH"] = top_folder("config/gridemissions.json") from gridemissions.workflows import make_dataset +logger = get_logger(__name__) + def convert_balance_file_to_gridemissions_format(year: int, small: bool = False): """Converts downloaded EIA-930 Balance files to gridemissions format.""" @@ -142,14 +145,14 @@ def clean_930(year: int, small: bool = False, path_prefix: str = ""): df = df.loc[start:end] # Don't worry about processing everything # Adjust - print(" Adjusting EIA-930 time stamps") + logger.info(" Adjusting EIA-930 time stamps") df = manual_930_adjust(df) df.to_csv( join(data_folder, "eia930_raw.csv") ) # Will be read by gridemissions workflow # Run cleaning - print(" Running physics-based data cleaning") + logger.info(" Running physics-based data cleaning") make_dataset( start, end, @@ -171,17 +174,17 @@ def reformat_chalendar(raw): """ # where we have variable (NG = net generation) and fuel type target_cols = [c for c in raw.columns if len(c.split(".")) == 5] - print("Filtering") + logger.info("Filtering") cleaned = ( raw.loc[:, target_cols] .melt(ignore_index=False, value_name="generation", var_name="variable") .reset_index() ) - print("Expanding cols") + logger.info("Expanding cols") cleaned[["dtype", "BA", "other BA", "var", "fuel", "interval"]] = cleaned[ "variable" ].str.split(r"[.-]", expand=True, regex=True) - print("Dropping and renaming") + logger.info("Dropping and renaming") cleaned = cleaned.drop(columns=["dtype", "var", "interval", "other BA"]) cleaned = cleaned.rename(columns={"index": "datetime_utc"}) return cleaned @@ -286,7 +289,7 @@ def remove_imputed_ones(eia930_data): filter = eia930_data["net_generation_mwh_930"].abs() < 1.5 # replace all 1.0 values with zero - print(f" replacing {sum(filter)} imputed 1 values with 0") + logger.info(f" replacing {sum(filter)} imputed 1 values with 0") eia930_data.loc[filter, "net_generation_mwh_930"] = 0 return eia930_data diff --git a/src/emissions.py b/src/emissions.py index 966b87d3..3460ad27 100644 --- a/src/emissions.py +++ b/src/emissions.py @@ -1,11 +1,11 @@ import pandas as pd import numpy as np - import load_data import validation from column_checks import get_dtypes from filepaths import manual_folder +from logging_util import get_logger from pudl.analysis.allocate_net_gen import ( distribute_annually_reported_data_to_months_if_annual, @@ -13,6 +13,8 @@ CLEAN_FUELS = ["SUN", "MWH", "WND", "WAT", "WH", "PUR", "NUC"] +logger = get_logger(__name__) + def calculate_ghg_emissions_from_fuel_consumption( df, year, include_co2=True, include_ch4=True, include_n2o=True @@ -477,9 +479,9 @@ def calculate_nox_from_fuel_consumption( & ~gen_fuel_allocated["energy_source_code"].isin(CLEAN_FUELS) ] if len(missing_ef) > 0: - print("WARNING: NOx emission factors are missing for the following records") - print("Missing factors for FC prime movers are currently expected") - print( + logger.warning("NOx emission factors are missing for the following records") + logger.warning("Missing factors for FC prime movers are currently expected") + logger.warning("\n" + missing_ef[ [ "report_date", @@ -488,7 +490,7 @@ def calculate_nox_from_fuel_consumption( "prime_mover_code", "generator_id", ] - ].drop_duplicates() + ].drop_duplicates().to_string() ) gen_fuel_allocated["nox_mass_lb"] = ( gen_fuel_allocated["fuel_consumed_mmbtu"] @@ -654,13 +656,11 @@ def calculate_generator_nox_ef_per_unit_from_boiler_type( ) ) if len(missing_nox_efs) > 0: - print(" ") - print( - "WARNING: NOx emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available." + logger.warning( + "NOx emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available." ) - print("Missing factors for FC prime movers are currently expected") - print(missing_nox_efs) - print(" ") + logger.warning("Missing factors for FC prime movers are currently expected") + logger.warning("\n" + missing_nox_efs.to_string()) gen_nox_factors = fill_missing_factors_based_on_pm_fuel( nox_emission_factors, gen_nox_factors ) @@ -687,13 +687,13 @@ def calculate_generator_nox_ef_per_unit_from_boiler_type( ) ) if len(missing_nox_efs) > 0: - print(" ") - print( - "WARNING: After filling with PM-fuel factors, NOx emission factors are still missing for the following boiler types. An emission factor of zero will be used for these boilers." + logger.warning(""" + After filling with PM-fuel factors, NOx emission factors are still missing for the following boiler types. + An emission factor of zero will be used for these boilers. + Missing factors for FC prime movers are currently expected.""" ) - print("Missing factors for FC prime movers are currently expected") - print(missing_nox_efs) - print(" ") + logger.warning("\n" + missing_nox_efs.to_string()) + gen_nox_factors["emission_factor"] = gen_nox_factors["emission_factor"].fillna(0) # average the emission factors for all boilers associated with each generator @@ -848,8 +848,8 @@ def convert_ef_to_lb_per_mmbtu(gen_emission_factors, pudl_out, pollutant): & (gen_emission_factors["emission_factor_denominator"] != "mmbtu") ] if len(missing_fuel_content) > 0: - print( - f"WARNING: The heat content for the following fuels is missing and NOx emissions will not be calculated for these fuel:{list(missing_fuel_content.energy_source_code.unique())}" + logger.warning( + f"The heat content for the following fuels is missing and NOx emissions will not be calculated for these fuel:{list(missing_fuel_content.energy_source_code.unique())}" ) # convert emission factors from lb per unit to lb per mmbtu if the factor is not already in units of lb/mmbtu @@ -1212,9 +1212,9 @@ def calculate_so2_from_fuel_consumption(gen_fuel_allocated, pudl_out, year): & ~gen_fuel_allocated["energy_source_code"].isin(CLEAN_FUELS) ] if len(missing_ef) > 0: - print("WARNING: SO2 emission factors are missing for the above records") - print("Missing factors for FC prime movers are currently expected") - print( + logger.warning("SO2 emission factors are missing for the above records") + logger.warning("Missing factors for FC prime movers are currently expected") + logger.warning("\n" + missing_ef[ [ "report_date", @@ -1223,7 +1223,7 @@ def calculate_so2_from_fuel_consumption(gen_fuel_allocated, pudl_out, year): "prime_mover_code", "generator_id", ] - ].drop_duplicates() + ].drop_duplicates().to_string() ) gen_fuel_allocated["so2_mass_lb"] = ( gen_fuel_allocated["fuel_consumed_mmbtu"] @@ -1375,13 +1375,11 @@ def calculate_generator_so2_ef_per_unit_from_boiler_type( ) ) if len(missing_so2_efs) > 0: - print(" ") - print( - "WARNING: SO2 emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available." + logger.warning( + "SO2 emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available." ) - print("Missing factors for FC prime movers are currently expected") - print(missing_so2_efs) - print(" ") + logger.warning("Missing factors for FC prime movers are currently expected") + logger.warning("\n" + missing_so2_efs.to_string()) gen_so2_factors = fill_missing_factors_based_on_pm_fuel( so2_emission_factors, gen_so2_factors ) @@ -1406,13 +1404,11 @@ def calculate_generator_so2_ef_per_unit_from_boiler_type( ) ) if len(missing_so2_efs) > 0: - print(" ") - print( - "WARNING: SO2 emission factors are missing for the following boiler types. An emission factor of zero will be used for these boilers." + logger.warning( + "SO2 emission factors are missing for the following boiler types. An emission factor of zero will be used for these boilers." ) - print("Missing factors for FC prime movers are currently expected") - print(missing_so2_efs) - print(" ") + logger.warning("Missing factors for FC prime movers are currently expected") + logger.warning("\n" + missing_so2_efs.to_string()) gen_so2_factors["emission_factor"] = gen_so2_factors["emission_factor"].fillna(0) gen_so2_factors["multiply_by_sulfur_content"] = gen_so2_factors[ "multiply_by_sulfur_content" @@ -1564,8 +1560,8 @@ def adjust_so2_efs_for_fuel_sulfur_content(uncontrolled_so2_factors, pudl_out): & (uncontrolled_so2_factors["multiply_by_sulfur_content"] == 1) ] if len(missing_sulfur_content) > 0: - print("WARNING: Sulfur content data is missing in EIA-923 for the above units.") - print( + logger.warning("Sulfur content data is missing in EIA-923 for the above units.") + logger.warning("\n" + missing_sulfur_content[ [ "plant_id_eia", @@ -1573,7 +1569,7 @@ def adjust_so2_efs_for_fuel_sulfur_content(uncontrolled_so2_factors, pudl_out): "prime_mover_code", "energy_source_code", ] - ].drop_duplicates() + ].drop_duplicates().to_string() ) uncontrolled_so2_factors.loc[ uncontrolled_so2_factors["sulfur_content_pct"].isna() @@ -1637,7 +1633,7 @@ def load_so2_control_efficiencies(year): ] if len(bad_efficiencies) > 0: raise UserWarning( - "WARNING: certain loaded SO2 removal efficiencies are either negative or > 100%" + "certain loaded SO2 removal efficiencies are either negative or > 100%" ) return so2_efficiency diff --git a/src/filepaths.py b/src/filepaths.py index 066203ae..ec143b07 100644 --- a/src/filepaths.py +++ b/src/filepaths.py @@ -1,7 +1,6 @@ +"""Convenience functions for paths.""" import os -# Convenience functions for paths. - def top_folder(rel=""): """ diff --git a/src/gross_to_net_generation.py b/src/gross_to_net_generation.py index 4cb41781..0d6ed96c 100644 --- a/src/gross_to_net_generation.py +++ b/src/gross_to_net_generation.py @@ -15,6 +15,9 @@ import validation from column_checks import get_dtypes from filepaths import outputs_folder +from logging_util import get_logger + +logger = get_logger(__name__) def convert_gross_to_net_generation(cems, eia923_allocated, plant_attributes, year): @@ -89,10 +92,10 @@ def convert_gross_to_net_generation(cems, eia923_allocated, plant_attributes, ye & (cems["default_gtn_ratio"].isna()) ] if len(missing_defaults) > 0: - print( - "WARNING: The following subplants are missing default GTN ratios. Using a default value of 0.97" + logger.warning( + "The following subplants are missing default GTN ratios. Using a default value of 0.97" ) - print(missing_defaults[["plant_id_eia", "subplant_id"]].drop_duplicates()) + logger.warning("\n" + missing_defaults[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string()) # if there is a missing default gtn ratio, fill with 0.97 cems["default_gtn_ratio"] = cems["default_gtn_ratio"].fillna(0.97) cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna( @@ -721,12 +724,12 @@ def calculate_multiyear_gtn_factors(year, number_of_years): ) # add subplant ids to the data - print("Creating subplant IDs") + logger.info("Creating subplant IDs") cems_monthly, gen_fuel_allocated = data_cleaning.generate_subplant_ids( start_year, end_year, cems_monthly, gen_fuel_allocated ) - print("Calculating Gross to Net regressions and ratios") + logger.info("Calculating Gross to Net regressions and ratios") # perform regression at subplant level gross_to_net_regression( gross_gen_data=cems_monthly, @@ -772,7 +775,7 @@ def load_monthly_gross_and_net_generation(start_year, end_year): ) # allocate net generation and heat input to each generator-fuel grouping - print(" Allocating EIA-923 generation data") + logger.info(" Allocating EIA-923 generation data") gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source( pudl_out, drop_interim_cols=True ) diff --git a/src/impute_hourly_profiles.py b/src/impute_hourly_profiles.py index 5d2cb9b7..2d9bb64c 100644 --- a/src/impute_hourly_profiles.py +++ b/src/impute_hourly_profiles.py @@ -7,6 +7,10 @@ from filepaths import manual_folder import validation import output_data +from logging_util import get_logger + +logger = get_logger(__name__) + # specify the ba numbers with leading zeros FUEL_NUMBERS = { @@ -112,7 +116,7 @@ def calculate_hourly_profiles( hourly_profiles["profile"] = hourly_profiles["flat_profile"] hourly_profiles["profile_method"] = "flat_profile" - print( + logger.info( "Summary of methods used to estimate missing hourly profiles (count of ba-months):" ) summary_table = ( @@ -144,7 +148,7 @@ def calculate_hourly_profiles( :, profile_methods, ] - print(summary_table) + logger.info("\n" + summary_table.to_string()) return hourly_profiles @@ -290,10 +294,10 @@ def aggregate_for_residual( (cems["fuel_category_eia930"].isna()) & (cems["net_generation_mwh"] != 0) ] if len(missing_fuel_category) > 0: - print( - "WARNING: The following cems subplants are missing fuel categories and will lead to incorrect residual calculations:" + logger.warning( + "The following cems subplants are missing fuel categories and will lead to incorrect residual calculations:" ) - print(missing_fuel_category[["plant_id_eia", "subplant_id"]].drop_duplicates()) + logger.warning("\n" + missing_fuel_category[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string()) raise UserWarning( "The missing fuel categories must be fixed before proceeding." ) @@ -706,7 +710,7 @@ def average_diba_wind_solar_profiles( ] if len(df_temporary) == 0 and not validation_run: # if this error is raised, we might have to implement an approach that uses average values for the wider region - print(f" There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}") + logger.warning(f" There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}") df_temporary = average_national_wind_solar_profiles( residual_profiles, ba, fuel, report_date ) @@ -1318,8 +1322,8 @@ def shape_partial_cems_plants(cems, eia923_allocated): | shaped_partial_plants["fuel_profile"].isna() ] if len(missing_profiles) > 0: - print( - "WARNING: Certain partial CEMS plants are missing hourly profile data. This will result in inaccurate results" + logger.warning( + "Certain partial CEMS plants are missing hourly profile data. This will result in inaccurate results" ) # check that all profiles add to 1 for each month incorrect_profiles = ( @@ -1334,8 +1338,8 @@ def shape_partial_cems_plants(cems, eia923_allocated): | (~np.isclose(incorrect_profiles["fuel_profile"], 1)) ] if len(incorrect_profiles) > 0: - print( - "WARNING: Certain partial CEMS profiles do not add to 100%. This will result in inaccurate results" + logger.warning( + "Certain partial CEMS profiles do not add to 100%. This will result in inaccurate results" ) # shape the profiles diff --git a/src/load_data.py b/src/load_data.py index c5b19a88..798c480b 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -9,6 +9,9 @@ from column_checks import get_dtypes from filepaths import downloads_folder, manual_folder, outputs_folder +from logging_util import get_logger + +logger = get_logger(__name__) def correct_epa_eia_plant_id_mapping(df): @@ -153,7 +156,7 @@ def load_cems_gross_generation(start_year, end_year): cems_all = [] for year in range(start_year, end_year + 1): - print(f" loading {year} CEMS data") + logger.info(f" loading {year} CEMS data") # specify the path to the CEMS data cems_path = downloads_folder( "pudl/pudl_data/parquet/epacems/hourly_emissions_epacems/" @@ -774,10 +777,10 @@ def load_emissions_controls_eia923(year: int): parse_dates=["report_date", "pm_test_date", "so2_test_date"], ) else: - print( - "WARNING: Emissions control data prior to 2014 has not been integrated into the data pipeline." + logger.warning( + "Emissions control data prior to 2014 has not been integrated into the data pipeline." ) - print( + logger.warning( "This may overestimate SO2 and NOx emissions calculated from EIA-923 data." ) emissions_controls_eia923 = pd.DataFrame( @@ -826,10 +829,10 @@ def load_boiler_control_id_association_eia860(year, pollutant): ) # return a blank dataframe if the data is not available else: - print( - "WARNING: Environmental association data prior to 2013 have not been integrated into the data pipeline." + logger.warning( + "Environmental association data prior to 2013 have not been integrated into the data pipeline." ) - print("This may result in less accurate pollutant emissions calculations.") + logger.warning("This may result in less accurate pollutant emissions calculations.") boiler_control_id_association_eia860 = pd.DataFrame( columns=boiler_association_eia860_names ) @@ -875,10 +878,10 @@ def load_boiler_design_parameters_eia860(year): ) # return a blank dataframe if the data is not available else: - print( - "WARNING: Boiler Design data prior to 2013 have not been integrated into the data pipeline." + logger.warning( + "Boiler Design data prior to 2013 have not been integrated into the data pipeline." ) - print("This may result in less accurate NOx and SO2 emissions calculations.") + logger.warning("This may result in less accurate NOx and SO2 emissions calculations.") boiler_design_parameters_eia860 = pd.DataFrame( columns=list(boiler_design_parameters_eia860_names.values()) ) diff --git a/src/logging_util.py b/src/logging_util.py new file mode 100644 index 00000000..79bbaf12 --- /dev/null +++ b/src/logging_util.py @@ -0,0 +1,49 @@ +"""Configure logging for the OGE codebase.""" +import logging +import coloredlogs + + +def get_logger(name: str) -> logging.Logger: + """Helper function to append `oge` to the logger name and return a logger. + + As a result, all returned loggers a children of the top-level `oge` logger. + """ + return logging.getLogger(f"oge.{name}") + + +def configure_root_logger(logfile: str | None = None, level: str = "INFO"): + """Configure the OGE logger to print to the console, and optionally to a file. + + This function is safe to call multiple times, since it will check if logging + handlers have already been installed and skip them if so. + + Logging is printed with the same format as PUDL: + ``` + 2023-02-21 16:10:44 [INFO] oge.test:21 This is an example + ``` + """ + root_logger = logging.getLogger() + + # Unfortunately, the `gridemissions` package adds a handler to the root logger + # which means that the output of other loggers propagates up and is printed + # twice. Remove the root handlers to avoid this. + for handler in root_logger.handlers: + root_logger.removeHandler(handler) + + oge_logger = logging.getLogger("oge") + log_format = "%(asctime)s [%(levelname)4s] %(name)s:%(lineno)s %(message)s" + + # Direct the output of the OGE logger to the terminal (and color it). Make + # sure this hasn't been done already to avoid adding duplicate handlers. + if len(oge_logger.handlers) == 0: + coloredlogs.install(fmt=log_format, level=level, logger=oge_logger) + oge_logger.addHandler(logging.NullHandler()) + + # Send everything to the log file by adding a file handler to the root logger. + if logfile is not None: + file_logger = logging.FileHandler(logfile, mode='w') + file_logger.setFormatter(logging.Formatter(log_format)) + + if file_logger not in root_logger.handlers: + root_logger.addHandler(file_logger) + diff --git a/src/output_data.py b/src/output_data.py index 2319ad5e..23eb9824 100644 --- a/src/output_data.py +++ b/src/output_data.py @@ -7,6 +7,10 @@ import column_checks import validation from filepaths import outputs_folder, results_folder, data_folder +from logging_util import get_logger + +logger = get_logger(__name__) + GENERATED_EMISSION_RATE_COLS = [ "generated_co2_rate_lb_per_mwh_for_electricity", @@ -71,7 +75,7 @@ def zip_results_for_s3(year): # skip the metric hourly plant data since we do not create those outputs pass else: - print(f"zipping {year}_{data_type}_{aggregation}_{unit} for s3") + logger.info(f"zipping {year}_{data_type}_{aggregation}_{unit} for s3") folder = ( f"{results_folder()}/{year}/{data_type}/{aggregation}/{unit}" ) @@ -101,7 +105,7 @@ def zip_data_for_zenodo(year): """ os.makedirs(data_folder("zenodo"), exist_ok=True) for directory in ["outputs", "results"]: - print(f"zipping {directory}_{year} for zenodo") + logger.info(f"zipping {directory}_{year} for zenodo") shutil.make_archive( data_folder(f"zenodo/{directory}_{year}"), "zip", @@ -113,7 +117,7 @@ def zip_data_for_zenodo(year): def output_intermediate_data(df, file_name, path_prefix, year, skip_outputs): column_checks.check_columns(df, file_name) if not skip_outputs: - print(f" Exporting {file_name} to data/outputs") + logger.info(f" Exporting {file_name} to data/outputs") df.to_csv(outputs_folder(f"{path_prefix}{file_name}_{year}.csv"), index=False) @@ -122,7 +126,7 @@ def output_to_results( ): # Always check columns that should not be negative. small = "small" in path_prefix - print(f" Exporting {file_name} to data/results/{path_prefix}{subfolder}") + logger.info(f" Exporting {file_name} to data/results/{path_prefix}{subfolder}") if include_metric: metric = convert_results(df) @@ -149,7 +153,7 @@ def output_to_results( def output_data_quality_metrics(df, file_name, path_prefix, skip_outputs): if not skip_outputs: - print( + logger.info( f" Exporting {file_name} to data/results/{path_prefix}data_quality_metrics" ) @@ -412,7 +416,7 @@ def round_table(table): decimals[c] = abs(math.floor(math.log10(val))) + 2 # Always 3 sigfigs (for median) except ValueError: - print(val) + logger.error(val) raise Exception return table.round(decimals) @@ -455,8 +459,8 @@ def write_power_sector_results(ba_fuel_data, path_prefix, skip_outputs): if not skip_outputs: for ba in list(ba_fuel_data.ba_code.unique()): if type(ba) is not str: - print( - f"WARNING: not aggregating {sum(ba_fuel_data.ba_code.isna())} plants with numeric BA {ba}" + logger.warning( + f"not aggregating {sum(ba_fuel_data.ba_code.isna())} plants with numeric BA {ba}" ) continue diff --git a/src/validation.py b/src/validation.py index cb4574d7..83af09be 100644 --- a/src/validation.py +++ b/src/validation.py @@ -6,6 +6,9 @@ from emissions import CLEAN_FUELS from column_checks import get_dtypes from filepaths import downloads_folder, manual_folder +from logging_util import get_logger + +logger = get_logger(__name__) # DATA PIPELINE VALIDATION FUNCTIONS @@ -21,17 +24,17 @@ def validate_year(year): if year < earliest_validated_year: year_warning = f""" ################################################################################ - WARNING: The data pipeline has only been validated to work for years {earliest_validated_year}-{latest_validated_year}. + The data pipeline has only been validated to work for years {earliest_validated_year}-{latest_validated_year}. Running the pipeline for {year} may cause it to fail or may lead to poor-quality or anomalous results. To check on the progress of validating additional years of data, see: https://github.com/singularity-energy/open-grid-emissions/issues/117 ################################################################################ """ - print(year_warning) + logger.warning(year_warning) elif year > latest_validated_year: year_warning = f""" ################################################################################ - WARNING: The most recent available year of input data is currently {latest_validated_year}. + The most recent available year of input data is currently {latest_validated_year}. Input data for {year} should be available from the EIA in Fall {year+1} and we will work to validate that the pipeline works with {year} data as soon as possible after the data is released. @@ -72,20 +75,19 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated): | (~np.isclose(plant_total_diff["net_generation_mwh"], 0)) ] if len(mismatched_allocation) > 0: - print( - "WARNING: Allocated EIA-923 data doesn't match input data for the following plants:" - ) - print("Percentage Difference:") - print(mismatched_allocation) - print("EIA-923 Input Totals:") - print(plant_total_gf.loc[mismatched_allocation.index, :]) - print("Allocated Totals:") - print(plant_total_alloc.loc[mismatched_allocation.index, :]) + logger.warning("Allocated EIA-923 doesn't match input data for plants:") + logger.warning("Percentage Difference:") + logger.warning("\n" + mismatched_allocation.to_string()) + logger.warning("EIA-923 Input Totals:") + logger.warning("\n" + plant_total_gf.loc[mismatched_allocation.index, :].to_string()) + logger.warning("Allocated Totals:") + logger.warning("\n" + plant_total_alloc.loc[mismatched_allocation.index, :].to_string()) + def test_for_negative_values(df, small: bool = False): """Checks that there are no unexpected negative values in the data.""" - print(" Checking that fuel and emissions values are positive... ", end="") + logger.info("Checking that fuel and emissions values are positive... ") columns_that_should_be_positive = [ "fuel_consumed_mmbtu", "fuel_consumed_for_electricity_mmbtu", @@ -145,29 +147,26 @@ def test_for_negative_values(df, small: bool = False): for column in columns_to_test: negative_test = df[df[column] < 0] if not negative_test.empty: - print(" ") - print( - f"WARNING: There are {len(negative_test)} records where {column} is negative." + logger.warning( + f"There are {len(negative_test)} records where {column} is negative." ) negative_warnings += 1 if negative_warnings > 0: if small: - print( + logger.warning( " Found negative values during small run, these may be fixed with full data" ) else: - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") - print("WARNING: The above negative values are errors and must be fixed") - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + logger.warning("The above negative values are errors and must be fixed!") # raise UserWarning("The above negative values are errors and must be fixed") else: - print("OK") + logger.info("OK") return negative_test def test_for_missing_values(df, small: bool = False): """Checks that there are no unexpected missing values in the output data.""" - print(" Checking that no values are missing... ", end="") + logger.info("Checking that no values are missing... ") columns_that_should_be_complete = [ "plant_id_eia", "fuel_category", @@ -221,60 +220,53 @@ def test_for_missing_values(df, small: bool = False): for column in columns_to_test: missing_test = df[df[column].isna()] if not missing_test.empty: - print(" ") - print( - f"WARNING: There are {len(missing_test)} records where {column} is missing." + logger.warning( + f"There are {len(missing_test)} records where {column} is missing." ) missing_warnings += 1 if missing_warnings > 0: if small: - print( + logger.warning( " Found missing values during small run, these may be fixed with full data" ) else: - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") - print("WARNING: The above missing values are errors and must be fixed") - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + logger.warning("The above missing values are errors and must be fixed") + logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") else: - print("OK") + logger.info("OK") return missing_test def test_chp_allocation(df): """Checks that the CHP allocation didn't create any anomalous values.""" - print( - " Checking that total fuel consumed >= fuel consumed for electricity... ", - end="", - ) + logger.info("Checking that total fuel consumed >= fuel consumed for electricity... ") chp_allocation_test = df[ df["fuel_consumed_for_electricity_mmbtu"] > df["fuel_consumed_mmbtu"] ] if not chp_allocation_test.empty: raise UserWarning( - f"WARNING: There are {len(chp_allocation_test)} records where fuel consumed for electricity is greater than total fuel consumption. Check `chp_allocation_test` for complete list" + f"There are {len(chp_allocation_test)} records where fuel consumed for electricity is greater than total fuel consumption. Check `chp_allocation_test` for complete list" ) else: - print("OK") + logger.info("OK") return chp_allocation_test def test_for_missing_energy_source_code(df): """Checks that there are no missing energy source codes associated with non-zero fuel consumption.""" - print( - " Checking that there are no missing energy source codes associated with non-zero fuel consumption... ", - end="", - ) + logger.info( + "Checking that there are no missing energy source codes associated with non-zero fuel consumption... ") missing_esc_test = df[ (df["energy_source_code"].isna()) & (df["fuel_consumed_mmbtu"] > 0) ] if not missing_esc_test.empty: - print(" ") - print( - f"WARNING: There are {len(missing_esc_test)} records where there is a missing energy source code associated with non-zero fuel consumption. Check `missing_esc_test` for complete list" + logger.warning( + f"There are {len(missing_esc_test)} records where there is a missing energy source code associated with non-zero fuel consumption. Check `missing_esc_test` for complete list" ) else: - print("OK") + logger.info("OK") return missing_esc_test @@ -330,24 +322,20 @@ def check_removed_data_is_empty(cems): def test_for_missing_subplant_id(df): """Checks if any records are missing a `subplant_id`.""" - print(" Checking that all data has an associated `subplant_id`... ", end="") + logger.info("Checking that all data has an associated `subplant_id`... ") missing_subplant_test = df[df["subplant_id"].isna()] if not missing_subplant_test.empty: - print(" ") - print( - f"WARNING: There are {len(missing_subplant_test)} records for {len(missing_subplant_test[['plant_id_eia']].drop_duplicates())} plants without a subplant ID. See `missing_subplant_test` for details" + logger.warning( + f"There are {len(missing_subplant_test)} records for {len(missing_subplant_test[['plant_id_eia']].drop_duplicates())} plants without a subplant ID. See `missing_subplant_test` for details" ) else: - print("OK") + logger.info("OK") return missing_subplant_test def validate_gross_to_net_conversion(cems, eia923_allocated): """checks whether the calculated net generation matches the reported net generation from EIA-923 at the annual plant level.""" - print( - " Checking that calculated net generation matches reported net generation in EIA-923... ", - end="", - ) + logger.info("Checking that calculated net generation matches reported net generation in EIA-923... ") # merge together monthly subplant totals from EIA and calculated from CEMS eia_netgen = ( eia923_allocated.groupby( @@ -389,22 +377,18 @@ def validate_gross_to_net_conversion(cems, eia923_allocated): cems_net_not_equal_to_eia = validated_ng[validated_ng["pct_error"] != 0] if len(cems_net_not_equal_to_eia) > 0: - print(" ") - print( - f"WARNING: There are {len(cems_net_not_equal_to_eia)} plants where calculated annual net generation does not match EIA annual net generation." + logger.warning( + f"There are {len(cems_net_not_equal_to_eia)} plants where calculated annual net generation does not match EIA annual net generation." ) - print(cems_net_not_equal_to_eia) + logger.warning("\n" + cems_net_not_equal_to_eia.to_string()) else: - print("OK") + logger.info("OK") def test_emissions_adjustments(df): """For each emission, tests that mass_lb >= mass_lb_for_electricity >= mass_lb_for_electricity_adjusted.""" - print( - " Checking that adjusted emission values are less than total emissions... ", - end="", - ) + logger.info("Checking that adjusted emission values are less than total emissions... ") pollutants = ["co2", "ch4", "n2o", "co2e", "nox", "so2"] @@ -416,8 +400,8 @@ def test_emissions_adjustments(df): (df[f"{pollutant}_mass_lb"] < df[f"{pollutant}_mass_lb_for_electricity"]) ] if len(bad_adjustment) > 0: - print( - f"WARNING: There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity > {pollutant}_mass_lb" + logger.warning( + f"There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity > {pollutant}_mass_lb" ) bad_adjustment += 1 @@ -426,8 +410,8 @@ def test_emissions_adjustments(df): (df[f"{pollutant}_mass_lb"] < df[f"{pollutant}_mass_lb_adjusted"]) ] if len(bad_adjustment) > 0: - print( - f"WARNING: There are {len(bad_adjustment)} records where {pollutant}_mass_lb_adjusted > {pollutant}_mass_lb" + logger.warning( + f"There are {len(bad_adjustment)} records where {pollutant}_mass_lb_adjusted > {pollutant}_mass_lb" ) bad_adjustment += 1 @@ -439,9 +423,8 @@ def test_emissions_adjustments(df): ) ] if len(bad_adjustment) > 0: - print(" ") - print( - f"WARNING: There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity_adjusted > {pollutant}_mass_lb_for_electricity" + logger.warning( + f"There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity_adjusted > {pollutant}_mass_lb_for_electricity" ) bad_adjustment += 1 @@ -449,7 +432,7 @@ def test_emissions_adjustments(df): if bad_adjustments > 0: raise UserWarning("The above issues with emissions adjustments must be fixed.") else: - print("OK") + logger.info("OK") def ensure_non_overlapping_data_from_all_sources( @@ -457,7 +440,7 @@ def ensure_non_overlapping_data_from_all_sources( ): """Ensures that there is no duplicated subplant-months from each of the four sources of cleaned data.""" - print(" Checking that all data to be combined is unique... ", end="") + logger.info("Checking that all data to be combined is unique... ") if "hourly_data_source" in eia_data.columns: eia_only_data = eia_data.loc[ @@ -520,69 +503,62 @@ def ensure_non_overlapping_data_from_all_sources( (data_overlap["in_eia"] == 1) & (data_overlap["in_cems"] == 1) ] if len(eia_cems_overlap) > 0: - print(" ") - print( - f"WARNING: There are {len(eia_cems_overlap)} subplant-months that exist in both shaped EIA data and CEMS" + logger.warning( + f"There are {len(eia_cems_overlap)} subplant-months that exist in both shaped EIA data and CEMS" ) eia_pcs_overlap = data_overlap[ (data_overlap["in_eia"] == 1) & (data_overlap["in_partial_cems_subplant"] == 1) ] if len(eia_pcs_overlap) > 0: - print(" ") - print( - f"WARNING: There are {len(eia_pcs_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data" + logger.warning( + f"There are {len(eia_pcs_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data" ) cems_pcs_overlap = data_overlap[ (data_overlap["in_cems"] == 1) & (data_overlap["in_partial_cems_subplant"] == 1) ] if len(cems_pcs_overlap) > 0: - print(" ") - print( - f"WARNING: There are {len(cems_pcs_overlap)} subplant-months that exist in both CEMS data and partial CEMS data" + logger.warning( + f"There are {len(cems_pcs_overlap)} subplant-months that exist in both CEMS data and partial CEMS data" ) eia_pcp_overlap = data_overlap[ (data_overlap["in_eia"] == 1) & (data_overlap["in_partial_cems_plant"] == 1) ] if len(eia_pcp_overlap) > 0: - print(" ") - print( - f"WARNING: There are {len(eia_pcp_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data" + logger.warning( + f"There are {len(eia_pcp_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data" ) cems_pcp_overlap = data_overlap[ (data_overlap["in_cems"] == 1) & (data_overlap["in_partial_cems_plant"] == 1) ] if len(cems_pcp_overlap) > 0: - print(" ") - print( - f"WARNING: There are {len(cems_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data" + logger.warning( + f"There are {len(cems_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data" ) pcs_pcp_overlap = data_overlap[ (data_overlap["in_partial_cems_subplant"] == 1) & (data_overlap["in_partial_cems_plant"] == 1) ] if len(pcs_pcp_overlap) > 0: - print(" ") - print( - f"WARNING: There are {len(pcs_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data" + logger.warning( + f"There are {len(pcs_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data" ) all_overlap = data_overlap[data_overlap["number_of_locations"] == 4] if len(all_overlap) > 0: - print(" ") - print( - f"WARNING: There are {len(all_overlap)} subplant-months that exist in shaped EIA data, CEMS data, and partial CEMS data." + logger.warning( + f"There are {len(all_overlap)} subplant-months that exist in shaped EIA data, CEMS data, and partial CEMS data." ) raise UserWarning("The above overlaps must be fixed before proceeding.") else: - print("OK") + logger.info("OK") def validate_shaped_totals(shaped_eia_data, monthly_eia_data_to_shape, group_keys): """Checks that any shaped monthly data still adds up to the monthly total after shaping.""" - print(" Checking that shaped hourly data matches monthly totals... ", end="") + logger.info("Checking that shaped hourly data matches monthly totals... ") monthly_group_keys = group_keys + ["report_date"] @@ -598,18 +574,17 @@ def validate_shaped_totals(shaped_eia_data, monthly_eia_data_to_shape, group_key compare = (shaped_data_agg - eia_data_agg).round(0) if compare.sum().sum() > 0: - print(" ") - print( + logger.warning("\n" + compare[ (compare["net_generation_mwh"] != 0) | (compare["fuel_consumed_mmbtu"] != 0) - ] + ].to_string() ) raise UserWarning( "The data shaping process is changing the monthly total values compared to reported EIA values. This process should only shape the data, not alter it." ) else: - print("OK") + logger.info("OK") def validate_unique_datetimes(df, df_name, keys): @@ -626,7 +601,7 @@ def validate_unique_datetimes(df, df_name, keys): df.duplicated(subset=(keys + [datetime_column]), keep=False) ] if len(duplicate_dt) > 0: - print(duplicate_dt) + logger.warning("\n" + duplicate_dt.to_string()) raise UserWarning( f"The dataframe {df_name} contains duplicate {datetime_column} values within each group of {keys}. See above output" ) @@ -840,7 +815,7 @@ def identify_percent_of_data_by_input_source( source_of_input_data = [] for name, df in data_sources.items(): if len(df) == 0: # Empty df. May occur when running `small` - print(f"WARNING: data source {name} has zero entries") + logger.warning(f"data source {name} has zero entries") continue if name == "eia": subplant_data = df.groupby( @@ -1380,8 +1355,8 @@ def check_for_anomalous_co2_factors( on="plant_id_eia", validate="m:1", ) - print("Potentially anomalous co2 factors detected for the following plants:") - print( + logger.warning("Potentially anomalous co2 factors detected for the following plants:") + logger.warning("\n" + factor_anomaly[ [ "plant_id_eia", @@ -1391,7 +1366,7 @@ def check_for_anomalous_co2_factors( f"{pollutant}_mass_lb_for_electricity", factor, ] - ].sort_values(by=factor) + ].sort_values(by=factor).to_string() ) @@ -1408,8 +1383,8 @@ def test_for_missing_fuel(df, generation_column): ) ] if not missing_fuel_test.empty: - print( - f"WARNING: There are {len(missing_fuel_test)} records where {generation_column} is positive but no fuel consumption is reported. Check `missing_fuel_test` for complete list" + logger.warning( + f"There are {len(missing_fuel_test)} records where {generation_column} is positive but no fuel consumption is reported. Check `missing_fuel_test` for complete list" ) return missing_fuel_test @@ -1418,8 +1393,8 @@ def test_for_missing_fuel(df, generation_column): def test_for_missing_co2(df): missing_co2_test = df[df["co2_mass_lb"].isna() & ~df["fuel_consumed_mmbtu"].isna()] if not missing_co2_test.empty: - print( - f"WARNING: There are {len(missing_co2_test)} records where co2 data is missing. Check `missing_co2_test` for complete list" + logger.warning( + f"There are {len(missing_co2_test)} records where co2 data is missing. Check `missing_co2_test` for complete list" ) return missing_co2_test @@ -1427,8 +1402,8 @@ def test_for_missing_co2(df): def test_for_missing_data(df, columns_to_test): missing_data_test = df[df[columns_to_test].isnull().all(axis=1)] if not missing_data_test.empty: - print( - f"WARNING: There are {len(missing_data_test)} records for which no data was reported. Check `missing_data_test` for complete list" + logger.warning( + f"There are {len(missing_data_test)} records for which no data was reported. Check `missing_data_test` for complete list" ) return missing_data_test @@ -1453,15 +1428,15 @@ def test_for_missing_incorrect_prime_movers(df, year): != incorrect_pm_test["prime_mover_code_eia860"] ] if not incorrect_pm_test.empty: - print( - f"WARNING: There are {len(incorrect_pm_test)} records for which the allocated prime mover does not match the reported prime mover. Check `incorrect_pm_test` for complete list" + logger.warning( + f"There are {len(incorrect_pm_test)} records for which the allocated prime mover does not match the reported prime mover. Check `incorrect_pm_test` for complete list" ) # check for missing PM code missing_pm_test = df[df["prime_mover_code"].isna()] if not missing_pm_test.empty: - print( - f"WARNING: There are {len(missing_pm_test)} records for which no prime mover was assigned. Check `missing_pm_test` for complete list" + logger.warning( + f"There are {len(missing_pm_test)} records for which no prime mover was assigned. Check `missing_pm_test` for complete list" ) return incorrect_pm_test, missing_pm_test @@ -1469,7 +1444,7 @@ def test_for_missing_incorrect_prime_movers(df, year): def test_for_outlier_heat_rates(df): # check heat rates - print("Heat Rate Test") + logger.warning("Heat Rate Test") # remove non-fossil fuel types thermal_generators = df[ ~df["energy_source_code"].isin(["SUN", "MWH", "WND", "WAT", "WH", "PUR"]) @@ -1508,10 +1483,10 @@ def test_for_outlier_heat_rates(df): ) ] if not heat_rate_test.empty: - print( - f" WARNING: {len(heat_rate_test)} of {len(generators_with_pm)} records for {fuel_type} generators with {pm} prime mover have heat rate of zero or > {outlier_threshold.round(2)} mmbtu/MWh" + logger.warning( + f"{len(heat_rate_test)} of {len(generators_with_pm)} records for {fuel_type} generators with {pm} prime mover have heat rate of zero or > {outlier_threshold.round(2)} mmbtu/MWh" ) - print( + logger.warning( f' median = {heat_rate_stats["50%"].round(2)}, max = {heat_rate_stats["max"].round(2)}, min = {heat_rate_stats["min"].round(2)}' ) heat_rate_test_all.append(heat_rate_test) @@ -1541,8 +1516,8 @@ def test_for_zero_data(df, columns_to_test): & (df[columns_to_test].sum(axis=1) == 0) ] if not zero_data_test.empty: - print( - f"WARNING: There are {len(zero_data_test)} records where all operating data are zero. Check `zero_data_test` for complete list" + logger.warning( + f"There are {len(zero_data_test)} records where all operating data are zero. Check `zero_data_test` for complete list" ) return zero_data_test @@ -1550,8 +1525,8 @@ def test_for_zero_data(df, columns_to_test): def test_gtn_results(df): gtn_test = df[df["net_generation_mwh"] > df["gross_generation_mwh"]] if not gtn_test.empty: - print( - f"WARNING: There are {round(len(gtn_test)/len(df)*100, 1)}% of records where net generation > gross generation. See `gtn_test` for details" + logger.warning( + f"There are {round(len(gtn_test)/len(df)*100, 1)}% of records where net generation > gross generation. See `gtn_test` for details" ) return gtn_test diff --git a/src/visualization.py b/src/visualization.py index 3121adf6..18c626bf 100644 --- a/src/visualization.py +++ b/src/visualization.py @@ -1,7 +1,4 @@ -""" -Helper functions for visualization - -""" +"""Helper functions for visualization.""" import pandas as pd import plotly.express as px diff --git a/test/test_logging.py b/test/test_logging.py new file mode 100644 index 00000000..f905175f --- /dev/null +++ b/test/test_logging.py @@ -0,0 +1,33 @@ +import sys +import logging + +import pandas as pd + +sys.path.append('../src') +sys.path.append('..') + +import src.eia930 as eia930 +from src.filepaths import top_folder + +from src.logging_util import get_logger, configure_root_logger + +pudl_logger = logging.getLogger(name="catalystcoop.pudl") + +configure_root_logger(logfile=top_folder('test/test_logfile.txt'), level=logging.INFO) +# If you call this again, nothing bad should happen. Logging statements should +# still only show up once. +configure_root_logger(logfile=top_folder('test/test_logfile.txt'), level=logging.INFO) +logger = get_logger('test') + + +def main(): + """These statements should each be printed once in a nice format.""" + logger.info('This is the OGE logger') + pudl_logger.info('This is the PUDL logger') + + df = pd.DataFrame({"a": [1,2,3], "b": [4,5,6]}) + logger.info("\n" + df.to_string()) + + +if __name__ == '__main__': + main() From 6c9d636eebf2e81aa905cfb58a9db73bf0dbc78c Mon Sep 17 00:00:00 2001 From: grgmiller Date: Thu, 23 Feb 2023 19:19:25 -0800 Subject: [PATCH 14/27] convert print to logger --- src/consumed.py | 8 ++++---- src/download_data.py | 9 +-------- src/validation.py | 14 ++++++-------- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/consumed.py b/src/consumed.py index 65f0f037..38b25855 100644 --- a/src/consumed.py +++ b/src/consumed.py @@ -183,9 +183,9 @@ def consumption_emissions(F, P, ID): for j in perturbed: if X[j] != 0.0: - print(b[j]) - print(np.abs(A[j, :]).sum()) - print(np.abs(A[:, j]).sum()) + logger.warning("\n" + b[j].to_string()) + logger.warning("\n" + np.abs(A[j, :]).sum()) + logger.warning("\n" + np.abs(A[:, j]).sum()) raise ValueError("X[%d] is %.2f instead of 0" % (j, X[j])) return X, len(perturbed) @@ -486,7 +486,7 @@ def run(self): for adj in ADJUSTMENTS: total_failed = 0 col = get_rate_column(pol, adjustment=adj, generated=False) - print(f"{pol}, {adj}", end="...") + logger.info(f"Solving consumed {pol} {adj} emissions...") # Calculate emissions for date in self.generation.index: if self.small and ( diff --git a/src/download_data.py b/src/download_data.py index 151564fb..07491f51 100644 --- a/src/download_data.py +++ b/src/download_data.py @@ -109,18 +109,11 @@ def download_pudl_data(zenodo_url: str): def download_pudl(zenodo_url, pudl_version): r = requests.get(zenodo_url, params={"download": "1"}, stream=True) # specify parameters for progress bar - total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 * 1024 * 10 # 10 MB - downloaded = 0 + logger.info(" Downloading PUDL data...") with open(downloads_folder("pudl.tgz"), "wb") as fd: for chunk in r.iter_content(chunk_size=block_size): - print( - f" Downloading PUDL. Progress: {(round(downloaded/total_size_in_bytes*100,2))}% \r", - end="", - ) fd.write(chunk) - downloaded += block_size - logger.info(" Downloading PUDL. Progress: 100.0%") # extract the tgz file logger.info(" Extracting PUDL data...") diff --git a/src/validation.py b/src/validation.py index 83af09be..c0f4d94f 100644 --- a/src/validation.py +++ b/src/validation.py @@ -273,9 +273,8 @@ def test_for_missing_energy_source_code(df): def check_non_missing_cems_co2_values_unchanged(cems_original, cems): """Checks that no non-missing CO2 values were modified during the process of filling.""" - print( + logger.info( " Checking that original CO2 data in CEMS was not modified by filling missing values...", - end="", ) # only keep non-zero and non-missing co2 values, since these should have not been modified cems_original = cems_original.loc[ @@ -294,12 +293,11 @@ def check_non_missing_cems_co2_values_unchanged(cems_original, cems): ) num_nonzero_rows = len(test_fill[test_fill["diff"] != 0]) if num_nonzero_rows > 0: - print(" ") - print( - f"WARNING: There are {num_nonzero_rows} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error" + logger.warning( + f"There are {num_nonzero_rows} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error" ) else: - print("OK") + logger.info("OK") def check_removed_data_is_empty(cems): @@ -316,8 +314,8 @@ def check_removed_data_is_empty(cems): ], ].sum(numeric_only=True) if check_that_data_is_zero.sum() > 0: - print("WARNING: Some data being removed has non-zero data associated with it:") - print(check_that_data_is_zero) + logger.warning("Some data being removed has non-zero data associated with it:") + logger.warning("\n" + check_that_data_is_zero.to_string()) def test_for_missing_subplant_id(df): From f9a5a8feb9553d22fba6ce42de05896ee4d4720d Mon Sep 17 00:00:00 2001 From: grgmiller Date: Thu, 23 Feb 2023 19:23:24 -0800 Subject: [PATCH 15/27] remove exclamation --- src/consumed.py | 4 ++-- src/validation.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/consumed.py b/src/consumed.py index 38b25855..5ff7cce8 100644 --- a/src/consumed.py +++ b/src/consumed.py @@ -184,8 +184,8 @@ def consumption_emissions(F, P, ID): for j in perturbed: if X[j] != 0.0: logger.warning("\n" + b[j].to_string()) - logger.warning("\n" + np.abs(A[j, :]).sum()) - logger.warning("\n" + np.abs(A[:, j]).sum()) + logger.warning("\n" + np.abs(A[j, :]).sum().to_string()) + logger.warning("\n" + np.abs(A[:, j]).sum().to_string()) raise ValueError("X[%d] is %.2f instead of 0" % (j, X[j])) return X, len(perturbed) diff --git a/src/validation.py b/src/validation.py index c0f4d94f..a50535a0 100644 --- a/src/validation.py +++ b/src/validation.py @@ -227,12 +227,10 @@ def test_for_missing_values(df, small: bool = False): if missing_warnings > 0: if small: logger.warning( - " Found missing values during small run, these may be fixed with full data" + "Found missing values during small run, these may be fixed with full data" ) else: - logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") logger.warning("The above missing values are errors and must be fixed") - logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") else: logger.info("OK") return missing_test From 80923b689cc65e81308837e646ed56c60257c683 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Thu, 23 Feb 2023 20:22:04 -0800 Subject: [PATCH 16/27] expand validation coverage --- src/data_cleaning.py | 9 +- src/gross_to_net_generation.py | 8 +- src/impute_hourly_profiles.py | 26 +++-- src/load_data.py | 11 ++- src/output_data.py | 7 +- src/validation.py | 174 +++++++++------------------------ 6 files changed, 95 insertions(+), 140 deletions(-) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index f11330ac..263f15ea 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -545,7 +545,8 @@ def update_energy_source_codes(df): (df["energy_source_code"] == "OTH") & (df["fuel_consumed_mmbtu"] > 0) ] if len(plants_with_other_fuel) > 0: - logger.warning(f""" + logger.warning( + f""" After cleaning energy source codes, some fuel consumption is still associated with an 'OTH' fuel type. This will lead to incorrect emissions calculations. Check the following plants: {list(plants_with_other_fuel.plant_id_eia.unique())} @@ -574,6 +575,7 @@ def create_primary_fuel_table(gen_fuel_allocated, pudl_out, add_subplant_id, yea on=["plant_id_eia", "generator_id"], validate="m:1", ) + validation.test_for_missing_subplant_id(gen_fuel_allocated) # get a table of primary energy source codes gen_primary_fuel = gen_fuel_allocated[ @@ -763,6 +765,7 @@ def calculate_capacity_based_primary_fuel(pudl_out, agg_keys, year): on=["plant_id_eia", "generator_id"], validate="m:1", ) + validation.test_for_missing_subplant_id(gen_capacity) gen_capacity = ( gen_capacity.groupby(agg_keys + ["energy_source_code_1"], dropna=False)[ @@ -811,6 +814,7 @@ def calculate_subplant_efs(gen_fuel_allocated, year): on=["plant_id_eia", "generator_id"], validate="m:1", ) + validation.test_for_missing_subplant_id(subplant_efs) # calculate the total emissions and fuel consumption by subplant-month subplant_efs = subplant_efs.groupby( @@ -998,6 +1002,9 @@ def clean_cems(year: int, small: bool, primary_fuel_table, subplant_emission_fac cems = remove_cems_with_zero_monthly_data(cems) validation.test_for_negative_values(cems) + validation.validate_unique_datetimes( + cems, "cems", ["plant_id_eia", "emissions_unit_id_epa"] + ) cems = apply_dtypes(cems) diff --git a/src/gross_to_net_generation.py b/src/gross_to_net_generation.py index 0d6ed96c..b574ec20 100644 --- a/src/gross_to_net_generation.py +++ b/src/gross_to_net_generation.py @@ -95,7 +95,12 @@ def convert_gross_to_net_generation(cems, eia923_allocated, plant_attributes, ye logger.warning( "The following subplants are missing default GTN ratios. Using a default value of 0.97" ) - logger.warning("\n" + missing_defaults[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string()) + logger.warning( + "\n" + + missing_defaults[["plant_id_eia", "subplant_id"]] + .drop_duplicates() + .to_string() + ) # if there is a missing default gtn ratio, fill with 0.97 cems["default_gtn_ratio"] = cems["default_gtn_ratio"].fillna(0.97) cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna( @@ -425,6 +430,7 @@ def calculate_subplant_nameplate_capacity(year): on=["plant_id_eia", "generator_id"], validate="1:1", ) + validation.test_for_missing_subplant_id(gen_capacity) subplant_capacity = ( gen_capacity.groupby(["plant_id_eia", "subplant_id"])["capacity_mw"] .sum() diff --git a/src/impute_hourly_profiles.py b/src/impute_hourly_profiles.py index 2d9bb64c..6c064a65 100644 --- a/src/impute_hourly_profiles.py +++ b/src/impute_hourly_profiles.py @@ -282,6 +282,9 @@ def aggregate_for_residual( # add the partial cems data cems = pd.concat([cems, partial_cems_subplant, partial_cems_plant], axis=0) + validation.validate_unique_datetimes( + cems, "cems_for_residual", ["plant_id_eia", "subplant_id"] + ) # merge in plant attributes cems = cems.merge(plant_attributes, how="left", on="plant_id_eia", validate="m:1") @@ -297,7 +300,12 @@ def aggregate_for_residual( logger.warning( "The following cems subplants are missing fuel categories and will lead to incorrect residual calculations:" ) - logger.warning("\n" + missing_fuel_category[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string()) + logger.warning( + "\n" + + missing_fuel_category[["plant_id_eia", "subplant_id"]] + .drop_duplicates() + .to_string() + ) raise UserWarning( "The missing fuel categories must be fixed before proceeding." ) @@ -657,6 +665,9 @@ def impute_missing_hourly_profiles( hourly_profiles["datetime_utc"] = pd.to_datetime( hourly_profiles["datetime_utc"], utc=True ) + validation.validate_unique_datetimes( + hourly_profiles, "hourly_profiles", ["ba_code", "fuel_category"] + ) return hourly_profiles @@ -1035,12 +1046,6 @@ def combine_and_export_hourly_plant_data( df_name="shaped_eia_data", keys=["plant_id_eia"], ) - # validate that the shaping did not alter data at the monthly level - validation.validate_shaped_totals( - shaped_eia_region_data, - eia_region, - group_keys=[region_to_group, "fuel_category"], - ) # concat all of the data together combined_plant_data = pd.concat( @@ -1230,6 +1235,13 @@ def shape_monthly_eia_data_as_hourly(monthly_eia_data_to_shape, hourly_profiles) [col for col in column_order if col in shaped_monthly_data.columns] ] + # validate that the shaping did not alter data at the monthly level + validation.validate_shaped_totals( + shaped_monthly_data, + monthly_eia_data_to_shape, + group_keys=["ba_code", "fuel_category"], + ) + return shaped_monthly_data diff --git a/src/load_data.py b/src/load_data.py index 798c480b..81c84e3d 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -9,6 +9,7 @@ from column_checks import get_dtypes from filepaths import downloads_folder, manual_folder, outputs_folder +from validation import validate_unique_datetimes from logging_util import get_logger logger = get_logger(__name__) @@ -112,6 +113,8 @@ def load_cems_data(year): } ) + validate_unique_datetimes(cems, "cems", ["plant_id_eia", "emissions_unit_id_epa"]) + return cems @@ -832,7 +835,9 @@ def load_boiler_control_id_association_eia860(year, pollutant): logger.warning( "Environmental association data prior to 2013 have not been integrated into the data pipeline." ) - logger.warning("This may result in less accurate pollutant emissions calculations.") + logger.warning( + "This may result in less accurate pollutant emissions calculations." + ) boiler_control_id_association_eia860 = pd.DataFrame( columns=boiler_association_eia860_names ) @@ -881,7 +886,9 @@ def load_boiler_design_parameters_eia860(year): logger.warning( "Boiler Design data prior to 2013 have not been integrated into the data pipeline." ) - logger.warning("This may result in less accurate NOx and SO2 emissions calculations.") + logger.warning( + "This may result in less accurate NOx and SO2 emissions calculations." + ) boiler_design_parameters_eia860 = pd.DataFrame( columns=list(boiler_design_parameters_eia860_names.values()) ) diff --git a/src/output_data.py b/src/output_data.py index 23eb9824..6626cc96 100644 --- a/src/output_data.py +++ b/src/output_data.py @@ -75,7 +75,9 @@ def zip_results_for_s3(year): # skip the metric hourly plant data since we do not create those outputs pass else: - logger.info(f"zipping {year}_{data_type}_{aggregation}_{unit} for s3") + logger.info( + f"zipping {year}_{data_type}_{aggregation}_{unit} for s3" + ) folder = ( f"{results_folder()}/{year}/{data_type}/{aggregation}/{unit}" ) @@ -176,6 +178,9 @@ def output_plant_data(df, path_prefix, resolution, skip_outputs, plant_attribute if not skip_outputs: if resolution == "hourly": # output hourly data + validation.validate_unique_datetimes( + df, "individual_plant_data", ["plant_id_eia"] + ) # Separately save real and aggregate plants output_to_results( df[df.plant_id_eia > 900000], diff --git a/src/validation.py b/src/validation.py index 83af09be..5ab57e8f 100644 --- a/src/validation.py +++ b/src/validation.py @@ -79,78 +79,35 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated): logger.warning("Percentage Difference:") logger.warning("\n" + mismatched_allocation.to_string()) logger.warning("EIA-923 Input Totals:") - logger.warning("\n" + plant_total_gf.loc[mismatched_allocation.index, :].to_string()) + logger.warning( + "\n" + plant_total_gf.loc[mismatched_allocation.index, :].to_string() + ) logger.warning("Allocated Totals:") - logger.warning("\n" + plant_total_alloc.loc[mismatched_allocation.index, :].to_string()) - + logger.warning( + "\n" + plant_total_alloc.loc[mismatched_allocation.index, :].to_string() + ) def test_for_negative_values(df, small: bool = False): """Checks that there are no unexpected negative values in the data.""" logger.info("Checking that fuel and emissions values are positive... ") - columns_that_should_be_positive = [ - "fuel_consumed_mmbtu", - "fuel_consumed_for_electricity_mmbtu", - "co2_mass_lb", - "ch4_mass_lb", - "n2o_mass_lb", - "co2e_mass_lb", - "nox_mass_lb", - "so2_mass_lb", - "co2_mass_lb_for_electricity", - "ch4_mass_lb_for_electricity", - "n2o_mass_lb_for_electricity", - "co2e_mass_lb_for_electricity", - "nox_mass_lb_for_electricity", - "so2_mass_lb_for_electricity", - "co2_mass_lb_adjusted", - "ch4_mass_lb_adjusted", - "n2o_mass_lb_adjusted", - "co2e_mass_lb_adjusted", - "nox_mass_lb_adjusted", - "so2_mass_lb_adjusted", - "co2_mass_lb_for_electricity_adjusted", - "ch4_mass_lb_for_electricity_adjusted", - "n2o_mass_lb_for_electricity_adjusted", - "co2e_mass_lb_for_electricity_adjusted", - "nox_mass_lb_for_electricity_adjusted", - "so2_mass_lb_for_electricity_adjusted", - "generated_co2_rate_lb_per_mwh_for_electricity", - "generated_ch4_rate_lb_per_mwh_for_electricity", - "generated_n2o_rate_lb_per_mwh_for_electricity", - "generated_co2e_rate_lb_per_mwh_for_electricity", - "generated_nox_rate_lb_per_mwh_for_electricity", - "generated_so2_rate_lb_per_mwh_for_electricity", - "generated_co2_rate_lb_per_mwh_for_electricity_adjusted", - "generated_ch4_rate_lb_per_mwh_for_electricity_adjusted", - "generated_n2o_rate_lb_per_mwh_for_electricity_adjusted", - "generated_co2e_rate_lb_per_mwh_for_electricity_adjusted", - "generated_nox_rate_lb_per_mwh_for_electricity_adjusted", - "generated_so2_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_co2_rate_lb_per_mwh_for_electricity", - "consumed_ch4_rate_lb_per_mwh_for_electricity", - "consumed_n2o_rate_lb_per_mwh_for_electricity", - "consumed_co2e_rate_lb_per_mwh_for_electricity", - "consumed_nox_rate_lb_per_mwh_for_electricity", - "consumed_so2_rate_lb_per_mwh_for_electricity", - "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_ch4_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_n2o_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_co2e_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_nox_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_so2_rate_lb_per_mwh_for_electricity_adjusted", - ] - columns_to_test = [ - col for col in columns_that_should_be_positive if col in df.columns - ] + columns_that_can_be_negative = ["net_generation_mwh"] negative_warnings = 0 - for column in columns_to_test: - negative_test = df[df[column] < 0] - if not negative_test.empty: - logger.warning( - f"There are {len(negative_test)} records where {column} is negative." - ) - negative_warnings += 1 + for column in df.columns: + # if the column is allowed to be negative, skip the test + if column in columns_that_can_be_negative: + pass + else: + # if the column is not numeric, skip the test + if pd.api.types.is_numeric_dtype(df[column].dtype): + negative_test = df[df[column] < 0] + if not negative_test.empty: + logger.warning( + f"There are {len(negative_test)} records where {column} is negative." + ) + negative_warnings += 1 + else: + pass if negative_warnings > 0: if small: logger.warning( @@ -158,7 +115,6 @@ def test_for_negative_values(df, small: bool = False): ) else: logger.warning("The above negative values are errors and must be fixed!") - # raise UserWarning("The above negative values are errors and must be fixed") else: logger.info("OK") return negative_test @@ -167,57 +123,8 @@ def test_for_negative_values(df, small: bool = False): def test_for_missing_values(df, small: bool = False): """Checks that there are no unexpected missing values in the output data.""" logger.info("Checking that no values are missing... ") - columns_that_should_be_complete = [ - "plant_id_eia", - "fuel_category", - "datetime_local", - "datetime_utc", - "month", - "net_generation_mwh", - "fuel_consumed_mmbtu", - "fuel_consumed_for_electricity_mmbtu", - "co2_mass_lb", - "ch4_mass_lb", - "n2o_mass_lb", - "co2e_mass_lb", - "nox_mass_lb", - "so2_mass_lb", - "co2_mass_lb_for_electricity", - "ch4_mass_lb_for_electricity", - "n2o_mass_lb_for_electricity", - "co2e_mass_lb_for_electricity", - "nox_mass_lb_for_electricity", - "so2_mass_lb_for_electricity", - "co2_mass_lb_adjusted", - "ch4_mass_lb_adjusted", - "n2o_mass_lb_adjusted", - "co2e_mass_lb_adjusted", - "nox_mass_lb_adjusted", - "so2_mass_lb_adjusted", - "co2_mass_lb_for_electricity_adjusted", - "ch4_mass_lb_for_electricity_adjusted", - "n2o_mass_lb_for_electricity_adjusted", - "co2e_mass_lb_for_electricity_adjusted", - "nox_mass_lb_for_electricity_adjusted", - "so2_mass_lb_for_electricity_adjusted", - "consumed_co2_rate_lb_per_mwh_for_electricity", - "consumed_ch4_rate_lb_per_mwh_for_electricity", - "consumed_n2o_rate_lb_per_mwh_for_electricity", - "consumed_co2e_rate_lb_per_mwh_for_electricity", - "consumed_nox_rate_lb_per_mwh_for_electricity", - "consumed_so2_rate_lb_per_mwh_for_electricity", - "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_ch4_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_n2o_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_co2e_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_nox_rate_lb_per_mwh_for_electricity_adjusted", - "consumed_so2_rate_lb_per_mwh_for_electricity_adjusted", - ] - columns_to_test = [ - col for col in columns_that_should_be_complete if col in df.columns - ] missing_warnings = 0 - for column in columns_to_test: + for column in df.columns: missing_test = df[df[column].isna()] if not missing_test.empty: logger.warning( @@ -230,9 +137,7 @@ def test_for_missing_values(df, small: bool = False): " Found missing values during small run, these may be fixed with full data" ) else: - logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") logger.warning("The above missing values are errors and must be fixed") - logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") else: logger.info("OK") return missing_test @@ -240,7 +145,9 @@ def test_for_missing_values(df, small: bool = False): def test_chp_allocation(df): """Checks that the CHP allocation didn't create any anomalous values.""" - logger.info("Checking that total fuel consumed >= fuel consumed for electricity... ") + logger.info( + "Checking that total fuel consumed >= fuel consumed for electricity... " + ) chp_allocation_test = df[ df["fuel_consumed_for_electricity_mmbtu"] > df["fuel_consumed_mmbtu"] ] @@ -257,7 +164,8 @@ def test_chp_allocation(df): def test_for_missing_energy_source_code(df): """Checks that there are no missing energy source codes associated with non-zero fuel consumption.""" logger.info( - "Checking that there are no missing energy source codes associated with non-zero fuel consumption... ") + "Checking that there are no missing energy source codes associated with non-zero fuel consumption... " + ) missing_esc_test = df[ (df["energy_source_code"].isna()) & (df["fuel_consumed_mmbtu"] > 0) ] @@ -335,7 +243,9 @@ def test_for_missing_subplant_id(df): def validate_gross_to_net_conversion(cems, eia923_allocated): """checks whether the calculated net generation matches the reported net generation from EIA-923 at the annual plant level.""" - logger.info("Checking that calculated net generation matches reported net generation in EIA-923... ") + logger.info( + "Checking that calculated net generation matches reported net generation in EIA-923... " + ) # merge together monthly subplant totals from EIA and calculated from CEMS eia_netgen = ( eia923_allocated.groupby( @@ -388,7 +298,9 @@ def validate_gross_to_net_conversion(cems, eia923_allocated): def test_emissions_adjustments(df): """For each emission, tests that mass_lb >= mass_lb_for_electricity >= mass_lb_for_electricity_adjusted.""" - logger.info("Checking that adjusted emission values are less than total emissions... ") + logger.info( + "Checking that adjusted emission values are less than total emissions... " + ) pollutants = ["co2", "ch4", "n2o", "co2e", "nox", "so2"] @@ -574,8 +486,9 @@ def validate_shaped_totals(shaped_eia_data, monthly_eia_data_to_shape, group_key compare = (shaped_data_agg - eia_data_agg).round(0) if compare.sum().sum() > 0: - logger.warning("\n" + - compare[ + logger.warning( + "\n" + + compare[ (compare["net_generation_mwh"] != 0) | (compare["fuel_consumed_mmbtu"] != 0) ].to_string() @@ -1355,9 +1268,12 @@ def check_for_anomalous_co2_factors( on="plant_id_eia", validate="m:1", ) - logger.warning("Potentially anomalous co2 factors detected for the following plants:") - logger.warning("\n" + - factor_anomaly[ + logger.warning( + "Potentially anomalous co2 factors detected for the following plants:" + ) + logger.warning( + "\n" + + factor_anomaly[ [ "plant_id_eia", "plant_primary_fuel", @@ -1366,7 +1282,9 @@ def check_for_anomalous_co2_factors( f"{pollutant}_mass_lb_for_electricity", factor, ] - ].sort_values(by=factor).to_string() + ] + .sort_values(by=factor) + .to_string() ) From 690538c2fdbca9445a9ee80111e8227e0d1b8a66 Mon Sep 17 00:00:00 2001 From: Milo Knowles Date: Fri, 24 Feb 2023 10:11:43 -0500 Subject: [PATCH 17/27] Improve flag handling and log args --- src/data_pipeline.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/data_pipeline.py b/src/data_pipeline.py index d30cefa1..c4c66574 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -39,33 +39,43 @@ def get_args() -> argparse.Namespace: parser.add_argument( "--shape_individual_plants", help="Assign an hourly profile to each individual plant with EIA-only data, instead of aggregating to the fleet level before shaping.", - type=bool, default=True, + action=argparse.BooleanOptionalAction ) parser.add_argument( "--small", help="Run on subset of data for quicker testing, outputs to outputs/small and results to results/small.", - type=bool, default=False, + action=argparse.BooleanOptionalAction ) parser.add_argument( "--flat", help="Use flat hourly profiles?", + default=False, + action=argparse.BooleanOptionalAction ) parser.add_argument( "--skip_outputs", help="Skip outputting data to csv files for quicker testing.", - type=bool, default=False, + action=argparse.BooleanOptionalAction ) args = parser.parse_args() + return args +def print_args(args: argparse.Namespace): + """Print out the command line arguments.""" + s = "\n".join([f" * {argname} = {argvalue}" for argname, argvalue in vars(args).items()]) + logger.info(f"\n\nRunning with the following options:\n{s}\n") + + def main(): """Runs the OGE data pipeline.""" args = get_args() + print_args(args) year = args.year logger.info(f'Running data pipeline for year {year}') From 1a091c41255bd88ac9dcf7b0b449865d4c468f1e Mon Sep 17 00:00:00 2001 From: Milo Knowles Date: Fri, 24 Feb 2023 10:39:04 -0500 Subject: [PATCH 18/27] WIP --- src/data_pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/data_pipeline.py b/src/data_pipeline.py index c4c66574..5418ea75 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -68,14 +68,15 @@ def get_args() -> argparse.Namespace: def print_args(args: argparse.Namespace): """Print out the command line arguments.""" - s = "\n".join([f" * {argname} = {argvalue}" for argname, argvalue in vars(args).items()]) - logger.info(f"\n\nRunning with the following options:\n{s}\n") + argstring = "\n".join([f" * {k} = {v}" for k, v in vars(args).items()]) + logger.info(f"\n\nRunning with the following options:\n{argstring}\n") def main(): """Runs the OGE data pipeline.""" args = get_args() print_args(args) + year = args.year logger.info(f'Running data pipeline for year {year}') From 121a1f1de6dcfb6d3e7d035233801cf32de8f29b Mon Sep 17 00:00:00 2001 From: Milo Knowles Date: Fri, 24 Feb 2023 11:21:55 -0500 Subject: [PATCH 19/27] Make sure the folder where logs go exists --- src/filepaths.py | 10 ++++++++++ src/logging_util.py | 3 +++ 2 files changed, 13 insertions(+) diff --git a/src/filepaths.py b/src/filepaths.py index ec143b07..bc8d9c60 100644 --- a/src/filepaths.py +++ b/src/filepaths.py @@ -32,3 +32,13 @@ def results_folder(rel=""): def outputs_folder(rel=""): return os.path.join(data_folder("outputs"), rel) + + +def containing_folder(filepath: str) -> str: + """Returns the folder containing `filepath`.""" + return os.path.dirname(os.path.realpath(filepath)) + + +def make_containing_folder(filepath: str): + """Make sure the the folder where `filepath` goes exists.""" + os.makedirs(containing_folder(filepath), exist_ok=True) diff --git a/src/logging_util.py b/src/logging_util.py index 79bbaf12..9f1fba28 100644 --- a/src/logging_util.py +++ b/src/logging_util.py @@ -2,6 +2,8 @@ import logging import coloredlogs +from filepaths import make_containing_folder + def get_logger(name: str) -> logging.Logger: """Helper function to append `oge` to the logger name and return a logger. @@ -41,6 +43,7 @@ def configure_root_logger(logfile: str | None = None, level: str = "INFO"): # Send everything to the log file by adding a file handler to the root logger. if logfile is not None: + make_containing_folder(logfile) file_logger = logging.FileHandler(logfile, mode='w') file_logger.setFormatter(logging.Formatter(log_format)) From 83817233009cca981b6d4cd25d2de1c5de4dc7f1 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Fri, 24 Feb 2023 10:03:24 -0800 Subject: [PATCH 20/27] add pudl download tracker back --- src/download_data.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/download_data.py b/src/download_data.py index 07491f51..cb84a588 100644 --- a/src/download_data.py +++ b/src/download_data.py @@ -41,7 +41,9 @@ def download_helper( # If the file already exists, do not re-download it. final_destination = output_path if output_path is not None else download_path if os.path.exists(final_destination): - logger.info(f" {final_destination.split('/')[-1]} already downloaded, skipping.") + logger.info( + f" {final_destination.split('/')[-1]} already downloaded, skipping." + ) return False # Otherwise, download to the file in chunks. @@ -109,11 +111,19 @@ def download_pudl_data(zenodo_url: str): def download_pudl(zenodo_url, pudl_version): r = requests.get(zenodo_url, params={"download": "1"}, stream=True) # specify parameters for progress bar + total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 * 1024 * 10 # 10 MB + downloaded = 0 logger.info(" Downloading PUDL data...") with open(downloads_folder("pudl.tgz"), "wb") as fd: for chunk in r.iter_content(chunk_size=block_size): + print( + f" Progress: {(round(downloaded/total_size_in_bytes*100,2))}% \r", + end="", + ) fd.write(chunk) + downloaded += block_size + print(" Progress: 100.0%") # extract the tgz file logger.info(" Extracting PUDL data...") From f678d0b235d9e8d8e60ec3881db6196c96d7ff84 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Sat, 25 Feb 2023 15:47:29 -0800 Subject: [PATCH 21/27] change log location --- src/data_cleaning.py | 20 ++++----- src/data_pipeline.py | 23 ++++++---- src/download_data.py | 16 +++---- src/eia930.py | 6 +-- src/gross_to_net_generation.py | 2 +- src/impute_hourly_profiles.py | 2 +- src/load_data.py | 2 +- src/logging_util.py | 79 +++++++++++++++++----------------- src/output_data.py | 6 +-- src/validation.py | 2 +- 10 files changed, 81 insertions(+), 77 deletions(-) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index f11330ac..1dd13449 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -54,11 +54,11 @@ def identify_subplants(year, number_of_years=5): end_year = year # load 5 years of monthly data from CEMS - logger.info(" loading CEMS ids") + logger.info("loading CEMS ids") cems_ids = load_data.load_cems_ids(start_year, end_year) # add subplant ids to the data - logger.info(" identifying unique subplants") + logger.info("identifying unique subplants") generate_subplant_ids(start_year, end_year, cems_ids) @@ -883,7 +883,7 @@ def remove_plants( ].plant_id_eia.unique() ) logger.info( - f" Removing {len(plants_in_states_to_remove)} plants located in the following states: {remove_states}" + f"Removing {len(plants_in_states_to_remove)} plants located in the following states: {remove_states}" ) df = df[~df["plant_id_eia"].isin(plants_in_states_to_remove)] if steam_only_plants: @@ -918,7 +918,7 @@ def remove_non_grid_connected_plants(df): "plant_id_eia" ].unique() ) - logger.info(f" Removing {num_plants} plants that are not grid-connected") + logger.info(f"Removing {num_plants} plants that are not grid-connected") df = df[~df["plant_id_eia"].isin(ngc_plants)] @@ -1005,7 +1005,7 @@ def clean_cems(year: int, small: bool, primary_fuel_table, subplant_emission_fac def smallerize_test_data(df, random_seed=None): - logger.info(" Randomly selecting 5% of plants for faster test run.") + logger.info("Randomly selecting 5% of plants for faster test run.") # Select 5% of plants selected_plants = df.plant_id_eia.unique() if random_seed is not None: @@ -1031,7 +1031,7 @@ def manually_remove_steam_units(df): )[["plant_id_eia", "emissions_unit_id_epa"]] logger.info( - f" Removing {len(units_to_remove)} units that only produce steam and do not report to EIA" + f"Removing {len(units_to_remove)} units that only produce steam and do not report to EIA" ) df = df.merge( @@ -1063,7 +1063,7 @@ def remove_incomplete_unit_months(cems): ].drop(columns="datetime_utc") logger.info( - f" Removing {len(unit_months_to_remove)} unit-months with incomplete hourly data" + f"Removing {len(unit_months_to_remove)} unit-months with incomplete hourly data" ) cems = cems.merge( @@ -1296,7 +1296,7 @@ def remove_cems_with_zero_monthly_data(cems): ) # remove any observations with the missing data flag logger.info( - f" Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported" + f"Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported" ) validation.check_removed_data_is_empty(cems) cems = cems[cems["missing_data_flag"] != "remove"] @@ -1670,7 +1670,7 @@ def identify_partial_cems_plants(all_data): # likely resulting from mixed fuel types. # If subplant_id assignment is working, there shouldn't be any raise Exception( - f" ERROR: {len(mixed_method_subplants)} subplant-months have multiple hourly methods assigned." + f"ERROR: {len(mixed_method_subplants)} subplant-months have multiple hourly methods assigned." ) # remove the intermediate indicator column @@ -1960,7 +1960,7 @@ def assign_ba_code_to_plant(df, year): df = df.merge(plant_ba, how="left", on="plant_id_eia", validate="m:1") if len(df[df["ba_code"].isna()]) > 0: - logger.warning(" the following plants are missing ba_code:") + logger.warning("the following plants are missing ba_code:") logger.warning("\n" + df[df["ba_code"].isna()].tostring()) # replace missing ba codes with NA diff --git a/src/data_pipeline.py b/src/data_pipeline.py index 5418ea75..ab87e491 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -40,25 +40,25 @@ def get_args() -> argparse.Namespace: "--shape_individual_plants", help="Assign an hourly profile to each individual plant with EIA-only data, instead of aggregating to the fleet level before shaping.", default=True, - action=argparse.BooleanOptionalAction + action=argparse.BooleanOptionalAction, ) parser.add_argument( "--small", help="Run on subset of data for quicker testing, outputs to outputs/small and results to results/small.", default=False, - action=argparse.BooleanOptionalAction + action=argparse.BooleanOptionalAction, ) parser.add_argument( "--flat", help="Use flat hourly profiles?", default=False, - action=argparse.BooleanOptionalAction + action=argparse.BooleanOptionalAction, ) parser.add_argument( "--skip_outputs", help="Skip outputting data to csv files for quicker testing.", default=False, - action=argparse.BooleanOptionalAction + action=argparse.BooleanOptionalAction, ) args = parser.parse_args() @@ -78,7 +78,7 @@ def main(): print_args(args) year = args.year - logger.info(f'Running data pipeline for year {year}') + logger.info(f"Running data pipeline for year {year}") validation.validate_year(year) @@ -344,12 +344,12 @@ def main(): logger.info("12. Cleaning EIA-930 data") # Scrapes and cleans data in data/downloads, outputs cleaned file at EBA_elec.csv if args.flat: - logger.info(" Not running 930 cleaning because we'll be using a flat profile.") + logger.info("Not running 930 cleaning because we'll be using a flat profile.") elif not (os.path.exists(outputs_folder(f"{path_prefix}/eia930/eia930_elec.csv"))): eia930.clean_930(year, small=args.small, path_prefix=path_prefix) else: logger.info( - f" Not re-running 930 cleaning. If you'd like to re-run, please delete data/outputs/{path_prefix}/eia930/" + f"Not re-running 930 cleaning. If you'd like to re-run, please delete data/outputs/{path_prefix}/eia930/" ) # If running small, we didn't clean the whole year, so need to use the Chalender file to build residual profiles. @@ -413,10 +413,10 @@ def main(): ) else: logger.info( - " Not shaping and exporting individual plant data since `shape_individual_plants` is False." + "Not shaping and exporting individual plant data since `shape_individual_plants` is False." ) logger.info( - " Plants that only report to EIA will be aggregated to the fleet level before shaping." + "Plants that only report to EIA will be aggregated to the fleet level before shaping." ) # 15. Shape fleet-level data @@ -550,6 +550,11 @@ def main(): hourly_consumed_calc.run() hourly_consumed_calc.output_results() + # move the log file into the specific year output folder + shutil.move( + outputs_folder("data_pipeline.log"), outputs_folder(f"{year}/data_pipeline.log") + ) + if __name__ == "__main__": main() diff --git a/src/download_data.py b/src/download_data.py index cb84a588..10763e06 100644 --- a/src/download_data.py +++ b/src/download_data.py @@ -42,12 +42,12 @@ def download_helper( final_destination = output_path if output_path is not None else download_path if os.path.exists(final_destination): logger.info( - f" {final_destination.split('/')[-1]} already downloaded, skipping." + f"{final_destination.split('/')[-1]} already downloaded, skipping." ) return False # Otherwise, download to the file in chunks. - logger.info(f" Downloading {final_destination.split('/')[-1]}") + logger.info(f"Downloading {final_destination.split('/')[-1]}") r = requests.get(input_url, stream=True) with open(download_path, "wb") as fd: for chunk in r.iter_content(chunk_size=chunk_size): @@ -99,10 +99,10 @@ def download_pudl_data(zenodo_url: str): with open(pudl_version_file, "r") as f: existing_version = f.readlines()[0].replace("\n", "") if pudl_version == existing_version: - logger.info(" PUDL version already downloaded") + logger.info("PUDL version already downloaded") return else: - logger.info(" Downloading new version of pudl") + logger.info("Downloading new version of pudl") shutil.rmtree(downloads_folder("pudl")) download_pudl(zenodo_url, pudl_version) @@ -114,19 +114,19 @@ def download_pudl(zenodo_url, pudl_version): total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 * 1024 * 10 # 10 MB downloaded = 0 - logger.info(" Downloading PUDL data...") + logger.info("Downloading PUDL data...") with open(downloads_folder("pudl.tgz"), "wb") as fd: for chunk in r.iter_content(chunk_size=block_size): print( - f" Progress: {(round(downloaded/total_size_in_bytes*100,2))}% \r", + f"Progress: {(round(downloaded/total_size_in_bytes*100,2))}% \r", end="", ) fd.write(chunk) downloaded += block_size - print(" Progress: 100.0%") + print("Progress: 100.0%") # extract the tgz file - logger.info(" Extracting PUDL data...") + logger.info("Extracting PUDL data...") with tarfile.open(downloads_folder("pudl.tgz")) as tar: tar.extractall(data_folder()) diff --git a/src/eia930.py b/src/eia930.py index 36aa7c1e..42ae2d6e 100644 --- a/src/eia930.py +++ b/src/eia930.py @@ -145,14 +145,14 @@ def clean_930(year: int, small: bool = False, path_prefix: str = ""): df = df.loc[start:end] # Don't worry about processing everything # Adjust - logger.info(" Adjusting EIA-930 time stamps") + logger.info("Adjusting EIA-930 time stamps") df = manual_930_adjust(df) df.to_csv( join(data_folder, "eia930_raw.csv") ) # Will be read by gridemissions workflow # Run cleaning - logger.info(" Running physics-based data cleaning") + logger.info("Running physics-based data cleaning") make_dataset( start, end, @@ -289,7 +289,7 @@ def remove_imputed_ones(eia930_data): filter = eia930_data["net_generation_mwh_930"].abs() < 1.5 # replace all 1.0 values with zero - logger.info(f" replacing {sum(filter)} imputed 1 values with 0") + logger.info(f"Replacing {sum(filter)} imputed 1 values with 0") eia930_data.loc[filter, "net_generation_mwh_930"] = 0 return eia930_data diff --git a/src/gross_to_net_generation.py b/src/gross_to_net_generation.py index 0d6ed96c..3c80d6e0 100644 --- a/src/gross_to_net_generation.py +++ b/src/gross_to_net_generation.py @@ -775,7 +775,7 @@ def load_monthly_gross_and_net_generation(start_year, end_year): ) # allocate net generation and heat input to each generator-fuel grouping - logger.info(" Allocating EIA-923 generation data") + logger.info("Allocating EIA-923 generation data") gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source( pudl_out, drop_interim_cols=True ) diff --git a/src/impute_hourly_profiles.py b/src/impute_hourly_profiles.py index 2d9bb64c..fa642eb8 100644 --- a/src/impute_hourly_profiles.py +++ b/src/impute_hourly_profiles.py @@ -710,7 +710,7 @@ def average_diba_wind_solar_profiles( ] if len(df_temporary) == 0 and not validation_run: # if this error is raised, we might have to implement an approach that uses average values for the wider region - logger.warning(f" There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}") + logger.warning(f"There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}") df_temporary = average_national_wind_solar_profiles( residual_profiles, ba, fuel, report_date ) diff --git a/src/load_data.py b/src/load_data.py index 798c480b..98164b6f 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -156,7 +156,7 @@ def load_cems_gross_generation(start_year, end_year): cems_all = [] for year in range(start_year, end_year + 1): - logger.info(f" loading {year} CEMS data") + logger.info(f"loading {year} CEMS data") # specify the path to the CEMS data cems_path = downloads_folder( "pudl/pudl_data/parquet/epacems/hourly_emissions_epacems/" diff --git a/src/logging_util.py b/src/logging_util.py index 9f1fba28..3ad47428 100644 --- a/src/logging_util.py +++ b/src/logging_util.py @@ -6,47 +6,46 @@ def get_logger(name: str) -> logging.Logger: - """Helper function to append `oge` to the logger name and return a logger. + """Helper function to append `oge` to the logger name and return a logger. - As a result, all returned loggers a children of the top-level `oge` logger. - """ - return logging.getLogger(f"oge.{name}") + As a result, all returned loggers a children of the top-level `oge` logger. + """ + return logging.getLogger(f"oge.{name}") def configure_root_logger(logfile: str | None = None, level: str = "INFO"): - """Configure the OGE logger to print to the console, and optionally to a file. - - This function is safe to call multiple times, since it will check if logging - handlers have already been installed and skip them if so. - - Logging is printed with the same format as PUDL: - ``` - 2023-02-21 16:10:44 [INFO] oge.test:21 This is an example - ``` - """ - root_logger = logging.getLogger() - - # Unfortunately, the `gridemissions` package adds a handler to the root logger - # which means that the output of other loggers propagates up and is printed - # twice. Remove the root handlers to avoid this. - for handler in root_logger.handlers: - root_logger.removeHandler(handler) - - oge_logger = logging.getLogger("oge") - log_format = "%(asctime)s [%(levelname)4s] %(name)s:%(lineno)s %(message)s" - - # Direct the output of the OGE logger to the terminal (and color it). Make - # sure this hasn't been done already to avoid adding duplicate handlers. - if len(oge_logger.handlers) == 0: - coloredlogs.install(fmt=log_format, level=level, logger=oge_logger) - oge_logger.addHandler(logging.NullHandler()) - - # Send everything to the log file by adding a file handler to the root logger. - if logfile is not None: - make_containing_folder(logfile) - file_logger = logging.FileHandler(logfile, mode='w') - file_logger.setFormatter(logging.Formatter(log_format)) - - if file_logger not in root_logger.handlers: - root_logger.addHandler(file_logger) - + """Configure the OGE logger to print to the console, and optionally to a file. + + This function is safe to call multiple times, since it will check if logging + handlers have already been installed and skip them if so. + + Logging is printed with the same format as PUDL: + ``` + 2023-02-21 16:10:44 [INFO] oge.test:21 This is an example + ``` + """ + root_logger = logging.getLogger() + + # Unfortunately, the `gridemissions` package adds a handler to the root logger + # which means that the output of other loggers propagates up and is printed + # twice. Remove the root handlers to avoid this. + for handler in root_logger.handlers: + root_logger.removeHandler(handler) + + oge_logger = logging.getLogger("oge") + log_format = "%(asctime)s [%(levelname)4s] %(name)s:%(lineno)s %(message)s" + + # Direct the output of the OGE logger to the terminal (and color it). Make + # sure this hasn't been done already to avoid adding duplicate handlers. + if len(oge_logger.handlers) == 0: + coloredlogs.install(fmt=log_format, level=level, logger=oge_logger) + oge_logger.addHandler(logging.NullHandler()) + + # Send everything to the log file by adding a file handler to the root logger. + if logfile is not None: + make_containing_folder(logfile) + file_logger = logging.FileHandler(logfile, mode="w") + file_logger.setFormatter(logging.Formatter(log_format)) + + if file_logger not in root_logger.handlers: + root_logger.addHandler(file_logger) diff --git a/src/output_data.py b/src/output_data.py index 23eb9824..8769abed 100644 --- a/src/output_data.py +++ b/src/output_data.py @@ -117,7 +117,7 @@ def zip_data_for_zenodo(year): def output_intermediate_data(df, file_name, path_prefix, year, skip_outputs): column_checks.check_columns(df, file_name) if not skip_outputs: - logger.info(f" Exporting {file_name} to data/outputs") + logger.info(f"Exporting {file_name} to data/outputs") df.to_csv(outputs_folder(f"{path_prefix}{file_name}_{year}.csv"), index=False) @@ -126,7 +126,7 @@ def output_to_results( ): # Always check columns that should not be negative. small = "small" in path_prefix - logger.info(f" Exporting {file_name} to data/results/{path_prefix}{subfolder}") + logger.info(f"Exporting {file_name} to data/results/{path_prefix}{subfolder}") if include_metric: metric = convert_results(df) @@ -154,7 +154,7 @@ def output_to_results( def output_data_quality_metrics(df, file_name, path_prefix, skip_outputs): if not skip_outputs: logger.info( - f" Exporting {file_name} to data/results/{path_prefix}data_quality_metrics" + f"Exporting {file_name} to data/results/{path_prefix}data_quality_metrics" ) # TODO: Add column checks diff --git a/src/validation.py b/src/validation.py index a50535a0..1f176020 100644 --- a/src/validation.py +++ b/src/validation.py @@ -272,7 +272,7 @@ def test_for_missing_energy_source_code(df): def check_non_missing_cems_co2_values_unchanged(cems_original, cems): """Checks that no non-missing CO2 values were modified during the process of filling.""" logger.info( - " Checking that original CO2 data in CEMS was not modified by filling missing values...", + "Checking that original CO2 data in CEMS was not modified by filling missing values...", ) # only keep non-zero and non-missing co2 values, since these should have not been modified cems_original = cems_original.loc[ From 4af25b1eeb341626c4139551cda1d7ec7833e25f Mon Sep 17 00:00:00 2001 From: grgmiller Date: Tue, 28 Feb 2023 11:20:58 -0800 Subject: [PATCH 22/27] update logger configuration location --- src/data_pipeline.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/data_pipeline.py b/src/data_pipeline.py index ab87e491..c687a385 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -24,11 +24,6 @@ from logging_util import get_logger, configure_root_logger -# Log the print statements to a file for debugging. -configure_root_logger(logfile=outputs_folder("data_pipeline.log")) -logger = get_logger("data_pipeline") - - def get_args() -> argparse.Namespace: """Specify arguments here. @@ -66,7 +61,7 @@ def get_args() -> argparse.Namespace: return args -def print_args(args: argparse.Namespace): +def print_args(args: argparse.Namespace, logger): """Print out the command line arguments.""" argstring = "\n".join([f" * {k} = {v}" for k, v in vars(args).items()]) logger.info(f"\n\nRunning with the following options:\n{argstring}\n") @@ -75,9 +70,14 @@ def print_args(args: argparse.Namespace): def main(): """Runs the OGE data pipeline.""" args = get_args() - print_args(args) - year = args.year + + # Log the print statements to a file for debugging. + configure_root_logger(logfile=outputs_folder(f"{year}/data_pipeline.log")) + logger = get_logger("data_pipeline") + + print_args(args, logger) + logger.info(f"Running data pipeline for year {year}") validation.validate_year(year) @@ -550,11 +550,6 @@ def main(): hourly_consumed_calc.run() hourly_consumed_calc.output_results() - # move the log file into the specific year output folder - shutil.move( - outputs_folder("data_pipeline.log"), outputs_folder(f"{year}/data_pipeline.log") - ) - if __name__ == "__main__": main() From 7610ac80daeabac0a63f50aa1f2c1124562ac6c2 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Tue, 28 Feb 2023 13:06:03 -0800 Subject: [PATCH 23/27] change order of fillna --- src/output_data.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/output_data.py b/src/output_data.py index 1222dcd3..7d9bc86e 100644 --- a/src/output_data.py +++ b/src/output_data.py @@ -275,12 +275,9 @@ def write_generated_averages(ba_fuel_data, year, path_prefix, skip_outputs): avg_fuel_type_production[f"{emission}_mass_lb{emission_type}"] / avg_fuel_type_production["net_generation_mwh"] ) - .fillna(0) .replace(np.inf, np.NaN) .replace(-np.inf, np.NaN) - .replace( - np.NaN, 0 - ) # TODO: temporary placeholder while solar is broken. Eventually there should be no NaNs. + .fillna(0) # TODO: temporary placeholder while solar is broken. Eventually there should be no NaNs. ) output_intermediate_data( avg_fuel_type_production, @@ -515,9 +512,9 @@ def add_generated_emission_rate_columns(df): df[f"{emission}_mass_lb{emission_type}"] / df["net_generation_mwh"] ) - .fillna(0) .replace(np.inf, np.NaN) .replace(-np.inf, np.NaN) + .fillna(0) ) # Set negative rates to zero, following eGRID methodology df.loc[df[col_name] < 0, col_name] = 0 From 60d6374b39fc01e20685a45626d5e5d3aaa0a308 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Tue, 28 Feb 2023 13:25:36 -0800 Subject: [PATCH 24/27] fill zeros only when in denominator --- src/output_data.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/output_data.py b/src/output_data.py index 7d9bc86e..a3d891af 100644 --- a/src/output_data.py +++ b/src/output_data.py @@ -277,7 +277,7 @@ def write_generated_averages(ba_fuel_data, year, path_prefix, skip_outputs): ) .replace(np.inf, np.NaN) .replace(-np.inf, np.NaN) - .fillna(0) # TODO: temporary placeholder while solar is broken. Eventually there should be no NaNs. + .fillna(0) ) output_intermediate_data( avg_fuel_type_production, @@ -514,8 +514,15 @@ def add_generated_emission_rate_columns(df): ) .replace(np.inf, np.NaN) .replace(-np.inf, np.NaN) - .fillna(0) ) + # where the rate is missing because of a divide by zero (i.e. + # net_generation_mwh is zero), replace the emission rate with + # zero. We want to keep all other NAs so that they get flagged + # by our validation checks since this indicates an unexpected + # issue + df.loc[df["net_generation_mwh"] == 0, col_name] = df.loc[ + df["net_generation_mwh"] == 0, col_name + ].fillna(0) # Set negative rates to zero, following eGRID methodology df.loc[df[col_name] < 0, col_name] = 0 return df From a588334144f00bb4a20bd161a92725d9053f372f Mon Sep 17 00:00:00 2001 From: grgmiller Date: Wed, 1 Mar 2023 08:48:43 -0800 Subject: [PATCH 25/27] change logfile location --- src/data_pipeline.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/data_pipeline.py b/src/data_pipeline.py index c687a385..e9735407 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -71,15 +71,6 @@ def main(): """Runs the OGE data pipeline.""" args = get_args() year = args.year - - # Log the print statements to a file for debugging. - configure_root_logger(logfile=outputs_folder(f"{year}/data_pipeline.log")) - logger = get_logger("data_pipeline") - - print_args(args, logger) - - logger.info(f"Running data pipeline for year {year}") - validation.validate_year(year) # 0. Set up directory structure @@ -111,6 +102,17 @@ def main(): exist_ok=True, ) + # configure the logger + # Log the print statements to a file for debugging. + configure_root_logger( + logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log") + ) + logger = get_logger("data_pipeline") + + print_args(args, logger) + + logger.info(f"Running data pipeline for year {year}") + # 1. Download data #################################################################################### logger.info("1. Downloading data") From 396a937efaceab64755ef89c1df98f09f37942e4 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Wed, 1 Mar 2023 08:53:22 -0800 Subject: [PATCH 26/27] move year validation after logger config --- src/data_pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/data_pipeline.py b/src/data_pipeline.py index e9735407..88678e2f 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -71,7 +71,6 @@ def main(): """Runs the OGE data pipeline.""" args = get_args() year = args.year - validation.validate_year(year) # 0. Set up directory structure path_prefix = "" if not args.small else "small/" @@ -108,9 +107,9 @@ def main(): logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log") ) logger = get_logger("data_pipeline") - print_args(args, logger) + validation.validate_year(year) logger.info(f"Running data pipeline for year {year}") # 1. Download data From 4bd4c54a1887e9a9cdd6465ec700164c028f1c76 Mon Sep 17 00:00:00 2001 From: grgmiller Date: Wed, 1 Mar 2023 10:58:23 -0800 Subject: [PATCH 27/27] change directory creation order --- src/data_pipeline.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/data_pipeline.py b/src/data_pipeline.py index 88678e2f..dc78b996 100644 --- a/src/data_pipeline.py +++ b/src/data_pipeline.py @@ -72,6 +72,17 @@ def main(): args = get_args() year = args.year + # configure the logger + # Log the print statements to a file for debugging. + configure_root_logger( + logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log") + ) + logger = get_logger("data_pipeline") + print_args(args, logger) + + logger.info(f"Running data pipeline for year {year}") + validation.validate_year(year) + # 0. Set up directory structure path_prefix = "" if not args.small else "small/" path_prefix += "flat/" if args.flat else "" @@ -101,17 +112,6 @@ def main(): exist_ok=True, ) - # configure the logger - # Log the print statements to a file for debugging. - configure_root_logger( - logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log") - ) - logger = get_logger("data_pipeline") - print_args(args, logger) - - validation.validate_year(year) - logger.info(f"Running data pipeline for year {year}") - # 1. Download data #################################################################################### logger.info("1. Downloading data")