From ec3c56ffc96e6c31f72a238295f0d1ab86408f6d Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Fri, 10 Feb 2023 17:23:27 -0800
Subject: [PATCH 01/27] update warning threshold

---
 src/validation.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/validation.py b/src/validation.py
index 15d05827..f157c292 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -57,16 +57,24 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated):
             "fuel_consumed_for_electricity_mmbtu",
         ]
     ].sum()
-    # calculate the difference between the values
-    plant_total_diff = plant_total_gf - plant_total_alloc
-    # flag values where the absolute difference is greater than 10 mwh or mmbtu
+    # calculate the percentage difference between the values
+    plant_total_diff = (plant_total_alloc - plant_total_gf) / plant_total_gf
+    # flag rows where the absolute percentage difference is greater than our threshold
+    threshold_percent = 0.05
     mismatched_allocation = plant_total_diff[
-        (abs(plant_total_diff["fuel_consumed_mmbtu"]) > 10)
-        | (abs(plant_total_diff["net_generation_mwh"]) > 10)
+        (abs(plant_total_diff["fuel_consumed_mmbtu"]) > threshold_percent)
+        | (abs(plant_total_diff["net_generation_mwh"]) > threshold_percent)
     ]
     if len(mismatched_allocation) > 0:
-        print("WARNING: Allocated EIA-923 doesn't match input data for plants:")
+        print(
+            "WARNING: Allocated EIA-923 data doesn't match input data for the following plants:"
+        )
+        print("Percentage Difference:")
         print(mismatched_allocation)
+        print("Input Totals:")
+        print(plant_total_gf.loc[mismatched_allocation.index, :])
+        print("Allocated Totals:")
+        print(plant_total_alloc.loc[mismatched_allocation.index, :])
 
 
 def test_for_negative_values(df, small: bool = False):

From 1484db8ffff610d2c1943096d049eed09017c786 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Fri, 10 Feb 2023 17:34:53 -0800
Subject: [PATCH 02/27] update message

---
 src/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/validation.py b/src/validation.py
index f157c292..e88512f2 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -71,7 +71,7 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated):
         )
         print("Percentage Difference:")
         print(mismatched_allocation)
-        print("Input Totals:")
+        print("EIA-923 Input Totals:")
         print(plant_total_gf.loc[mismatched_allocation.index, :])
         print("Allocated Totals:")
         print(plant_total_alloc.loc[mismatched_allocation.index, :])

From d1643b13bdd79b08d6a8473fa23b4a1dc0472c8c Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 11 Feb 2023 08:38:22 -0800
Subject: [PATCH 03/27] change threshold to 0.001

---
 src/data_cleaning.py |  4 +++-
 src/validation.py    | 16 +++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index 6ce26ca7..e6a67eb7 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -401,7 +401,9 @@ def clean_eia923(
     )
 
     # test to make sure allocated totals match input totals
-    validation.check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated)
+    validation.check_allocated_gf_matches_input_gf(
+        pudl_out, gen_fuel_allocated, threshold_percent=0.01
+    )
 
     # manually update energy source code when OTH
     gen_fuel_allocated = update_energy_source_codes(gen_fuel_allocated)
diff --git a/src/validation.py b/src/validation.py
index e88512f2..4fa35e5e 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -40,8 +40,19 @@ def validate_year(year):
         raise UserWarning(year_warning)
 
 
-def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated):
-    """Checks that the allocated generation and fuel from EIA-923 matches the input totals."""
+def check_allocated_gf_matches_input_gf(
+    pudl_out, gen_fuel_allocated, threshold_percent=0.001
+):
+    """
+    Checks that the allocated generation and fuel from EIA-923 matches the input totals.
+
+    Because there might be small rounding errors in the allocation that make the
+    allocated total slightly off from the input data, we allow the user to specify a
+    threshold percentage above which mismatched data is flagged. The default value is
+    0.1%, so that if either the allocated total fuel consumption or allocated total net
+    generation is more than +/-0.1% different from the total input generation or fuel,
+    the record is flagged.
+    """
     gf = pudl_out.gf_eia923()
     plant_total_gf = gf.groupby("plant_id_eia")[
         [
@@ -60,7 +71,6 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated):
     # calculate the percentage difference between the values
     plant_total_diff = (plant_total_alloc - plant_total_gf) / plant_total_gf
     # flag rows where the absolute percentage difference is greater than our threshold
-    threshold_percent = 0.05
     mismatched_allocation = plant_total_diff[
         (abs(plant_total_diff["fuel_consumed_mmbtu"]) > threshold_percent)
         | (abs(plant_total_diff["net_generation_mwh"]) > threshold_percent)

From 84ae470132e9412c2fd331cdcc9f599a843a510d Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 11 Feb 2023 08:39:25 -0800
Subject: [PATCH 04/27] fix threshold in data_cleaning

---
 src/data_cleaning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index e6a67eb7..558c4b7d 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -402,7 +402,7 @@ def clean_eia923(
 
     # test to make sure allocated totals match input totals
     validation.check_allocated_gf_matches_input_gf(
-        pudl_out, gen_fuel_allocated, threshold_percent=0.01
+        pudl_out, gen_fuel_allocated, threshold_percent=0.001
     )
 
     # manually update energy source code when OTH

From 7fd11a45d24a72c3bb195cfedd9ff97834984668 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 11 Feb 2023 13:46:19 -0800
Subject: [PATCH 05/27] fix issue with indexes

---
 notebooks/validation/validate_vs_egrid.ipynb  | 136 +++-
 .../GH279_missing_cems_data.ipynb             | 731 ++++++++++++++++++
 src/data_cleaning.py                          |  14 +
 src/emissions.py                              |   7 +-
 src/validation.py                             |  82 ++
 5 files changed, 929 insertions(+), 41 deletions(-)
 create mode 100644 notebooks/work_in_progress/GH279_missing_cems_data.ipynb

diff --git a/notebooks/validation/validate_vs_egrid.ipynb b/notebooks/validation/validate_vs_egrid.ipynb
index 689c0901..70ec8518 100644
--- a/notebooks/validation/validate_vs_egrid.ipynb
+++ b/notebooks/validation/validate_vs_egrid.ipynb
@@ -245,6 +245,36 @@
     "]\n"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate Plant-level discrepencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# evaluate all plants\n",
+    "comparison_count, compared = validation.compare_plant_level_results_to_egrid(\n",
+    "    annual_plant_results, egrid_plant, PLANTS_MISSING_FROM_EGRID\n",
+    ")\n",
+    "comparison_count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compared[(compared[\"ba_code\"] == \"SOCO\") & (compared[\"co2_mass_lb_status\"] != \"!exact\")]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -404,23 +434,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "year = 2020\n",
+    "year = 2021\n",
     "path_prefix = year\n",
     "\n",
+    "DATA_COLUMNS = [\n",
+    "    \"net_generation_mwh\",\n",
+    "    \"fuel_consumed_mmbtu\",\n",
+    "    \"fuel_consumed_for_electricity_mmbtu\",\n",
+    "    \"co2_mass_lb\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Load the eGRID plant table\n",
     "egrid_plant = validation.load_egrid_plant_file(year)\n",
     "\n",
     "egrid_ba = validation.load_egrid_ba_file(year)\n",
     "\n",
     "# aggregate the plant data up to the BA level\n",
-    "data_columns = [\n",
-    "    \"net_generation_mwh\",\n",
-    "    \"fuel_consumed_mmbtu\",\n",
-    "    \"fuel_consumed_for_electricity_mmbtu\",\n",
-    "    \"co2_mass_lb\",\n",
-    "    \"co2_mass_lb_for_electricity_adjusted\",\n",
-    "]\n",
-    "egrid_plant_ba_agg = egrid_plant.groupby([\"ba_code\"]).sum()[data_columns].reset_index()\n"
+    "egrid_plant_ba_agg = egrid_plant.groupby([\"ba_code\"]).sum()[DATA_COLUMNS].reset_index()\n",
+    "\n",
+    "egrid_plant_ba_agg[\"generated_co2_rate_lb_per_mwh\"] = egrid_plant_ba_agg[\"co2_mass_lb\"] / egrid_plant_ba_agg[\"net_generation_mwh\"]"
    ]
   },
   {
@@ -430,14 +469,6 @@
    "outputs": [],
    "source": [
     "# load our annual ba data\n",
-    "DATA_COLUMNS = [\n",
-    "    \"net_generation_mwh\",\n",
-    "    \"fuel_consumed_mmbtu\",\n",
-    "    \"fuel_consumed_for_electricity_mmbtu\",\n",
-    "    \"co2_mass_lb\",\n",
-    "    \"co2_mass_lb_adjusted\",\n",
-    "]\n",
-    "\n",
     "calculated_ba = []\n",
     "\n",
     "for filename in os.listdir(\n",
@@ -455,7 +486,9 @@
     "    ba_data = ba_data[[\"ba_code\"] + DATA_COLUMNS]\n",
     "    calculated_ba.append(ba_data)\n",
     "\n",
-    "calculated_ba = pd.concat(calculated_ba, axis=0)\n"
+    "calculated_ba = pd.concat(calculated_ba, axis=0)\n",
+    "\n",
+    "calculated_ba[\"generated_co2_rate_lb_per_mwh\"] = calculated_ba[\"co2_mass_lb\"] / calculated_ba[\"net_generation_mwh\"]\n"
    ]
   },
   {
@@ -473,15 +506,6 @@
     ").round(2)\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "percent_diff_from_egrid.sort_values(by=\"net_generation_mwh\")\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -497,21 +521,59 @@
     "    .sort_values(by=\"co2_mass_lb\")\n",
     "    .round(3)\n",
     ")\n",
+    "ba_metric = ba_metric - 1\n",
     "\n",
     "total = pd.DataFrame(\n",
-    "    calculated_ba[data_columns]\n",
+    "    calculated_ba[DATA_COLUMNS + [\"generated_co2_rate_lb_per_mwh\"]]\n",
     "    .sum()\n",
-    "    .div(egrid_plant_ba_agg[data_columns].sum())\n",
+    "    .div(egrid_plant_ba_agg[DATA_COLUMNS + [\"generated_co2_rate_lb_per_mwh\"]].sum())\n",
     "    .rename(\"Total\")\n",
     ").T\n",
+    "total = total - 1\n",
     "\n",
     "# calculate the difference in the number of plants in each region\n",
-    "# plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')\n",
-    "# ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()\n",
+    "plant_count = (\n",
+    "    annual_plant_results[\n",
+    "        ~(\n",
+    "            annual_plant_results[\n",
+    "                [\n",
+    "                    \"net_generation_mwh\",\n",
+    "                    \"fuel_consumed_mmbtu\",\n",
+    "                    \"fuel_consumed_for_electricity_mmbtu\",\n",
+    "                    \"co2_mass_lb\",\n",
+    "                ]\n",
+    "            ].sum(axis=1)\n",
+    "            == 0\n",
+    "        )\n",
+    "    ]\n",
+    "    .groupby(\"ba_code\", dropna=False)[\"plant_id_eia\"]\n",
+    "    .nunique()\n",
+    "    - egrid_plant[\n",
+    "        ~(\n",
+    "            egrid_plant[\n",
+    "                [\n",
+    "                    \"net_generation_mwh\",\n",
+    "                    \"fuel_consumed_mmbtu\",\n",
+    "                    \"fuel_consumed_for_electricity_mmbtu\",\n",
+    "                    \"co2_mass_lb\",\n",
+    "                ]\n",
+    "            ].sum(axis=1)\n",
+    "            == 0\n",
+    "        )\n",
+    "    ]\n",
+    "    .groupby(\"ba_code\", dropna=False)[\"plant_id_eia\"]\n",
+    "    .nunique()\n",
+    ").rename(\"num_plants\")\n",
+    "\n",
+    "ba_metric = ba_metric.merge(\n",
+    "    plant_count, how=\"left\", left_index=True, right_index=True\n",
+    ").sort_index()\n",
+    "\n",
+    "ba_metric = ba_metric.sort_values(by=[\"generated_co2_rate_lb_per_mwh\"], ascending=True)\n",
     "\n",
     "ba_metric = pd.concat([ba_metric, total], axis=0).round(2)\n",
     "\n",
-    "ba_metric = ba_metric[data_columns]\n",
+    "ba_metric = ba_metric[DATA_COLUMNS + [\"generated_co2_rate_lb_per_mwh\", \"num_plants\"]]\n",
     "\n",
     "columns_to_check = [\n",
     "    \"net_generation_mwh\",\n",
@@ -521,19 +583,15 @@
     "]\n",
     "\n",
     "with pd.option_context(\"display.max_rows\", None, \"display.max_columns\", None):\n",
-    "    display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])\n"
+    "    display(ba_metric[~(ba_metric[columns_to_check] == 0).all(axis=1)])\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Explore specific plants\n",
-    "\n",
-    "### Notes\n",
-    "\n",
-    "BA Totals\n",
-    " - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC\n"
+    "## Explore specific plants\n"
    ]
   },
   {
diff --git a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb
new file mode 100644
index 00000000..904e85ac
--- /dev/null
+++ b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb
@@ -0,0 +1,731 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import packages\n",
+    "import pandas as pd\n",
+    "\n",
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# # Tell python where to look for modules.\n",
+    "import sys\n",
+    "sys.path.append('../../../open-grid-emissions/src/')\n",
+    "\n",
+    "from column_checks import get_dtypes\n",
+    "from filepaths import *\n",
+    "import load_data\n",
+    "from data_cleaning import *\n",
+    "import validation\n",
+    "import emissions\n",
+    "\n",
+    "year = 2021"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## What does the cleaned CEMS data look like"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load data from csv\n",
+    "year = 2021\n",
+    "path_prefix = f\"{year}/\"\n",
+    "\n",
+    "cems = pd.read_csv(outputs_folder(f\"{path_prefix}/cems_cleaned_{year}.csv\"), dtype=get_dtypes())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems[cems[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems[cems[\"plant_id_eia\"] == 3].sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia923_allocated = pd.read_csv(outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"), dtype=get_dtypes())\n",
+    "eia923_allocated[eia923_allocated[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"subplant_id\",\"report_date\"]).sum(numeric_only=True).head(20)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test where data is being dropped"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    eia923_allocated,\n",
+    "    primary_fuel_table,\n",
+    "    subplant_emission_factors,\n",
+    ") = clean_eia923(year, False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# does the raw cems match this?\n",
+    "cems_raw = load_data.load_cems_data(year)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "barry.sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove non-grid connected plants\n",
+    "cems_raw = remove_plants(\n",
+    "    cems_raw,\n",
+    "    non_grid_connected=True,\n",
+    "    remove_states=[\"PR\"],\n",
+    "    steam_only_plants=False,\n",
+    "    distribution_connected_plants=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# manually remove steam-only units\n",
+    "cems_raw = manually_remove_steam_units(cems_raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add a report date\n",
+    "cems_raw = load_data.add_report_date(cems_raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove data for any unit-months where there are incomplete data reported\n",
+    "# this is generally when there is a single observation reported for an entire month\n",
+    "cems_raw = remove_incomplete_unit_months(cems_raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add subplant id\n",
+    "subplant_crosswalk = (\n",
+    "    pd.read_csv(\n",
+    "        outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n",
+    "        dtype=get_dtypes(),\n",
+    "    )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n",
+    "    .drop_duplicates()\n",
+    "    .dropna(subset=\"emissions_unit_id_epa\")\n",
+    ")\n",
+    "cems_raw = cems_raw.merge(\n",
+    "    subplant_crosswalk,\n",
+    "    how=\"left\",\n",
+    "    on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n",
+    "    validate=\"m:1\",\n",
+    ")\n",
+    "validation.test_for_missing_subplant_id(cems_raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add a fuel type to each observation\n",
+    "cems_raw = assign_fuel_type_to_cems(cems_raw, year, primary_fuel_table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fill in missing hourly emissions data using the fuel type and heat input\n",
+    "validation.test_for_missing_energy_source_code(cems_raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save a copy of the cems data at this point to test later\n",
+    "cems_test = cems_raw.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw = emissions.calculate_ghg_emissions_from_fuel_consumption(\n",
+    "        df=cems_raw, year=year, include_co2=False, include_ch4=True, include_n2o=True\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw = remove_cems_with_zero_monthly_data(cems_raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Investigate emissions filling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_test[cems_test[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "barry = cems_test.copy() #[(cems_test[\"plant_id_eia\"] == 3)]\n",
+    "barry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add a new categorical option to the mass measurement code\n",
+    "barry[\"co2_mass_measurement_code\"] = barry[\n",
+    "    \"co2_mass_measurement_code\"\n",
+    "].cat.add_categories(\"Imputed\")\n",
+    "\n",
+    "# replace all \"missing\" CO2 values with zero\n",
+    "barry[\"co2_mass_lb\"] = barry[\"co2_mass_lb\"].fillna(0)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace 0 reported CO2 values with missing values, if there was reported heat input\n",
+    "barry.loc[\n",
+    "    (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace 0 reported CO2 values with missing values, if there was reported heat input\n",
+    "barry.loc[\n",
+    "    (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n",
+    "    \"co2_mass_lb\",\n",
+    "] = np.NaN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new df with all observations with missing co2 data\n",
+    "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n",
+    "missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unit_months_missing_co2 = missing_co2[\n",
+    "        [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"]\n",
+    "    ].drop_duplicates()\n",
+    "unit_months_missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get non-missing data from cems for these unit months\n",
+    "unit_months_missing_co2 = unit_months_missing_co2.merge(\n",
+    "    barry[\n",
+    "        [\n",
+    "            \"plant_id_eia\",\n",
+    "            \"emissions_unit_id_epa\",\n",
+    "            \"report_date\",\n",
+    "            \"co2_mass_lb\",\n",
+    "            \"fuel_consumed_mmbtu\",\n",
+    "        ]\n",
+    "    ],\n",
+    "    how=\"left\",\n",
+    "    on=[\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"],\n",
+    "    validate=\"1:m\",\n",
+    ")\n",
+    "unit_months_missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unit_months_missing_co2 = unit_months_missing_co2[\n",
+    "        (unit_months_missing_co2[\"co2_mass_lb\"] > 0)\n",
+    "        & (unit_months_missing_co2[\"fuel_consumed_mmbtu\"] > 0)\n",
+    "    ]\n",
+    "unit_months_missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# calculate total fuel consumption and emissions by month\n",
+    "unit_month_efs = (\n",
+    "    unit_months_missing_co2.groupby(\n",
+    "        [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"], dropna=False\n",
+    "    )\n",
+    "    .sum()\n",
+    "    .reset_index()\n",
+    ")\n",
+    "unit_month_efs[\"co2_lb_per_mmbtu\"] = (\n",
+    "    unit_month_efs[\"co2_mass_lb\"] / unit_month_efs[\"fuel_consumed_mmbtu\"]\n",
+    ")\n",
+    "unit_month_efs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge these EFs into the missing cems data\n",
+    "missing_co2 = missing_co2.merge(\n",
+    "    unit_month_efs[\n",
+    "        [\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\", \"co2_lb_per_mmbtu\"]\n",
+    "    ],\n",
+    "    how=\"left\",\n",
+    "    on=[\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\"],\n",
+    "    validate=\"m:1\",\n",
+    ").set_index(missing_co2.index)\n",
+    "missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only keep observations where there is a non-missing ef\n",
+    "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n",
+    "\n",
+    "# calculate missing co2 data\n",
+    "missing_co2[\"co2_mass_lb\"] = (\n",
+    "    missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n",
+    ")\n",
+    "missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# update in CEMS table\n",
+    "barry.update(missing_co2[[\"co2_mass_lb\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# update the co2 mass measurement code\n",
+    "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n",
+    "\n",
+    "# identify all observations that are still missing co2 data\n",
+    "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n",
+    "missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge the weighted ef into the missing data\n",
+    "missing_co2 = missing_co2.merge(\n",
+    "    subplant_emission_factors[\n",
+    "        [\"plant_id_eia\", \"report_date\", \"subplant_id\", \"co2_lb_per_mmbtu\"]\n",
+    "    ],\n",
+    "    how=\"left\",\n",
+    "    on=[\"plant_id_eia\", \"report_date\", \"subplant_id\"],\n",
+    "    validate=\"m:1\",\n",
+    ").set_index(missing_co2.index)\n",
+    "missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only keep observations where there is a non-missing ef\n",
+    "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n",
+    "\n",
+    "# calculate missing co2 data\n",
+    "missing_co2[\"co2_mass_lb\"] = (\n",
+    "    missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n",
+    ")\n",
+    "missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# update in barry table\n",
+    "barry.update(missing_co2[[\"co2_mass_lb\"]])\n",
+    "\n",
+    "# update the co2 mass measurement code\n",
+    "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n",
+    "\n",
+    "# identify all observations that are still missing co2 data\n",
+    "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "missing_co2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for rows that have a successful fuel code match, move to a temporary dataframe to hold the data\n",
+    "co2_to_fill = missing_co2.copy()[~missing_co2[\"energy_source_code\"].isna()]\n",
+    "fill_index = co2_to_fill.index\n",
+    "co2_to_fill"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# calculate emissions based on fuel type\n",
+    "co2_to_fill = emissions.calculate_ghg_emissions_from_fuel_consumption(\n",
+    "    df=co2_to_fill,\n",
+    "    year=year,\n",
+    "    include_co2=True,\n",
+    "    include_ch4=False,\n",
+    "    include_n2o=False,\n",
+    ").set_index(fill_index)\n",
+    "\n",
+    "co2_to_fill"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fill this data into the original cems data\n",
+    "barry.update(co2_to_fill[[\"co2_mass_lb\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_fill = cems_test.loc[cems_test[\"co2_mass_lb\"] > 0,[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]\n",
+    "test_fill = test_fill.merge(barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]], how=\"left\", on=[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\"], validate=\"1:1\", suffixes=(\"_original\",\"_postfill\"))\n",
+    "test_fill[\"diff\"] = test_fill[\"co2_mass_lb_postfill\"] - test_fill[\"co2_mass_lb_original\"]\n",
+    "test_fill[test_fill[\"diff\"] != 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "open_grid_emissions",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "25e36f192ecdbe5da57d9bea009812e7b11ef0e0053366a845a2802aae1b29d2"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index 6ce26ca7..dd88e073 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -1298,6 +1298,20 @@ def remove_cems_with_zero_monthly_data(cems):
     print(
         f"    Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported"
     )
+    check_that_data_is_zero = cems[
+        cems["missing_data_flag"] == "remove",
+        [
+            "gross_generation_mwh",
+            "steam_load_1000_lb",
+            "fuel_consumed_mmbtu",
+            "co2_mass_lb",
+            "nox_mass_lb",
+            "so2_mass_lb",
+        ],
+    ].sum(numeric_only=True)
+    if check_that_data_is_zero.sum() > 0:
+        print("WARNING: Some data being removed has non-zero data associated with it:")
+        print(check_that_data_is_zero)
     cems = cems[cems["missing_data_flag"] != "remove"]
     # drop the missing data flag column
     cems = cems.drop(columns="missing_data_flag")
diff --git a/src/emissions.py b/src/emissions.py
index 0987b70d..01245150 100644
--- a/src/emissions.py
+++ b/src/emissions.py
@@ -1753,7 +1753,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors):
         how="left",
         on=["plant_id_eia", "report_date", "emissions_unit_id_epa"],
         validate="m:1",
-    )
+    ).set_index(missing_co2.index)
 
     # only keep observations where there is a non-missing ef
     missing_co2 = missing_co2[~missing_co2["co2_lb_per_mmbtu"].isna()]
@@ -1783,7 +1783,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors):
         how="left",
         on=["plant_id_eia", "report_date", "subplant_id"],
         validate="m:1",
-    )
+    ).set_index(missing_co2.index)
 
     # only keep observations where there is a non-missing ef
     missing_co2 = missing_co2[~missing_co2["co2_lb_per_mmbtu"].isna()]
@@ -1831,4 +1831,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors):
             "There are still misssing CO2 values remaining after filling missing CO2 values in CEMS"
         )
 
+    # check that no non-missing co2 values were modified during filling
+    validation.check_non_missing_cems_co2_values_unchanged(cems, year)
+
     return cems
diff --git a/src/validation.py b/src/validation.py
index 15d05827..5564e225 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -265,6 +265,38 @@ def test_for_missing_energy_source_code(df):
     return missing_esc_test
 
 
+def check_non_missing_cems_co2_values_unchanged(cems, year):
+    """Checks that no non-missing CO2 values were modified during the process of filling."""
+    print("    Checking that original CO2 data in CEMS was not modified by filling missing values...", end="")
+    # re-load the raw cems data
+    cems_original = load_data.load_cems_data(year)
+    # only keep non-zero and non-missing co2 values, since these should have not been modified
+    cems_original = cems_original.loc[
+        cems_original["co2_mass_lb"] > 0,
+        ["plant_id_eia", "emissions_unit_id_epa", "datetime_utc", "co2_mass_lb"],
+    ]
+    test_fill = cems_original.merge(
+        cems[["plant_id_eia", "emissions_unit_id_epa", "datetime_utc", "co2_mass_lb"]],
+        how="left",
+        on=["plant_id_eia", "emissions_unit_id_epa", "datetime_utc"],
+        validate="1:1",
+        suffixes=("_original", "_postfill"),
+    )
+    test_fill["diff"] = (
+        test_fill["co2_mass_lb_postfill"] - test_fill["co2_mass_lb_original"]
+    )
+    if len(test_fill[test_fill["diff"] != 0]) > 0:
+        print(" ")
+        print(
+            f"WARNING: There are {len(test_fill[test_fill["diff"] != 0])} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error"
+        )
+    else:
+        print("OK")
+
+    del cems_original
+
+
+
 def test_for_missing_subplant_id(df):
     """Checks if any records are missing a `subplant_id`."""
     print("    Checking that all data has an associated `subplant_id`...  ", end="")
@@ -1831,6 +1863,56 @@ def compare_plant_level_results_to_egrid(
         [comparison_count, pd.DataFrame(comparison_count.sum().rename("Total")).T],
         axis=0,
     )
+
+    compared = compared_merged.merge(
+        compared[
+            [
+                "plant_name_eia",
+                "ba_code",
+                "state",
+                "net_generation_mwh_status",
+                "fuel_consumed_mmbtu_status",
+                "fuel_consumed_for_electricity_mmbtu_status",
+                "co2_mass_lb_for_electricity_adjusted_status",
+                "co2_mass_lb_status",
+                "so2_mass_lb_status",
+                "nox_mass_lb_status",
+            ]
+        ],
+        how="left",
+        left_index=True,
+        right_index=True,
+    )
+
+    compared = compared[
+        [
+            "plant_name_eia",
+            "ba_code",
+            "state",
+            "net_generation_mwh_status",
+            "net_generation_mwh_calc",
+            "net_generation_mwh_egrid",
+            "fuel_consumed_mmbtu_status",
+            "fuel_consumed_mmbtu_calc",
+            "fuel_consumed_mmbtu_egrid",
+            "fuel_consumed_for_electricity_mmbtu_status",
+            "fuel_consumed_for_electricity_mmbtu_calc",
+            "fuel_consumed_for_electricity_mmbtu_egrid",
+            "co2_mass_lb_status",
+            "co2_mass_lb_calc",
+            "co2_mass_lb_egrid",
+            "nox_mass_lb_status",
+            "nox_mass_lb_calc",
+            "nox_mass_lb_egrid",
+            "so2_mass_lb_status",
+            "so2_mass_lb_calc",
+            "so2_mass_lb_egrid",
+            "co2_mass_lb_for_electricity_adjusted_status",
+            "co2_mass_lb_for_electricity_adjusted_calc",
+            "co2_mass_lb_for_electricity_adjusted_egrid",
+        ]
+    ]
+
     return comparison_count, compared
 
 

From 6127925965e6236f9dfb27e6ad5fc20f8470ef6b Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 11 Feb 2023 13:55:02 -0800
Subject: [PATCH 06/27] move validation function

---
 src/data_cleaning.py | 15 +--------------
 src/validation.py    | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index dd88e073..30294715 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -1298,20 +1298,7 @@ def remove_cems_with_zero_monthly_data(cems):
     print(
         f"    Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported"
     )
-    check_that_data_is_zero = cems[
-        cems["missing_data_flag"] == "remove",
-        [
-            "gross_generation_mwh",
-            "steam_load_1000_lb",
-            "fuel_consumed_mmbtu",
-            "co2_mass_lb",
-            "nox_mass_lb",
-            "so2_mass_lb",
-        ],
-    ].sum(numeric_only=True)
-    if check_that_data_is_zero.sum() > 0:
-        print("WARNING: Some data being removed has non-zero data associated with it:")
-        print(check_that_data_is_zero)
+    validation.check_removed_data_is_empty(cems)
     cems = cems[cems["missing_data_flag"] != "remove"]
     # drop the missing data flag column
     cems = cems.drop(columns="missing_data_flag")
diff --git a/src/validation.py b/src/validation.py
index 5564e225..64f87203 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -295,7 +295,22 @@ def check_non_missing_cems_co2_values_unchanged(cems, year):
 
     del cems_original
 
-
+def check_removed_data_is_empty(cems):
+    """Checks that the rows removed by `data_cleaning.remove_cems_with_zero_monthly_data()` don't actually contain non-zero data"""
+    check_that_data_is_zero = cems[
+        cems["missing_data_flag"] == "remove",
+        [
+            "gross_generation_mwh",
+            "steam_load_1000_lb",
+            "fuel_consumed_mmbtu",
+            "co2_mass_lb",
+            "nox_mass_lb",
+            "so2_mass_lb",
+        ],
+    ].sum(numeric_only=True)
+    if check_that_data_is_zero.sum() > 0:
+        print("WARNING: Some data being removed has non-zero data associated with it:")
+        print(check_that_data_is_zero)
 
 def test_for_missing_subplant_id(df):
     """Checks if any records are missing a `subplant_id`."""

From c9604d6acdb299f58a22cdfedc5c08848480b99e Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 11 Feb 2023 14:00:15 -0800
Subject: [PATCH 07/27] update format

---
 src/validation.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/validation.py b/src/validation.py
index 64f87203..090de123 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -267,7 +267,10 @@ def test_for_missing_energy_source_code(df):
 
 def check_non_missing_cems_co2_values_unchanged(cems, year):
     """Checks that no non-missing CO2 values were modified during the process of filling."""
-    print("    Checking that original CO2 data in CEMS was not modified by filling missing values...", end="")
+    print(
+        "    Checking that original CO2 data in CEMS was not modified by filling missing values...",
+        end="",
+    )
     # re-load the raw cems data
     cems_original = load_data.load_cems_data(year)
     # only keep non-zero and non-missing co2 values, since these should have not been modified
@@ -285,16 +288,18 @@ def check_non_missing_cems_co2_values_unchanged(cems, year):
     test_fill["diff"] = (
         test_fill["co2_mass_lb_postfill"] - test_fill["co2_mass_lb_original"]
     )
-    if len(test_fill[test_fill["diff"] != 0]) > 0:
+    num_nonzero_rows = len(test_fill[test_fill["diff"] != 0])
+    if num_nonzero_rows > 0:
         print(" ")
         print(
-            f"WARNING: There are {len(test_fill[test_fill["diff"] != 0])} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error"
+            f"WARNING: There are {num_nonzero_rows} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error"
         )
     else:
         print("OK")
 
     del cems_original
 
+
 def check_removed_data_is_empty(cems):
     """Checks that the rows removed by `data_cleaning.remove_cems_with_zero_monthly_data()` don't actually contain non-zero data"""
     check_that_data_is_zero = cems[
@@ -312,6 +317,7 @@ def check_removed_data_is_empty(cems):
         print("WARNING: Some data being removed has non-zero data associated with it:")
         print(check_that_data_is_zero)
 
+
 def test_for_missing_subplant_id(df):
     """Checks if any records are missing a `subplant_id`."""
     print("    Checking that all data has an associated `subplant_id`...  ", end="")

From 9ff5541e617c84779adbcdaefffcc1bb2b48a3b0 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 11 Feb 2023 15:55:43 -0800
Subject: [PATCH 08/27] add diff notebook

---
 .../validation/diff_output_versions.ipynb     | 208 ++++++++++++++++++
 .../GH279_missing_cems_data.ipynb             |  78 ++++++-
 src/emissions.py                              |   5 +-
 src/validation.py                             |   8 +-
 4 files changed, 290 insertions(+), 9 deletions(-)
 create mode 100644 notebooks/validation/diff_output_versions.ipynb

diff --git a/notebooks/validation/diff_output_versions.ipynb b/notebooks/validation/diff_output_versions.ipynb
new file mode 100644
index 00000000..19f6a649
--- /dev/null
+++ b/notebooks/validation/diff_output_versions.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import packages\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import plotly.express as px\n",
+    "import zipfile\n",
+    "\n",
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# # Tell python where to look for modules.\n",
+    "import sys\n",
+    "sys.path.append('../../../open-grid-emissions/src/')\n",
+    "\n",
+    "import download_data\n",
+    "import load_data\n",
+    "from column_checks import get_dtypes\n",
+    "from filepaths import *\n",
+    "import impute_hourly_profiles\n",
+    "import data_cleaning\n",
+    "import output_data\n",
+    "import emissions\n",
+    "import validation\n",
+    "import gross_to_net_generation\n",
+    "import eia930\n",
+    "\n",
+    "year = 2021\n",
+    "path_prefix = f\"{year}/\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## About this notebook\n",
+    "This notebook can be used to identify differences between one version of OGE data and another. \n",
+    "This is useful if you want to identify how much a code update affects the output results.\n",
+    "\n",
+    "This notebook compares files in the `outputs` and `results` directory against archived data in the `zenodo` or `s3_upload` directories. \n",
+    "This assumes that the previous, stable version of the data outputs are archived on your computer."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare plant data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load archived data\n",
+    "data_type = \"plant_data\"\n",
+    "resolution = \"annual\"\n",
+    "\n",
+    "# unzip archived data\n",
+    "if not os.path.exists(data_folder(\"diff\")):\n",
+    "    os.mkdir(data_folder(\"diff\"))\n",
+    "with zipfile.ZipFile(data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\") as zip_to_unzip:\n",
+    "    zip_to_unzip.extractall(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\"))\n",
+    "\n",
+    "# load archived data\n",
+    "prev_data = pd.read_csv(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/plant_data.csv\"), dtype=get_dtypes()).round(0)\n",
+    "\n",
+    "# load new data\n",
+    "new_data = pd.read_csv(results_folder(f\"{year}/{data_type}/{resolution}/us_units/plant_data.csv\"), dtype=get_dtypes()).round(0)\n",
+    "\n",
+    "# load plant attributes\n",
+    "plant_attributes = pd.read_csv(outputs_folder(f\"{year}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes())\n",
+    "\n",
+    "prev_data = prev_data.merge(plant_attributes[[\"plant_id_eia\",\"ba_code\",\"fuel_category\"]], how=\"left\", on=\"plant_id_eia\")\n",
+    "new_data = new_data.merge(plant_attributes[[\"plant_id_eia\",\"ba_code\",\"fuel_category\"]], how=\"left\", on=\"plant_id_eia\")\n",
+    "\n",
+    "key_cols = [\"plant_id_eia\",\"ba_code\",\"fuel_category\"]\n",
+    "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n",
+    "\n",
+    "# get difference\n",
+    "diff = comparison.groupby(level=0, axis=1).diff().rename(columns={\"new\":\"pct_diff\"}).drop(columns=[\"previous\"], level=1)\n",
+    "comparison = pd.concat([comparison, diff], axis=1).sort_index(axis=1, level=0, ascending=True, sort_remaining=False)\n",
+    "comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'] = (comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'].values / comparison.iloc[:, comparison.columns.get_level_values(1)=='previous'].values).round(2)\n",
+    "\n",
+    "comparison\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compare BA data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load archived data\n",
+    "data_type = \"power_sector_data\"\n",
+    "resolution = \"annual\"\n",
+    "\n",
+    "# unzip archived data\n",
+    "if not os.path.exists(data_folder(\"diff\")):\n",
+    "    os.mkdir(data_folder(\"diff\"))\n",
+    "with zipfile.ZipFile(data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\") as zip_to_unzip:\n",
+    "    zip_to_unzip.extractall(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\"))\n",
+    "\n",
+    "# load archived data\n",
+    "prev_data = []\n",
+    "for ba in os.listdir(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\")):\n",
+    "    df = pd.read_csv(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/{ba}\"), dtype=get_dtypes())\n",
+    "    df[\"ba_code\"] = ba.split(\".\")[0]\n",
+    "    prev_data.append(df)\n",
+    "\n",
+    "prev_data = pd.concat(prev_data, axis=0).reset_index(drop=True)\n",
+    "\n",
+    "# load data\n",
+    "new_data = []\n",
+    "for ba in os.listdir(results_folder(f\"{year}/{data_type}/{resolution}/us_units\")):\n",
+    "    df = pd.read_csv(results_folder(f\"{year}/{data_type}/{resolution}/us_units/{ba}\"), dtype=get_dtypes())\n",
+    "    df[\"ba_code\"] = ba.split(\".\")[0]\n",
+    "    new_data.append(df)\n",
+    "\n",
+    "new_data = pd.concat(new_data, axis=0).reset_index(drop=True)\n",
+    "\n",
+    "key_cols = [\"ba_code\", \"fuel_category\"]\n",
+    "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n",
+    "comparison\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare intermediate outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load archived data\n",
+    "file = \"cems_cleaned\"\n",
+    "key_cols = [\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\"]\n",
+    "\n",
+    "# unzip archived data\n",
+    "if not os.path.exists(data_folder(f\"diff/outputs_{year}\")):\n",
+    "    os.mkdir(data_folder(f\"diff/outputs_{year}\"))\n",
+    "    with zipfile.ZipFile(data_folder(f\"zenodo/outputs_{year}.zip\"), \"r\") as zip_to_unzip:\n",
+    "        zip_to_unzip.extractall(data_folder(f\"diff/outputs_{year}\"))\n",
+    "\n",
+    "# load archived data\n",
+    "prev_data = pd.read_csv(data_folder(f\"diff/outputs_{year}/{file}_{year}.csv\"), dtype=get_dtypes())\n",
+    "\n",
+    "# load new data\n",
+    "new_data = pd.read_csv(outputs_folder(f\"{year}/{file}_{year}.csv\"), dtype=get_dtypes())\n",
+    "\n",
+    "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n",
+    "comparison\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "open_grid_emissions",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "25e36f192ecdbe5da57d9bea009812e7b11ef0e0053366a845a2802aae1b29d2"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb
index 904e85ac..c90d0608 100644
--- a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb
+++ b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb
@@ -686,19 +686,93 @@
     "test_fill[test_fill[\"diff\"] != 0]"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Why are non-missing values being removed?"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "(\n",
+    "    eia923_allocated,\n",
+    "    primary_fuel_table,\n",
+    "    subplant_emission_factors,\n",
+    ") = clean_eia923(year, False)\n",
+    "\n",
+    "# load the CEMS data\n",
+    "cems = load_data.load_cems_data(year)\n",
+    "\n",
+    "\n",
+    "# remove non-grid connected plants\n",
+    "cems = remove_plants(\n",
+    "    cems,\n",
+    "    non_grid_connected=True,\n",
+    "    remove_states=[\"PR\"],\n",
+    "    steam_only_plants=False,\n",
+    "    distribution_connected_plants=False,\n",
+    ")\n",
+    "\n",
+    "# manually remove steam-only units\n",
+    "cems = manually_remove_steam_units(cems)\n",
+    "\n",
+    "# add a report date\n",
+    "cems = load_data.add_report_date(cems)\n",
+    "\n",
+    "# remove data for any unit-months where there are incomplete data reported\n",
+    "# this is generally when there is a single observation reported for an entire month\n",
+    "cems = remove_incomplete_unit_months(cems)\n",
+    "\n",
+    "# TODO: identify and remove any hourly values that appear to be outliers\n",
+    "# See: https://github.com/singularity-energy/open-grid-emissions/issues/50\n",
+    "\n",
+    "# add subplant id\n",
+    "subplant_crosswalk = (\n",
+    "    pd.read_csv(\n",
+    "        outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n",
+    "        dtype=get_dtypes(),\n",
+    "    )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n",
+    "    .drop_duplicates()\n",
+    "    .dropna(subset=\"emissions_unit_id_epa\")\n",
+    ")\n",
+    "cems = cems.merge(\n",
+    "    subplant_crosswalk,\n",
+    "    how=\"left\",\n",
+    "    on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n",
+    "    validate=\"m:1\",\n",
+    ")\n",
+    "validation.test_for_missing_subplant_id(cems)\n",
+    "\n",
+    "# add a fuel type to each observation\n",
+    "cems = assign_fuel_type_to_cems(cems, year, primary_fuel_table)\n",
+    "\n",
+    "# fill in missing hourly emissions data using the fuel type and heat input\n",
+    "validation.test_for_missing_energy_source_code(cems)"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "cems_test = cems.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cems_test = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)"
+   ]
   }
  ],
  "metadata": {
diff --git a/src/emissions.py b/src/emissions.py
index 01245150..966b87d3 100644
--- a/src/emissions.py
+++ b/src/emissions.py
@@ -1688,6 +1688,8 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors):
     3. For any remaining missing values, calculate emissions based on the subplant primary fuel and fuel consumption
     """
 
+    # make a copy of the cems data so that we can validate the outputs
+    cems_original = cems.copy()
     # add a new categorical option to the mass measurement code
     cems["co2_mass_measurement_code"] = cems[
         "co2_mass_measurement_code"
@@ -1832,6 +1834,7 @@ def fill_cems_missing_co2(cems, year, subplant_emission_factors):
         )
 
     # check that no non-missing co2 values were modified during filling
-    validation.check_non_missing_cems_co2_values_unchanged(cems, year)
+    validation.check_non_missing_cems_co2_values_unchanged(cems_original, cems)
+    del cems_original
 
     return cems
diff --git a/src/validation.py b/src/validation.py
index 090de123..0f2fcd86 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -265,14 +265,12 @@ def test_for_missing_energy_source_code(df):
     return missing_esc_test
 
 
-def check_non_missing_cems_co2_values_unchanged(cems, year):
+def check_non_missing_cems_co2_values_unchanged(cems_original, cems):
     """Checks that no non-missing CO2 values were modified during the process of filling."""
     print(
         "    Checking that original CO2 data in CEMS was not modified by filling missing values...",
         end="",
     )
-    # re-load the raw cems data
-    cems_original = load_data.load_cems_data(year)
     # only keep non-zero and non-missing co2 values, since these should have not been modified
     cems_original = cems_original.loc[
         cems_original["co2_mass_lb"] > 0,
@@ -297,12 +295,10 @@ def check_non_missing_cems_co2_values_unchanged(cems, year):
     else:
         print("OK")
 
-    del cems_original
-
 
 def check_removed_data_is_empty(cems):
     """Checks that the rows removed by `data_cleaning.remove_cems_with_zero_monthly_data()` don't actually contain non-zero data"""
-    check_that_data_is_zero = cems[
+    check_that_data_is_zero = cems.loc[
         cems["missing_data_flag"] == "remove",
         [
             "gross_generation_mwh",

From 4e0f919fe641bbf49b30a496ab31436b47720ae8 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 11 Feb 2023 17:08:43 -0800
Subject: [PATCH 09/27] clean up notebook

---
 .../validation/diff_output_versions.ipynb     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/notebooks/validation/diff_output_versions.ipynb b/notebooks/validation/diff_output_versions.ipynb
index 19f6a649..b6c8d200 100644
--- a/notebooks/validation/diff_output_versions.ipynb
+++ b/notebooks/validation/diff_output_versions.ipynb
@@ -8,9 +8,7 @@
    "source": [
     "# import packages\n",
     "import pandas as pd\n",
-    "import numpy as np\n",
     "import os\n",
-    "import plotly.express as px\n",
     "import zipfile\n",
     "\n",
     "%reload_ext autoreload\n",
@@ -20,17 +18,10 @@
     "import sys\n",
     "sys.path.append('../../../open-grid-emissions/src/')\n",
     "\n",
-    "import download_data\n",
     "import load_data\n",
     "from column_checks import get_dtypes\n",
     "from filepaths import *\n",
-    "import impute_hourly_profiles\n",
-    "import data_cleaning\n",
-    "import output_data\n",
-    "import emissions\n",
-    "import validation\n",
-    "import gross_to_net_generation\n",
-    "import eia930\n",
+    "\n",
     "\n",
     "year = 2021\n",
     "path_prefix = f\"{year}/\""
@@ -96,6 +87,15 @@
     "comparison\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comparison[comparison.loc[:,(\"co2_mass_lb_for_electricity\",\"pct_diff\")] > 0.001]#.groupby(\"ba_code\").sum().sum()"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",

From 390318693dfe7020245baf08858384291b2d0796 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Wed, 15 Feb 2023 12:47:32 -0800
Subject: [PATCH 10/27] use isclose

---
 src/validation.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/validation.py b/src/validation.py
index 4fa35e5e..7ef45054 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -40,18 +40,12 @@ def validate_year(year):
         raise UserWarning(year_warning)
 
 
-def check_allocated_gf_matches_input_gf(
-    pudl_out, gen_fuel_allocated, threshold_percent=0.001
-):
+def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated):
     """
     Checks that the allocated generation and fuel from EIA-923 matches the input totals.
 
-    Because there might be small rounding errors in the allocation that make the
-    allocated total slightly off from the input data, we allow the user to specify a
-    threshold percentage above which mismatched data is flagged. The default value is
-    0.1%, so that if either the allocated total fuel consumption or allocated total net
-    generation is more than +/-0.1% different from the total input generation or fuel,
-    the record is flagged.
+    We use np.isclose() to identify any values that are off by more than 1e-9% different
+    from the total input generation or fuel.
     """
     gf = pudl_out.gf_eia923()
     plant_total_gf = gf.groupby("plant_id_eia")[
@@ -69,11 +63,13 @@ def check_allocated_gf_matches_input_gf(
         ]
     ].sum()
     # calculate the percentage difference between the values
-    plant_total_diff = (plant_total_alloc - plant_total_gf) / plant_total_gf
+    plant_total_diff = ((plant_total_alloc - plant_total_gf) / plant_total_gf).dropna(
+        how="any", axis=0
+    )
     # flag rows where the absolute percentage difference is greater than our threshold
     mismatched_allocation = plant_total_diff[
-        (abs(plant_total_diff["fuel_consumed_mmbtu"]) > threshold_percent)
-        | (abs(plant_total_diff["net_generation_mwh"]) > threshold_percent)
+        (~np.isclose(plant_total_diff["fuel_consumed_mmbtu"], 0))
+        | (~np.isclose(plant_total_diff["net_generation_mwh"], 0))
     ]
     if len(mismatched_allocation) > 0:
         print(

From 50977c50145826da5cad87f4b4fa1be23725bc54 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Wed, 15 Feb 2023 12:48:53 -0800
Subject: [PATCH 11/27] update validation parameters

---
 src/data_cleaning.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index 558c4b7d..6ce26ca7 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -401,9 +401,7 @@ def clean_eia923(
     )
 
     # test to make sure allocated totals match input totals
-    validation.check_allocated_gf_matches_input_gf(
-        pudl_out, gen_fuel_allocated, threshold_percent=0.001
-    )
+    validation.check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated)
 
     # manually update energy source code when OTH
     gen_fuel_allocated = update_energy_source_codes(gen_fuel_allocated)

From c9472bc17c72a64eb3fa2a3118b3121dd96e4746 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Wed, 15 Feb 2023 12:56:20 -0800
Subject: [PATCH 12/27] remove WIP notebook

---
 .../GH279_missing_cems_data.ipynb             | 805 ------------------
 1 file changed, 805 deletions(-)
 delete mode 100644 notebooks/work_in_progress/GH279_missing_cems_data.ipynb

diff --git a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb b/notebooks/work_in_progress/GH279_missing_cems_data.ipynb
deleted file mode 100644
index c90d0608..00000000
--- a/notebooks/work_in_progress/GH279_missing_cems_data.ipynb
+++ /dev/null
@@ -1,805 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import packages\n",
-    "import pandas as pd\n",
-    "\n",
-    "%reload_ext autoreload\n",
-    "%autoreload 2\n",
-    "\n",
-    "# # Tell python where to look for modules.\n",
-    "import sys\n",
-    "sys.path.append('../../../open-grid-emissions/src/')\n",
-    "\n",
-    "from column_checks import get_dtypes\n",
-    "from filepaths import *\n",
-    "import load_data\n",
-    "from data_cleaning import *\n",
-    "import validation\n",
-    "import emissions\n",
-    "\n",
-    "year = 2021"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## What does the cleaned CEMS data look like"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load data from csv\n",
-    "year = 2021\n",
-    "path_prefix = f\"{year}/\"\n",
-    "\n",
-    "cems = pd.read_csv(outputs_folder(f\"{path_prefix}/cems_cleaned_{year}.csv\"), dtype=get_dtypes())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems[cems[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems[cems[\"plant_id_eia\"] == 3].sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eia923_allocated = pd.read_csv(outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"), dtype=get_dtypes())\n",
-    "eia923_allocated[eia923_allocated[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"subplant_id\",\"report_date\"]).sum(numeric_only=True).head(20)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Test where data is being dropped"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "(\n",
-    "    eia923_allocated,\n",
-    "    primary_fuel_table,\n",
-    "    subplant_emission_factors,\n",
-    ") = clean_eia923(year, False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# does the raw cems match this?\n",
-    "cems_raw = load_data.load_cems_data(year)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "barry.sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# remove non-grid connected plants\n",
-    "cems_raw = remove_plants(\n",
-    "    cems_raw,\n",
-    "    non_grid_connected=True,\n",
-    "    remove_states=[\"PR\"],\n",
-    "    steam_only_plants=False,\n",
-    "    distribution_connected_plants=False,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# manually remove steam-only units\n",
-    "cems_raw = manually_remove_steam_units(cems_raw)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# add a report date\n",
-    "cems_raw = load_data.add_report_date(cems_raw)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# remove data for any unit-months where there are incomplete data reported\n",
-    "# this is generally when there is a single observation reported for an entire month\n",
-    "cems_raw = remove_incomplete_unit_months(cems_raw)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# add subplant id\n",
-    "subplant_crosswalk = (\n",
-    "    pd.read_csv(\n",
-    "        outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n",
-    "        dtype=get_dtypes(),\n",
-    "    )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n",
-    "    .drop_duplicates()\n",
-    "    .dropna(subset=\"emissions_unit_id_epa\")\n",
-    ")\n",
-    "cems_raw = cems_raw.merge(\n",
-    "    subplant_crosswalk,\n",
-    "    how=\"left\",\n",
-    "    on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n",
-    "    validate=\"m:1\",\n",
-    ")\n",
-    "validation.test_for_missing_subplant_id(cems_raw)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# add a fuel type to each observation\n",
-    "cems_raw = assign_fuel_type_to_cems(cems_raw, year, primary_fuel_table)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# fill in missing hourly emissions data using the fuel type and heat input\n",
-    "validation.test_for_missing_energy_source_code(cems_raw)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# save a copy of the cems data at this point to test later\n",
-    "cems_test = cems_raw.copy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw = emissions.calculate_ghg_emissions_from_fuel_consumption(\n",
-    "        df=cems_raw, year=year, include_co2=False, include_ch4=True, include_n2o=True\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw = remove_cems_with_zero_monthly_data(cems_raw)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_raw[cems_raw[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Investigate emissions filling"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_test[cems_test[\"plant_id_eia\"] == 3].groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",]).sum(numeric_only=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "barry = cems_test.copy() #[(cems_test[\"plant_id_eia\"] == 3)]\n",
-    "barry"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# add a new categorical option to the mass measurement code\n",
-    "barry[\"co2_mass_measurement_code\"] = barry[\n",
-    "    \"co2_mass_measurement_code\"\n",
-    "].cat.add_categories(\"Imputed\")\n",
-    "\n",
-    "# replace all \"missing\" CO2 values with zero\n",
-    "barry[\"co2_mass_lb\"] = barry[\"co2_mass_lb\"].fillna(0)\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# replace 0 reported CO2 values with missing values, if there was reported heat input\n",
-    "barry.loc[\n",
-    "    (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# replace 0 reported CO2 values with missing values, if there was reported heat input\n",
-    "barry.loc[\n",
-    "    (barry[\"co2_mass_lb\"] == 0) & (barry[\"fuel_consumed_mmbtu\"] > 0),\n",
-    "    \"co2_mass_lb\",\n",
-    "] = np.NaN"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create a new df with all observations with missing co2 data\n",
-    "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n",
-    "missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "unit_months_missing_co2 = missing_co2[\n",
-    "        [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"]\n",
-    "    ].drop_duplicates()\n",
-    "unit_months_missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# get non-missing data from cems for these unit months\n",
-    "unit_months_missing_co2 = unit_months_missing_co2.merge(\n",
-    "    barry[\n",
-    "        [\n",
-    "            \"plant_id_eia\",\n",
-    "            \"emissions_unit_id_epa\",\n",
-    "            \"report_date\",\n",
-    "            \"co2_mass_lb\",\n",
-    "            \"fuel_consumed_mmbtu\",\n",
-    "        ]\n",
-    "    ],\n",
-    "    how=\"left\",\n",
-    "    on=[\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"],\n",
-    "    validate=\"1:m\",\n",
-    ")\n",
-    "unit_months_missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "unit_months_missing_co2 = unit_months_missing_co2[\n",
-    "        (unit_months_missing_co2[\"co2_mass_lb\"] > 0)\n",
-    "        & (unit_months_missing_co2[\"fuel_consumed_mmbtu\"] > 0)\n",
-    "    ]\n",
-    "unit_months_missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# calculate total fuel consumption and emissions by month\n",
-    "unit_month_efs = (\n",
-    "    unit_months_missing_co2.groupby(\n",
-    "        [\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"], dropna=False\n",
-    "    )\n",
-    "    .sum()\n",
-    "    .reset_index()\n",
-    ")\n",
-    "unit_month_efs[\"co2_lb_per_mmbtu\"] = (\n",
-    "    unit_month_efs[\"co2_mass_lb\"] / unit_month_efs[\"fuel_consumed_mmbtu\"]\n",
-    ")\n",
-    "unit_month_efs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# merge these EFs into the missing cems data\n",
-    "missing_co2 = missing_co2.merge(\n",
-    "    unit_month_efs[\n",
-    "        [\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\", \"co2_lb_per_mmbtu\"]\n",
-    "    ],\n",
-    "    how=\"left\",\n",
-    "    on=[\"plant_id_eia\", \"report_date\", \"emissions_unit_id_epa\"],\n",
-    "    validate=\"m:1\",\n",
-    ").set_index(missing_co2.index)\n",
-    "missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# only keep observations where there is a non-missing ef\n",
-    "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n",
-    "\n",
-    "# calculate missing co2 data\n",
-    "missing_co2[\"co2_mass_lb\"] = (\n",
-    "    missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n",
-    ")\n",
-    "missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# update in CEMS table\n",
-    "barry.update(missing_co2[[\"co2_mass_lb\"]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# update the co2 mass measurement code\n",
-    "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n",
-    "\n",
-    "# identify all observations that are still missing co2 data\n",
-    "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]\n",
-    "missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# merge the weighted ef into the missing data\n",
-    "missing_co2 = missing_co2.merge(\n",
-    "    subplant_emission_factors[\n",
-    "        [\"plant_id_eia\", \"report_date\", \"subplant_id\", \"co2_lb_per_mmbtu\"]\n",
-    "    ],\n",
-    "    how=\"left\",\n",
-    "    on=[\"plant_id_eia\", \"report_date\", \"subplant_id\"],\n",
-    "    validate=\"m:1\",\n",
-    ").set_index(missing_co2.index)\n",
-    "missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# only keep observations where there is a non-missing ef\n",
-    "missing_co2 = missing_co2[~missing_co2[\"co2_lb_per_mmbtu\"].isna()]\n",
-    "\n",
-    "# calculate missing co2 data\n",
-    "missing_co2[\"co2_mass_lb\"] = (\n",
-    "    missing_co2[\"fuel_consumed_mmbtu\"] * missing_co2[\"co2_lb_per_mmbtu\"]\n",
-    ")\n",
-    "missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# update in barry table\n",
-    "barry.update(missing_co2[[\"co2_mass_lb\"]])\n",
-    "\n",
-    "# update the co2 mass measurement code\n",
-    "barry.loc[missing_co2.index, \"co2_mass_measurement_code\"] = \"Imputed\"\n",
-    "\n",
-    "# identify all observations that are still missing co2 data\n",
-    "missing_co2 = barry[barry[\"co2_mass_lb\"].isnull()]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "missing_co2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# for rows that have a successful fuel code match, move to a temporary dataframe to hold the data\n",
-    "co2_to_fill = missing_co2.copy()[~missing_co2[\"energy_source_code\"].isna()]\n",
-    "fill_index = co2_to_fill.index\n",
-    "co2_to_fill"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# calculate emissions based on fuel type\n",
-    "co2_to_fill = emissions.calculate_ghg_emissions_from_fuel_consumption(\n",
-    "    df=co2_to_fill,\n",
-    "    year=year,\n",
-    "    include_co2=True,\n",
-    "    include_ch4=False,\n",
-    "    include_n2o=False,\n",
-    ").set_index(fill_index)\n",
-    "\n",
-    "co2_to_fill"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# fill this data into the original cems data\n",
-    "barry.update(co2_to_fill[[\"co2_mass_lb\"]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "test_fill = cems_test.loc[cems_test[\"co2_mass_lb\"] > 0,[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]]\n",
-    "test_fill = test_fill.merge(barry[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\", \"co2_mass_lb\"]], how=\"left\", on=[\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\"], validate=\"1:1\", suffixes=(\"_original\",\"_postfill\"))\n",
-    "test_fill[\"diff\"] = test_fill[\"co2_mass_lb_postfill\"] - test_fill[\"co2_mass_lb_original\"]\n",
-    "test_fill[test_fill[\"diff\"] != 0]"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Why are non-missing values being removed?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "(\n",
-    "    eia923_allocated,\n",
-    "    primary_fuel_table,\n",
-    "    subplant_emission_factors,\n",
-    ") = clean_eia923(year, False)\n",
-    "\n",
-    "# load the CEMS data\n",
-    "cems = load_data.load_cems_data(year)\n",
-    "\n",
-    "\n",
-    "# remove non-grid connected plants\n",
-    "cems = remove_plants(\n",
-    "    cems,\n",
-    "    non_grid_connected=True,\n",
-    "    remove_states=[\"PR\"],\n",
-    "    steam_only_plants=False,\n",
-    "    distribution_connected_plants=False,\n",
-    ")\n",
-    "\n",
-    "# manually remove steam-only units\n",
-    "cems = manually_remove_steam_units(cems)\n",
-    "\n",
-    "# add a report date\n",
-    "cems = load_data.add_report_date(cems)\n",
-    "\n",
-    "# remove data for any unit-months where there are incomplete data reported\n",
-    "# this is generally when there is a single observation reported for an entire month\n",
-    "cems = remove_incomplete_unit_months(cems)\n",
-    "\n",
-    "# TODO: identify and remove any hourly values that appear to be outliers\n",
-    "# See: https://github.com/singularity-energy/open-grid-emissions/issues/50\n",
-    "\n",
-    "# add subplant id\n",
-    "subplant_crosswalk = (\n",
-    "    pd.read_csv(\n",
-    "        outputs_folder(f\"{year}/subplant_crosswalk_{year}.csv\"),\n",
-    "        dtype=get_dtypes(),\n",
-    "    )[[\"plant_id_eia\", \"emissions_unit_id_epa\", \"subplant_id\"]]\n",
-    "    .drop_duplicates()\n",
-    "    .dropna(subset=\"emissions_unit_id_epa\")\n",
-    ")\n",
-    "cems = cems.merge(\n",
-    "    subplant_crosswalk,\n",
-    "    how=\"left\",\n",
-    "    on=[\"plant_id_eia\", \"emissions_unit_id_epa\"],\n",
-    "    validate=\"m:1\",\n",
-    ")\n",
-    "validation.test_for_missing_subplant_id(cems)\n",
-    "\n",
-    "# add a fuel type to each observation\n",
-    "cems = assign_fuel_type_to_cems(cems, year, primary_fuel_table)\n",
-    "\n",
-    "# fill in missing hourly emissions data using the fuel type and heat input\n",
-    "validation.test_for_missing_energy_source_code(cems)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_test = cems.copy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cems_test = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "open_grid_emissions",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "25e36f192ecdbe5da57d9bea009812e7b11ef0e0053366a845a2802aae1b29d2"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From ac8175c665f737ed86c233e6793eb4f1aa635295 Mon Sep 17 00:00:00 2001
From: Milo Knowles <miloknowles97@gmail.com>
Date: Thu, 23 Feb 2023 15:35:16 -0500
Subject: [PATCH 13/27] Make PUDL logging show up, and set up logging in OGE
 (#285)

* Fix logging and show example

* WIP

* Use logger everywhere

* Remove leading spaces

* Print dataframes properly

* Address comments

* Remove empty logging

---------

Co-authored-by: Greg Miller <45949268+grgmiller@users.noreply.github.com>
---
 .gitignore                     |   3 +-
 environment.yml                |   1 +
 src/__init__.py                |   5 +
 src/column_checks.py           |  13 +-
 src/consumed.py                |  27 ++--
 src/data_cleaning.py           |  36 +++---
 src/data_pipeline.py           |  63 +++++-----
 src/download_data.py           |  17 +--
 src/eia930.py                  |  15 ++-
 src/emissions.py               |  74 ++++++-----
 src/filepaths.py               |   3 +-
 src/gross_to_net_generation.py |  15 ++-
 src/impute_hourly_profiles.py  |  24 ++--
 src/load_data.py               |  23 ++--
 src/logging_util.py            |  49 ++++++++
 src/output_data.py             |  20 +--
 src/validation.py              | 217 +++++++++++++++------------------
 src/visualization.py           |   5 +-
 test/test_logging.py           |  33 +++++
 19 files changed, 364 insertions(+), 279 deletions(-)
 create mode 100644 src/logging_util.py
 create mode 100644 test/test_logging.py

diff --git a/.gitignore b/.gitignore
index 2f35d2c3..093d108f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,13 +3,14 @@ data/*
 
 example/.ipynb_checkpoints/
 test/__pycache__/
+test/*.txt
 src/__pycache__/
 
 CHANGELOG.md
 
 notebooks/visualization/outputs/*
 
-# Python 
+# Python
 notebooks/.ipynb_checkpoints
 notebooks/*/.ipynb_checkpoints
 .hypothesis/
diff --git a/environment.yml b/environment.yml
index 505de70c..6b6ca3f8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -27,6 +27,7 @@ dependencies:
   - sqlalchemy
   - sqlite # used for pudl
   - statsmodels
+  - coloredlogs # used for prettier logging
 
   - pip:
       # --editable ../pudl #NOTE: this is for development use
diff --git a/src/__init__.py b/src/__init__.py
index e69de29b..efd6d259 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -0,0 +1,5 @@
+# Set up the OGE logging configuration once.
+import logging
+from .logging_util import configure_root_logger
+from .filepaths import outputs_folder
+configure_root_logger(outputs_folder("logfile.txt"), logging.INFO)
diff --git a/src/column_checks.py b/src/column_checks.py
index 545c7bdf..5fb8fd03 100644
--- a/src/column_checks.py
+++ b/src/column_checks.py
@@ -17,6 +17,9 @@
 After any change, re-run data_pipeline to regenerate all files and re-run these
 checks.
 """
+from logging_util import get_logger
+logger = get_logger(__name__)
+
 
 COLUMNS = {
     "eia923_allocated": {
@@ -348,8 +351,8 @@ def check_columns(df, file_name):
     # Check for extra columns. Warning not exception
     extras = cols - expected_cols
     if len(extras) > 0:
-        print(
-            f"WARNING: columns {extras} in {file_name} are not guaranteed by column_checks.py"
+        logger.warning(
+            f"columns {extras} in {file_name} are not guaranteed by column_checks.py"
         )
 
     # Raise exception for missing columns
@@ -464,8 +467,8 @@ def apply_dtypes(df):
         if (col not in dtypes) and (col not in datetime_columns)
     ]
     if len(cols_missing_dtypes) > 0:
-        print(
-            "WARNING: The following columns do not have dtypes assigned in `column_checks.get_dtypes()`"
+        logger.warning(
+            "The following columns do not have dtypes assigned in `column_checks.get_dtypes()`"
         )
-        print(cols_missing_dtypes)
+        logger.warning(cols_missing_dtypes)
     return df.astype({col: dtypes[col] for col in df.columns if col in dtypes})
diff --git a/src/consumed.py b/src/consumed.py
index 7f164b89..65f0f037 100644
--- a/src/consumed.py
+++ b/src/consumed.py
@@ -6,6 +6,7 @@
 from gridemissions.load import BaData
 from gridemissions.eia_api import KEYS, SRC
 from filepaths import outputs_folder, manual_folder, results_folder
+from logging_util import get_logger
 
 from output_data import (
     GENERATED_EMISSION_RATE_COLS,
@@ -14,11 +15,13 @@
     TIME_RESOLUTIONS,
 )
 
-""" For these BAs, there are significant and systematic differences 
-between our net_generation_mwh and EIA-930 net generation and interchange, 
-so we cannot combine our net generation and 930 interchange to get net_consumed. 
-Instead, we use 930 demand as net_consumed. Note: there may be issues with the 930 
-demand! But it is better than combining inconsistent generation and interchange, 
+logger = get_logger(__name__)
+
+""" For these BAs, there are significant and systematic differences
+between our net_generation_mwh and EIA-930 net generation and interchange,
+so we cannot combine our net generation and 930 interchange to get net_consumed.
+Instead, we use 930 demand as net_consumed. Note: there may be issues with the 930
+demand! But it is better than combining inconsistent generation and interchange,
 which results in unreasonable profiles with many negative hours.
 """
 # Identify the BAs for which we need to use demand data for the consumed calculation
@@ -118,8 +121,8 @@ def get_average_emission_factors(prefix: str, year: int):
             for fuel in SRC:
                 column = get_rate_column(pol, adjustment, generated=True)
                 if FUEL_TYPE_MAP[fuel] not in genavg.index:
-                    print(
-                        f"WARNING: fuel {FUEL_TYPE_MAP[fuel]} not found in file annual_generation_averages_by_fuel_{year}.csv, using average"
+                    logger.warning(
+                        f"fuel {FUEL_TYPE_MAP[fuel]} not found in file annual_generation_averages_by_fuel_{year}.csv, using average"
                     )
                     efs[pol][adjustment][fuel] = genavg.loc["total", column]
                 else:
@@ -288,7 +291,7 @@ def output_results(self):
             if (ba in self.import_regions) or (ba in self.generation_regions):
                 continue
             if ba in BA_930_INCONSISTENCY[self.year]:
-                print(f"Using D instead of (G-TI) for consumed calc in {ba}")
+                logger.warning(f"Using D instead of (G-TI) for consumed calc in {ba}")
                 self.results[ba]["net_consumed_mwh"] = self.eia930.df[
                     KEYS["E"]["D"] % ba
                 ][self.generation.index]
@@ -325,8 +328,8 @@ def output_results(self):
                     time_cols = ["datetime_utc", "datetime_local"]
                     missing_hours = time_dat[time_dat.isna().any(axis=1)]
                     if len(missing_hours) > 0:
-                        print(
-                            f"WARNING: {len(missing_hours)} hours are missing in {ba} consumed data"
+                        logger.warning(
+                            f"{len(missing_hours)} hours are missing in {ba} consumed data"
                         )
                 elif time_resolution == "monthly":
                     time_dat["month"] = time_dat.datetime_local.dt.month
@@ -513,6 +516,6 @@ def run(self):
                     for (i, r) in enumerate(self.regions):
                         self.results[r].loc[date, col] = consumed_emissions[i]
                 if total_failed > 0:
-                    print(
-                        f"Warning: {total_failed} hours failed to solve for consumed {pol} {adj} emissions."
+                    logger.warning(
+                        f"{total_failed} hours failed to solve for consumed {pol} {adj} emissions."
                     )
diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index 30294715..f11330ac 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -13,7 +13,9 @@
 from emissions import CLEAN_FUELS
 from column_checks import get_dtypes, apply_dtypes
 from filepaths import manual_folder, outputs_folder, downloads_folder
+from logging_util import get_logger
 
+logger = get_logger(__name__)
 
 DATA_COLUMNS = [
     "net_generation_mwh",
@@ -52,11 +54,11 @@ def identify_subplants(year, number_of_years=5):
     end_year = year
 
     # load 5 years of monthly data from CEMS
-    print("    loading CEMS ids")
+    logger.info("    loading CEMS ids")
     cems_ids = load_data.load_cems_ids(start_year, end_year)
 
     # add subplant ids to the data
-    print("    identifying unique subplants")
+    logger.info("    identifying unique subplants")
     generate_subplant_ids(start_year, end_year, cems_ids)
 
 
@@ -543,14 +545,12 @@ def update_energy_source_codes(df):
         (df["energy_source_code"] == "OTH") & (df["fuel_consumed_mmbtu"] > 0)
     ]
     if len(plants_with_other_fuel) > 0:
-        print(
-            "WARNING: After cleaning energy source codes, some fuel consumption is still associated with an 'OTH' fuel type."
+        logger.warning(f"""
+            After cleaning energy source codes, some fuel consumption is still associated with an 'OTH' fuel type.
+            This will lead to incorrect emissions calculations.
+            Check the following plants: {list(plants_with_other_fuel.plant_id_eia.unique())}
+            Assign a fuel type in `data_cleaning.update_energy_source_codes`"""
         )
-        print("This will lead to incorrect emissions calculations.")
-        print(
-            f"Check the following plants: {list(plants_with_other_fuel.plant_id_eia.unique())}"
-        )
-        print("Assign a fuel type in `data_cleaning.update_energy_source_codes`")
 
     return df
 
@@ -735,7 +735,7 @@ def calculate_aggregated_primary_fuel(
         plants_with_no_primary_fuel = agg_primary_fuel[
             agg_primary_fuel[f"{level}_primary_fuel"].isna()
         ]
-        print(
+        logger.warning(
             f"Check the following plants: {list(plants_with_no_primary_fuel.plant_id_eia.unique())}"
         )
         raise UserWarning(
@@ -882,7 +882,7 @@ def remove_plants(
                 plant_states["state"].isin(remove_states)
             ].plant_id_eia.unique()
         )
-        print(
+        logger.info(
             f"    Removing {len(plants_in_states_to_remove)} plants located in the following states: {remove_states}"
         )
         df = df[~df["plant_id_eia"].isin(plants_in_states_to_remove)]
@@ -918,7 +918,7 @@ def remove_non_grid_connected_plants(df):
             "plant_id_eia"
         ].unique()
     )
-    print(f"    Removing {num_plants} plants that are not grid-connected")
+    logger.info(f"    Removing {num_plants} plants that are not grid-connected")
 
     df = df[~df["plant_id_eia"].isin(ngc_plants)]
 
@@ -1005,7 +1005,7 @@ def clean_cems(year: int, small: bool, primary_fuel_table, subplant_emission_fac
 
 
 def smallerize_test_data(df, random_seed=None):
-    print("    Randomly selecting 5% of plants for faster test run.")
+    logger.info("    Randomly selecting 5% of plants for faster test run.")
     # Select 5% of plants
     selected_plants = df.plant_id_eia.unique()
     if random_seed is not None:
@@ -1030,7 +1030,7 @@ def manually_remove_steam_units(df):
         dtype=get_dtypes(),
     )[["plant_id_eia", "emissions_unit_id_epa"]]
 
-    print(
+    logger.info(
         f"    Removing {len(units_to_remove)} units that only produce steam and do not report to EIA"
     )
 
@@ -1062,7 +1062,7 @@ def remove_incomplete_unit_months(cems):
         unit_hours_in_month["datetime_utc"] < 600
     ].drop(columns="datetime_utc")
 
-    print(
+    logger.info(
         f"    Removing {len(unit_months_to_remove)} unit-months with incomplete hourly data"
     )
 
@@ -1295,7 +1295,7 @@ def remove_cems_with_zero_monthly_data(cems):
         validate="m:1",
     )
     # remove any observations with the missing data flag
-    print(
+    logger.info(
         f"    Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported"
     )
     validation.check_removed_data_is_empty(cems)
@@ -1960,8 +1960,8 @@ def assign_ba_code_to_plant(df, year):
     df = df.merge(plant_ba, how="left", on="plant_id_eia", validate="m:1")
 
     if len(df[df["ba_code"].isna()]) > 0:
-        print("    WARNING: the following plants are missing ba_code:")
-        print(df[df["ba_code"].isna()])
+        logger.warning("    the following plants are missing ba_code:")
+        logger.warning("\n" + df[df["ba_code"].isna()].tostring())
 
     # replace missing ba codes with NA
     df["ba_code"] = df["ba_code"].fillna("NA")
diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index bd092eaa..d30cefa1 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -6,16 +6,11 @@
 Optional arguments are --year (default 2021), --shape_individual_plants (default True)
 Optional arguments for development are --small, --flat, and --skip_outputs
 """
-
-
-# import packages
 import argparse
 import os
 import shutil
 
 # import local modules
-# import local modules
-# # # Tell python where to look for modules.
 import download_data
 import data_cleaning
 import emissions
@@ -26,11 +21,17 @@
 import output_data
 import consumed
 from filepaths import downloads_folder, outputs_folder, results_folder
+from logging_util import get_logger, configure_root_logger
 
 
-def get_args():
-    """
-    Specify arguments here.
+# Log the print statements to a file for debugging.
+configure_root_logger(logfile=outputs_folder("data_pipeline.log"))
+logger = get_logger("data_pipeline")
+
+
+def get_args() -> argparse.Namespace:
+    """Specify arguments here.
+
     Returns dictionary of {arg_name: arg_value}
     """
     parser = argparse.ArgumentParser()
@@ -63,8 +64,10 @@ def get_args():
 
 
 def main():
+    """Runs the OGE data pipeline."""
     args = get_args()
     year = args.year
+    logger.info(f'Running data pipeline for year {year}')
 
     validation.validate_year(year)
 
@@ -99,7 +102,7 @@ def main():
 
     # 1. Download data
     ####################################################################################
-    print("1. Downloading data")
+    logger.info("1. Downloading data")
     # PUDL
     download_data.download_pudl_data(
         zenodo_url="https://zenodo.org/record/7472137/files/pudl-v2022.11.30.tgz"
@@ -131,12 +134,12 @@ def main():
 
     # 2. Identify subplants
     ####################################################################################
-    print("2. Identifying subplant IDs")
+    logger.info("2. Identifying subplant IDs")
     data_cleaning.identify_subplants(year)
 
     # 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level
     ####################################################################################
-    print("3. Cleaning EIA-923 data")
+    logger.info("3. Cleaning EIA-923 data")
     (
         eia923_allocated,
         primary_fuel_table,
@@ -152,7 +155,7 @@ def main():
 
     # 4. Clean Hourly Data from CEMS
     ####################################################################################
-    print("4. Cleaning CEMS data")
+    logger.info("4. Cleaning CEMS data")
     cems = data_cleaning.clean_cems(
         year, args.small, primary_fuel_table, subplant_emission_factors
     )
@@ -178,14 +181,14 @@ def main():
 
     # 5. Assign static characteristics to CEMS and EIA data to aid in aggregation
     ####################################################################################
-    print("5. Loading plant static attributes")
+    logger.info("5. Loading plant static attributes")
     plant_attributes = data_cleaning.create_plant_attributes_table(
         cems, eia923_allocated, year, primary_fuel_table
     )
 
     # 6. Crosswalk CEMS and EIA data
     ####################################################################################
-    print("6. Identifying source for hourly data")
+    logger.info("6. Identifying source for hourly data")
     eia923_allocated = data_cleaning.identify_hourly_data_source(
         eia923_allocated, cems, year
     )
@@ -207,13 +210,13 @@ def main():
 
     # 7. Aggregating CEMS data to subplant
     ####################################################################################
-    print("7. Aggregating CEMS data from unit to subplant")
+    logger.info("7. Aggregating CEMS data from unit to subplant")
     # aggregate cems data to subplant level
     cems = data_cleaning.aggregate_cems_to_subplant(cems)
 
     # 8. Calculate hourly data for partial_cems plants
     ####################################################################################
-    print("8. Shaping partial CEMS data")
+    logger.info("8. Shaping partial CEMS data")
     # shape partial CEMS plant data
     partial_cems_plant = impute_hourly_profiles.shape_partial_cems_plants(
         cems, eia923_allocated
@@ -251,7 +254,7 @@ def main():
 
     # 9. Convert CEMS Hourly Gross Generation to Hourly Net Generation
     ####################################################################################
-    print("9. Converting CEMS gross generation to net generation")
+    logger.info("9. Converting CEMS gross generation to net generation")
     cems, gtn_conversions = gross_to_net_generation.convert_gross_to_net_generation(
         cems, eia923_allocated, plant_attributes, year
     )
@@ -273,7 +276,7 @@ def main():
 
     # 10. Adjust CEMS emission data for CHP
     ####################################################################################
-    print("10. Adjusting CEMS emissions for CHP")
+    logger.info("10. Adjusting CEMS emissions for CHP")
     cems = data_cleaning.adjust_cems_for_chp(cems, eia923_allocated)
     cems = emissions.calculate_co2e_mass(
         cems, year, gwp_horizon=100, ar5_climate_carbon_feedback=True
@@ -290,7 +293,7 @@ def main():
 
     # 11. Export monthly and annual plant-level results
     ####################################################################################
-    print("11. Exporting monthly and annual plant-level results")
+    logger.info("11. Exporting monthly and annual plant-level results")
     # create a separate dataframe containing only the EIA data that is missing from cems
     monthly_eia_data_to_shape = eia923_allocated[
         (eia923_allocated["hourly_data_source"] == "eia")
@@ -327,14 +330,14 @@ def main():
 
     # 12. Clean and Reconcile EIA-930 data
     ####################################################################################
-    print("12. Cleaning EIA-930 data")
+    logger.info("12. Cleaning EIA-930 data")
     # Scrapes and cleans data in data/downloads, outputs cleaned file at EBA_elec.csv
     if args.flat:
-        print("    Not running 930 cleaning because we'll be using a flat profile.")
+        logger.info("    Not running 930 cleaning because we'll be using a flat profile.")
     elif not (os.path.exists(outputs_folder(f"{path_prefix}/eia930/eia930_elec.csv"))):
         eia930.clean_930(year, small=args.small, path_prefix=path_prefix)
     else:
-        print(
+        logger.info(
             f"    Not re-running 930 cleaning. If you'd like to re-run, please delete data/outputs/{path_prefix}/eia930/"
         )
 
@@ -351,7 +354,7 @@ def main():
 
     # 13. Calculate hourly profiles for monthly EIA data
     ####################################################################################
-    print("13. Estimating hourly profiles for EIA data")
+    logger.info("13. Estimating hourly profiles for EIA data")
     hourly_profiles = impute_hourly_profiles.calculate_hourly_profiles(
         cems,
         partial_cems_subplant,
@@ -384,7 +387,7 @@ def main():
 
     # 14. Export hourly plant-level data
     ####################################################################################
-    print("14. Exporting Hourly Plant-level data for each BA")
+    logger.info("14. Exporting Hourly Plant-level data for each BA")
     if args.shape_individual_plants and not args.small:
         impute_hourly_profiles.combine_and_export_hourly_plant_data(
             cems,
@@ -398,16 +401,16 @@ def main():
             region_to_group="ba_code",
         )
     else:
-        print(
+        logger.info(
             "    Not shaping and exporting individual plant data since `shape_individual_plants` is False."
         )
-        print(
+        logger.info(
             "    Plants that only report to EIA will be aggregated to the fleet level before shaping."
         )
 
     # 15. Shape fleet-level data
     ####################################################################################
-    print("15. Assigning hourly profiles to monthly EIA-923 data")
+    logger.info("15. Assigning hourly profiles to monthly EIA-923 data")
     hourly_profiles = impute_hourly_profiles.convert_profile_to_percent(
         hourly_profiles,
         group_keys=["ba_code", "fuel_category", "profile_method"],
@@ -465,7 +468,7 @@ def main():
 
     # 16. Combine plant-level data from all sources
     ####################################################################################
-    print("16. Combining plant-level hourly data")
+    logger.info("16. Combining plant-level hourly data")
     # write metadata outputs
     output_data.write_plant_metadata(
         plant_attributes,
@@ -511,7 +514,7 @@ def main():
 
     # 17. Aggregate CEMS data to BA-fuel and write power sector results
     ####################################################################################
-    print("17. Creating and exporting BA-level power sector results")
+    logger.info("17. Creating and exporting BA-level power sector results")
     ba_fuel_data = data_cleaning.aggregate_plant_data_to_ba_fuel(
         combined_plant_data, plant_attributes
     )
@@ -525,7 +528,7 @@ def main():
 
     # 18. Calculate consumption-based emissions and write carbon accounting results
     ####################################################################################
-    print("18. Calculating and exporting consumption-based results")
+    logger.info("18. Calculating and exporting consumption-based results")
     hourly_consumed_calc = consumed.HourlyConsumed(
         clean_930_file,
         path_prefix,
diff --git a/src/download_data.py b/src/download_data.py
index d50eaf60..151564fb 100644
--- a/src/download_data.py
+++ b/src/download_data.py
@@ -7,6 +7,9 @@
 import zipfile
 
 from filepaths import downloads_folder, data_folder
+from logging_util import get_logger
+
+logger = get_logger(__name__)
 
 
 def download_helper(
@@ -38,11 +41,11 @@ def download_helper(
     # If the file already exists, do not re-download it.
     final_destination = output_path if output_path is not None else download_path
     if os.path.exists(final_destination):
-        print(f"    {final_destination.split('/')[-1]} already downloaded, skipping.")
+        logger.info(f"    {final_destination.split('/')[-1]} already downloaded, skipping.")
         return False
 
     # Otherwise, download to the file in chunks.
-    print(f"    Downloading {final_destination.split('/')[-1]}")
+    logger.info(f"    Downloading {final_destination.split('/')[-1]}")
     r = requests.get(input_url, stream=True)
     with open(download_path, "wb") as fd:
         for chunk in r.iter_content(chunk_size=chunk_size):
@@ -94,10 +97,10 @@ def download_pudl_data(zenodo_url: str):
         with open(pudl_version_file, "r") as f:
             existing_version = f.readlines()[0].replace("\n", "")
         if pudl_version == existing_version:
-            print("    PUDL version already downloaded")
+            logger.info("    PUDL version already downloaded")
             return
         else:
-            print("    Downloading new version of pudl")
+            logger.info("    Downloading new version of pudl")
             shutil.rmtree(downloads_folder("pudl"))
 
     download_pudl(zenodo_url, pudl_version)
@@ -117,10 +120,10 @@ def download_pudl(zenodo_url, pudl_version):
             )
             fd.write(chunk)
             downloaded += block_size
-    print("    Downloading PUDL. Progress: 100.0%")
+    logger.info("    Downloading PUDL. Progress: 100.0%")
 
     # extract the tgz file
-    print("    Extracting PUDL data...")
+    logger.info("    Extracting PUDL data...")
     with tarfile.open(downloads_folder("pudl.tgz")) as tar:
         tar.extractall(data_folder())
 
@@ -268,7 +271,7 @@ def download_raw_eia860(year):
     Downloads raw EIA-860 data (zip files), and unzips them to the downloads folder.
     """
     if year < 2005:
-        raise NotImplementedError(f"WARNING: We haven't tested EIA-860 for '{year}'.")
+        raise NotImplementedError(f"We haven't tested EIA-860 for '{year}'.")
     os.makedirs(downloads_folder("eia860"), exist_ok=True)
     url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip"
     archive_url = (
diff --git a/src/eia930.py b/src/eia930.py
index d2a11f54..36aa7c1e 100644
--- a/src/eia930.py
+++ b/src/eia930.py
@@ -7,12 +7,15 @@
 import load_data
 from column_checks import get_dtypes
 from filepaths import top_folder, downloads_folder, outputs_folder, manual_folder
+from logging_util import get_logger
 
 # Tell gridemissions where to find config before we load gridemissions
 os.environ["GRIDEMISSIONS_CONFIG_FILE_PATH"] = top_folder("config/gridemissions.json")
 
 from gridemissions.workflows import make_dataset
 
+logger = get_logger(__name__)
+
 
 def convert_balance_file_to_gridemissions_format(year: int, small: bool = False):
     """Converts downloaded EIA-930 Balance files to gridemissions format."""
@@ -142,14 +145,14 @@ def clean_930(year: int, small: bool = False, path_prefix: str = ""):
         df = df.loc[start:end]  # Don't worry about processing everything
 
     # Adjust
-    print("    Adjusting EIA-930 time stamps")
+    logger.info("    Adjusting EIA-930 time stamps")
     df = manual_930_adjust(df)
     df.to_csv(
         join(data_folder, "eia930_raw.csv")
     )  # Will be read by gridemissions workflow
 
     # Run cleaning
-    print("    Running physics-based data cleaning")
+    logger.info("    Running physics-based data cleaning")
     make_dataset(
         start,
         end,
@@ -171,17 +174,17 @@ def reformat_chalendar(raw):
     """
     # where we have variable (NG = net generation) and fuel type
     target_cols = [c for c in raw.columns if len(c.split(".")) == 5]
-    print("Filtering")
+    logger.info("Filtering")
     cleaned = (
         raw.loc[:, target_cols]
         .melt(ignore_index=False, value_name="generation", var_name="variable")
         .reset_index()
     )
-    print("Expanding cols")
+    logger.info("Expanding cols")
     cleaned[["dtype", "BA", "other BA", "var", "fuel", "interval"]] = cleaned[
         "variable"
     ].str.split(r"[.-]", expand=True, regex=True)
-    print("Dropping and renaming")
+    logger.info("Dropping and renaming")
     cleaned = cleaned.drop(columns=["dtype", "var", "interval", "other BA"])
     cleaned = cleaned.rename(columns={"index": "datetime_utc"})
     return cleaned
@@ -286,7 +289,7 @@ def remove_imputed_ones(eia930_data):
     filter = eia930_data["net_generation_mwh_930"].abs() < 1.5
 
     # replace all 1.0 values with zero
-    print(f"  replacing {sum(filter)} imputed 1 values with 0")
+    logger.info(f"  replacing {sum(filter)} imputed 1 values with 0")
     eia930_data.loc[filter, "net_generation_mwh_930"] = 0
 
     return eia930_data
diff --git a/src/emissions.py b/src/emissions.py
index 966b87d3..3460ad27 100644
--- a/src/emissions.py
+++ b/src/emissions.py
@@ -1,11 +1,11 @@
 import pandas as pd
 import numpy as np
 
-
 import load_data
 import validation
 from column_checks import get_dtypes
 from filepaths import manual_folder
+from logging_util import get_logger
 
 from pudl.analysis.allocate_net_gen import (
     distribute_annually_reported_data_to_months_if_annual,
@@ -13,6 +13,8 @@
 
 CLEAN_FUELS = ["SUN", "MWH", "WND", "WAT", "WH", "PUR", "NUC"]
 
+logger = get_logger(__name__)
+
 
 def calculate_ghg_emissions_from_fuel_consumption(
     df, year, include_co2=True, include_ch4=True, include_n2o=True
@@ -477,9 +479,9 @@ def calculate_nox_from_fuel_consumption(
         & ~gen_fuel_allocated["energy_source_code"].isin(CLEAN_FUELS)
     ]
     if len(missing_ef) > 0:
-        print("WARNING: NOx emission factors are missing for the following records")
-        print("Missing factors for FC prime movers are currently expected")
-        print(
+        logger.warning("NOx emission factors are missing for the following records")
+        logger.warning("Missing factors for FC prime movers are currently expected")
+        logger.warning("\n" +
             missing_ef[
                 [
                     "report_date",
@@ -488,7 +490,7 @@ def calculate_nox_from_fuel_consumption(
                     "prime_mover_code",
                     "generator_id",
                 ]
-            ].drop_duplicates()
+            ].drop_duplicates().to_string()
         )
     gen_fuel_allocated["nox_mass_lb"] = (
         gen_fuel_allocated["fuel_consumed_mmbtu"]
@@ -654,13 +656,11 @@ def calculate_generator_nox_ef_per_unit_from_boiler_type(
         )
     )
     if len(missing_nox_efs) > 0:
-        print(" ")
-        print(
-            "WARNING: NOx emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available."
+        logger.warning(
+            "NOx emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available."
         )
-        print("Missing factors for FC prime movers are currently expected")
-        print(missing_nox_efs)
-        print(" ")
+        logger.warning("Missing factors for FC prime movers are currently expected")
+        logger.warning("\n" + missing_nox_efs.to_string())
     gen_nox_factors = fill_missing_factors_based_on_pm_fuel(
         nox_emission_factors, gen_nox_factors
     )
@@ -687,13 +687,13 @@ def calculate_generator_nox_ef_per_unit_from_boiler_type(
         )
     )
     if len(missing_nox_efs) > 0:
-        print(" ")
-        print(
-            "WARNING: After filling with PM-fuel factors, NOx emission factors are still missing for the following boiler types. An emission factor of zero will be used for these boilers."
+        logger.warning("""
+            After filling with PM-fuel factors, NOx emission factors are still missing for the following boiler types.
+            An emission factor of zero will be used for these boilers.
+            Missing factors for FC prime movers are currently expected."""
         )
-        print("Missing factors for FC prime movers are currently expected")
-        print(missing_nox_efs)
-        print(" ")
+        logger.warning("\n" + missing_nox_efs.to_string())
+
     gen_nox_factors["emission_factor"] = gen_nox_factors["emission_factor"].fillna(0)
 
     # average the emission factors for all boilers associated with each generator
@@ -848,8 +848,8 @@ def convert_ef_to_lb_per_mmbtu(gen_emission_factors, pudl_out, pollutant):
         & (gen_emission_factors["emission_factor_denominator"] != "mmbtu")
     ]
     if len(missing_fuel_content) > 0:
-        print(
-            f"WARNING: The heat content for the following fuels is missing and NOx emissions will not be calculated for these fuel:{list(missing_fuel_content.energy_source_code.unique())}"
+        logger.warning(
+            f"The heat content for the following fuels is missing and NOx emissions will not be calculated for these fuel:{list(missing_fuel_content.energy_source_code.unique())}"
         )
 
     # convert emission factors from lb per unit to lb per mmbtu if the factor is not already in units of lb/mmbtu
@@ -1212,9 +1212,9 @@ def calculate_so2_from_fuel_consumption(gen_fuel_allocated, pudl_out, year):
         & ~gen_fuel_allocated["energy_source_code"].isin(CLEAN_FUELS)
     ]
     if len(missing_ef) > 0:
-        print("WARNING: SO2 emission factors are missing for the above records")
-        print("Missing factors for FC prime movers are currently expected")
-        print(
+        logger.warning("SO2 emission factors are missing for the above records")
+        logger.warning("Missing factors for FC prime movers are currently expected")
+        logger.warning("\n" +
             missing_ef[
                 [
                     "report_date",
@@ -1223,7 +1223,7 @@ def calculate_so2_from_fuel_consumption(gen_fuel_allocated, pudl_out, year):
                     "prime_mover_code",
                     "generator_id",
                 ]
-            ].drop_duplicates()
+            ].drop_duplicates().to_string()
         )
     gen_fuel_allocated["so2_mass_lb"] = (
         gen_fuel_allocated["fuel_consumed_mmbtu"]
@@ -1375,13 +1375,11 @@ def calculate_generator_so2_ef_per_unit_from_boiler_type(
         )
     )
     if len(missing_so2_efs) > 0:
-        print(" ")
-        print(
-            "WARNING: SO2 emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available."
+        logger.warning(
+            "SO2 emission factors are missing for the following boiler types. A prime mover-fuel level factor will be used if available."
         )
-        print("Missing factors for FC prime movers are currently expected")
-        print(missing_so2_efs)
-        print(" ")
+        logger.warning("Missing factors for FC prime movers are currently expected")
+        logger.warning("\n" + missing_so2_efs.to_string())
     gen_so2_factors = fill_missing_factors_based_on_pm_fuel(
         so2_emission_factors, gen_so2_factors
     )
@@ -1406,13 +1404,11 @@ def calculate_generator_so2_ef_per_unit_from_boiler_type(
         )
     )
     if len(missing_so2_efs) > 0:
-        print(" ")
-        print(
-            "WARNING: SO2 emission factors are missing for the following boiler types. An emission factor of zero will be used for these boilers."
+        logger.warning(
+            "SO2 emission factors are missing for the following boiler types. An emission factor of zero will be used for these boilers."
         )
-        print("Missing factors for FC prime movers are currently expected")
-        print(missing_so2_efs)
-        print(" ")
+        logger.warning("Missing factors for FC prime movers are currently expected")
+        logger.warning("\n" + missing_so2_efs.to_string())
     gen_so2_factors["emission_factor"] = gen_so2_factors["emission_factor"].fillna(0)
     gen_so2_factors["multiply_by_sulfur_content"] = gen_so2_factors[
         "multiply_by_sulfur_content"
@@ -1564,8 +1560,8 @@ def adjust_so2_efs_for_fuel_sulfur_content(uncontrolled_so2_factors, pudl_out):
         & (uncontrolled_so2_factors["multiply_by_sulfur_content"] == 1)
     ]
     if len(missing_sulfur_content) > 0:
-        print("WARNING: Sulfur content data is missing in EIA-923 for the above units.")
-        print(
+        logger.warning("Sulfur content data is missing in EIA-923 for the above units.")
+        logger.warning("\n" +
             missing_sulfur_content[
                 [
                     "plant_id_eia",
@@ -1573,7 +1569,7 @@ def adjust_so2_efs_for_fuel_sulfur_content(uncontrolled_so2_factors, pudl_out):
                     "prime_mover_code",
                     "energy_source_code",
                 ]
-            ].drop_duplicates()
+            ].drop_duplicates().to_string()
         )
     uncontrolled_so2_factors.loc[
         uncontrolled_so2_factors["sulfur_content_pct"].isna()
@@ -1637,7 +1633,7 @@ def load_so2_control_efficiencies(year):
     ]
     if len(bad_efficiencies) > 0:
         raise UserWarning(
-            "WARNING: certain loaded SO2 removal efficiencies are either negative or > 100%"
+            "certain loaded SO2 removal efficiencies are either negative or > 100%"
         )
 
     return so2_efficiency
diff --git a/src/filepaths.py b/src/filepaths.py
index 066203ae..ec143b07 100644
--- a/src/filepaths.py
+++ b/src/filepaths.py
@@ -1,7 +1,6 @@
+"""Convenience functions for paths."""
 import os
 
-# Convenience functions for paths.
-
 
 def top_folder(rel=""):
     """
diff --git a/src/gross_to_net_generation.py b/src/gross_to_net_generation.py
index 4cb41781..0d6ed96c 100644
--- a/src/gross_to_net_generation.py
+++ b/src/gross_to_net_generation.py
@@ -15,6 +15,9 @@
 import validation
 from column_checks import get_dtypes
 from filepaths import outputs_folder
+from logging_util import get_logger
+
+logger = get_logger(__name__)
 
 
 def convert_gross_to_net_generation(cems, eia923_allocated, plant_attributes, year):
@@ -89,10 +92,10 @@ def convert_gross_to_net_generation(cems, eia923_allocated, plant_attributes, ye
         & (cems["default_gtn_ratio"].isna())
     ]
     if len(missing_defaults) > 0:
-        print(
-            "WARNING: The following subplants are missing default GTN ratios. Using a default value of 0.97"
+        logger.warning(
+            "The following subplants are missing default GTN ratios. Using a default value of 0.97"
         )
-        print(missing_defaults[["plant_id_eia", "subplant_id"]].drop_duplicates())
+        logger.warning("\n" + missing_defaults[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string())
     # if there is a missing default gtn ratio, fill with 0.97
     cems["default_gtn_ratio"] = cems["default_gtn_ratio"].fillna(0.97)
     cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
@@ -721,12 +724,12 @@ def calculate_multiyear_gtn_factors(year, number_of_years):
     )
 
     # add subplant ids to the data
-    print("Creating subplant IDs")
+    logger.info("Creating subplant IDs")
     cems_monthly, gen_fuel_allocated = data_cleaning.generate_subplant_ids(
         start_year, end_year, cems_monthly, gen_fuel_allocated
     )
 
-    print("Calculating Gross to Net regressions and ratios")
+    logger.info("Calculating Gross to Net regressions and ratios")
     # perform regression at subplant level
     gross_to_net_regression(
         gross_gen_data=cems_monthly,
@@ -772,7 +775,7 @@ def load_monthly_gross_and_net_generation(start_year, end_year):
     )
 
     # allocate net generation and heat input to each generator-fuel grouping
-    print("    Allocating EIA-923 generation data")
+    logger.info("    Allocating EIA-923 generation data")
     gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source(
         pudl_out, drop_interim_cols=True
     )
diff --git a/src/impute_hourly_profiles.py b/src/impute_hourly_profiles.py
index 5d2cb9b7..2d9bb64c 100644
--- a/src/impute_hourly_profiles.py
+++ b/src/impute_hourly_profiles.py
@@ -7,6 +7,10 @@
 from filepaths import manual_folder
 import validation
 import output_data
+from logging_util import get_logger
+
+logger = get_logger(__name__)
+
 
 # specify the ba numbers with leading zeros
 FUEL_NUMBERS = {
@@ -112,7 +116,7 @@ def calculate_hourly_profiles(
         hourly_profiles["profile"] = hourly_profiles["flat_profile"]
         hourly_profiles["profile_method"] = "flat_profile"
 
-    print(
+    logger.info(
         "Summary of methods used to estimate missing hourly profiles (count of ba-months):"
     )
     summary_table = (
@@ -144,7 +148,7 @@ def calculate_hourly_profiles(
         :,
         profile_methods,
     ]
-    print(summary_table)
+    logger.info("\n" + summary_table.to_string())
 
     return hourly_profiles
 
@@ -290,10 +294,10 @@ def aggregate_for_residual(
         (cems["fuel_category_eia930"].isna()) & (cems["net_generation_mwh"] != 0)
     ]
     if len(missing_fuel_category) > 0:
-        print(
-            "WARNING: The following cems subplants are missing fuel categories and will lead to incorrect residual calculations:"
+        logger.warning(
+            "The following cems subplants are missing fuel categories and will lead to incorrect residual calculations:"
         )
-        print(missing_fuel_category[["plant_id_eia", "subplant_id"]].drop_duplicates())
+        logger.warning("\n" + missing_fuel_category[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string())
         raise UserWarning(
             "The missing fuel categories must be fixed before proceeding."
         )
@@ -706,7 +710,7 @@ def average_diba_wind_solar_profiles(
     ]
     if len(df_temporary) == 0 and not validation_run:
         # if this error is raised, we might have to implement an approach that uses average values for the wider region
-        print(f"    There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}")
+        logger.warning(f"    There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}")
         df_temporary = average_national_wind_solar_profiles(
             residual_profiles, ba, fuel, report_date
         )
@@ -1318,8 +1322,8 @@ def shape_partial_cems_plants(cems, eia923_allocated):
             | shaped_partial_plants["fuel_profile"].isna()
         ]
         if len(missing_profiles) > 0:
-            print(
-                "WARNING: Certain partial CEMS plants are missing hourly profile data. This will result in inaccurate results"
+            logger.warning(
+                "Certain partial CEMS plants are missing hourly profile data. This will result in inaccurate results"
             )
         # check that all profiles add to 1 for each month
         incorrect_profiles = (
@@ -1334,8 +1338,8 @@ def shape_partial_cems_plants(cems, eia923_allocated):
             | (~np.isclose(incorrect_profiles["fuel_profile"], 1))
         ]
         if len(incorrect_profiles) > 0:
-            print(
-                "WARNING: Certain partial CEMS profiles do not add to 100%. This will result in inaccurate results"
+            logger.warning(
+                "Certain partial CEMS profiles do not add to 100%. This will result in inaccurate results"
             )
 
         # shape the profiles
diff --git a/src/load_data.py b/src/load_data.py
index c5b19a88..798c480b 100644
--- a/src/load_data.py
+++ b/src/load_data.py
@@ -9,6 +9,9 @@
 
 from column_checks import get_dtypes
 from filepaths import downloads_folder, manual_folder, outputs_folder
+from logging_util import get_logger
+
+logger = get_logger(__name__)
 
 
 def correct_epa_eia_plant_id_mapping(df):
@@ -153,7 +156,7 @@ def load_cems_gross_generation(start_year, end_year):
     cems_all = []
 
     for year in range(start_year, end_year + 1):
-        print(f"    loading {year} CEMS data")
+        logger.info(f"    loading {year} CEMS data")
         # specify the path to the CEMS data
         cems_path = downloads_folder(
             "pudl/pudl_data/parquet/epacems/hourly_emissions_epacems/"
@@ -774,10 +777,10 @@ def load_emissions_controls_eia923(year: int):
             parse_dates=["report_date", "pm_test_date", "so2_test_date"],
         )
     else:
-        print(
-            "WARNING: Emissions control data prior to 2014 has not been integrated into the data pipeline."
+        logger.warning(
+            "Emissions control data prior to 2014 has not been integrated into the data pipeline."
         )
-        print(
+        logger.warning(
             "This may overestimate SO2 and NOx emissions calculated from EIA-923 data."
         )
         emissions_controls_eia923 = pd.DataFrame(
@@ -826,10 +829,10 @@ def load_boiler_control_id_association_eia860(year, pollutant):
         )
     # return a blank dataframe if the data is not available
     else:
-        print(
-            "WARNING: Environmental association data prior to 2013 have not been integrated into the data pipeline."
+        logger.warning(
+            "Environmental association data prior to 2013 have not been integrated into the data pipeline."
         )
-        print("This may result in less accurate pollutant emissions calculations.")
+        logger.warning("This may result in less accurate pollutant emissions calculations.")
         boiler_control_id_association_eia860 = pd.DataFrame(
             columns=boiler_association_eia860_names
         )
@@ -875,10 +878,10 @@ def load_boiler_design_parameters_eia860(year):
         )
     # return a blank dataframe if the data is not available
     else:
-        print(
-            "WARNING: Boiler Design data prior to 2013 have not been integrated into the data pipeline."
+        logger.warning(
+            "Boiler Design data prior to 2013 have not been integrated into the data pipeline."
         )
-        print("This may result in less accurate NOx and SO2 emissions calculations.")
+        logger.warning("This may result in less accurate NOx and SO2 emissions calculations.")
         boiler_design_parameters_eia860 = pd.DataFrame(
             columns=list(boiler_design_parameters_eia860_names.values())
         )
diff --git a/src/logging_util.py b/src/logging_util.py
new file mode 100644
index 00000000..79bbaf12
--- /dev/null
+++ b/src/logging_util.py
@@ -0,0 +1,49 @@
+"""Configure logging for the OGE codebase."""
+import logging
+import coloredlogs
+
+
+def get_logger(name: str) -> logging.Logger:
+  """Helper function to append `oge` to the logger name and return a logger.
+
+  As a result, all returned loggers a children of the top-level `oge` logger.
+  """
+  return logging.getLogger(f"oge.{name}")
+
+
+def configure_root_logger(logfile: str | None = None, level: str = "INFO"):
+  """Configure the OGE logger to print to the console, and optionally to a file.
+
+  This function is safe to call multiple times, since it will check if logging
+  handlers have already been installed and skip them if so.
+
+  Logging is printed with the same format as PUDL:
+  ```
+  2023-02-21 16:10:44 [INFO] oge.test:21 This is an example
+  ```
+  """
+  root_logger = logging.getLogger()
+
+  # Unfortunately, the `gridemissions` package adds a handler to the root logger
+  # which means that the output of other loggers propagates up and is printed
+  # twice. Remove the root handlers to avoid this.
+  for handler in root_logger.handlers:
+    root_logger.removeHandler(handler)
+
+  oge_logger = logging.getLogger("oge")
+  log_format = "%(asctime)s [%(levelname)4s] %(name)s:%(lineno)s %(message)s"
+
+  # Direct the output of the OGE logger to the terminal (and color it). Make
+  # sure this hasn't been done already to avoid adding duplicate handlers.
+  if len(oge_logger.handlers) == 0:
+    coloredlogs.install(fmt=log_format, level=level, logger=oge_logger)
+    oge_logger.addHandler(logging.NullHandler())
+
+  # Send everything to the log file by adding a file handler to the root logger.
+  if logfile is not None:
+    file_logger = logging.FileHandler(logfile, mode='w')
+    file_logger.setFormatter(logging.Formatter(log_format))
+
+    if file_logger not in root_logger.handlers:
+      root_logger.addHandler(file_logger)
+
diff --git a/src/output_data.py b/src/output_data.py
index 2319ad5e..23eb9824 100644
--- a/src/output_data.py
+++ b/src/output_data.py
@@ -7,6 +7,10 @@
 import column_checks
 import validation
 from filepaths import outputs_folder, results_folder, data_folder
+from logging_util import get_logger
+
+logger = get_logger(__name__)
+
 
 GENERATED_EMISSION_RATE_COLS = [
     "generated_co2_rate_lb_per_mwh_for_electricity",
@@ -71,7 +75,7 @@ def zip_results_for_s3(year):
                     # skip the metric hourly plant data since we do not create those outputs
                     pass
                 else:
-                    print(f"zipping {year}_{data_type}_{aggregation}_{unit} for s3")
+                    logger.info(f"zipping {year}_{data_type}_{aggregation}_{unit} for s3")
                     folder = (
                         f"{results_folder()}/{year}/{data_type}/{aggregation}/{unit}"
                     )
@@ -101,7 +105,7 @@ def zip_data_for_zenodo(year):
     """
     os.makedirs(data_folder("zenodo"), exist_ok=True)
     for directory in ["outputs", "results"]:
-        print(f"zipping {directory}_{year} for zenodo")
+        logger.info(f"zipping {directory}_{year} for zenodo")
         shutil.make_archive(
             data_folder(f"zenodo/{directory}_{year}"),
             "zip",
@@ -113,7 +117,7 @@ def zip_data_for_zenodo(year):
 def output_intermediate_data(df, file_name, path_prefix, year, skip_outputs):
     column_checks.check_columns(df, file_name)
     if not skip_outputs:
-        print(f"    Exporting {file_name} to data/outputs")
+        logger.info(f"    Exporting {file_name} to data/outputs")
         df.to_csv(outputs_folder(f"{path_prefix}{file_name}_{year}.csv"), index=False)
 
 
@@ -122,7 +126,7 @@ def output_to_results(
 ):
     # Always check columns that should not be negative.
     small = "small" in path_prefix
-    print(f"    Exporting {file_name} to data/results/{path_prefix}{subfolder}")
+    logger.info(f"    Exporting {file_name} to data/results/{path_prefix}{subfolder}")
 
     if include_metric:
         metric = convert_results(df)
@@ -149,7 +153,7 @@ def output_to_results(
 
 def output_data_quality_metrics(df, file_name, path_prefix, skip_outputs):
     if not skip_outputs:
-        print(
+        logger.info(
             f"    Exporting {file_name} to data/results/{path_prefix}data_quality_metrics"
         )
 
@@ -412,7 +416,7 @@ def round_table(table):
                 decimals[c] = abs(math.floor(math.log10(val))) + 2
             # Always 3 sigfigs (for median)
             except ValueError:
-                print(val)
+                logger.error(val)
                 raise Exception
     return table.round(decimals)
 
@@ -455,8 +459,8 @@ def write_power_sector_results(ba_fuel_data, path_prefix, skip_outputs):
     if not skip_outputs:
         for ba in list(ba_fuel_data.ba_code.unique()):
             if type(ba) is not str:
-                print(
-                    f"WARNING: not aggregating {sum(ba_fuel_data.ba_code.isna())} plants with numeric BA {ba}"
+                logger.warning(
+                    f"not aggregating {sum(ba_fuel_data.ba_code.isna())} plants with numeric BA {ba}"
                 )
                 continue
 
diff --git a/src/validation.py b/src/validation.py
index cb4574d7..83af09be 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -6,6 +6,9 @@
 from emissions import CLEAN_FUELS
 from column_checks import get_dtypes
 from filepaths import downloads_folder, manual_folder
+from logging_util import get_logger
+
+logger = get_logger(__name__)
 
 
 # DATA PIPELINE VALIDATION FUNCTIONS
@@ -21,17 +24,17 @@ def validate_year(year):
     if year < earliest_validated_year:
         year_warning = f"""
         ################################################################################
-        WARNING: The data pipeline has only been validated to work for years {earliest_validated_year}-{latest_validated_year}.
+        The data pipeline has only been validated to work for years {earliest_validated_year}-{latest_validated_year}.
         Running the pipeline for {year} may cause it to fail or may lead to poor-quality
         or anomalous results. To check on the progress of validating additional years of
         data, see: https://github.com/singularity-energy/open-grid-emissions/issues/117
         ################################################################################
         """
-        print(year_warning)
+        logger.warning(year_warning)
     elif year > latest_validated_year:
         year_warning = f"""
         ################################################################################
-        WARNING: The most recent available year of input data is currently {latest_validated_year}.
+        The most recent available year of input data is currently {latest_validated_year}.
         Input data for {year} should be available from the EIA in Fall {year+1} and we will
         work to validate that the pipeline works with {year} data as soon as possible
         after the data is released.
@@ -72,20 +75,19 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated):
         | (~np.isclose(plant_total_diff["net_generation_mwh"], 0))
     ]
     if len(mismatched_allocation) > 0:
-        print(
-            "WARNING: Allocated EIA-923 data doesn't match input data for the following plants:"
-        )
-        print("Percentage Difference:")
-        print(mismatched_allocation)
-        print("EIA-923 Input Totals:")
-        print(plant_total_gf.loc[mismatched_allocation.index, :])
-        print("Allocated Totals:")
-        print(plant_total_alloc.loc[mismatched_allocation.index, :])
+        logger.warning("Allocated EIA-923 doesn't match input data for plants:")
+        logger.warning("Percentage Difference:")
+        logger.warning("\n" + mismatched_allocation.to_string())
+        logger.warning("EIA-923 Input Totals:")
+        logger.warning("\n" + plant_total_gf.loc[mismatched_allocation.index, :].to_string())
+        logger.warning("Allocated Totals:")
+        logger.warning("\n" + plant_total_alloc.loc[mismatched_allocation.index, :].to_string())
+
 
 
 def test_for_negative_values(df, small: bool = False):
     """Checks that there are no unexpected negative values in the data."""
-    print("    Checking that fuel and emissions values are positive...  ", end="")
+    logger.info("Checking that fuel and emissions values are positive...  ")
     columns_that_should_be_positive = [
         "fuel_consumed_mmbtu",
         "fuel_consumed_for_electricity_mmbtu",
@@ -145,29 +147,26 @@ def test_for_negative_values(df, small: bool = False):
     for column in columns_to_test:
         negative_test = df[df[column] < 0]
         if not negative_test.empty:
-            print(" ")
-            print(
-                f"WARNING: There are {len(negative_test)} records where {column} is negative."
+            logger.warning(
+                f"There are {len(negative_test)} records where {column} is negative."
             )
             negative_warnings += 1
     if negative_warnings > 0:
         if small:
-            print(
+            logger.warning(
                 " Found negative values during small run, these may be fixed with full data"
             )
         else:
-            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
-            print("WARNING: The above negative values are errors and must be fixed")
-            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+            logger.warning("The above negative values are errors and must be fixed!")
             # raise UserWarning("The above negative values are errors and must be fixed")
     else:
-        print("OK")
+        logger.info("OK")
     return negative_test
 
 
 def test_for_missing_values(df, small: bool = False):
     """Checks that there are no unexpected missing values in the output data."""
-    print("    Checking that no values are missing...  ", end="")
+    logger.info("Checking that no values are missing...  ")
     columns_that_should_be_complete = [
         "plant_id_eia",
         "fuel_category",
@@ -221,60 +220,53 @@ def test_for_missing_values(df, small: bool = False):
     for column in columns_to_test:
         missing_test = df[df[column].isna()]
         if not missing_test.empty:
-            print(" ")
-            print(
-                f"WARNING: There are {len(missing_test)} records where {column} is missing."
+            logger.warning(
+                f"There are {len(missing_test)} records where {column} is missing."
             )
             missing_warnings += 1
     if missing_warnings > 0:
         if small:
-            print(
+            logger.warning(
                 " Found missing values during small run, these may be fixed with full data"
             )
         else:
-            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
-            print("WARNING: The above missing values are errors and must be fixed")
-            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+            logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+            logger.warning("The above missing values are errors and must be fixed")
+            logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
     else:
-        print("OK")
+        logger.info("OK")
     return missing_test
 
 
 def test_chp_allocation(df):
     """Checks that the CHP allocation didn't create any anomalous values."""
-    print(
-        "    Checking that total fuel consumed >= fuel consumed for electricity...  ",
-        end="",
-    )
+    logger.info("Checking that total fuel consumed >= fuel consumed for electricity...  ")
     chp_allocation_test = df[
         df["fuel_consumed_for_electricity_mmbtu"] > df["fuel_consumed_mmbtu"]
     ]
     if not chp_allocation_test.empty:
         raise UserWarning(
-            f"WARNING: There are {len(chp_allocation_test)} records where fuel consumed for electricity is greater than total fuel consumption. Check `chp_allocation_test` for complete list"
+            f"There are {len(chp_allocation_test)} records where fuel consumed for electricity is greater than total fuel consumption. Check `chp_allocation_test` for complete list"
         )
     else:
-        print("OK")
+        logger.info("OK")
 
     return chp_allocation_test
 
 
 def test_for_missing_energy_source_code(df):
     """Checks that there are no missing energy source codes associated with non-zero fuel consumption."""
-    print(
-        "    Checking that there are no missing energy source codes associated with non-zero fuel consumption...  ",
-        end="",
-    )
+    logger.info(
+        "Checking that there are no missing energy source codes associated with non-zero fuel consumption...  ")
     missing_esc_test = df[
         (df["energy_source_code"].isna()) & (df["fuel_consumed_mmbtu"] > 0)
     ]
     if not missing_esc_test.empty:
-        print(" ")
-        print(
-            f"WARNING: There are {len(missing_esc_test)} records where there is a missing energy source code associated with non-zero fuel consumption. Check `missing_esc_test` for complete list"
+        logger.warning(
+            f"There are {len(missing_esc_test)} records where there is a missing energy source code associated with non-zero fuel consumption. Check `missing_esc_test` for complete list"
         )
     else:
-        print("OK")
+        logger.info("OK")
 
     return missing_esc_test
 
@@ -330,24 +322,20 @@ def check_removed_data_is_empty(cems):
 
 def test_for_missing_subplant_id(df):
     """Checks if any records are missing a `subplant_id`."""
-    print("    Checking that all data has an associated `subplant_id`...  ", end="")
+    logger.info("Checking that all data has an associated `subplant_id`...  ")
     missing_subplant_test = df[df["subplant_id"].isna()]
     if not missing_subplant_test.empty:
-        print(" ")
-        print(
-            f"WARNING: There are {len(missing_subplant_test)} records for {len(missing_subplant_test[['plant_id_eia']].drop_duplicates())} plants without a subplant ID. See `missing_subplant_test` for details"
+        logger.warning(
+            f"There are {len(missing_subplant_test)} records for {len(missing_subplant_test[['plant_id_eia']].drop_duplicates())} plants without a subplant ID. See `missing_subplant_test` for details"
         )
     else:
-        print("OK")
+        logger.info("OK")
     return missing_subplant_test
 
 
 def validate_gross_to_net_conversion(cems, eia923_allocated):
     """checks whether the calculated net generation matches the reported net generation from EIA-923 at the annual plant level."""
-    print(
-        "    Checking that calculated net generation matches reported net generation in EIA-923...  ",
-        end="",
-    )
+    logger.info("Checking that calculated net generation matches reported net generation in EIA-923...  ")
     # merge together monthly subplant totals from EIA and calculated from CEMS
     eia_netgen = (
         eia923_allocated.groupby(
@@ -389,22 +377,18 @@ def validate_gross_to_net_conversion(cems, eia923_allocated):
     cems_net_not_equal_to_eia = validated_ng[validated_ng["pct_error"] != 0]
 
     if len(cems_net_not_equal_to_eia) > 0:
-        print(" ")
-        print(
-            f"WARNING: There are {len(cems_net_not_equal_to_eia)} plants where calculated annual net generation does not match EIA annual net generation."
+        logger.warning(
+            f"There are {len(cems_net_not_equal_to_eia)} plants where calculated annual net generation does not match EIA annual net generation."
         )
-        print(cems_net_not_equal_to_eia)
+        logger.warning("\n" + cems_net_not_equal_to_eia.to_string())
     else:
-        print("OK")
+        logger.info("OK")
 
 
 def test_emissions_adjustments(df):
     """For each emission, tests that mass_lb >= mass_lb_for_electricity >= mass_lb_for_electricity_adjusted."""
 
-    print(
-        "    Checking that adjusted emission values are less than total emissions...  ",
-        end="",
-    )
+    logger.info("Checking that adjusted emission values are less than total emissions...  ")
 
     pollutants = ["co2", "ch4", "n2o", "co2e", "nox", "so2"]
 
@@ -416,8 +400,8 @@ def test_emissions_adjustments(df):
             (df[f"{pollutant}_mass_lb"] < df[f"{pollutant}_mass_lb_for_electricity"])
         ]
         if len(bad_adjustment) > 0:
-            print(
-                f"WARNING: There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity > {pollutant}_mass_lb"
+            logger.warning(
+                f"There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity > {pollutant}_mass_lb"
             )
             bad_adjustment += 1
 
@@ -426,8 +410,8 @@ def test_emissions_adjustments(df):
             (df[f"{pollutant}_mass_lb"] < df[f"{pollutant}_mass_lb_adjusted"])
         ]
         if len(bad_adjustment) > 0:
-            print(
-                f"WARNING: There are {len(bad_adjustment)} records where {pollutant}_mass_lb_adjusted > {pollutant}_mass_lb"
+            logger.warning(
+                f"There are {len(bad_adjustment)} records where {pollutant}_mass_lb_adjusted > {pollutant}_mass_lb"
             )
             bad_adjustment += 1
 
@@ -439,9 +423,8 @@ def test_emissions_adjustments(df):
             )
         ]
         if len(bad_adjustment) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity_adjusted > {pollutant}_mass_lb_for_electricity"
+            logger.warning(
+                f"There are {len(bad_adjustment)} records where {pollutant}_mass_lb_for_electricity_adjusted > {pollutant}_mass_lb_for_electricity"
             )
             bad_adjustment += 1
 
@@ -449,7 +432,7 @@ def test_emissions_adjustments(df):
     if bad_adjustments > 0:
         raise UserWarning("The above issues with emissions adjustments must be fixed.")
     else:
-        print("OK")
+        logger.info("OK")
 
 
 def ensure_non_overlapping_data_from_all_sources(
@@ -457,7 +440,7 @@ def ensure_non_overlapping_data_from_all_sources(
 ):
     """Ensures that there is no duplicated subplant-months from each of the four sources of cleaned data."""
 
-    print("    Checking that all data to be combined is unique...  ", end="")
+    logger.info("Checking that all data to be combined is unique...  ")
 
     if "hourly_data_source" in eia_data.columns:
         eia_only_data = eia_data.loc[
@@ -520,69 +503,62 @@ def ensure_non_overlapping_data_from_all_sources(
             (data_overlap["in_eia"] == 1) & (data_overlap["in_cems"] == 1)
         ]
         if len(eia_cems_overlap) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(eia_cems_overlap)} subplant-months that exist in both shaped EIA data and CEMS"
+            logger.warning(
+                f"There are {len(eia_cems_overlap)} subplant-months that exist in both shaped EIA data and CEMS"
             )
         eia_pcs_overlap = data_overlap[
             (data_overlap["in_eia"] == 1)
             & (data_overlap["in_partial_cems_subplant"] == 1)
         ]
         if len(eia_pcs_overlap) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(eia_pcs_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data"
+            logger.warning(
+                f"There are {len(eia_pcs_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data"
             )
         cems_pcs_overlap = data_overlap[
             (data_overlap["in_cems"] == 1)
             & (data_overlap["in_partial_cems_subplant"] == 1)
         ]
         if len(cems_pcs_overlap) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(cems_pcs_overlap)} subplant-months that exist in both CEMS data and partial CEMS data"
+            logger.warning(
+                f"There are {len(cems_pcs_overlap)} subplant-months that exist in both CEMS data and partial CEMS data"
             )
         eia_pcp_overlap = data_overlap[
             (data_overlap["in_eia"] == 1) & (data_overlap["in_partial_cems_plant"] == 1)
         ]
         if len(eia_pcp_overlap) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(eia_pcp_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data"
+            logger.warning(
+                f"There are {len(eia_pcp_overlap)} subplant-months that exist in both shaped EIA data and partial CEMS data"
             )
         cems_pcp_overlap = data_overlap[
             (data_overlap["in_cems"] == 1)
             & (data_overlap["in_partial_cems_plant"] == 1)
         ]
         if len(cems_pcp_overlap) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(cems_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data"
+            logger.warning(
+                f"There are {len(cems_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data"
             )
         pcs_pcp_overlap = data_overlap[
             (data_overlap["in_partial_cems_subplant"] == 1)
             & (data_overlap["in_partial_cems_plant"] == 1)
         ]
         if len(pcs_pcp_overlap) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(pcs_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data"
+            logger.warning(
+                f"There are {len(pcs_pcp_overlap)} subplant-months that exist in both CEMS data and partial CEMS data"
             )
         all_overlap = data_overlap[data_overlap["number_of_locations"] == 4]
         if len(all_overlap) > 0:
-            print(" ")
-            print(
-                f"WARNING: There are {len(all_overlap)} subplant-months that exist in shaped EIA data, CEMS data, and partial CEMS data."
+            logger.warning(
+                f"There are {len(all_overlap)} subplant-months that exist in shaped EIA data, CEMS data, and partial CEMS data."
             )
         raise UserWarning("The above overlaps must be fixed before proceeding.")
     else:
-        print("OK")
+        logger.info("OK")
 
 
 def validate_shaped_totals(shaped_eia_data, monthly_eia_data_to_shape, group_keys):
     """Checks that any shaped monthly data still adds up to the monthly total after shaping."""
 
-    print("    Checking that shaped hourly data matches monthly totals...  ", end="")
+    logger.info("Checking that shaped hourly data matches monthly totals...  ")
 
     monthly_group_keys = group_keys + ["report_date"]
 
@@ -598,18 +574,17 @@ def validate_shaped_totals(shaped_eia_data, monthly_eia_data_to_shape, group_key
     compare = (shaped_data_agg - eia_data_agg).round(0)
 
     if compare.sum().sum() > 0:
-        print(" ")
-        print(
+        logger.warning("\n" +
             compare[
                 (compare["net_generation_mwh"] != 0)
                 | (compare["fuel_consumed_mmbtu"] != 0)
-            ]
+            ].to_string()
         )
         raise UserWarning(
             "The data shaping process is changing the monthly total values compared to reported EIA values. This process should only shape the data, not alter it."
         )
     else:
-        print("OK")
+        logger.info("OK")
 
 
 def validate_unique_datetimes(df, df_name, keys):
@@ -626,7 +601,7 @@ def validate_unique_datetimes(df, df_name, keys):
                 df.duplicated(subset=(keys + [datetime_column]), keep=False)
             ]
             if len(duplicate_dt) > 0:
-                print(duplicate_dt)
+                logger.warning("\n" + duplicate_dt.to_string())
                 raise UserWarning(
                     f"The dataframe {df_name} contains duplicate {datetime_column} values within each group of {keys}. See above output"
                 )
@@ -840,7 +815,7 @@ def identify_percent_of_data_by_input_source(
     source_of_input_data = []
     for name, df in data_sources.items():
         if len(df) == 0:  # Empty df. May occur when running `small`
-            print(f"WARNING: data source {name} has zero entries")
+            logger.warning(f"data source {name} has zero entries")
             continue
         if name == "eia":
             subplant_data = df.groupby(
@@ -1380,8 +1355,8 @@ def check_for_anomalous_co2_factors(
             on="plant_id_eia",
             validate="m:1",
         )
-        print("Potentially anomalous co2 factors detected for the following plants:")
-        print(
+        logger.warning("Potentially anomalous co2 factors detected for the following plants:")
+        logger.warning("\n" +
             factor_anomaly[
                 [
                     "plant_id_eia",
@@ -1391,7 +1366,7 @@ def check_for_anomalous_co2_factors(
                     f"{pollutant}_mass_lb_for_electricity",
                     factor,
                 ]
-            ].sort_values(by=factor)
+            ].sort_values(by=factor).to_string()
         )
 
 
@@ -1408,8 +1383,8 @@ def test_for_missing_fuel(df, generation_column):
         )
     ]
     if not missing_fuel_test.empty:
-        print(
-            f"WARNING: There are {len(missing_fuel_test)} records where {generation_column} is positive but no fuel consumption is reported. Check `missing_fuel_test` for complete list"
+        logger.warning(
+            f"There are {len(missing_fuel_test)} records where {generation_column} is positive but no fuel consumption is reported. Check `missing_fuel_test` for complete list"
         )
 
     return missing_fuel_test
@@ -1418,8 +1393,8 @@ def test_for_missing_fuel(df, generation_column):
 def test_for_missing_co2(df):
     missing_co2_test = df[df["co2_mass_lb"].isna() & ~df["fuel_consumed_mmbtu"].isna()]
     if not missing_co2_test.empty:
-        print(
-            f"WARNING: There are {len(missing_co2_test)} records where co2 data is missing. Check `missing_co2_test` for complete list"
+        logger.warning(
+            f"There are {len(missing_co2_test)} records where co2 data is missing. Check `missing_co2_test` for complete list"
         )
     return missing_co2_test
 
@@ -1427,8 +1402,8 @@ def test_for_missing_co2(df):
 def test_for_missing_data(df, columns_to_test):
     missing_data_test = df[df[columns_to_test].isnull().all(axis=1)]
     if not missing_data_test.empty:
-        print(
-            f"WARNING: There are {len(missing_data_test)} records for which no data was reported. Check `missing_data_test` for complete list"
+        logger.warning(
+            f"There are {len(missing_data_test)} records for which no data was reported. Check `missing_data_test` for complete list"
         )
     return missing_data_test
 
@@ -1453,15 +1428,15 @@ def test_for_missing_incorrect_prime_movers(df, year):
         != incorrect_pm_test["prime_mover_code_eia860"]
     ]
     if not incorrect_pm_test.empty:
-        print(
-            f"WARNING: There are {len(incorrect_pm_test)} records for which the allocated prime mover does not match the reported prime mover. Check `incorrect_pm_test` for complete list"
+        logger.warning(
+            f"There are {len(incorrect_pm_test)} records for which the allocated prime mover does not match the reported prime mover. Check `incorrect_pm_test` for complete list"
         )
 
     # check for missing PM code
     missing_pm_test = df[df["prime_mover_code"].isna()]
     if not missing_pm_test.empty:
-        print(
-            f"WARNING: There are {len(missing_pm_test)} records for which no prime mover was assigned. Check `missing_pm_test` for complete list"
+        logger.warning(
+            f"There are {len(missing_pm_test)} records for which no prime mover was assigned. Check `missing_pm_test` for complete list"
         )
 
     return incorrect_pm_test, missing_pm_test
@@ -1469,7 +1444,7 @@ def test_for_missing_incorrect_prime_movers(df, year):
 
 def test_for_outlier_heat_rates(df):
     # check heat rates
-    print("Heat Rate Test")
+    logger.warning("Heat Rate Test")
     # remove non-fossil fuel types
     thermal_generators = df[
         ~df["energy_source_code"].isin(["SUN", "MWH", "WND", "WAT", "WH", "PUR"])
@@ -1508,10 +1483,10 @@ def test_for_outlier_heat_rates(df):
                 )
             ]
             if not heat_rate_test.empty:
-                print(
-                    f"    WARNING: {len(heat_rate_test)} of {len(generators_with_pm)} records for {fuel_type} generators with {pm} prime mover have heat rate of zero or > {outlier_threshold.round(2)} mmbtu/MWh"
+                logger.warning(
+                    f"{len(heat_rate_test)} of {len(generators_with_pm)} records for {fuel_type} generators with {pm} prime mover have heat rate of zero or > {outlier_threshold.round(2)} mmbtu/MWh"
                 )
-                print(
+                logger.warning(
                     f'             median = {heat_rate_stats["50%"].round(2)}, max = {heat_rate_stats["max"].round(2)}, min = {heat_rate_stats["min"].round(2)}'
                 )
                 heat_rate_test_all.append(heat_rate_test)
@@ -1541,8 +1516,8 @@ def test_for_zero_data(df, columns_to_test):
         & (df[columns_to_test].sum(axis=1) == 0)
     ]
     if not zero_data_test.empty:
-        print(
-            f"WARNING: There are {len(zero_data_test)} records where all operating data are zero. Check `zero_data_test` for complete list"
+        logger.warning(
+            f"There are {len(zero_data_test)} records where all operating data are zero. Check `zero_data_test` for complete list"
         )
     return zero_data_test
 
@@ -1550,8 +1525,8 @@ def test_for_zero_data(df, columns_to_test):
 def test_gtn_results(df):
     gtn_test = df[df["net_generation_mwh"] > df["gross_generation_mwh"]]
     if not gtn_test.empty:
-        print(
-            f"WARNING: There are {round(len(gtn_test)/len(df)*100, 1)}% of records where net generation > gross generation. See `gtn_test` for details"
+        logger.warning(
+            f"There are {round(len(gtn_test)/len(df)*100, 1)}% of records where net generation > gross generation. See `gtn_test` for details"
         )
     return gtn_test
 
diff --git a/src/visualization.py b/src/visualization.py
index 3121adf6..18c626bf 100644
--- a/src/visualization.py
+++ b/src/visualization.py
@@ -1,7 +1,4 @@
-"""
-Helper functions for visualization
-
-"""
+"""Helper functions for visualization."""
 import pandas as pd
 import plotly.express as px
 
diff --git a/test/test_logging.py b/test/test_logging.py
new file mode 100644
index 00000000..f905175f
--- /dev/null
+++ b/test/test_logging.py
@@ -0,0 +1,33 @@
+import sys
+import logging
+
+import pandas as pd
+
+sys.path.append('../src')
+sys.path.append('..')
+
+import src.eia930 as eia930
+from src.filepaths import top_folder
+
+from src.logging_util import get_logger, configure_root_logger
+
+pudl_logger = logging.getLogger(name="catalystcoop.pudl")
+
+configure_root_logger(logfile=top_folder('test/test_logfile.txt'), level=logging.INFO)
+# If you call this again, nothing bad should happen. Logging statements should
+# still only show up once.
+configure_root_logger(logfile=top_folder('test/test_logfile.txt'), level=logging.INFO)
+logger = get_logger('test')
+
+
+def main():
+  """These statements should each be printed once in a nice format."""
+  logger.info('This is the OGE logger')
+  pudl_logger.info('This is the PUDL logger')
+
+  df = pd.DataFrame({"a": [1,2,3], "b": [4,5,6]})
+  logger.info("\n" + df.to_string())
+
+
+if __name__ == '__main__':
+  main()

From 6c9d636eebf2e81aa905cfb58a9db73bf0dbc78c Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Thu, 23 Feb 2023 19:19:25 -0800
Subject: [PATCH 14/27] convert print to logger

---
 src/consumed.py      |  8 ++++----
 src/download_data.py |  9 +--------
 src/validation.py    | 14 ++++++--------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/src/consumed.py b/src/consumed.py
index 65f0f037..38b25855 100644
--- a/src/consumed.py
+++ b/src/consumed.py
@@ -183,9 +183,9 @@ def consumption_emissions(F, P, ID):
 
     for j in perturbed:
         if X[j] != 0.0:
-            print(b[j])
-            print(np.abs(A[j, :]).sum())
-            print(np.abs(A[:, j]).sum())
+            logger.warning("\n" + b[j].to_string())
+            logger.warning("\n" + np.abs(A[j, :]).sum())
+            logger.warning("\n" + np.abs(A[:, j]).sum())
             raise ValueError("X[%d] is %.2f instead of 0" % (j, X[j]))
 
     return X, len(perturbed)
@@ -486,7 +486,7 @@ def run(self):
             for adj in ADJUSTMENTS:
                 total_failed = 0
                 col = get_rate_column(pol, adjustment=adj, generated=False)
-                print(f"{pol}, {adj}", end="...")
+                logger.info(f"Solving consumed {pol} {adj} emissions...")
                 # Calculate emissions
                 for date in self.generation.index:
                     if self.small and (
diff --git a/src/download_data.py b/src/download_data.py
index 151564fb..07491f51 100644
--- a/src/download_data.py
+++ b/src/download_data.py
@@ -109,18 +109,11 @@ def download_pudl_data(zenodo_url: str):
 def download_pudl(zenodo_url, pudl_version):
     r = requests.get(zenodo_url, params={"download": "1"}, stream=True)
     # specify parameters for progress bar
-    total_size_in_bytes = int(r.headers.get("content-length", 0))
     block_size = 1024 * 1024 * 10  # 10 MB
-    downloaded = 0
+    logger.info("    Downloading PUDL data...")
     with open(downloads_folder("pudl.tgz"), "wb") as fd:
         for chunk in r.iter_content(chunk_size=block_size):
-            print(
-                f"    Downloading PUDL. Progress: {(round(downloaded/total_size_in_bytes*100,2))}%   \r",
-                end="",
-            )
             fd.write(chunk)
-            downloaded += block_size
-    logger.info("    Downloading PUDL. Progress: 100.0%")
 
     # extract the tgz file
     logger.info("    Extracting PUDL data...")
diff --git a/src/validation.py b/src/validation.py
index 83af09be..c0f4d94f 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -273,9 +273,8 @@ def test_for_missing_energy_source_code(df):
 
 def check_non_missing_cems_co2_values_unchanged(cems_original, cems):
     """Checks that no non-missing CO2 values were modified during the process of filling."""
-    print(
+    logger.info(
         "    Checking that original CO2 data in CEMS was not modified by filling missing values...",
-        end="",
     )
     # only keep non-zero and non-missing co2 values, since these should have not been modified
     cems_original = cems_original.loc[
@@ -294,12 +293,11 @@ def check_non_missing_cems_co2_values_unchanged(cems_original, cems):
     )
     num_nonzero_rows = len(test_fill[test_fill["diff"] != 0])
     if num_nonzero_rows > 0:
-        print(" ")
-        print(
-            f"WARNING: There are {num_nonzero_rows} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error"
+        logger.warning(
+            f"There are {num_nonzero_rows} non-missing CO2 CEMS records that were modified by `fill_cems_missing_co2` in error"
         )
     else:
-        print("OK")
+        logger.info("OK")
 
 
 def check_removed_data_is_empty(cems):
@@ -316,8 +314,8 @@ def check_removed_data_is_empty(cems):
         ],
     ].sum(numeric_only=True)
     if check_that_data_is_zero.sum() > 0:
-        print("WARNING: Some data being removed has non-zero data associated with it:")
-        print(check_that_data_is_zero)
+        logger.warning("Some data being removed has non-zero data associated with it:")
+        logger.warning("\n" + check_that_data_is_zero.to_string())
 
 
 def test_for_missing_subplant_id(df):

From f9a5a8feb9553d22fba6ce42de05896ee4d4720d Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Thu, 23 Feb 2023 19:23:24 -0800
Subject: [PATCH 15/27] remove exclamation

---
 src/consumed.py   | 4 ++--
 src/validation.py | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/consumed.py b/src/consumed.py
index 38b25855..5ff7cce8 100644
--- a/src/consumed.py
+++ b/src/consumed.py
@@ -184,8 +184,8 @@ def consumption_emissions(F, P, ID):
     for j in perturbed:
         if X[j] != 0.0:
             logger.warning("\n" + b[j].to_string())
-            logger.warning("\n" + np.abs(A[j, :]).sum())
-            logger.warning("\n" + np.abs(A[:, j]).sum())
+            logger.warning("\n" + np.abs(A[j, :]).sum().to_string())
+            logger.warning("\n" + np.abs(A[:, j]).sum().to_string())
             raise ValueError("X[%d] is %.2f instead of 0" % (j, X[j]))
 
     return X, len(perturbed)
diff --git a/src/validation.py b/src/validation.py
index c0f4d94f..a50535a0 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -227,12 +227,10 @@ def test_for_missing_values(df, small: bool = False):
     if missing_warnings > 0:
         if small:
             logger.warning(
-                " Found missing values during small run, these may be fixed with full data"
+                "Found missing values during small run, these may be fixed with full data"
             )
         else:
-            logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
             logger.warning("The above missing values are errors and must be fixed")
-            logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
     else:
         logger.info("OK")
     return missing_test

From 80923b689cc65e81308837e646ed56c60257c683 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Thu, 23 Feb 2023 20:22:04 -0800
Subject: [PATCH 16/27] expand validation coverage

---
 src/data_cleaning.py           |   9 +-
 src/gross_to_net_generation.py |   8 +-
 src/impute_hourly_profiles.py  |  26 +++--
 src/load_data.py               |  11 ++-
 src/output_data.py             |   7 +-
 src/validation.py              | 174 +++++++++------------------------
 6 files changed, 95 insertions(+), 140 deletions(-)

diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index f11330ac..263f15ea 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -545,7 +545,8 @@ def update_energy_source_codes(df):
         (df["energy_source_code"] == "OTH") & (df["fuel_consumed_mmbtu"] > 0)
     ]
     if len(plants_with_other_fuel) > 0:
-        logger.warning(f"""
+        logger.warning(
+            f"""
             After cleaning energy source codes, some fuel consumption is still associated with an 'OTH' fuel type.
             This will lead to incorrect emissions calculations.
             Check the following plants: {list(plants_with_other_fuel.plant_id_eia.unique())}
@@ -574,6 +575,7 @@ def create_primary_fuel_table(gen_fuel_allocated, pudl_out, add_subplant_id, yea
             on=["plant_id_eia", "generator_id"],
             validate="m:1",
         )
+        validation.test_for_missing_subplant_id(gen_fuel_allocated)
 
         # get a table of primary energy source codes
         gen_primary_fuel = gen_fuel_allocated[
@@ -763,6 +765,7 @@ def calculate_capacity_based_primary_fuel(pudl_out, agg_keys, year):
             on=["plant_id_eia", "generator_id"],
             validate="m:1",
         )
+        validation.test_for_missing_subplant_id(gen_capacity)
 
     gen_capacity = (
         gen_capacity.groupby(agg_keys + ["energy_source_code_1"], dropna=False)[
@@ -811,6 +814,7 @@ def calculate_subplant_efs(gen_fuel_allocated, year):
         on=["plant_id_eia", "generator_id"],
         validate="m:1",
     )
+    validation.test_for_missing_subplant_id(subplant_efs)
 
     # calculate the total emissions and fuel consumption by subplant-month
     subplant_efs = subplant_efs.groupby(
@@ -998,6 +1002,9 @@ def clean_cems(year: int, small: bool, primary_fuel_table, subplant_emission_fac
     cems = remove_cems_with_zero_monthly_data(cems)
 
     validation.test_for_negative_values(cems)
+    validation.validate_unique_datetimes(
+        cems, "cems", ["plant_id_eia", "emissions_unit_id_epa"]
+    )
 
     cems = apply_dtypes(cems)
 
diff --git a/src/gross_to_net_generation.py b/src/gross_to_net_generation.py
index 0d6ed96c..b574ec20 100644
--- a/src/gross_to_net_generation.py
+++ b/src/gross_to_net_generation.py
@@ -95,7 +95,12 @@ def convert_gross_to_net_generation(cems, eia923_allocated, plant_attributes, ye
         logger.warning(
             "The following subplants are missing default GTN ratios. Using a default value of 0.97"
         )
-        logger.warning("\n" + missing_defaults[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string())
+        logger.warning(
+            "\n"
+            + missing_defaults[["plant_id_eia", "subplant_id"]]
+            .drop_duplicates()
+            .to_string()
+        )
     # if there is a missing default gtn ratio, fill with 0.97
     cems["default_gtn_ratio"] = cems["default_gtn_ratio"].fillna(0.97)
     cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
@@ -425,6 +430,7 @@ def calculate_subplant_nameplate_capacity(year):
         on=["plant_id_eia", "generator_id"],
         validate="1:1",
     )
+    validation.test_for_missing_subplant_id(gen_capacity)
     subplant_capacity = (
         gen_capacity.groupby(["plant_id_eia", "subplant_id"])["capacity_mw"]
         .sum()
diff --git a/src/impute_hourly_profiles.py b/src/impute_hourly_profiles.py
index 2d9bb64c..6c064a65 100644
--- a/src/impute_hourly_profiles.py
+++ b/src/impute_hourly_profiles.py
@@ -282,6 +282,9 @@ def aggregate_for_residual(
 
     # add the partial cems data
     cems = pd.concat([cems, partial_cems_subplant, partial_cems_plant], axis=0)
+    validation.validate_unique_datetimes(
+        cems, "cems_for_residual", ["plant_id_eia", "subplant_id"]
+    )
 
     # merge in plant attributes
     cems = cems.merge(plant_attributes, how="left", on="plant_id_eia", validate="m:1")
@@ -297,7 +300,12 @@ def aggregate_for_residual(
         logger.warning(
             "The following cems subplants are missing fuel categories and will lead to incorrect residual calculations:"
         )
-        logger.warning("\n" + missing_fuel_category[["plant_id_eia", "subplant_id"]].drop_duplicates().to_string())
+        logger.warning(
+            "\n"
+            + missing_fuel_category[["plant_id_eia", "subplant_id"]]
+            .drop_duplicates()
+            .to_string()
+        )
         raise UserWarning(
             "The missing fuel categories must be fixed before proceeding."
         )
@@ -657,6 +665,9 @@ def impute_missing_hourly_profiles(
     hourly_profiles["datetime_utc"] = pd.to_datetime(
         hourly_profiles["datetime_utc"], utc=True
     )
+    validation.validate_unique_datetimes(
+        hourly_profiles, "hourly_profiles", ["ba_code", "fuel_category"]
+    )
 
     return hourly_profiles
 
@@ -1035,12 +1046,6 @@ def combine_and_export_hourly_plant_data(
             df_name="shaped_eia_data",
             keys=["plant_id_eia"],
         )
-        # validate that the shaping did not alter data at the monthly level
-        validation.validate_shaped_totals(
-            shaped_eia_region_data,
-            eia_region,
-            group_keys=[region_to_group, "fuel_category"],
-        )
 
         # concat all of the data together
         combined_plant_data = pd.concat(
@@ -1230,6 +1235,13 @@ def shape_monthly_eia_data_as_hourly(monthly_eia_data_to_shape, hourly_profiles)
         [col for col in column_order if col in shaped_monthly_data.columns]
     ]
 
+    # validate that the shaping did not alter data at the monthly level
+    validation.validate_shaped_totals(
+        shaped_monthly_data,
+        monthly_eia_data_to_shape,
+        group_keys=["ba_code", "fuel_category"],
+    )
+
     return shaped_monthly_data
 
 
diff --git a/src/load_data.py b/src/load_data.py
index 798c480b..81c84e3d 100644
--- a/src/load_data.py
+++ b/src/load_data.py
@@ -9,6 +9,7 @@
 
 from column_checks import get_dtypes
 from filepaths import downloads_folder, manual_folder, outputs_folder
+from validation import validate_unique_datetimes
 from logging_util import get_logger
 
 logger = get_logger(__name__)
@@ -112,6 +113,8 @@ def load_cems_data(year):
         }
     )
 
+    validate_unique_datetimes(cems, "cems", ["plant_id_eia", "emissions_unit_id_epa"])
+
     return cems
 
 
@@ -832,7 +835,9 @@ def load_boiler_control_id_association_eia860(year, pollutant):
         logger.warning(
             "Environmental association data prior to 2013 have not been integrated into the data pipeline."
         )
-        logger.warning("This may result in less accurate pollutant emissions calculations.")
+        logger.warning(
+            "This may result in less accurate pollutant emissions calculations."
+        )
         boiler_control_id_association_eia860 = pd.DataFrame(
             columns=boiler_association_eia860_names
         )
@@ -881,7 +886,9 @@ def load_boiler_design_parameters_eia860(year):
         logger.warning(
             "Boiler Design data prior to 2013 have not been integrated into the data pipeline."
         )
-        logger.warning("This may result in less accurate NOx and SO2 emissions calculations.")
+        logger.warning(
+            "This may result in less accurate NOx and SO2 emissions calculations."
+        )
         boiler_design_parameters_eia860 = pd.DataFrame(
             columns=list(boiler_design_parameters_eia860_names.values())
         )
diff --git a/src/output_data.py b/src/output_data.py
index 23eb9824..6626cc96 100644
--- a/src/output_data.py
+++ b/src/output_data.py
@@ -75,7 +75,9 @@ def zip_results_for_s3(year):
                     # skip the metric hourly plant data since we do not create those outputs
                     pass
                 else:
-                    logger.info(f"zipping {year}_{data_type}_{aggregation}_{unit} for s3")
+                    logger.info(
+                        f"zipping {year}_{data_type}_{aggregation}_{unit} for s3"
+                    )
                     folder = (
                         f"{results_folder()}/{year}/{data_type}/{aggregation}/{unit}"
                     )
@@ -176,6 +178,9 @@ def output_plant_data(df, path_prefix, resolution, skip_outputs, plant_attribute
     if not skip_outputs:
         if resolution == "hourly":
             # output hourly data
+            validation.validate_unique_datetimes(
+                df, "individual_plant_data", ["plant_id_eia"]
+            )
             # Separately save real and aggregate plants
             output_to_results(
                 df[df.plant_id_eia > 900000],
diff --git a/src/validation.py b/src/validation.py
index 83af09be..5ab57e8f 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -79,78 +79,35 @@ def check_allocated_gf_matches_input_gf(pudl_out, gen_fuel_allocated):
         logger.warning("Percentage Difference:")
         logger.warning("\n" + mismatched_allocation.to_string())
         logger.warning("EIA-923 Input Totals:")
-        logger.warning("\n" + plant_total_gf.loc[mismatched_allocation.index, :].to_string())
+        logger.warning(
+            "\n" + plant_total_gf.loc[mismatched_allocation.index, :].to_string()
+        )
         logger.warning("Allocated Totals:")
-        logger.warning("\n" + plant_total_alloc.loc[mismatched_allocation.index, :].to_string())
-
+        logger.warning(
+            "\n" + plant_total_alloc.loc[mismatched_allocation.index, :].to_string()
+        )
 
 
 def test_for_negative_values(df, small: bool = False):
     """Checks that there are no unexpected negative values in the data."""
     logger.info("Checking that fuel and emissions values are positive...  ")
-    columns_that_should_be_positive = [
-        "fuel_consumed_mmbtu",
-        "fuel_consumed_for_electricity_mmbtu",
-        "co2_mass_lb",
-        "ch4_mass_lb",
-        "n2o_mass_lb",
-        "co2e_mass_lb",
-        "nox_mass_lb",
-        "so2_mass_lb",
-        "co2_mass_lb_for_electricity",
-        "ch4_mass_lb_for_electricity",
-        "n2o_mass_lb_for_electricity",
-        "co2e_mass_lb_for_electricity",
-        "nox_mass_lb_for_electricity",
-        "so2_mass_lb_for_electricity",
-        "co2_mass_lb_adjusted",
-        "ch4_mass_lb_adjusted",
-        "n2o_mass_lb_adjusted",
-        "co2e_mass_lb_adjusted",
-        "nox_mass_lb_adjusted",
-        "so2_mass_lb_adjusted",
-        "co2_mass_lb_for_electricity_adjusted",
-        "ch4_mass_lb_for_electricity_adjusted",
-        "n2o_mass_lb_for_electricity_adjusted",
-        "co2e_mass_lb_for_electricity_adjusted",
-        "nox_mass_lb_for_electricity_adjusted",
-        "so2_mass_lb_for_electricity_adjusted",
-        "generated_co2_rate_lb_per_mwh_for_electricity",
-        "generated_ch4_rate_lb_per_mwh_for_electricity",
-        "generated_n2o_rate_lb_per_mwh_for_electricity",
-        "generated_co2e_rate_lb_per_mwh_for_electricity",
-        "generated_nox_rate_lb_per_mwh_for_electricity",
-        "generated_so2_rate_lb_per_mwh_for_electricity",
-        "generated_co2_rate_lb_per_mwh_for_electricity_adjusted",
-        "generated_ch4_rate_lb_per_mwh_for_electricity_adjusted",
-        "generated_n2o_rate_lb_per_mwh_for_electricity_adjusted",
-        "generated_co2e_rate_lb_per_mwh_for_electricity_adjusted",
-        "generated_nox_rate_lb_per_mwh_for_electricity_adjusted",
-        "generated_so2_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_co2_rate_lb_per_mwh_for_electricity",
-        "consumed_ch4_rate_lb_per_mwh_for_electricity",
-        "consumed_n2o_rate_lb_per_mwh_for_electricity",
-        "consumed_co2e_rate_lb_per_mwh_for_electricity",
-        "consumed_nox_rate_lb_per_mwh_for_electricity",
-        "consumed_so2_rate_lb_per_mwh_for_electricity",
-        "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_ch4_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_n2o_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_co2e_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_nox_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_so2_rate_lb_per_mwh_for_electricity_adjusted",
-    ]
-    columns_to_test = [
-        col for col in columns_that_should_be_positive if col in df.columns
-    ]
+    columns_that_can_be_negative = ["net_generation_mwh"]
     negative_warnings = 0
-    for column in columns_to_test:
-        negative_test = df[df[column] < 0]
-        if not negative_test.empty:
-            logger.warning(
-                f"There are {len(negative_test)} records where {column} is negative."
-            )
-            negative_warnings += 1
+    for column in df.columns:
+        # if the column is allowed to be negative, skip the test
+        if column in columns_that_can_be_negative:
+            pass
+        else:
+            # if the column is not numeric, skip the test
+            if pd.api.types.is_numeric_dtype(df[column].dtype):
+                negative_test = df[df[column] < 0]
+                if not negative_test.empty:
+                    logger.warning(
+                        f"There are {len(negative_test)} records where {column} is negative."
+                    )
+                    negative_warnings += 1
+            else:
+                pass
     if negative_warnings > 0:
         if small:
             logger.warning(
@@ -158,7 +115,6 @@ def test_for_negative_values(df, small: bool = False):
             )
         else:
             logger.warning("The above negative values are errors and must be fixed!")
-            # raise UserWarning("The above negative values are errors and must be fixed")
     else:
         logger.info("OK")
     return negative_test
@@ -167,57 +123,8 @@ def test_for_negative_values(df, small: bool = False):
 def test_for_missing_values(df, small: bool = False):
     """Checks that there are no unexpected missing values in the output data."""
     logger.info("Checking that no values are missing...  ")
-    columns_that_should_be_complete = [
-        "plant_id_eia",
-        "fuel_category",
-        "datetime_local",
-        "datetime_utc",
-        "month",
-        "net_generation_mwh",
-        "fuel_consumed_mmbtu",
-        "fuel_consumed_for_electricity_mmbtu",
-        "co2_mass_lb",
-        "ch4_mass_lb",
-        "n2o_mass_lb",
-        "co2e_mass_lb",
-        "nox_mass_lb",
-        "so2_mass_lb",
-        "co2_mass_lb_for_electricity",
-        "ch4_mass_lb_for_electricity",
-        "n2o_mass_lb_for_electricity",
-        "co2e_mass_lb_for_electricity",
-        "nox_mass_lb_for_electricity",
-        "so2_mass_lb_for_electricity",
-        "co2_mass_lb_adjusted",
-        "ch4_mass_lb_adjusted",
-        "n2o_mass_lb_adjusted",
-        "co2e_mass_lb_adjusted",
-        "nox_mass_lb_adjusted",
-        "so2_mass_lb_adjusted",
-        "co2_mass_lb_for_electricity_adjusted",
-        "ch4_mass_lb_for_electricity_adjusted",
-        "n2o_mass_lb_for_electricity_adjusted",
-        "co2e_mass_lb_for_electricity_adjusted",
-        "nox_mass_lb_for_electricity_adjusted",
-        "so2_mass_lb_for_electricity_adjusted",
-        "consumed_co2_rate_lb_per_mwh_for_electricity",
-        "consumed_ch4_rate_lb_per_mwh_for_electricity",
-        "consumed_n2o_rate_lb_per_mwh_for_electricity",
-        "consumed_co2e_rate_lb_per_mwh_for_electricity",
-        "consumed_nox_rate_lb_per_mwh_for_electricity",
-        "consumed_so2_rate_lb_per_mwh_for_electricity",
-        "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_ch4_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_n2o_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_co2e_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_nox_rate_lb_per_mwh_for_electricity_adjusted",
-        "consumed_so2_rate_lb_per_mwh_for_electricity_adjusted",
-    ]
-    columns_to_test = [
-        col for col in columns_that_should_be_complete if col in df.columns
-    ]
     missing_warnings = 0
-    for column in columns_to_test:
+    for column in df.columns:
         missing_test = df[df[column].isna()]
         if not missing_test.empty:
             logger.warning(
@@ -230,9 +137,7 @@ def test_for_missing_values(df, small: bool = False):
                 " Found missing values during small run, these may be fixed with full data"
             )
         else:
-            logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
             logger.warning("The above missing values are errors and must be fixed")
-            logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
     else:
         logger.info("OK")
     return missing_test
@@ -240,7 +145,9 @@ def test_for_missing_values(df, small: bool = False):
 
 def test_chp_allocation(df):
     """Checks that the CHP allocation didn't create any anomalous values."""
-    logger.info("Checking that total fuel consumed >= fuel consumed for electricity...  ")
+    logger.info(
+        "Checking that total fuel consumed >= fuel consumed for electricity...  "
+    )
     chp_allocation_test = df[
         df["fuel_consumed_for_electricity_mmbtu"] > df["fuel_consumed_mmbtu"]
     ]
@@ -257,7 +164,8 @@ def test_chp_allocation(df):
 def test_for_missing_energy_source_code(df):
     """Checks that there are no missing energy source codes associated with non-zero fuel consumption."""
     logger.info(
-        "Checking that there are no missing energy source codes associated with non-zero fuel consumption...  ")
+        "Checking that there are no missing energy source codes associated with non-zero fuel consumption...  "
+    )
     missing_esc_test = df[
         (df["energy_source_code"].isna()) & (df["fuel_consumed_mmbtu"] > 0)
     ]
@@ -335,7 +243,9 @@ def test_for_missing_subplant_id(df):
 
 def validate_gross_to_net_conversion(cems, eia923_allocated):
     """checks whether the calculated net generation matches the reported net generation from EIA-923 at the annual plant level."""
-    logger.info("Checking that calculated net generation matches reported net generation in EIA-923...  ")
+    logger.info(
+        "Checking that calculated net generation matches reported net generation in EIA-923...  "
+    )
     # merge together monthly subplant totals from EIA and calculated from CEMS
     eia_netgen = (
         eia923_allocated.groupby(
@@ -388,7 +298,9 @@ def validate_gross_to_net_conversion(cems, eia923_allocated):
 def test_emissions_adjustments(df):
     """For each emission, tests that mass_lb >= mass_lb_for_electricity >= mass_lb_for_electricity_adjusted."""
 
-    logger.info("Checking that adjusted emission values are less than total emissions...  ")
+    logger.info(
+        "Checking that adjusted emission values are less than total emissions...  "
+    )
 
     pollutants = ["co2", "ch4", "n2o", "co2e", "nox", "so2"]
 
@@ -574,8 +486,9 @@ def validate_shaped_totals(shaped_eia_data, monthly_eia_data_to_shape, group_key
     compare = (shaped_data_agg - eia_data_agg).round(0)
 
     if compare.sum().sum() > 0:
-        logger.warning("\n" +
-            compare[
+        logger.warning(
+            "\n"
+            + compare[
                 (compare["net_generation_mwh"] != 0)
                 | (compare["fuel_consumed_mmbtu"] != 0)
             ].to_string()
@@ -1355,9 +1268,12 @@ def check_for_anomalous_co2_factors(
             on="plant_id_eia",
             validate="m:1",
         )
-        logger.warning("Potentially anomalous co2 factors detected for the following plants:")
-        logger.warning("\n" +
-            factor_anomaly[
+        logger.warning(
+            "Potentially anomalous co2 factors detected for the following plants:"
+        )
+        logger.warning(
+            "\n"
+            + factor_anomaly[
                 [
                     "plant_id_eia",
                     "plant_primary_fuel",
@@ -1366,7 +1282,9 @@ def check_for_anomalous_co2_factors(
                     f"{pollutant}_mass_lb_for_electricity",
                     factor,
                 ]
-            ].sort_values(by=factor).to_string()
+            ]
+            .sort_values(by=factor)
+            .to_string()
         )
 
 

From 690538c2fdbca9445a9ee80111e8227e0d1b8a66 Mon Sep 17 00:00:00 2001
From: Milo Knowles <miloknowles97@gmail.com>
Date: Fri, 24 Feb 2023 10:11:43 -0500
Subject: [PATCH 17/27] Improve flag handling and log args

---
 src/data_pipeline.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index d30cefa1..c4c66574 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -39,33 +39,43 @@ def get_args() -> argparse.Namespace:
     parser.add_argument(
         "--shape_individual_plants",
         help="Assign an hourly profile to each individual plant with EIA-only data, instead of aggregating to the fleet level before shaping.",
-        type=bool,
         default=True,
+        action=argparse.BooleanOptionalAction
     )
     parser.add_argument(
         "--small",
         help="Run on subset of data for quicker testing, outputs to outputs/small and results to results/small.",
-        type=bool,
         default=False,
+        action=argparse.BooleanOptionalAction
     )
     parser.add_argument(
         "--flat",
         help="Use flat hourly profiles?",
+        default=False,
+        action=argparse.BooleanOptionalAction
     )
     parser.add_argument(
         "--skip_outputs",
         help="Skip outputting data to csv files for quicker testing.",
-        type=bool,
         default=False,
+        action=argparse.BooleanOptionalAction
     )
 
     args = parser.parse_args()
+
     return args
 
 
+def print_args(args: argparse.Namespace):
+    """Print out the command line arguments."""
+    s = "\n".join([f"  * {argname} = {argvalue}" for argname, argvalue in vars(args).items()])
+    logger.info(f"\n\nRunning with the following options:\n{s}\n")
+
+
 def main():
     """Runs the OGE data pipeline."""
     args = get_args()
+    print_args(args)
     year = args.year
     logger.info(f'Running data pipeline for year {year}')
 

From 1a091c41255bd88ac9dcf7b0b449865d4c468f1e Mon Sep 17 00:00:00 2001
From: Milo Knowles <miloknowles97@gmail.com>
Date: Fri, 24 Feb 2023 10:39:04 -0500
Subject: [PATCH 18/27] WIP

---
 src/data_pipeline.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index c4c66574..5418ea75 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -68,14 +68,15 @@ def get_args() -> argparse.Namespace:
 
 def print_args(args: argparse.Namespace):
     """Print out the command line arguments."""
-    s = "\n".join([f"  * {argname} = {argvalue}" for argname, argvalue in vars(args).items()])
-    logger.info(f"\n\nRunning with the following options:\n{s}\n")
+    argstring = "\n".join([f"  * {k} = {v}" for k, v in vars(args).items()])
+    logger.info(f"\n\nRunning with the following options:\n{argstring}\n")
 
 
 def main():
     """Runs the OGE data pipeline."""
     args = get_args()
     print_args(args)
+
     year = args.year
     logger.info(f'Running data pipeline for year {year}')
 

From 121a1f1de6dcfb6d3e7d035233801cf32de8f29b Mon Sep 17 00:00:00 2001
From: Milo Knowles <miloknowles97@gmail.com>
Date: Fri, 24 Feb 2023 11:21:55 -0500
Subject: [PATCH 19/27] Make sure the folder where logs go exists

---
 src/filepaths.py    | 10 ++++++++++
 src/logging_util.py |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/src/filepaths.py b/src/filepaths.py
index ec143b07..bc8d9c60 100644
--- a/src/filepaths.py
+++ b/src/filepaths.py
@@ -32,3 +32,13 @@ def results_folder(rel=""):
 
 def outputs_folder(rel=""):
     return os.path.join(data_folder("outputs"), rel)
+
+
+def containing_folder(filepath: str) -> str:
+    """Returns the folder containing `filepath`."""
+    return os.path.dirname(os.path.realpath(filepath))
+
+
+def make_containing_folder(filepath: str):
+    """Make sure the the folder where `filepath` goes exists."""
+    os.makedirs(containing_folder(filepath), exist_ok=True)
diff --git a/src/logging_util.py b/src/logging_util.py
index 79bbaf12..9f1fba28 100644
--- a/src/logging_util.py
+++ b/src/logging_util.py
@@ -2,6 +2,8 @@
 import logging
 import coloredlogs
 
+from filepaths import make_containing_folder
+
 
 def get_logger(name: str) -> logging.Logger:
   """Helper function to append `oge` to the logger name and return a logger.
@@ -41,6 +43,7 @@ def configure_root_logger(logfile: str | None = None, level: str = "INFO"):
 
   # Send everything to the log file by adding a file handler to the root logger.
   if logfile is not None:
+    make_containing_folder(logfile)
     file_logger = logging.FileHandler(logfile, mode='w')
     file_logger.setFormatter(logging.Formatter(log_format))
 

From 83817233009cca981b6d4cd25d2de1c5de4dc7f1 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Fri, 24 Feb 2023 10:03:24 -0800
Subject: [PATCH 20/27] add pudl download tracker back

---
 src/download_data.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/download_data.py b/src/download_data.py
index 07491f51..cb84a588 100644
--- a/src/download_data.py
+++ b/src/download_data.py
@@ -41,7 +41,9 @@ def download_helper(
     # If the file already exists, do not re-download it.
     final_destination = output_path if output_path is not None else download_path
     if os.path.exists(final_destination):
-        logger.info(f"    {final_destination.split('/')[-1]} already downloaded, skipping.")
+        logger.info(
+            f"    {final_destination.split('/')[-1]} already downloaded, skipping."
+        )
         return False
 
     # Otherwise, download to the file in chunks.
@@ -109,11 +111,19 @@ def download_pudl_data(zenodo_url: str):
 def download_pudl(zenodo_url, pudl_version):
     r = requests.get(zenodo_url, params={"download": "1"}, stream=True)
     # specify parameters for progress bar
+    total_size_in_bytes = int(r.headers.get("content-length", 0))
     block_size = 1024 * 1024 * 10  # 10 MB
+    downloaded = 0
     logger.info("    Downloading PUDL data...")
     with open(downloads_folder("pudl.tgz"), "wb") as fd:
         for chunk in r.iter_content(chunk_size=block_size):
+            print(
+                f"    Progress: {(round(downloaded/total_size_in_bytes*100,2))}%   \r",
+                end="",
+            )
             fd.write(chunk)
+            downloaded += block_size
+        print("    Progress: 100.0%")
 
     # extract the tgz file
     logger.info("    Extracting PUDL data...")

From f678d0b235d9e8d8e60ec3881db6196c96d7ff84 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Sat, 25 Feb 2023 15:47:29 -0800
Subject: [PATCH 21/27] change log location

---
 src/data_cleaning.py           | 20 ++++-----
 src/data_pipeline.py           | 23 ++++++----
 src/download_data.py           | 16 +++----
 src/eia930.py                  |  6 +--
 src/gross_to_net_generation.py |  2 +-
 src/impute_hourly_profiles.py  |  2 +-
 src/load_data.py               |  2 +-
 src/logging_util.py            | 79 +++++++++++++++++-----------------
 src/output_data.py             |  6 +--
 src/validation.py              |  2 +-
 10 files changed, 81 insertions(+), 77 deletions(-)

diff --git a/src/data_cleaning.py b/src/data_cleaning.py
index f11330ac..1dd13449 100644
--- a/src/data_cleaning.py
+++ b/src/data_cleaning.py
@@ -54,11 +54,11 @@ def identify_subplants(year, number_of_years=5):
     end_year = year
 
     # load 5 years of monthly data from CEMS
-    logger.info("    loading CEMS ids")
+    logger.info("loading CEMS ids")
     cems_ids = load_data.load_cems_ids(start_year, end_year)
 
     # add subplant ids to the data
-    logger.info("    identifying unique subplants")
+    logger.info("identifying unique subplants")
     generate_subplant_ids(start_year, end_year, cems_ids)
 
 
@@ -883,7 +883,7 @@ def remove_plants(
             ].plant_id_eia.unique()
         )
         logger.info(
-            f"    Removing {len(plants_in_states_to_remove)} plants located in the following states: {remove_states}"
+            f"Removing {len(plants_in_states_to_remove)} plants located in the following states: {remove_states}"
         )
         df = df[~df["plant_id_eia"].isin(plants_in_states_to_remove)]
     if steam_only_plants:
@@ -918,7 +918,7 @@ def remove_non_grid_connected_plants(df):
             "plant_id_eia"
         ].unique()
     )
-    logger.info(f"    Removing {num_plants} plants that are not grid-connected")
+    logger.info(f"Removing {num_plants} plants that are not grid-connected")
 
     df = df[~df["plant_id_eia"].isin(ngc_plants)]
 
@@ -1005,7 +1005,7 @@ def clean_cems(year: int, small: bool, primary_fuel_table, subplant_emission_fac
 
 
 def smallerize_test_data(df, random_seed=None):
-    logger.info("    Randomly selecting 5% of plants for faster test run.")
+    logger.info("Randomly selecting 5% of plants for faster test run.")
     # Select 5% of plants
     selected_plants = df.plant_id_eia.unique()
     if random_seed is not None:
@@ -1031,7 +1031,7 @@ def manually_remove_steam_units(df):
     )[["plant_id_eia", "emissions_unit_id_epa"]]
 
     logger.info(
-        f"    Removing {len(units_to_remove)} units that only produce steam and do not report to EIA"
+        f"Removing {len(units_to_remove)} units that only produce steam and do not report to EIA"
     )
 
     df = df.merge(
@@ -1063,7 +1063,7 @@ def remove_incomplete_unit_months(cems):
     ].drop(columns="datetime_utc")
 
     logger.info(
-        f"    Removing {len(unit_months_to_remove)} unit-months with incomplete hourly data"
+        f"Removing {len(unit_months_to_remove)} unit-months with incomplete hourly data"
     )
 
     cems = cems.merge(
@@ -1296,7 +1296,7 @@ def remove_cems_with_zero_monthly_data(cems):
     )
     # remove any observations with the missing data flag
     logger.info(
-        f"    Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported"
+        f"Removing {len(cems[cems['missing_data_flag'] == 'remove'])} observations from cems for unit-months where no data reported"
     )
     validation.check_removed_data_is_empty(cems)
     cems = cems[cems["missing_data_flag"] != "remove"]
@@ -1670,7 +1670,7 @@ def identify_partial_cems_plants(all_data):
         # likely resulting from mixed fuel types.
         # If subplant_id assignment is working, there shouldn't be any
         raise Exception(
-            f"    ERROR: {len(mixed_method_subplants)} subplant-months have multiple hourly methods assigned."
+            f"ERROR: {len(mixed_method_subplants)} subplant-months have multiple hourly methods assigned."
         )
 
     # remove the intermediate indicator column
@@ -1960,7 +1960,7 @@ def assign_ba_code_to_plant(df, year):
     df = df.merge(plant_ba, how="left", on="plant_id_eia", validate="m:1")
 
     if len(df[df["ba_code"].isna()]) > 0:
-        logger.warning("    the following plants are missing ba_code:")
+        logger.warning("the following plants are missing ba_code:")
         logger.warning("\n" + df[df["ba_code"].isna()].tostring())
 
     # replace missing ba codes with NA
diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index 5418ea75..ab87e491 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -40,25 +40,25 @@ def get_args() -> argparse.Namespace:
         "--shape_individual_plants",
         help="Assign an hourly profile to each individual plant with EIA-only data, instead of aggregating to the fleet level before shaping.",
         default=True,
-        action=argparse.BooleanOptionalAction
+        action=argparse.BooleanOptionalAction,
     )
     parser.add_argument(
         "--small",
         help="Run on subset of data for quicker testing, outputs to outputs/small and results to results/small.",
         default=False,
-        action=argparse.BooleanOptionalAction
+        action=argparse.BooleanOptionalAction,
     )
     parser.add_argument(
         "--flat",
         help="Use flat hourly profiles?",
         default=False,
-        action=argparse.BooleanOptionalAction
+        action=argparse.BooleanOptionalAction,
     )
     parser.add_argument(
         "--skip_outputs",
         help="Skip outputting data to csv files for quicker testing.",
         default=False,
-        action=argparse.BooleanOptionalAction
+        action=argparse.BooleanOptionalAction,
     )
 
     args = parser.parse_args()
@@ -78,7 +78,7 @@ def main():
     print_args(args)
 
     year = args.year
-    logger.info(f'Running data pipeline for year {year}')
+    logger.info(f"Running data pipeline for year {year}")
 
     validation.validate_year(year)
 
@@ -344,12 +344,12 @@ def main():
     logger.info("12. Cleaning EIA-930 data")
     # Scrapes and cleans data in data/downloads, outputs cleaned file at EBA_elec.csv
     if args.flat:
-        logger.info("    Not running 930 cleaning because we'll be using a flat profile.")
+        logger.info("Not running 930 cleaning because we'll be using a flat profile.")
     elif not (os.path.exists(outputs_folder(f"{path_prefix}/eia930/eia930_elec.csv"))):
         eia930.clean_930(year, small=args.small, path_prefix=path_prefix)
     else:
         logger.info(
-            f"    Not re-running 930 cleaning. If you'd like to re-run, please delete data/outputs/{path_prefix}/eia930/"
+            f"Not re-running 930 cleaning. If you'd like to re-run, please delete data/outputs/{path_prefix}/eia930/"
         )
 
     # If running small, we didn't clean the whole year, so need to use the Chalender file to build residual profiles.
@@ -413,10 +413,10 @@ def main():
         )
     else:
         logger.info(
-            "    Not shaping and exporting individual plant data since `shape_individual_plants` is False."
+            "Not shaping and exporting individual plant data since `shape_individual_plants` is False."
         )
         logger.info(
-            "    Plants that only report to EIA will be aggregated to the fleet level before shaping."
+            "Plants that only report to EIA will be aggregated to the fleet level before shaping."
         )
 
     # 15. Shape fleet-level data
@@ -550,6 +550,11 @@ def main():
     hourly_consumed_calc.run()
     hourly_consumed_calc.output_results()
 
+    # move the log file into the specific year output folder
+    shutil.move(
+        outputs_folder("data_pipeline.log"), outputs_folder(f"{year}/data_pipeline.log")
+    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/src/download_data.py b/src/download_data.py
index cb84a588..10763e06 100644
--- a/src/download_data.py
+++ b/src/download_data.py
@@ -42,12 +42,12 @@ def download_helper(
     final_destination = output_path if output_path is not None else download_path
     if os.path.exists(final_destination):
         logger.info(
-            f"    {final_destination.split('/')[-1]} already downloaded, skipping."
+            f"{final_destination.split('/')[-1]} already downloaded, skipping."
         )
         return False
 
     # Otherwise, download to the file in chunks.
-    logger.info(f"    Downloading {final_destination.split('/')[-1]}")
+    logger.info(f"Downloading {final_destination.split('/')[-1]}")
     r = requests.get(input_url, stream=True)
     with open(download_path, "wb") as fd:
         for chunk in r.iter_content(chunk_size=chunk_size):
@@ -99,10 +99,10 @@ def download_pudl_data(zenodo_url: str):
         with open(pudl_version_file, "r") as f:
             existing_version = f.readlines()[0].replace("\n", "")
         if pudl_version == existing_version:
-            logger.info("    PUDL version already downloaded")
+            logger.info("PUDL version already downloaded")
             return
         else:
-            logger.info("    Downloading new version of pudl")
+            logger.info("Downloading new version of pudl")
             shutil.rmtree(downloads_folder("pudl"))
 
     download_pudl(zenodo_url, pudl_version)
@@ -114,19 +114,19 @@ def download_pudl(zenodo_url, pudl_version):
     total_size_in_bytes = int(r.headers.get("content-length", 0))
     block_size = 1024 * 1024 * 10  # 10 MB
     downloaded = 0
-    logger.info("    Downloading PUDL data...")
+    logger.info("Downloading PUDL data...")
     with open(downloads_folder("pudl.tgz"), "wb") as fd:
         for chunk in r.iter_content(chunk_size=block_size):
             print(
-                f"    Progress: {(round(downloaded/total_size_in_bytes*100,2))}%   \r",
+                f"Progress: {(round(downloaded/total_size_in_bytes*100,2))}%   \r",
                 end="",
             )
             fd.write(chunk)
             downloaded += block_size
-        print("    Progress: 100.0%")
+        print("Progress: 100.0%")
 
     # extract the tgz file
-    logger.info("    Extracting PUDL data...")
+    logger.info("Extracting PUDL data...")
     with tarfile.open(downloads_folder("pudl.tgz")) as tar:
         tar.extractall(data_folder())
 
diff --git a/src/eia930.py b/src/eia930.py
index 36aa7c1e..42ae2d6e 100644
--- a/src/eia930.py
+++ b/src/eia930.py
@@ -145,14 +145,14 @@ def clean_930(year: int, small: bool = False, path_prefix: str = ""):
         df = df.loc[start:end]  # Don't worry about processing everything
 
     # Adjust
-    logger.info("    Adjusting EIA-930 time stamps")
+    logger.info("Adjusting EIA-930 time stamps")
     df = manual_930_adjust(df)
     df.to_csv(
         join(data_folder, "eia930_raw.csv")
     )  # Will be read by gridemissions workflow
 
     # Run cleaning
-    logger.info("    Running physics-based data cleaning")
+    logger.info("Running physics-based data cleaning")
     make_dataset(
         start,
         end,
@@ -289,7 +289,7 @@ def remove_imputed_ones(eia930_data):
     filter = eia930_data["net_generation_mwh_930"].abs() < 1.5
 
     # replace all 1.0 values with zero
-    logger.info(f"  replacing {sum(filter)} imputed 1 values with 0")
+    logger.info(f"Replacing {sum(filter)} imputed 1 values with 0")
     eia930_data.loc[filter, "net_generation_mwh_930"] = 0
 
     return eia930_data
diff --git a/src/gross_to_net_generation.py b/src/gross_to_net_generation.py
index 0d6ed96c..3c80d6e0 100644
--- a/src/gross_to_net_generation.py
+++ b/src/gross_to_net_generation.py
@@ -775,7 +775,7 @@ def load_monthly_gross_and_net_generation(start_year, end_year):
     )
 
     # allocate net generation and heat input to each generator-fuel grouping
-    logger.info("    Allocating EIA-923 generation data")
+    logger.info("Allocating EIA-923 generation data")
     gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source(
         pudl_out, drop_interim_cols=True
     )
diff --git a/src/impute_hourly_profiles.py b/src/impute_hourly_profiles.py
index 2d9bb64c..fa642eb8 100644
--- a/src/impute_hourly_profiles.py
+++ b/src/impute_hourly_profiles.py
@@ -710,7 +710,7 @@ def average_diba_wind_solar_profiles(
     ]
     if len(df_temporary) == 0 and not validation_run:
         # if this error is raised, we might have to implement an approach that uses average values for the wider region
-        logger.warning(f"    There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}")
+        logger.warning(f"There is no {fuel} data in the DIBAs for {ba}: {ba_dibas}")
         df_temporary = average_national_wind_solar_profiles(
             residual_profiles, ba, fuel, report_date
         )
diff --git a/src/load_data.py b/src/load_data.py
index 798c480b..98164b6f 100644
--- a/src/load_data.py
+++ b/src/load_data.py
@@ -156,7 +156,7 @@ def load_cems_gross_generation(start_year, end_year):
     cems_all = []
 
     for year in range(start_year, end_year + 1):
-        logger.info(f"    loading {year} CEMS data")
+        logger.info(f"loading {year} CEMS data")
         # specify the path to the CEMS data
         cems_path = downloads_folder(
             "pudl/pudl_data/parquet/epacems/hourly_emissions_epacems/"
diff --git a/src/logging_util.py b/src/logging_util.py
index 9f1fba28..3ad47428 100644
--- a/src/logging_util.py
+++ b/src/logging_util.py
@@ -6,47 +6,46 @@
 
 
 def get_logger(name: str) -> logging.Logger:
-  """Helper function to append `oge` to the logger name and return a logger.
+    """Helper function to append `oge` to the logger name and return a logger.
 
-  As a result, all returned loggers a children of the top-level `oge` logger.
-  """
-  return logging.getLogger(f"oge.{name}")
+    As a result, all returned loggers a children of the top-level `oge` logger.
+    """
+    return logging.getLogger(f"oge.{name}")
 
 
 def configure_root_logger(logfile: str | None = None, level: str = "INFO"):
-  """Configure the OGE logger to print to the console, and optionally to a file.
-
-  This function is safe to call multiple times, since it will check if logging
-  handlers have already been installed and skip them if so.
-
-  Logging is printed with the same format as PUDL:
-  ```
-  2023-02-21 16:10:44 [INFO] oge.test:21 This is an example
-  ```
-  """
-  root_logger = logging.getLogger()
-
-  # Unfortunately, the `gridemissions` package adds a handler to the root logger
-  # which means that the output of other loggers propagates up and is printed
-  # twice. Remove the root handlers to avoid this.
-  for handler in root_logger.handlers:
-    root_logger.removeHandler(handler)
-
-  oge_logger = logging.getLogger("oge")
-  log_format = "%(asctime)s [%(levelname)4s] %(name)s:%(lineno)s %(message)s"
-
-  # Direct the output of the OGE logger to the terminal (and color it). Make
-  # sure this hasn't been done already to avoid adding duplicate handlers.
-  if len(oge_logger.handlers) == 0:
-    coloredlogs.install(fmt=log_format, level=level, logger=oge_logger)
-    oge_logger.addHandler(logging.NullHandler())
-
-  # Send everything to the log file by adding a file handler to the root logger.
-  if logfile is not None:
-    make_containing_folder(logfile)
-    file_logger = logging.FileHandler(logfile, mode='w')
-    file_logger.setFormatter(logging.Formatter(log_format))
-
-    if file_logger not in root_logger.handlers:
-      root_logger.addHandler(file_logger)
-
+    """Configure the OGE logger to print to the console, and optionally to a file.
+
+    This function is safe to call multiple times, since it will check if logging
+    handlers have already been installed and skip them if so.
+
+    Logging is printed with the same format as PUDL:
+    ```
+    2023-02-21 16:10:44 [INFO] oge.test:21 This is an example
+    ```
+    """
+    root_logger = logging.getLogger()
+
+    # Unfortunately, the `gridemissions` package adds a handler to the root logger
+    # which means that the output of other loggers propagates up and is printed
+    # twice. Remove the root handlers to avoid this.
+    for handler in root_logger.handlers:
+        root_logger.removeHandler(handler)
+
+    oge_logger = logging.getLogger("oge")
+    log_format = "%(asctime)s [%(levelname)4s] %(name)s:%(lineno)s %(message)s"
+
+    # Direct the output of the OGE logger to the terminal (and color it). Make
+    # sure this hasn't been done already to avoid adding duplicate handlers.
+    if len(oge_logger.handlers) == 0:
+        coloredlogs.install(fmt=log_format, level=level, logger=oge_logger)
+        oge_logger.addHandler(logging.NullHandler())
+
+    # Send everything to the log file by adding a file handler to the root logger.
+    if logfile is not None:
+        make_containing_folder(logfile)
+        file_logger = logging.FileHandler(logfile, mode="w")
+        file_logger.setFormatter(logging.Formatter(log_format))
+
+        if file_logger not in root_logger.handlers:
+            root_logger.addHandler(file_logger)
diff --git a/src/output_data.py b/src/output_data.py
index 23eb9824..8769abed 100644
--- a/src/output_data.py
+++ b/src/output_data.py
@@ -117,7 +117,7 @@ def zip_data_for_zenodo(year):
 def output_intermediate_data(df, file_name, path_prefix, year, skip_outputs):
     column_checks.check_columns(df, file_name)
     if not skip_outputs:
-        logger.info(f"    Exporting {file_name} to data/outputs")
+        logger.info(f"Exporting {file_name} to data/outputs")
         df.to_csv(outputs_folder(f"{path_prefix}{file_name}_{year}.csv"), index=False)
 
 
@@ -126,7 +126,7 @@ def output_to_results(
 ):
     # Always check columns that should not be negative.
     small = "small" in path_prefix
-    logger.info(f"    Exporting {file_name} to data/results/{path_prefix}{subfolder}")
+    logger.info(f"Exporting {file_name} to data/results/{path_prefix}{subfolder}")
 
     if include_metric:
         metric = convert_results(df)
@@ -154,7 +154,7 @@ def output_to_results(
 def output_data_quality_metrics(df, file_name, path_prefix, skip_outputs):
     if not skip_outputs:
         logger.info(
-            f"    Exporting {file_name} to data/results/{path_prefix}data_quality_metrics"
+            f"Exporting {file_name} to data/results/{path_prefix}data_quality_metrics"
         )
 
         # TODO: Add column checks
diff --git a/src/validation.py b/src/validation.py
index a50535a0..1f176020 100644
--- a/src/validation.py
+++ b/src/validation.py
@@ -272,7 +272,7 @@ def test_for_missing_energy_source_code(df):
 def check_non_missing_cems_co2_values_unchanged(cems_original, cems):
     """Checks that no non-missing CO2 values were modified during the process of filling."""
     logger.info(
-        "    Checking that original CO2 data in CEMS was not modified by filling missing values...",
+        "Checking that original CO2 data in CEMS was not modified by filling missing values...",
     )
     # only keep non-zero and non-missing co2 values, since these should have not been modified
     cems_original = cems_original.loc[

From 4af25b1eeb341626c4139551cda1d7ec7833e25f Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Tue, 28 Feb 2023 11:20:58 -0800
Subject: [PATCH 22/27] update logger configuration location

---
 src/data_pipeline.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index ab87e491..c687a385 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -24,11 +24,6 @@
 from logging_util import get_logger, configure_root_logger
 
 
-# Log the print statements to a file for debugging.
-configure_root_logger(logfile=outputs_folder("data_pipeline.log"))
-logger = get_logger("data_pipeline")
-
-
 def get_args() -> argparse.Namespace:
     """Specify arguments here.
 
@@ -66,7 +61,7 @@ def get_args() -> argparse.Namespace:
     return args
 
 
-def print_args(args: argparse.Namespace):
+def print_args(args: argparse.Namespace, logger):
     """Print out the command line arguments."""
     argstring = "\n".join([f"  * {k} = {v}" for k, v in vars(args).items()])
     logger.info(f"\n\nRunning with the following options:\n{argstring}\n")
@@ -75,9 +70,14 @@ def print_args(args: argparse.Namespace):
 def main():
     """Runs the OGE data pipeline."""
     args = get_args()
-    print_args(args)
-
     year = args.year
+
+    # Log the print statements to a file for debugging.
+    configure_root_logger(logfile=outputs_folder(f"{year}/data_pipeline.log"))
+    logger = get_logger("data_pipeline")
+
+    print_args(args, logger)
+
     logger.info(f"Running data pipeline for year {year}")
 
     validation.validate_year(year)
@@ -550,11 +550,6 @@ def main():
     hourly_consumed_calc.run()
     hourly_consumed_calc.output_results()
 
-    # move the log file into the specific year output folder
-    shutil.move(
-        outputs_folder("data_pipeline.log"), outputs_folder(f"{year}/data_pipeline.log")
-    )
-
 
 if __name__ == "__main__":
     main()

From 7610ac80daeabac0a63f50aa1f2c1124562ac6c2 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Tue, 28 Feb 2023 13:06:03 -0800
Subject: [PATCH 23/27] change order of fillna

---
 src/output_data.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/output_data.py b/src/output_data.py
index 1222dcd3..7d9bc86e 100644
--- a/src/output_data.py
+++ b/src/output_data.py
@@ -275,12 +275,9 @@ def write_generated_averages(ba_fuel_data, year, path_prefix, skip_outputs):
                         avg_fuel_type_production[f"{emission}_mass_lb{emission_type}"]
                         / avg_fuel_type_production["net_generation_mwh"]
                     )
-                    .fillna(0)
                     .replace(np.inf, np.NaN)
                     .replace(-np.inf, np.NaN)
-                    .replace(
-                        np.NaN, 0
-                    )  # TODO: temporary placeholder while solar is broken. Eventually there should be no NaNs.
+                    .fillna(0)  # TODO: temporary placeholder while solar is broken. Eventually there should be no NaNs.
                 )
         output_intermediate_data(
             avg_fuel_type_production,
@@ -515,9 +512,9 @@ def add_generated_emission_rate_columns(df):
                                 df[f"{emission}_mass_lb{emission_type}"]
                                 / df["net_generation_mwh"]
                             )
-                            .fillna(0)
                             .replace(np.inf, np.NaN)
                             .replace(-np.inf, np.NaN)
+                            .fillna(0)
                         )
                         # Set negative rates to zero, following eGRID methodology
                         df.loc[df[col_name] < 0, col_name] = 0

From 60d6374b39fc01e20685a45626d5e5d3aaa0a308 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Tue, 28 Feb 2023 13:25:36 -0800
Subject: [PATCH 24/27] fill zeros only when in denominator

---
 src/output_data.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/output_data.py b/src/output_data.py
index 7d9bc86e..a3d891af 100644
--- a/src/output_data.py
+++ b/src/output_data.py
@@ -277,7 +277,7 @@ def write_generated_averages(ba_fuel_data, year, path_prefix, skip_outputs):
                     )
                     .replace(np.inf, np.NaN)
                     .replace(-np.inf, np.NaN)
-                    .fillna(0)  # TODO: temporary placeholder while solar is broken. Eventually there should be no NaNs.
+                    .fillna(0)
                 )
         output_intermediate_data(
             avg_fuel_type_production,
@@ -514,8 +514,15 @@ def add_generated_emission_rate_columns(df):
                             )
                             .replace(np.inf, np.NaN)
                             .replace(-np.inf, np.NaN)
-                            .fillna(0)
                         )
+                        # where the rate is missing because of a divide by zero (i.e.
+                        # net_generation_mwh is zero), replace the emission rate with
+                        # zero. We want to keep all other NAs so that they get flagged
+                        # by our validation checks since this indicates an unexpected
+                        # issue
+                        df.loc[df["net_generation_mwh"] == 0, col_name] = df.loc[
+                            df["net_generation_mwh"] == 0, col_name
+                        ].fillna(0)
                         # Set negative rates to zero, following eGRID methodology
                         df.loc[df[col_name] < 0, col_name] = 0
                 return df

From a588334144f00bb4a20bd161a92725d9053f372f Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Wed, 1 Mar 2023 08:48:43 -0800
Subject: [PATCH 25/27] change logfile location

---
 src/data_pipeline.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index c687a385..e9735407 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -71,15 +71,6 @@ def main():
     """Runs the OGE data pipeline."""
     args = get_args()
     year = args.year
-
-    # Log the print statements to a file for debugging.
-    configure_root_logger(logfile=outputs_folder(f"{year}/data_pipeline.log"))
-    logger = get_logger("data_pipeline")
-
-    print_args(args, logger)
-
-    logger.info(f"Running data pipeline for year {year}")
-
     validation.validate_year(year)
 
     # 0. Set up directory structure
@@ -111,6 +102,17 @@ def main():
                     exist_ok=True,
                 )
 
+    # configure the logger
+    # Log the print statements to a file for debugging.
+    configure_root_logger(
+        logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log")
+    )
+    logger = get_logger("data_pipeline")
+
+    print_args(args, logger)
+
+    logger.info(f"Running data pipeline for year {year}")
+
     # 1. Download data
     ####################################################################################
     logger.info("1. Downloading data")

From 396a937efaceab64755ef89c1df98f09f37942e4 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Wed, 1 Mar 2023 08:53:22 -0800
Subject: [PATCH 26/27] move year validation after logger config

---
 src/data_pipeline.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index e9735407..88678e2f 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -71,7 +71,6 @@ def main():
     """Runs the OGE data pipeline."""
     args = get_args()
     year = args.year
-    validation.validate_year(year)
 
     # 0. Set up directory structure
     path_prefix = "" if not args.small else "small/"
@@ -108,9 +107,9 @@ def main():
         logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log")
     )
     logger = get_logger("data_pipeline")
-
     print_args(args, logger)
 
+    validation.validate_year(year)
     logger.info(f"Running data pipeline for year {year}")
 
     # 1. Download data

From 4bd4c54a1887e9a9cdd6465ec700164c028f1c76 Mon Sep 17 00:00:00 2001
From: grgmiller <grmiller@ucdavis.edu>
Date: Wed, 1 Mar 2023 10:58:23 -0800
Subject: [PATCH 27/27] change directory creation order

---
 src/data_pipeline.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/data_pipeline.py b/src/data_pipeline.py
index 88678e2f..dc78b996 100644
--- a/src/data_pipeline.py
+++ b/src/data_pipeline.py
@@ -72,6 +72,17 @@ def main():
     args = get_args()
     year = args.year
 
+    # configure the logger
+    # Log the print statements to a file for debugging.
+    configure_root_logger(
+        logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log")
+    )
+    logger = get_logger("data_pipeline")
+    print_args(args, logger)
+
+    logger.info(f"Running data pipeline for year {year}")
+    validation.validate_year(year)
+
     # 0. Set up directory structure
     path_prefix = "" if not args.small else "small/"
     path_prefix += "flat/" if args.flat else ""
@@ -101,17 +112,6 @@ def main():
                     exist_ok=True,
                 )
 
-    # configure the logger
-    # Log the print statements to a file for debugging.
-    configure_root_logger(
-        logfile=results_folder(f"{year}/data_quality_metrics/data_pipeline.log")
-    )
-    logger = get_logger("data_pipeline")
-    print_args(args, logger)
-
-    validation.validate_year(year)
-    logger.info(f"Running data pipeline for year {year}")
-
     # 1. Download data
     ####################################################################################
     logger.info("1. Downloading data")