Merge pull request #432 from cal-adapt/upgrade-gen-wl-tbls

Improve readability of `generate_gwl_tables.py`
cal-adapt · Sep 19, 2024 · 0b3067d · 0b3067d
2 parents c7e6291 + 60d75d5
commit 0b3067d
Showing 1 changed file with 220 additions and 46 deletions.
diff --git a/climakitae/util/generate_gwl_tables.py b/climakitae/util/generate_gwl_tables.py
@@ -14,9 +14,14 @@
 
 
 def main():
-    """Call everything needed to write the global warming level reference files
-    for all of the available GCMs."""
+    """
+    Generates global warming level (GWL) reference files for all available CMIP6 GCMs and CESM2-LENS.
 
+    This includes:
+    - Connecting to AWS S3 storage to access CMIP6 and CESM2-LENS data.
+    - Filtering and processing data to create global temperature time series.
+    - Generating and saving warming level tables in CSV format for different reference periods.
+    """
     # Connect to AWS S3 storage
     fs = s3fs.S3FileSystem(anon=True)
 
@@ -52,39 +57,100 @@ def main():
     }
     ens_mem_cesm_rev = dict([(v, k) for k, v in ens_mems_cesm.items()])
 
+    def make_weighted_timeseries(temp):
+        """
+        Creates a spatially-weighted single-dimension time series of global temperature.
+
+        The function weights the latitude grids by size and averages across all longitudes,
+        resulting in a single time series object.
+
+        Parameters:
+        ----------
+        temp : xarray.DataArray
+            An xarray DataArray of global temperature with latitude and longitude coordinates.
+
+        Returns:
+        -------
+        xarray.DataArray
+            A time series of global temperature that is spatially weighted across latitudes and averaged
+            across all longitudes.
+        """
+        # Find variable names for latitude and longitude to make code more readable
+        lat, lon = "lat", "lon"
+        if "lat" not in temp.coords and "lon" not in temp.coords:
+            lat, lon = "latitude", "longitude"
+
+        # Weight latitude grids by size, then average across all longitudes to create single time-series object
+        weightlat = np.sqrt(np.cos(np.deg2rad(temp[lat])))
+        weightlat = weightlat / np.sum(weightlat)
+        timeseries = (temp * weightlat).sum(lat).mean(lon)
+        return timeseries
+
     def buildDFtimeSeries_cesm2(variable, model, ens_mem, scenarios):
+        """
+        Builds a global temperature time series by weighting latitudes and averaging longitudes
+        for the CESM2 model across specified scenarios from 1980 to 2100.
+
+        Parameters:
+        ----------
+        variable : str
+            The variable to process, hardcoded to 'tas' (Air Temperature at 2m).
+        model : str
+            The model name, e.g., 'CESM2'.
+        ens_mem : str
+            The ensemble member ID.
+        scenarios : list
+            A list of scenario names to include in the time series, e.g., ['historical', 'ssp370'].
+
+        Returns:
+        -------
+        xarray.Dataset
+            A dataset containing the global temperature time series for each scenario.
+        """
         temp = cesm2_lens.sel(member_id=ens_mem)
         data_one_model = xr.Dataset()
         for scenario in scenarios:
-            weightlat = np.sqrt(np.cos(np.deg2rad(temp.lat)))
-            weightlat = weightlat / np.sum(weightlat)
-            timeseries = (temp * weightlat).sum("lat").mean("lon")
+
+            # Create global weighted time-series of variable
+            timeseries = make_weighted_timeseries(temp)
             timeseries = timeseries.sortby("time")
+
             data_one_model[scenario] = timeseries
         return data_one_model
 
     def build_timeseries(variable, model, ens_mem, scenarios):
-        """Builds an xarray Dataset with only a time dimension, for the appended
-        historical+ssp timeseries for all the scenarios of a particular
-        model/variant combo. Works for all of the models(/GCMs) in the list
-        models_for_now, which appear in the current data catalog of WRF
-        downscaling."""
+        """
+        Builds an xarray Dataset with a time dimension, containing the concatenated historical
+        and SSP time series for all specified scenarios of a given model and ensemble member.
+        Works for all of the models(/GCMs) in the list `models`, which appear in the current
+        data catalog of WRF downscaling.
+
+        Parameters:
+        ----------
+        variable : str
+            The variable to process, e.g., `tas`. `tas` is the only variable used in this file currently.
+        model : str
+            The model name.
+        ens_mem : str
+            The ensemble member ID.
+        scenarios : list
+            A list of scenario names to include, e.g., ['historical', 'ssp585', 'ssp370'].
+
+        Returns:
+        -------
+        xarray.Dataset
+            A dataset with time as the dimension, containing the appended historical and SSP time series.
+        """
         scenario = "historical"
         data_historical = xr.Dataset()
         df_scenario = df_subset[
             (df_subset.source_id == model) & (df_subset.member_id == ens_mem)
         ]
         with xr.open_zarr(fs.get_mapper(df_scenario.zstore.values[0])) as temp:
-            try:
-                weightlat = np.sqrt(np.cos(np.deg2rad(temp.lat)))
-                weightlat = weightlat / np.sum(weightlat)
-                data_historical = (temp[variable] * weightlat).sum("lat").mean("lon")
-            except:
-                weightlat = np.sqrt(np.cos(np.deg2rad(temp.latitude)))
-                weightlat = weightlat / np.sum(weightlat)
-                data_historical = (
-                    (temp[variable] * weightlat).sum("latitude").mean("longitude")
-                )
+
+            # Create global weighted time-series of variable
+            data_historical = make_weighted_timeseries(temp[variable])
+
             if model == "FGOALS-g3" or (
                 model == "EC-Earth3-Veg" and ens_mem == "r5i1p1f1"
             ):
@@ -107,20 +173,11 @@ def build_timeseries(variable, model, ens_mem, scenarios):
                     ) as temp:  # BUG: Some scenarios not returning a full predictive period of values (i.e. not returning time period of 2015-2100 of data)
                         temp = temp.isel(time=slice(0, 1032))
                         temp = xr.decode_cf(temp)
-                        try:
-                            weightlat = np.sqrt(np.cos(np.deg2rad(temp.lat)))
-                            weightlat = weightlat / np.sum(weightlat)
-                            timeseries = (
-                                (temp[variable] * weightlat).sum("lat").mean("lon")
-                            )
-                        except:
-                            weightlat = np.sqrt(np.cos(np.deg2rad(temp.latitude)))
-                            weightlat = weightlat / np.sum(weightlat)
-                            timeseries = (
-                                (temp[variable] * weightlat)
-                                .sum("latitude")
-                                .mean("longitude")
-                            )
+
+                        # Create global weighted time-series of variable
+                        timeseries = make_weighted_timeseries(temp[variable])
+
+                        # Clean data and append to `data_one_model`
                         timeseries = timeseries.sortby(
                             "time"
                         )  # needed for MPI-ESM1-2-LR
@@ -130,9 +187,23 @@ def build_timeseries(variable, model, ens_mem, scenarios):
         return data_one_model
 
     def get_gwl(smoothed, degree):
-        """Takes a smoothed timeseries of global mean temperature for multiple
-        scenarios, and returns a small table of the timestamp that a given
-        global warming level is reached."""
+        """
+        Computes the timestamp when a given GWL is first reached.
+        Takes a smoothed time series of global mean temperature of different scenarios for a model
+        and returns a table indicating the timestamp at which the specified warming level is reached.
+
+        Parameters:
+        ----------
+        smoothed : pandas.DataFrame
+            A DataFrame containing a global mean temperature time series for a model for multiple scenarios.
+        degree : float
+            The global warming level to detect, e.g., 1.5, 2, etc.
+
+        Returns:
+        -------
+        pandas.DataFrame
+            A table containing timestamps for when each scenario first crosses the specified warming level.
+        """
 
         def get_wl_timestamp(scenario, degree):
             """
@@ -151,7 +222,29 @@ def get_wl_timestamp(scenario, degree):
     def get_table_one_cesm2(
         variable, model, ens_mem, scenarios, start_year="18500101", end_year="19000101"
     ):
-        """Generates GWL information for one ensemble member of CESM2."""
+        """
+        Generates a GWL lookup table for one ensemble member of CESM2.
+
+        Parameters:
+        ----------
+        variable : str
+            The variable to process, e.g., 'tas'.
+        model : str
+            The model name, e.g., 'CESM2'.
+        ens_mem : str
+            The ensemble member ID.
+        scenarios : list
+            A list of scenario names, e.g., ['historical', 'ssp370'].
+        start_year : str, optional
+            The start year for the reference period in the format 'YYYYMMDD'.
+        end_year : str, optional
+            The end year for the reference period in the format 'YYYYMMDD'.
+
+        Returns:
+        -------
+        tuple
+            A DataFrame of warming levels and a DataFrame of global mean temperature time series.
+        """
         data_one_model = buildDFtimeSeries_cesm2(variable, model, ens_mem, scenarios)
         anom = data_one_model - data_one_model.sel(
             time=slice(start_year, end_year)
@@ -174,7 +267,27 @@ def get_table_one_cesm2(
     def get_table_cesm2(
         variable, model, scenarios, start_year="18500101", end_year="19000101"
     ):
-        """Generates GWL table for CESM2 model."""
+        """
+        Generates a GWL table for the CESM2 model.
+
+        Parameters:
+        ----------
+        variable : str
+            The variable to process, e.g., 'tas'.
+        model : str
+            The model name, e.g., 'CESM2'.
+        scenarios : list
+            A list of scenario names to include, e.g., ['ssp370'].
+        start_year : str, optional
+            The start year for the reference period in the format 'YYYYMMDD'.
+        end_year : str, optional
+            The end year for the reference period in the format 'YYYYMMDD'.
+
+        Returns:
+        -------
+        tuple
+            A DataFrame of warming levels and a DataFrame of global mean temperature time series for the CESM2 model.
+        """
         ens_mem_list = [v for k, v in ens_mems_cesm.items()]
         gwlevels_tbl, wl_data_tbls = [], []
         for ens_mem in ens_mem_list:
@@ -198,9 +311,32 @@ def get_table_cesm2(
     def get_gwl_table_one(
         variable, model, ens_mem, scenarios, start_year="18500101", end_year="19000101"
     ):
-        """Loops through global warming levels, and returns an aggregate table
-        for all warming levels (1.5, 2, 2.5, 3, and 4 degrees) for all scenarios of
-        the model/variant requested."""
+        """
+        Generates a GWL table for a single model and ensemble member.
+
+        Loops through various global warming levels (1.5, 2, 2.5, 3, and 4 degrees)
+        for the requested model/variant and scenarios.
+
+        Parameters:
+        ----------
+        variable : str
+            The variable to process, e.g., 'tas'.
+        model : str
+            The model name.
+        ens_mem : str
+            The ensemble member ID.
+        scenarios : list
+            A list of scenario names to include, e.g., ['historical', 'ssp585', 'ssp370'].
+        start_year : str, optional
+            The start year for the reference period in the format 'YYYYMMDD'.
+        end_year : str, optional
+            The end year for the reference period in the format 'YYYYMMDD'.
+
+        Returns:
+        -------
+        tuple
+            A DataFrame containing warming levels and a DataFrame with global mean temperature time series.
+        """
         data_one_model = build_timeseries(variable, model, ens_mem, scenarios)
         try:
             anom = data_one_model - data_one_model.sel(
@@ -241,10 +377,31 @@ def get_gwl_table_one(
     def get_gwl_table(
         variable, model, scenarios, start_year="18500101", end_year="19000101"
     ):
-        """Generates GWL table for a given model and scenarios. Used on all GCMs available in AWS catalog, not including CESM2-LENS."""
-        ens_mem_list = sims_on_aws.T[model]["historical"]
+        """
+        Generates a GWL table for a given model and scenarios.
+
+        Parameters:
+        ----------
+        variable : str
+            The variable to process, e.g., 'tas'.
+        model : str
+            The model name.
+        scenarios : list
+            A list of scenario names to include, e.g., ['ssp585', 'ssp370', 'ssp245'].
+        start_year : str, optional
+            The start year for the reference period in the format 'YYYYMMDD'.
+        end_year : str, optional
+            The end year for the reference period in the format 'YYYYMMDD'.
+
+        Returns:
+        -------
+        tuple
+            A DataFrame containing warming levels and a DataFrame with global mean temperature time series.
+            To be exported into `gwl_[time period]ref.csv` and `gwl_[time period]ref_timeidx.csv`.
+        """
+        ens_mem_list = sims_on_aws.T[model]["historical"].copy()
         if (model == "EC-Earth3") or (model == "EC-Earth3-Veg"):
-            for ens_mem in ens_mem_list:
+            for ens_mem in ens_mem_list[:]:
                 if int(ens_mem.split("r")[1].split("i")[0]) > 100:
                     # These ones were branched off another at 1970
                     ens_mem_list.remove(ens_mem)
@@ -355,7 +512,24 @@ def get_gwl_table(
 
 def get_sims_on_aws(df):
     """
-    Make a table of all of the relevant CMIP6 simulations on AWS.
+    Generates a pandas DataFrame listing all relevant CMIP6 simulations available on AWS.
+
+    This function filters the input DataFrame `df` and identifies and lists CMIP6 model simulations
+    for historical and various SSP (Shared Socioeconomic Pathway) scenarios. It only includes
+    models that have both historical and at least one SSP ensemble member. Additionally, it ensures
+    that only historical ensemble members with variants in at least one SSP are kept.
+
+    Parameters:
+    ----------
+    df : pandas.DataFrame
+        A DataFrame containing metadata for CMIP6 simulations.
+
+    Returns:
+    -------
+    pandas.DataFrame
+        A DataFrame indexed by model names (source_id) and columns corresponding to scenarios
+        ('historical', 'ssp585', 'ssp370', 'ssp245', 'ssp126'). Each cell contains a list of
+        ensemble member IDs available on AWS for that model and scenario.
     """
     df_subset = df[
         (df.table_id == "Amon")