diff --git a/climakitae/util/generate_gwl_tables.py b/climakitae/util/generate_gwl_tables.py index 2548b8f2..20cd7511 100644 --- a/climakitae/util/generate_gwl_tables.py +++ b/climakitae/util/generate_gwl_tables.py @@ -14,9 +14,14 @@ def main(): - """Call everything needed to write the global warming level reference files - for all of the available GCMs.""" + """ + Generates global warming level (GWL) reference files for all available CMIP6 GCMs and CESM2-LENS. + This includes: + - Connecting to AWS S3 storage to access CMIP6 and CESM2-LENS data. + - Filtering and processing data to create global temperature time series. + - Generating and saving warming level tables in CSV format for different reference periods. + """ # Connect to AWS S3 storage fs = s3fs.S3FileSystem(anon=True) @@ -52,39 +57,100 @@ def main(): } ens_mem_cesm_rev = dict([(v, k) for k, v in ens_mems_cesm.items()]) + def make_weighted_timeseries(temp): + """ + Creates a spatially-weighted single-dimension time series of global temperature. + + The function weights the latitude grids by size and averages across all longitudes, + resulting in a single time series object. + + Parameters: + ---------- + temp : xarray.DataArray + An xarray DataArray of global temperature with latitude and longitude coordinates. + + Returns: + ------- + xarray.DataArray + A time series of global temperature that is spatially weighted across latitudes and averaged + across all longitudes. + """ + # Find variable names for latitude and longitude to make code more readable + lat, lon = "lat", "lon" + if "lat" not in temp.coords and "lon" not in temp.coords: + lat, lon = "latitude", "longitude" + + # Weight latitude grids by size, then average across all longitudes to create single time-series object + weightlat = np.sqrt(np.cos(np.deg2rad(temp[lat]))) + weightlat = weightlat / np.sum(weightlat) + timeseries = (temp * weightlat).sum(lat).mean(lon) + return timeseries + def buildDFtimeSeries_cesm2(variable, model, ens_mem, scenarios): + """ + Builds a global temperature time series by weighting latitudes and averaging longitudes + for the CESM2 model across specified scenarios from 1980 to 2100. + + Parameters: + ---------- + variable : str + The variable to process, hardcoded to 'tas' (Air Temperature at 2m). + model : str + The model name, e.g., 'CESM2'. + ens_mem : str + The ensemble member ID. + scenarios : list + A list of scenario names to include in the time series, e.g., ['historical', 'ssp370']. + + Returns: + ------- + xarray.Dataset + A dataset containing the global temperature time series for each scenario. + """ temp = cesm2_lens.sel(member_id=ens_mem) data_one_model = xr.Dataset() for scenario in scenarios: - weightlat = np.sqrt(np.cos(np.deg2rad(temp.lat))) - weightlat = weightlat / np.sum(weightlat) - timeseries = (temp * weightlat).sum("lat").mean("lon") + + # Create global weighted time-series of variable + timeseries = make_weighted_timeseries(temp) timeseries = timeseries.sortby("time") + data_one_model[scenario] = timeseries return data_one_model def build_timeseries(variable, model, ens_mem, scenarios): - """Builds an xarray Dataset with only a time dimension, for the appended - historical+ssp timeseries for all the scenarios of a particular - model/variant combo. Works for all of the models(/GCMs) in the list - models_for_now, which appear in the current data catalog of WRF - downscaling.""" + """ + Builds an xarray Dataset with a time dimension, containing the concatenated historical + and SSP time series for all specified scenarios of a given model and ensemble member. + Works for all of the models(/GCMs) in the list `models`, which appear in the current + data catalog of WRF downscaling. + + Parameters: + ---------- + variable : str + The variable to process, e.g., `tas`. `tas` is the only variable used in this file currently. + model : str + The model name. + ens_mem : str + The ensemble member ID. + scenarios : list + A list of scenario names to include, e.g., ['historical', 'ssp585', 'ssp370']. + + Returns: + ------- + xarray.Dataset + A dataset with time as the dimension, containing the appended historical and SSP time series. + """ scenario = "historical" data_historical = xr.Dataset() df_scenario = df_subset[ (df_subset.source_id == model) & (df_subset.member_id == ens_mem) ] with xr.open_zarr(fs.get_mapper(df_scenario.zstore.values[0])) as temp: - try: - weightlat = np.sqrt(np.cos(np.deg2rad(temp.lat))) - weightlat = weightlat / np.sum(weightlat) - data_historical = (temp[variable] * weightlat).sum("lat").mean("lon") - except: - weightlat = np.sqrt(np.cos(np.deg2rad(temp.latitude))) - weightlat = weightlat / np.sum(weightlat) - data_historical = ( - (temp[variable] * weightlat).sum("latitude").mean("longitude") - ) + + # Create global weighted time-series of variable + data_historical = make_weighted_timeseries(temp[variable]) + if model == "FGOALS-g3" or ( model == "EC-Earth3-Veg" and ens_mem == "r5i1p1f1" ): @@ -107,20 +173,11 @@ def build_timeseries(variable, model, ens_mem, scenarios): ) as temp: # BUG: Some scenarios not returning a full predictive period of values (i.e. not returning time period of 2015-2100 of data) temp = temp.isel(time=slice(0, 1032)) temp = xr.decode_cf(temp) - try: - weightlat = np.sqrt(np.cos(np.deg2rad(temp.lat))) - weightlat = weightlat / np.sum(weightlat) - timeseries = ( - (temp[variable] * weightlat).sum("lat").mean("lon") - ) - except: - weightlat = np.sqrt(np.cos(np.deg2rad(temp.latitude))) - weightlat = weightlat / np.sum(weightlat) - timeseries = ( - (temp[variable] * weightlat) - .sum("latitude") - .mean("longitude") - ) + + # Create global weighted time-series of variable + timeseries = make_weighted_timeseries(temp[variable]) + + # Clean data and append to `data_one_model` timeseries = timeseries.sortby( "time" ) # needed for MPI-ESM1-2-LR @@ -130,9 +187,23 @@ def build_timeseries(variable, model, ens_mem, scenarios): return data_one_model def get_gwl(smoothed, degree): - """Takes a smoothed timeseries of global mean temperature for multiple - scenarios, and returns a small table of the timestamp that a given - global warming level is reached.""" + """ + Computes the timestamp when a given GWL is first reached. + Takes a smoothed time series of global mean temperature of different scenarios for a model + and returns a table indicating the timestamp at which the specified warming level is reached. + + Parameters: + ---------- + smoothed : pandas.DataFrame + A DataFrame containing a global mean temperature time series for a model for multiple scenarios. + degree : float + The global warming level to detect, e.g., 1.5, 2, etc. + + Returns: + ------- + pandas.DataFrame + A table containing timestamps for when each scenario first crosses the specified warming level. + """ def get_wl_timestamp(scenario, degree): """ @@ -151,7 +222,29 @@ def get_wl_timestamp(scenario, degree): def get_table_one_cesm2( variable, model, ens_mem, scenarios, start_year="18500101", end_year="19000101" ): - """Generates GWL information for one ensemble member of CESM2.""" + """ + Generates a GWL lookup table for one ensemble member of CESM2. + + Parameters: + ---------- + variable : str + The variable to process, e.g., 'tas'. + model : str + The model name, e.g., 'CESM2'. + ens_mem : str + The ensemble member ID. + scenarios : list + A list of scenario names, e.g., ['historical', 'ssp370']. + start_year : str, optional + The start year for the reference period in the format 'YYYYMMDD'. + end_year : str, optional + The end year for the reference period in the format 'YYYYMMDD'. + + Returns: + ------- + tuple + A DataFrame of warming levels and a DataFrame of global mean temperature time series. + """ data_one_model = buildDFtimeSeries_cesm2(variable, model, ens_mem, scenarios) anom = data_one_model - data_one_model.sel( time=slice(start_year, end_year) @@ -174,7 +267,27 @@ def get_table_one_cesm2( def get_table_cesm2( variable, model, scenarios, start_year="18500101", end_year="19000101" ): - """Generates GWL table for CESM2 model.""" + """ + Generates a GWL table for the CESM2 model. + + Parameters: + ---------- + variable : str + The variable to process, e.g., 'tas'. + model : str + The model name, e.g., 'CESM2'. + scenarios : list + A list of scenario names to include, e.g., ['ssp370']. + start_year : str, optional + The start year for the reference period in the format 'YYYYMMDD'. + end_year : str, optional + The end year for the reference period in the format 'YYYYMMDD'. + + Returns: + ------- + tuple + A DataFrame of warming levels and a DataFrame of global mean temperature time series for the CESM2 model. + """ ens_mem_list = [v for k, v in ens_mems_cesm.items()] gwlevels_tbl, wl_data_tbls = [], [] for ens_mem in ens_mem_list: @@ -198,9 +311,32 @@ def get_table_cesm2( def get_gwl_table_one( variable, model, ens_mem, scenarios, start_year="18500101", end_year="19000101" ): - """Loops through global warming levels, and returns an aggregate table - for all warming levels (1.5, 2, 2.5, 3, and 4 degrees) for all scenarios of - the model/variant requested.""" + """ + Generates a GWL table for a single model and ensemble member. + + Loops through various global warming levels (1.5, 2, 2.5, 3, and 4 degrees) + for the requested model/variant and scenarios. + + Parameters: + ---------- + variable : str + The variable to process, e.g., 'tas'. + model : str + The model name. + ens_mem : str + The ensemble member ID. + scenarios : list + A list of scenario names to include, e.g., ['historical', 'ssp585', 'ssp370']. + start_year : str, optional + The start year for the reference period in the format 'YYYYMMDD'. + end_year : str, optional + The end year for the reference period in the format 'YYYYMMDD'. + + Returns: + ------- + tuple + A DataFrame containing warming levels and a DataFrame with global mean temperature time series. + """ data_one_model = build_timeseries(variable, model, ens_mem, scenarios) try: anom = data_one_model - data_one_model.sel( @@ -241,10 +377,31 @@ def get_gwl_table_one( def get_gwl_table( variable, model, scenarios, start_year="18500101", end_year="19000101" ): - """Generates GWL table for a given model and scenarios. Used on all GCMs available in AWS catalog, not including CESM2-LENS.""" - ens_mem_list = sims_on_aws.T[model]["historical"] + """ + Generates a GWL table for a given model and scenarios. + + Parameters: + ---------- + variable : str + The variable to process, e.g., 'tas'. + model : str + The model name. + scenarios : list + A list of scenario names to include, e.g., ['ssp585', 'ssp370', 'ssp245']. + start_year : str, optional + The start year for the reference period in the format 'YYYYMMDD'. + end_year : str, optional + The end year for the reference period in the format 'YYYYMMDD'. + + Returns: + ------- + tuple + A DataFrame containing warming levels and a DataFrame with global mean temperature time series. + To be exported into `gwl_[time period]ref.csv` and `gwl_[time period]ref_timeidx.csv`. + """ + ens_mem_list = sims_on_aws.T[model]["historical"].copy() if (model == "EC-Earth3") or (model == "EC-Earth3-Veg"): - for ens_mem in ens_mem_list: + for ens_mem in ens_mem_list[:]: if int(ens_mem.split("r")[1].split("i")[0]) > 100: # These ones were branched off another at 1970 ens_mem_list.remove(ens_mem) @@ -355,7 +512,24 @@ def get_gwl_table( def get_sims_on_aws(df): """ - Make a table of all of the relevant CMIP6 simulations on AWS. + Generates a pandas DataFrame listing all relevant CMIP6 simulations available on AWS. + + This function filters the input DataFrame `df` and identifies and lists CMIP6 model simulations + for historical and various SSP (Shared Socioeconomic Pathway) scenarios. It only includes + models that have both historical and at least one SSP ensemble member. Additionally, it ensures + that only historical ensemble members with variants in at least one SSP are kept. + + Parameters: + ---------- + df : pandas.DataFrame + A DataFrame containing metadata for CMIP6 simulations. + + Returns: + ------- + pandas.DataFrame + A DataFrame indexed by model names (source_id) and columns corresponding to scenarios + ('historical', 'ssp585', 'ssp370', 'ssp245', 'ssp126'). Each cell contains a list of + ensemble member IDs available on AWS for that model and scenario. """ df_subset = df[ (df.table_id == "Amon")