Add Nudging decay (including reading LastObs files) (#310)

* added last obs df including discharge and ids * lastobs nc file folder * formatting and cleanup * added both discharge and model discharge * added prediction delta timesteps * da decay with exp decay incorporated, unblackened for readability * added check between obs file results and our fvd output * removed pdbs and set to run on lastobs * added verbose statements * restructuring of lastobs df to simplify process * generalized last timestep index call to automatically determine from various input sizes * github requested fixes to if statements and cleanup * updated variable * working ncar da decay prototype need to bring in real lastobs data inside equation * getting da values in order through classic da assim technique * pushing ids and values to mc reach * fixed gage id matching, cython is broken cant compile correctly, need to print values to identify if da is properly working * saving changes, trying to fix old DA function * restructing da timeslice file read to use datetime, not generalized before * added last obs df including discharge and ids * lastobs nc file folder * formatting and cleanup * added both discharge and model discharge * added prediction delta timesteps * da decay with exp decay incorporated, unblackened for readability * added check between obs file results and our fvd output * removed pdbs and set to run on lastobs * added verbose statements * restructuring of lastobs df to simplify process * generalized last timestep index call to automatically determine from various input sizes * github requested fixes to if statements and cleanup * updated variable * generalized da naming conventional and date timeframe with improved interpolation * removed extra comments * remove dependence on not-yet-created flowveldepth * name "last_obs_file" * include data_assimilation_parameters to yaml * include empty dict for data_assimilation_parameters in yaml * black * added paths to shared drive locations and blackened files * quick merge changes * files working * sync for merge * sync for merge * sync merge to upstream * move last obs function next to usgs_da function * add TODOs * add lastobs to other parallel modes * move last_obs above usgs_df * cimports and cdefs for last_obs * fixed broken usgs_df names were changed to folder in some areas and filter in others * moved da functions into single wrapper in nhd_network_utilities also harmonized inputs a bit. * add da yaml file (DATA NOT YET ADDED) * drop non-5-minute entries from time_slices Also harmonizing inputs for merge. * add function for finding the tailwater for a given segment * add masks * use pandas date_range * cleanup * add comment showing possible handling of extended DA * Revert "add comment showing possible handling of extended DA" This reverts commit 0455466. * temporarily disable last_obs * Update example yaml with inputs that work * temporarily disable last_obs * update comment * adjust DA for perfect match * removed filter list * use efficient shape call for usgs_positions_list length * add gage_maxtime and pseudocode for lastobs * Identified major hard-code issue in structured- and structured-obj * update lastobs comments and pseudocode * update da test yaml file with additional gage options * use "reindex" to fill/eliminate columns for usgs_df * functions in place for decay but last obs file is behaving incorrectly and changing parity check even when all uses are turned off * added decay timestep count * Use new fields in flowveldepth to simplify initial condition handling To facilitate this, added a constant qvd_ts_w (flowveldepth timestep width) to define the standard column width. * add two additional segments for parity checking * reconfigure reach splitting to consider gages * update diffusive call signature * yaml updates for test * black Co-authored-by: James Halgren <james.halgren@noaa.gov>
NOAA-OWP · May 28, 2021 · e24d032 · e24d032
1 parent fa93df6
commit e24d032
Show file tree

Hide file tree

Showing 8 changed files with 430 additions and 73 deletions.
diff --git a/src/python_framework_v02/troute/nhd_io.py b/src/python_framework_v02/troute/nhd_io.py
@@ -8,6 +8,9 @@
 import numpy as np
 from toolz import compose
 import dask.array as da
+import sys
+import math
+from datetime import *
 
 
 def read_netcdf(geo_file_path):
@@ -186,7 +189,6 @@ def get_ql_from_wrf_hydro_mf(qlat_files, index_col="feature_id", value_col="q_la
     2018-01-01 13:00:00 4186117     41.233807 -75.413895   0.006496
     ```
     """
-    filter_list = None
 
     with xr.open_mfdataset(
         qlat_files,
@@ -339,6 +341,92 @@ def preprocess_time_station_index(xd):
     )
 
 
+def build_last_obs_df(lastobsfile, routelink, wrf_last_obs_flag):
+    # open routelink_file and extract discharges
+
+    ds1 = xr.open_dataset(routelink)
+    df = ds1.to_dataframe()
+    df2 = df.loc[df["gages"] != b"               "]
+    df2["gages"] = df2["gages"].astype("int")
+    df2 = df2[["gages", "to"]]
+    df2 = df2.reset_index()
+    df2 = df2.set_index("gages")
+
+    with xr.open_dataset(lastobsfile) as ds:
+        df_model_discharges = ds["model_discharge"].to_dataframe()
+        df_discharges = ds["discharge"].to_dataframe()
+        last_ts = df_model_discharges.index.get_level_values("timeInd")[-1]
+        model_discharge_last_ts = df_model_discharges[
+            df_model_discharges.index.get_level_values("timeInd") == last_ts
+        ]
+        discharge_last_ts = df_discharges[
+            df_discharges.index.get_level_values("timeInd") == last_ts
+        ]
+        df1 = ds["stationId"].to_dataframe()
+        df1 = df1.astype(int)
+        model_discharge_last_ts = model_discharge_last_ts.join(df1)
+        model_discharge_last_ts = model_discharge_last_ts.join(discharge_last_ts)
+        model_discharge_last_ts = model_discharge_last_ts.loc[
+            model_discharge_last_ts["model_discharge"] != -9999.0
+        ]
+        model_discharge_last_ts = model_discharge_last_ts.reset_index().set_index(
+            "stationId"
+        )
+        model_discharge_last_ts = model_discharge_last_ts.drop(
+            ["stationIdInd", "timeInd"], axis=1
+        )
+        model_discharge_last_ts["discharge"] = model_discharge_last_ts[
+            "discharge"
+        ].to_frame()
+        # If predict from last_obs file use last obs file results
+        # if last_obs_file == "error-based":
+        # elif last_obs_file == "obs-based":  # the wrf-hydro default
+        if wrf_last_obs_flag:
+            model_discharge_last_ts["last_nudge"] = (
+                model_discharge_last_ts["discharge"]
+                - model_discharge_last_ts["model_discharge"]
+            )
+        final_df = df2.join(model_discharge_last_ts["discharge"])
+        final_df = final_df.reset_index()
+        final_df = final_df.set_index("to")
+        final_df = final_df.drop(["feature_id", "gages"], axis=1)
+        final_df = final_df.dropna()
+
+        # Else predict from the model outputs from t-route if index doesn't match interrupt computation as the results won't be valid
+        # else:
+        #     fvd_df = fvd_df
+        #     if len(model_discharge_last_ts.index) == len(fvd_df.index):
+        #         model_discharge_last_ts["last_nudge"] = (
+        #             model_discharge_last_ts["discharge"] - fvd_df[fvd_df.columns[0]]
+        #         )
+        #     else:
+        #         print("THE NUDGING FILE IDS DO NOT MATCH THE FLOWVELDEPTH IDS")
+        #         sys.exit()
+        # # Predictions created with continuously decreasing deltas until near 0 difference
+        # a = 120
+        # prediction_df = pd.DataFrame(index=model_discharge_last_ts.index)
+
+        # for time in range(0, 720, 5):
+        #     weight = math.exp(time / -a)
+        #     delta = pd.DataFrame(
+        #         model_discharge_last_ts["last_nudge"] / weight)
+
+        #     if time == 0:
+        #         prediction_df[str(time)] = model_discharge_last_ts["last_nudge"]
+        #         weight_diff = prediction_df[str(time)] - prediction_df[str(time)]
+        #     else:
+        #         if weight > 0.1:
+        #             prediction_df[str(time)] = (
+        #                 delta["last_nudge"] + model_discharge_last_ts["model_discharge"]
+        #             )
+        #         elif weight < -0.1:
+        #             prediction_df[str(time)] = (
+        #                 delta["last_nudge"] + model_discharge_last_ts["model_discharge"]
+        #             )
+        # prediction_df["0"] = model_discharge_last_ts["model_discharge"]
+        return final_df
+
+
 def get_usgs_from_time_slices_csv(routelink_subset_file, usgs_csv):
 
     df2 = pd.read_csv(usgs_csv, index_col=0)
@@ -421,32 +509,36 @@ def get_usgs_from_time_slices_folder(
     usgs_df = usgs_df.drop(["gages", "ascendingIndex", "to"], axis=1)
     columns_list = usgs_df.columns
 
-    for i in range(0, (len(columns_list) * 3) - 12, 12):
-        original_string = usgs_df.columns[i]
-        original_string_shortened = original_string[:-5]
-        temp_name1 = original_string_shortened + str("05:00")
-        temp_name2 = original_string_shortened + str("10:00")
-        temp_name3 = original_string_shortened + str("20:00")
-        temp_name4 = original_string_shortened + str("25:00")
-        temp_name5 = original_string_shortened + str("35:00")
-        temp_name6 = original_string_shortened + str("40:00")
-        temp_name7 = original_string_shortened + str("50:00")
-        temp_name8 = original_string_shortened + str("55:00")
-        usgs_df.insert(i + 1, temp_name1, np.nan)
-        usgs_df.insert(i + 2, temp_name2, np.nan)
-        usgs_df.insert(i + 4, temp_name3, np.nan)
-        usgs_df.insert(i + 5, temp_name4, np.nan)
-        usgs_df.insert(i + 7, temp_name5, np.nan)
-        usgs_df.insert(i + 8, temp_name6, np.nan)
-        usgs_df.insert(i + 10, temp_name7, np.nan)
-        usgs_df.insert(i + 11, temp_name8, np.nan)
+    original_string_first = usgs_df.columns[0]
+    date_time_str = original_string_first[:10] + " " + original_string_first[11:]
+    date_time_obj_start = datetime.strptime(date_time_str, "%Y-%m-%d %H:%M:%S")
+
+    original_string_last = usgs_df.columns[-1]
+    date_time_str = original_string_last[:10] + " " + original_string_last[11:]
+    date_time_obj_end = datetime.strptime(date_time_str, "%Y-%m-%d %H:%M:%S")
+
+    dates = []
+    # for j in pd.date_range(date_time_obj_start, date_time_obj_end + timedelta(1), freq="5min"):
+    for j in pd.date_range(date_time_obj_start, date_time_obj_end, freq="5min"):
+        dates.append(j.strftime("%Y-%m-%d_%H:%M:00"))
+
+    """
+    # dates_to_drop = ~usgs_df.columns.isin(dates)
+    OR 
+    # dates_to_drop = usgs_df.columns.difference(dates)
+    # dates_to_add = pd.Index(dates).difference(usgs_df.columns)
+    """
+
+    usgs_df = usgs_df.reindex(columns=dates)
 
     usgs_df = usgs_df.interpolate(method="linear", axis=1)
+    usgs_df = usgs_df.interpolate(method="linear", axis=1, limit_direction="backward")
     usgs_df.drop(usgs_df[usgs_df.iloc[:, 0] == -999999.000000].index, inplace=True)
 
     return usgs_df
 
 
+# TODO: Move channel restart above usgs to keep order with execution script
 def get_channel_restart_from_csv(
     channel_initial_states_file,
     index_col=0,

diff --git a/src/python_framework_v02/troute/nhd_network.py b/src/python_framework_v02/troute/nhd_network.py
@@ -88,6 +88,31 @@ def reverse_network(N):
     return rg
 
 
+def find_tw_for_node(reaches_bytw, node):
+    # TODO: extend this function (or write a new one) to handle a list of nodes.
+    # Such functionality might be useful for finding networks corresponding to a
+    # list of gages, for instance.
+    """
+    reaches_bytw is a dictionary of lists of the form
+
+    tw 1: 
+      [ [ seg1, seg2, seg3, ... segn ], # reach 1
+        [ sega, segb, segc, ... segz ], # reach 2
+        .
+        .
+        .
+        [ ... ] ] reach n
+    tw 2:
+        etc.
+    """
+    for tw, rs in reaches_bytw.items():
+        for r in rs:
+            if node in r:
+                return tw
+
+    return None  # Node not in reach set.
+
+
 def junctions(N):
     c = Counter(chain.from_iterable(N.values()))
     return {k for k, v in c.items() if v > 1}

diff --git a/src/python_framework_v02/troute/nhd_network_utilities_v02.py b/src/python_framework_v02/troute/nhd_network_utilities_v02.py
@@ -419,10 +419,31 @@ def build_connections(supernetwork_parameters, dt):
 
     param_df["dt"] = dt
     param_df = param_df.rename(columns=reverse_dict(cols))
+
+    wbodies = {}
+    if "waterbody" in cols:
+        wbodies = build_waterbodies(
+            param_df[["waterbody"]], supernetwork_parameters, "waterbody"
+        )
+        param_df = param_df.drop("waterbody", axis=1)
+
+    gages = {}
+    if "gages" in cols:
+        gages = build_gages(param_df[["gages"]])
+        param_df = param_df.drop("gages", axis=1)
+
     param_df = param_df.astype("float32")
 
     # datasub = data[['dt', 'bw', 'tw', 'twcc', 'dx', 'n', 'ncc', 'cs', 's0']]
-    return connections, param_df
+    return connections, param_df, wbodies, gages
+
+
+def build_gages(segment_gage_df,):
+    gage_list = list(map(bytes.strip, segment_gage_df.gages.values))
+    gage_mask = list(map(bytes.isdigit, gage_list))
+    gages = segment_gage_df.loc[gage_mask, "gages"].to_dict()
+
+    return gages
 
 
 def build_waterbodies(
@@ -569,20 +590,42 @@ def build_data_assimilation(data_assimilation_parameters):
     data_assimilation_csv = data_assimilation_parameters.get(
         "data_assimilation_csv", None
     )
-    data_assimilation_filter = data_assimilation_parameters.get(
-        "data_assimilation_filter", None
+    data_assimilation_folder = data_assimilation_parameters.get(
+        "data_assimilation_timeslices_folder", None
     )
+    # TODO: Fix the Logic here according to the following.
+
+    # If there are any observations for data assimilation, there
+    # needs to be a complete set in the first time set or else
+    # there must be a "LastObs". If there is a complete set in
+    # the first time step, the LastObs is optional. If there are
+    # no observations for assimilation, there can be a LastObs
+    # with an empty usgs dataframe.
+
+    last_obs_file = data_assimilation_parameters.get("wrf_hydro_last_obs_file", None)
+    last_obs_type = data_assimilation_parameters.get("wrf_last_obs_type", "error-based")
+    last_obs_crosswalk_file = data_assimilation_parameters.get(
+        "wrf_hydro_da_channel_ID_crosswalk_file", None
+    )
+
+    last_obs_df = pd.DataFrame()
+
+    if last_obs_file:
+        last_obs_df = nhd_io.build_last_obs_df(
+            last_obs_file, last_obs_crosswalk_file, last_obs_type,
+        )
+
     if data_assimilation_csv:
         usgs_df = build_data_assimilation_csv(data_assimilation_parameters)
-    elif data_assimilation_filter:
+    elif data_assimilation_folder:
         usgs_df = build_data_assimilation_folder(data_assimilation_parameters)
-    return usgs_df
+    return usgs_df, last_obs_df
 
 
 def build_data_assimilation_csv(data_assimilation_parameters):
 
     usgs_df = nhd_io.get_usgs_from_time_slices_csv(
-        data_assimilation_parameters["data_assimilation_parameters_file"],
+        data_assimilation_parameters["wrf_hydro_da_channel_ID_crosswalk_file"],
         data_assimilation_parameters["data_assimilation_csv"],
     )
 
@@ -597,7 +640,7 @@ def build_data_assimilation_folder(data_assimilation_parameters):
         ).resolve()
 
         usgs_df = nhd_io.get_usgs_from_time_slices_folder(
-            data_assimilation_parameters["data_assimilation_parameters_file"],
+            data_assimilation_parameters["wrf_hydro_da_channel_ID_crosswalk_file"],
             usgs_timeslices_folder,
             data_assimilation_parameters["data_assimilation_filter"],
         )