Merge pull request #35 from blaylockbk/handle_different_idx_styles

Handle different idx styles
blaylockbk · Jan 21, 2022 · a49904a · a49904a
2 parents 07a65bb + 9364e31
commit a49904a
Show file tree

Hide file tree

Showing 16 changed files with 3,400 additions and 204 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -39,9 +39,8 @@
       }
     ]
   },
-  "python.testing.pytestArgs": [
-    "tests"
-  ],
+  "python.testing.pytestArgs": ["tests"],
   "python.testing.unittestEnabled": false,
-  "python.testing.pytestEnabled": true
+  "python.testing.pytestEnabled": true,
+  "cSpell.words": ["NCEI"]
 }
diff --git a/docs/user_guide/notebooks/data_hrrr.ipynb b/docs/user_guide/notebooks/data_hrrr.ipynb
diff --git a/docs/user_guide/notebooks/data_rap.ipynb b/docs/user_guide/notebooks/data_rap.ipynb
diff --git a/herbie/archive.py b/herbie/archive.py
@@ -167,7 +167,6 @@ def __init__(
         model=config["default"].get("model"),
         fxx=config["default"].get("fxx"),
         product=config["default"].get("product"),
-        member=config["default"].get("member", 1),
         priority=config["default"].get("priority"),
         save_dir=config["default"].get("save_dir"),
         overwrite=config["default"].get("overwrite", False),
@@ -190,28 +189,29 @@ def __init__(
             self.date = self.valid_date - timedelta(hours=self.fxx)
 
         self.model = model.lower()
-        self.member = member
         self.product = product
 
         self.priority = priority
         self.save_dir = Path(save_dir).expand()
         self.overwrite = overwrite
 
-        # Some model templates may require kwargs not listed (e.g., "nest").
+        # Some model templates may require kwargs not listed (e.g., `nest=`, `member=`).
         for key, value in kwargs.items():
             # TODO: Check if the kwarg is a config default.
+            # TODO: e.g. if a user primarily works with RRFS, they may
+            # TODO: want to configure "member" as a default argument.
             setattr(self, key, value)
 
         # Get details from the template of the specified model.
         # This attaches the details from the `models.<model>.template`
         # class to this Herbie object.
         # This line is equivalent to `models_template.gfs.template(self)`.
-        # We do it this way because the model name is a variable.
+        # I do it this way because the model name is a variable.
         # (see https://stackoverflow.com/a/7936588/2383070 for what I'm doing here)
         getattr(models_template, self.model).template(self)
 
         if product is None:
-            # The user didn't specify a product, so lets use the first
+            # The user didn't specify a product, so let's use the first
             # product in the model template.
             self.product = list(self.PRODUCTS)[0]
             warnings.warn(f'`product` not specified. Will use ["{self.product}"].')
@@ -220,8 +220,10 @@ def __init__(
 
         self.product_description = self.PRODUCTS[self.product]
 
-        # Default value is .idx, but some have weird suffix (.inv for NCEI files).
-        self.IDX_SUFFIX = getattr(self, "IDX_SUFFIX", ".idx")
+        # Specify the suffix for the inventory index files.
+        # Default value is `.grib2.idx`, but some have weird suffix,
+        # like archived RAP on NCEI are `.grb2.inv`.
+        self.IDX_SUFFIX = getattr(self, "IDX_SUFFIX", [".grib2.idx"])
 
         # Check the user input
         self._validate()
@@ -244,9 +246,9 @@ def __init__(
 
         if list(self.SOURCES)[0] == "local":
             # TODO: Experimental special case, not very elegant yet.
-            self.idx = Path(str(self.grib) + self.IDX_SUFFIX)
+            self.idx = Path(str(self.grib) + self.IDX_SUFFIX[0])
             if not self.idx.exists():
-                self.idx = Path(str(self.grib).replace(".grb2", self.IDX_SUFFIX))
+                self.idx = Path(str(self.grib).replace(".grb2", self.IDX_SUFFIX[0]))
             return None
 
         # If priority list is set, we want to search SOURCES in that
@@ -278,8 +280,7 @@ def __init__(
                 self.grib = url
                 self.grib_source = source
             idx_exists, idx_url = self._check_idx(url)
-            print(url)
-            print(idx_url)
+
             if idx_exists:
                 found_idx = True
                 self.idx = idx_url
@@ -299,14 +300,15 @@ def __init__(
                 break
 
         # After searching each source, print some info about what we found...
+        # (ANSI color's added for style points)
         if verbose:
             if any([self.grib is not None, self.idx is not None]):
                 print(
                     f"🏋🏻‍♂️ Found",
                     f"\033[32m{self.date:%Y-%b-%d %H:%M UTC} F{self.fxx:02d}\033[m",
                     f"[{self.model.upper()}] [product={self.product}]",
-                    f"GRIB2 file from \033[38;5;202m{self.grib_source}\033[m and",
-                    f"index file from \033[38;5;202m{self.idx_source}\033[m.",
+                    f"GRIB2 file from \033[31m{self.grib_source}\033[m and",
+                    f"index file from \033[31m{self.idx_source}\033[m.",
                     f'{" ":150s}',
                 )
             else:
@@ -382,17 +384,32 @@ def _check_grib(self, url):
         else:
             return False
 
-    def _check_idx(self, url):
+    def _check_idx(self, url, verbose=False):
         """Check if an index file exist for the GRIB2 URL."""
-        if not url.endswith(self.IDX_SUFFIX):
-            url += self.IDX_SUFFIX
-        url_exists = requests.head(url).ok
-        # Check for index files where .inv replaces grb2 rather than being appended
-        url_rep = url
-        if not url_exists:
-            url_rep = url.replace(".grb2" + self.IDX_SUFFIX, self.IDX_SUFFIX)
-            url_exists = requests.head(url_rep).ok
-        return url_exists, url_rep
+        # To check inventory files with slightly different URL structure
+        # we will loop through the IDX_SUFFIX.
+        if not isinstance(self.IDX_SUFFIX, list):
+            self.IDX_SUFFIX = [self.IDX_SUFFIX]
+
+        if verbose:
+            print(f"🐜 {self.IDX_SUFFIX=}")
+
+        # Loop through IDX_SUFFIX options until we find one that exists
+        for i in self.IDX_SUFFIX:
+            idx_url = url.rsplit(".", maxsplit=1)[0] + i
+            idx_exists = requests.head(idx_url).ok
+            if verbose:
+                print(f"🐜 {idx_url=}")
+                print(f"🐜 {idx_exists=}")
+            if idx_exists:
+                return idx_exists, idx_url
+
+        if verbose:
+            print(
+                f"⚠ Herbie didn't find any inventory files that",
+                f"exists from {self.IDX_SUFFIX}",
+            )
+        return False, None
 
     @property
     def get_remoteFileName(self, source=None):
@@ -409,7 +426,7 @@ def get_localFileName(self):
     def get_localFilePath(self, searchString=None):
         """Get path to local file"""
         if list(self.SOURCES)[0] == "local":
-            # TODO: An experimental special case
+            # TODO: An experimental special case for locally stored GRIB2.
             outFile = Path(self.SOURCES["local"]).expand()
         else:
             outFile = (
@@ -488,15 +505,14 @@ def read_idx(self, searchString=None):
         # Format the DataFrame
         df["grib_message"] = df["grib_message"].astype(float)
         # ^ float because RAP idx files have some decimal grib message numbers
+        # TODO: ^ how can I address issue #32?
         df["reference_time"] = pd.to_datetime(df.reference_time, format="d=%Y%m%d%H")
         df["valid_time"] = df["reference_time"] + pd.to_timedelta(f"{self.fxx}H")
         df["start_byte"] = df["start_byte"].astype(int)
         df["end_byte"] = df["start_byte"].shift(-1, fill_value="")
         # TODO: Check this works: Assign the ending byte for the last row...
-        # TODO: df["end_byte"] = df["start_byte"].shift(-1, fill_value=requests.get(self.idx, stream=True).headers['Content-length'])
-        # TODO: From Karl Schnieder
-        # TODO: Get the actual end byte with requests
-        # TODO: df['byte_end'].values[-1] = requests.get(URL+url_ext, stream=True).headers['Content-length']
+        # TODO: df["end_byte"] = df["start_byte"].shift(-1, fill_value=requests.get(self.grib, stream=True).headers['Content-Length'])
+        # TODO: Based on what Karl Schnieder did.
         df["range"] = df.start_byte.astype(str) + "-" + df.end_byte.astype(str)
         df = df.set_index("grib_message")
         df = df.reindex(

diff --git a/herbie/models/__init__.py b/herbie/models/__init__.py
@@ -13,7 +13,7 @@
 except:
     pass
 try:
-    # Brian's personal local special case.
+    # Brian's personal local special case (hidden).
     from .local_B import *
 except:
     pass
diff --git a/herbie/models/hrrr.py b/herbie/models/hrrr.py
@@ -11,42 +11,47 @@
 
 Requirements
 ------------
-1. Model must be available via https
+1. Model GRIB2 file must be available via https
 2. Preferably, an .idx file should be available.
 3. URL must be consistent across time and products.
 
 Properties
 ----------
 DESCRIPTION : str
     A description of the model. Give the full name and the
-    domain, if relevant.
+    domain, if relevant. Just infor for the user.
 DETAILS : dict
     Some additional details about the model. Provide links
-    to web documentation.
+    to web documentation. Just info for the user.
 PRODUCTS : dict
     Models usually have different product types. The keys are
     used in building the GRIB2 source URL.
     ORDER MATTERS -- If product is None, then Herbie uses the first
     as default.
+    *ONLY ONE IS USED (FIRST IS USED IF NOT SET)*
 SOURCES : dict
     Build the URL for the GRIB2 file for different sources.
     The parameters are from arguments passed into the
     ``herbie.archive.Herbie()`` class.
     ORDER MATTERS -- If priority is None, then Herbie searches the
     sources in the order given here.
+    *LOOP THROUGH ALL SOURCES*
 LOCALFILE : str
     The local file to save the model output. The file will be saved in
     ``save_dir/model/YYYYmmdd/localFile.grib2``
     It is sometimes necessary to add details to maintain unique
     filenames (e.g., rrfs needs to have the member number in LOCALFILE).
-TODO
-EXPECT_IDX_FILE : {'remote', 'local', 'none'}
-    (Not implemented, but might be in the future)
-    Where to expect the inventory index file, on the 'remote' server,
-    on the 'local' disk, or 'none' for non-grib files.
-    Default will be set to 'remote'
-"""
 
+Optional
+--------
+IDX_SUFFIX : list
+    Default value is ["grib.idx"], which is pretty standard.
+    But for some, like RAP, the idx files are messy and could be a few
+    different styles.
+    self.IDX_SUFFIX = [".grb2.inv", ".inv", ".grb.inv"]
+    *LOOP THROUGH ALL SUFFIXES TO FIND AN INDEX FILE*
+"""
+from datetime import datetime
 
 class hrrr:
     def template(self):
@@ -69,9 +74,24 @@ def template(self):
             "pando": f"https://pando-rgw01.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
             "pando2": f"https://pando-rgw02.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
         }
-        self.EXPECT_IDX_FILE = 'remote'
+        self.EXPECT_IDX_FILE = "remote"
         self.LOCALFILE = f"{self.get_remoteFileName}"
 
+        # ----------
+        # CONDITIONS
+        # ----------
+
+        # Fix Issue #34 (not pretty, but gets the job done for now)
+        # TODO: Allow Herbie to specify the format of the SOURCE manually
+        if self.product == "subh" and self.date <= datetime(2018, 9, 16):
+            # The subhourly filenames are different for older files.
+            # prepend the self.SOURCES dict with the old filename format.
+            # This requires an additional arg for `fxx_subh` when calling Herbie
+            self.SOURCES = {
+                "aws_old_subh": f"https://noaa-hrrr-bdp-pds.s3.amazonaws.com/hrrr.{self.date:%Y%m%d}/conus/hrrr.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}{self.fxx_subh:02d}.grib2",
+                **self.SOURCES
+            }
+
 
 class hrrrak:
     def template(self):
@@ -93,5 +113,5 @@ def template(self):
             "pando": f"https://pando-rgw01.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
             "pando2": f"https://pando-rgw02.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
         }
-        self.EXPECT_IDX_FILE = 'remote'
+        self.EXPECT_IDX_FILE = "remote"
         self.LOCALFILE = f"{self.get_remoteFileName}"
diff --git a/herbie/models/rap.py b/herbie/models/rap.py
@@ -3,8 +3,12 @@
 
 
 class rap:
+    """
+    For NOMADS and Big Data Program RAP archive
+    """
+
     def template(self):
-        self.DESCRIPTION = "Rapid Refresh"
+        self.DESCRIPTION = "Rapid Refresh (RAP) from NOMADS and Big Data Program"
         self.DETAILS = {
             "nomads product description": "https://www.nco.ncep.noaa.gov/pmb/products/rap",
         }
@@ -31,29 +35,76 @@ def template(self):
         self.LOCALFILE = f"{self.get_remoteFileName}"
 
 
+########################################################################
+# The RAP record at NCEI is very different than the Big Data Program.
+# Files are separated into historical/, rap-130-13km/, rap-252-20km/,
+# analysis/, and forecast/ directories. These are inconsistent in years
+# that are archived and have incomplete archived datetime groups.
+# In a nutshell, NCEI's archive is very messy. Why anyone would want to
+# use historical RAP is beyond me. Because the NCEI archive is so messy,
+# Herbie may not be configured to find all possible file names in each
+# year.
+########################################################################
+
+# TODO: Set LOCALFILE name to match modern filename structure.
+
+class rap_historical:
+    """
+    The RAP and RUC historical record at NCEI. (files older than 2020)
+
+    Grid 130 = 13 km
+    Grid 252 = 20 km
+    Grid 236 = ?? km
+    Grid 211 = ?? km
+    """
+
+    def template(self):
+        self.DESCRIPTION = "Rapid Refresh - NCEI Historical"
+        self.DETAILS = {
+            "nomads product description": "https://www.ncei.noaa.gov/products/weather-climate-models/rapid-refresh-update",
+        }
+        self.PRODUCTS = {
+            "analysis": "RAP",
+            "forecast": "RAP",
+        }
+        self.SOURCES = {
+            "rap_130": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
+            "rap_252": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
+            "ruc_252": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc2_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
+            "ruc_anl_252": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc2anl_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
+            "ruc_236": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc2_236_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
+            "ruc_211": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc_211_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
+        }
+        self.IDX_SUFFIX = [".inv", ".grb2.inv", "grb.inv"]
+        self.LOCALFILE = f"{self.get_remoteFileName}"
+
+
 class rap_ncei:
     """
-    The RAP record at NCEI is very different than other sources.
+    The RAP record at NCEI.
 
-    This isn't implemented super well.
+    Analysis: longer archive; May 2012 through a few days ago
+    Forecast: short archive; 2021 through a few days ago
     """
 
     def template(self):
-        self.DESCRIPTION = "Rapid Refresh"
+        self.DESCRIPTION = "Rapid Refresh 13 km - NCEI"
         self.DETAILS = {
             "nomads product description": "https://www.ncei.noaa.gov/products/weather-climate-models/rapid-refresh-update",
         }
         self.PRODUCTS = {
-            "historical/analysis": "RAP 13 km",
-            "rap-130-13km/analysis": "RAP 13 km",  # longer archive
-            "rap-130-13km/forecast": "RAP 13 km",  # very short archive
-            "rap-252-20km/analysis": "RAP 20 km",
-            "rap-252-20km/forecast": "RAP 20 km",
-            "historical/forecast": "RAP 20 km",
+            "rap-130-13km": "RAP 13 km",
+            "rap-252-20km": "RAP 20 km",
         }
+        # Well, it's either loop through the two sources and look for
+        # files or create a separate class. I elected to just loop
+        # through different URL's. Might not be the fastest, but it'll
+        # work. The user can always specify the priority order.
         self.SOURCES = {
-            "ncei": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
-            "ncei_20km": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
+            "ncei_13km_analysis": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/analysis/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
+            "ncei_13km_forecast": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/forecast/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
+            "ncei_20km_analysis": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/analysis/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
+            "ncei_20km_forecast": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/forecast/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
         }
-        self.IDX_SUFFIX = ".inv"  # it is not .idx
+        self.IDX_SUFFIX = [".grb2.inv", ".inv", ".grb.inv"]
         self.LOCALFILE = f"{self.get_remoteFileName}"