Skip to content

Commit

Permalink
Merge pull request #35 from blaylockbk/handle_different_idx_styles
Browse files Browse the repository at this point in the history
Handle different idx styles
  • Loading branch information
blaylockbk authored Jan 21, 2022
2 parents 07a65bb + 9364e31 commit a49904a
Show file tree
Hide file tree
Showing 16 changed files with 3,400 additions and 204 deletions.
7 changes: 3 additions & 4 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,8 @@
}
]
},
"python.testing.pytestArgs": [
"tests"
],
"python.testing.pytestArgs": ["tests"],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
"python.testing.pytestEnabled": true,
"cSpell.words": ["NCEI"]
}
114 changes: 52 additions & 62 deletions docs/user_guide/notebooks/data_hrrr.ipynb

Large diffs are not rendered by default.

976 changes: 965 additions & 11 deletions docs/user_guide/notebooks/data_rap.ipynb

Large diffs are not rendered by default.

72 changes: 44 additions & 28 deletions herbie/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ def __init__(
model=config["default"].get("model"),
fxx=config["default"].get("fxx"),
product=config["default"].get("product"),
member=config["default"].get("member", 1),
priority=config["default"].get("priority"),
save_dir=config["default"].get("save_dir"),
overwrite=config["default"].get("overwrite", False),
Expand All @@ -190,28 +189,29 @@ def __init__(
self.date = self.valid_date - timedelta(hours=self.fxx)

self.model = model.lower()
self.member = member
self.product = product

self.priority = priority
self.save_dir = Path(save_dir).expand()
self.overwrite = overwrite

# Some model templates may require kwargs not listed (e.g., "nest").
# Some model templates may require kwargs not listed (e.g., `nest=`, `member=`).
for key, value in kwargs.items():
# TODO: Check if the kwarg is a config default.
# TODO: e.g. if a user primarily works with RRFS, they may
# TODO: want to configure "member" as a default argument.
setattr(self, key, value)

# Get details from the template of the specified model.
# This attaches the details from the `models.<model>.template`
# class to this Herbie object.
# This line is equivalent to `models_template.gfs.template(self)`.
# We do it this way because the model name is a variable.
# I do it this way because the model name is a variable.
# (see https://stackoverflow.com/a/7936588/2383070 for what I'm doing here)
getattr(models_template, self.model).template(self)

if product is None:
# The user didn't specify a product, so lets use the first
# The user didn't specify a product, so let's use the first
# product in the model template.
self.product = list(self.PRODUCTS)[0]
warnings.warn(f'`product` not specified. Will use ["{self.product}"].')
Expand All @@ -220,8 +220,10 @@ def __init__(

self.product_description = self.PRODUCTS[self.product]

# Default value is .idx, but some have weird suffix (.inv for NCEI files).
self.IDX_SUFFIX = getattr(self, "IDX_SUFFIX", ".idx")
# Specify the suffix for the inventory index files.
# Default value is `.grib2.idx`, but some have weird suffix,
# like archived RAP on NCEI are `.grb2.inv`.
self.IDX_SUFFIX = getattr(self, "IDX_SUFFIX", [".grib2.idx"])

# Check the user input
self._validate()
Expand All @@ -244,9 +246,9 @@ def __init__(

if list(self.SOURCES)[0] == "local":
# TODO: Experimental special case, not very elegant yet.
self.idx = Path(str(self.grib) + self.IDX_SUFFIX)
self.idx = Path(str(self.grib) + self.IDX_SUFFIX[0])
if not self.idx.exists():
self.idx = Path(str(self.grib).replace(".grb2", self.IDX_SUFFIX))
self.idx = Path(str(self.grib).replace(".grb2", self.IDX_SUFFIX[0]))
return None

# If priority list is set, we want to search SOURCES in that
Expand Down Expand Up @@ -278,8 +280,7 @@ def __init__(
self.grib = url
self.grib_source = source
idx_exists, idx_url = self._check_idx(url)
print(url)
print(idx_url)

if idx_exists:
found_idx = True
self.idx = idx_url
Expand All @@ -299,14 +300,15 @@ def __init__(
break

# After searching each source, print some info about what we found...
# (ANSI color's added for style points)
if verbose:
if any([self.grib is not None, self.idx is not None]):
print(
f"🏋🏻‍♂️ Found",
f"\033[32m{self.date:%Y-%b-%d %H:%M UTC} F{self.fxx:02d}\033[m",
f"[{self.model.upper()}] [product={self.product}]",
f"GRIB2 file from \033[38;5;202m{self.grib_source}\033[m and",
f"index file from \033[38;5;202m{self.idx_source}\033[m.",
f"GRIB2 file from \033[31m{self.grib_source}\033[m and",
f"index file from \033[31m{self.idx_source}\033[m.",
f'{" ":150s}',
)
else:
Expand Down Expand Up @@ -382,17 +384,32 @@ def _check_grib(self, url):
else:
return False

def _check_idx(self, url):
def _check_idx(self, url, verbose=False):
"""Check if an index file exist for the GRIB2 URL."""
if not url.endswith(self.IDX_SUFFIX):
url += self.IDX_SUFFIX
url_exists = requests.head(url).ok
# Check for index files where .inv replaces grb2 rather than being appended
url_rep = url
if not url_exists:
url_rep = url.replace(".grb2" + self.IDX_SUFFIX, self.IDX_SUFFIX)
url_exists = requests.head(url_rep).ok
return url_exists, url_rep
# To check inventory files with slightly different URL structure
# we will loop through the IDX_SUFFIX.
if not isinstance(self.IDX_SUFFIX, list):
self.IDX_SUFFIX = [self.IDX_SUFFIX]

if verbose:
print(f"🐜 {self.IDX_SUFFIX=}")

# Loop through IDX_SUFFIX options until we find one that exists
for i in self.IDX_SUFFIX:
idx_url = url.rsplit(".", maxsplit=1)[0] + i
idx_exists = requests.head(idx_url).ok
if verbose:
print(f"🐜 {idx_url=}")
print(f"🐜 {idx_exists=}")
if idx_exists:
return idx_exists, idx_url

if verbose:
print(
f"⚠ Herbie didn't find any inventory files that",
f"exists from {self.IDX_SUFFIX}",
)
return False, None

@property
def get_remoteFileName(self, source=None):
Expand All @@ -409,7 +426,7 @@ def get_localFileName(self):
def get_localFilePath(self, searchString=None):
"""Get path to local file"""
if list(self.SOURCES)[0] == "local":
# TODO: An experimental special case
# TODO: An experimental special case for locally stored GRIB2.
outFile = Path(self.SOURCES["local"]).expand()
else:
outFile = (
Expand Down Expand Up @@ -488,15 +505,14 @@ def read_idx(self, searchString=None):
# Format the DataFrame
df["grib_message"] = df["grib_message"].astype(float)
# ^ float because RAP idx files have some decimal grib message numbers
# TODO: ^ how can I address issue #32?
df["reference_time"] = pd.to_datetime(df.reference_time, format="d=%Y%m%d%H")
df["valid_time"] = df["reference_time"] + pd.to_timedelta(f"{self.fxx}H")
df["start_byte"] = df["start_byte"].astype(int)
df["end_byte"] = df["start_byte"].shift(-1, fill_value="")
# TODO: Check this works: Assign the ending byte for the last row...
# TODO: df["end_byte"] = df["start_byte"].shift(-1, fill_value=requests.get(self.idx, stream=True).headers['Content-length'])
# TODO: From Karl Schnieder
# TODO: Get the actual end byte with requests
# TODO: df['byte_end'].values[-1] = requests.get(URL+url_ext, stream=True).headers['Content-length']
# TODO: df["end_byte"] = df["start_byte"].shift(-1, fill_value=requests.get(self.grib, stream=True).headers['Content-Length'])
# TODO: Based on what Karl Schnieder did.
df["range"] = df.start_byte.astype(str) + "-" + df.end_byte.astype(str)
df = df.set_index("grib_message")
df = df.reindex(
Expand Down
2 changes: 1 addition & 1 deletion herbie/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
except:
pass
try:
# Brian's personal local special case.
# Brian's personal local special case (hidden).
from .local_B import *
except:
pass
44 changes: 32 additions & 12 deletions herbie/models/hrrr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,42 +11,47 @@
Requirements
------------
1. Model must be available via https
1. Model GRIB2 file must be available via https
2. Preferably, an .idx file should be available.
3. URL must be consistent across time and products.
Properties
----------
DESCRIPTION : str
A description of the model. Give the full name and the
domain, if relevant.
domain, if relevant. Just infor for the user.
DETAILS : dict
Some additional details about the model. Provide links
to web documentation.
to web documentation. Just info for the user.
PRODUCTS : dict
Models usually have different product types. The keys are
used in building the GRIB2 source URL.
ORDER MATTERS -- If product is None, then Herbie uses the first
as default.
*ONLY ONE IS USED (FIRST IS USED IF NOT SET)*
SOURCES : dict
Build the URL for the GRIB2 file for different sources.
The parameters are from arguments passed into the
``herbie.archive.Herbie()`` class.
ORDER MATTERS -- If priority is None, then Herbie searches the
sources in the order given here.
*LOOP THROUGH ALL SOURCES*
LOCALFILE : str
The local file to save the model output. The file will be saved in
``save_dir/model/YYYYmmdd/localFile.grib2``
It is sometimes necessary to add details to maintain unique
filenames (e.g., rrfs needs to have the member number in LOCALFILE).
TODO
EXPECT_IDX_FILE : {'remote', 'local', 'none'}
(Not implemented, but might be in the future)
Where to expect the inventory index file, on the 'remote' server,
on the 'local' disk, or 'none' for non-grib files.
Default will be set to 'remote'
"""
Optional
--------
IDX_SUFFIX : list
Default value is ["grib.idx"], which is pretty standard.
But for some, like RAP, the idx files are messy and could be a few
different styles.
self.IDX_SUFFIX = [".grb2.inv", ".inv", ".grb.inv"]
*LOOP THROUGH ALL SUFFIXES TO FIND AN INDEX FILE*
"""
from datetime import datetime

class hrrr:
def template(self):
Expand All @@ -69,9 +74,24 @@ def template(self):
"pando": f"https://pando-rgw01.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
"pando2": f"https://pando-rgw02.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
}
self.EXPECT_IDX_FILE = 'remote'
self.EXPECT_IDX_FILE = "remote"
self.LOCALFILE = f"{self.get_remoteFileName}"

# ----------
# CONDITIONS
# ----------

# Fix Issue #34 (not pretty, but gets the job done for now)
# TODO: Allow Herbie to specify the format of the SOURCE manually
if self.product == "subh" and self.date <= datetime(2018, 9, 16):
# The subhourly filenames are different for older files.
# prepend the self.SOURCES dict with the old filename format.
# This requires an additional arg for `fxx_subh` when calling Herbie
self.SOURCES = {
"aws_old_subh": f"https://noaa-hrrr-bdp-pds.s3.amazonaws.com/hrrr.{self.date:%Y%m%d}/conus/hrrr.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}{self.fxx_subh:02d}.grib2",
**self.SOURCES
}


class hrrrak:
def template(self):
Expand All @@ -93,5 +113,5 @@ def template(self):
"pando": f"https://pando-rgw01.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
"pando2": f"https://pando-rgw02.chpc.utah.edu/{self.model}/{self.product}/{self.date:%Y%m%d}/{self.model}.t{self.date:%H}z.wrf{self.product}f{self.fxx:02d}.grib2",
}
self.EXPECT_IDX_FILE = 'remote'
self.EXPECT_IDX_FILE = "remote"
self.LOCALFILE = f"{self.get_remoteFileName}"
77 changes: 64 additions & 13 deletions herbie/models/rap.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@


class rap:
"""
For NOMADS and Big Data Program RAP archive
"""

def template(self):
self.DESCRIPTION = "Rapid Refresh"
self.DESCRIPTION = "Rapid Refresh (RAP) from NOMADS and Big Data Program"
self.DETAILS = {
"nomads product description": "https://www.nco.ncep.noaa.gov/pmb/products/rap",
}
Expand All @@ -31,29 +35,76 @@ def template(self):
self.LOCALFILE = f"{self.get_remoteFileName}"


########################################################################
# The RAP record at NCEI is very different than the Big Data Program.
# Files are separated into historical/, rap-130-13km/, rap-252-20km/,
# analysis/, and forecast/ directories. These are inconsistent in years
# that are archived and have incomplete archived datetime groups.
# In a nutshell, NCEI's archive is very messy. Why anyone would want to
# use historical RAP is beyond me. Because the NCEI archive is so messy,
# Herbie may not be configured to find all possible file names in each
# year.
########################################################################

# TODO: Set LOCALFILE name to match modern filename structure.

class rap_historical:
"""
The RAP and RUC historical record at NCEI. (files older than 2020)
Grid 130 = 13 km
Grid 252 = 20 km
Grid 236 = ?? km
Grid 211 = ?? km
"""

def template(self):
self.DESCRIPTION = "Rapid Refresh - NCEI Historical"
self.DETAILS = {
"nomads product description": "https://www.ncei.noaa.gov/products/weather-climate-models/rapid-refresh-update",
}
self.PRODUCTS = {
"analysis": "RAP",
"forecast": "RAP",
}
self.SOURCES = {
"rap_130": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
"rap_252": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
"ruc_252": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc2_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
"ruc_anl_252": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc2anl_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
"ruc_236": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc2_236_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
"ruc_211": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/historical/{self.product}/{self.date:%Y%m/%Y%m%d}/ruc_211_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb",
}
self.IDX_SUFFIX = [".inv", ".grb2.inv", "grb.inv"]
self.LOCALFILE = f"{self.get_remoteFileName}"


class rap_ncei:
"""
The RAP record at NCEI is very different than other sources.
The RAP record at NCEI.
This isn't implemented super well.
Analysis: longer archive; May 2012 through a few days ago
Forecast: short archive; 2021 through a few days ago
"""

def template(self):
self.DESCRIPTION = "Rapid Refresh"
self.DESCRIPTION = "Rapid Refresh 13 km - NCEI"
self.DETAILS = {
"nomads product description": "https://www.ncei.noaa.gov/products/weather-climate-models/rapid-refresh-update",
}
self.PRODUCTS = {
"historical/analysis": "RAP 13 km",
"rap-130-13km/analysis": "RAP 13 km", # longer archive
"rap-130-13km/forecast": "RAP 13 km", # very short archive
"rap-252-20km/analysis": "RAP 20 km",
"rap-252-20km/forecast": "RAP 20 km",
"historical/forecast": "RAP 20 km",
"rap-130-13km": "RAP 13 km",
"rap-252-20km": "RAP 20 km",
}
# Well, it's either loop through the two sources and look for
# files or create a separate class. I elected to just loop
# through different URL's. Might not be the fastest, but it'll
# work. The user can always specify the priority order.
self.SOURCES = {
"ncei": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
"ncei_20km": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
"ncei_13km_analysis": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/analysis/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
"ncei_13km_forecast": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/forecast/{self.date:%Y%m/%Y%m%d}/rap_130_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
"ncei_20km_analysis": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/analysis/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
"ncei_20km_forecast": f"https://www.ncei.noaa.gov/data/rapid-refresh/access/{self.product}/forecast/{self.date:%Y%m/%Y%m%d}/rap_252_{self.date:%Y%m%d_%H%M}_{self.fxx:03d}.grb2",
}
self.IDX_SUFFIX = ".inv" # it is not .idx
self.IDX_SUFFIX = [".grb2.inv", ".inv", ".grb.inv"]
self.LOCALFILE = f"{self.get_remoteFileName}"
Loading

0 comments on commit a49904a

Please sign in to comment.