aradhakrishnanGFDL · ceblanton · Jul 3, 2024 · May 31, 2024 · May 31, 2024 · Jun 14, 2024
diff --git a/cats/gfdl_template.json b/cats/gfdl_template.json
@@ -3,71 +3,98 @@
   "attributes": [
     {
       "column_name": "activity_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": false
     },
     {
       "column_name": "institution_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": false
     },
     {
       "column_name": "source_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": false
     },
     {
       "column_name": "experiment_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": true
     },
     {
       "column_name": "frequency",
-      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json"
+      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json",
+      "required": true
     },
     {
-      "column_name": "modeling_realm",
-      "vocabulary": ""
+      "column_name": "realm",
+      "vocabulary": "",
+      "required": true
     },
     {
       "column_name": "table_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": false
     },
     {
       "column_name": "member_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": false
     },
     {
       "column_name": "grid_label",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": false
     },
     {
       "column_name": "variable_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": true
     },
     {
-      "column_name": "temporal_subset",
-      "vocabulary": ""
+      "column_name": "time_range",
+      "vocabulary": "",
+      "required": true
     },
     {
       "column_name": "chunk_freq",
-      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/GFDL_chunk_freq.json"
+      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/GFDL_chunk_freq.json",
+      "required": false
     },
     {
-      "column_name": "grid_label",
-      "vocabulary": ""
+      "column_name":"platform",
+      "vocabulary": "",
+      "required": false
     },
     {
-      "column_name":"platform",
-      "vocabulary": ""
+      "column_name":"target",
+      "vocabulary": "",
+      "required": false
     },
     {
      "column_name": "cell_methods",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": "enhanced"
     },
     {
       "column_name": "path",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "dimensions",
+      "vocabulary": "",
+      "required": "enhanced"
     },
     {
       "column_name": "version_id",
-      "vocabulary": ""
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "standard_name",
+      "vocabulary": "",
+      "required": "enhanced"
     }
   ],
   "assets": {
@@ -83,7 +110,7 @@
       "frequency",
       "table_id",
       "grid_label", 
-      "modeling_realm",
+      "realm",
       "member_id",
       "chunk_freq"
     ],
@@ -95,7 +122,7 @@
       },
       {
         "type": "join_existing",
-        "attribute_name": "temporal_subset",
+        "attribute_name": "time_range",
         "options": {
           "dim": "time",
           "coords": "minimal",

diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py
@@ -13,7 +13,7 @@
 #with the ESM collection specification standards and the appropriate workflows.
 
 headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
-                  "frequency", "modeling_realm", "table_id",
+                  "frequency", "realm", "table_id",
                   "member_id", "grid_label", "variable_id",
                   "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
 
@@ -28,10 +28,8 @@
 #for the fourth value.
 
 
-#output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
-
-output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
-output_file_template = ['modeling_realm','temporal_subset','variable_id']
+output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+output_file_template = ['realm','temporal_subset','variable_id']
 
 #OUTPUT FILE INFO is currently passed as command-line argument.
 #We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
@@ -46,7 +44,7 @@
 
 dictFilter = {}
 dictFilterIgnore = {}
-dictFilter["modeling_realm"]= 'atmos_cmip'
+dictFilter["realm"]= 'atmos_cmip'
 dictFilter["frequency"] = "monthly"
 dictFilter["chunk_freq"] = "5yr"
 dictFilterIgnore["remove"]= 'DO_NOT_USE'
diff --git a/intakebuilder/catalogcols.py b/intakebuilder/catalogcols.py
@@ -1,4 +1,4 @@
 headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
-                  "frequency", "modeling_realm", "table_id",
+                  "frequency", "realm", "table_id",
                   "member_id", "grid_label", "variable_id",
                   "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
diff --git a/intakebuilder/config.yaml b/intakebuilder/config.yaml
@@ -13,7 +13,7 @@
 #with the ESM collection specification standards and the appropriate workflows.
 
 headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
-                  "frequency", "modeling_realm", "table_id",
+                  "frequency", "realm", "table_id",
                   "member_id", "grid_label", "variable_id",
                   "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
 
@@ -27,9 +27,9 @@ headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
 #The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
 #for the fourth value.
 
-output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
 
-output_file_template: ['modeling_realm','temporal_subset','variable_id']
+output_file_template: ['realm','temporal_subset','variable_id']
 
 #OUTPUT FILE INFO is currently passed as command-line argument.
 #We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.

diff --git a/intakebuilder/getinfo.py b/intakebuilder/getinfo.py
@@ -35,9 +35,9 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None):
             except KeyError:
                 dictInfo["frequency"] = "NA"
             try:
-                dictInfo["modeling_realm"] = mappings[miptable]["modeling_realm"]
+                dictInfo["realm"] = mappings[miptable]["realm"]
             except KeyError:
-                dictInfo["modeling_realm"]  = "NA"
+                dictInfo["realm"]  = "NA"
     return(dictInfo)
 
 def getStem(dirpath,projectdir):
@@ -112,7 +112,7 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
     :return:
     '''
    # we need thise dict keys "project", "institute", "model", "experiment_id",
-   #               "frequency", "modeling_realm", "mip_table",
+   #               "frequency", "realm", "mip_table",
    #               "ensemble_member", "grid_label", "variable",
    #               "temporal subset", "version", "path"]
 
@@ -153,22 +153,6 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
          #print("Skipping non-timeseries data")
          return {}
     return dictInfo
-    '''
-    if stemdir[len(stemdir)-3] == "ts":
-        dictInfo['experiment_id'] = stemdir[len(stemdir)-7]
-        dictInfo['frequency'] = stemdir[len(stemdir)-2]
-        dictInfo['member_id'] = 'n/a'
-        dictInfo['modeling_realm'] = stemdir[len(stemdir)-4]
-
-        #Finds last (a) and second to last (b) periods and grabs value between them
-        a = dictInfo['path'].rfind('.')
-        b = dictInfo['path'].rfind('.',0,dictInfo['path'].rfind('.')-1)
-        dictInfo['variable_id'] = dictInfo['path'][b+1:a]
-        dictInfo['chunk_frequency'] = stemdir[len(stemdir)-1]
-
-
-    return dictInfo
-    '''
 
 def getInfoFromDRS(dirpath,projectdir,dictInfo):
     '''
@@ -215,7 +199,7 @@ def getInfoFromGlobalAtts(fname,dictInfo,filexra=None):
             version = "NA"
         dictInfo["version"] = version
     realm = filexra["realm"]
-    dictInfo["modeling_realm"] = realm
+    dictInfo["realm"] = realm
     frequency = filexra["frequency"]
     dictInfo["frequency"] = frequency
     return dictInfo

diff --git a/intakebuilder/gfdlcrawler.py b/intakebuilder/gfdlcrawler.py
@@ -16,8 +16,8 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
     '''
     listfiles = []
     pat = None
-    if("modeling_realm" in dictFilter.keys()) & (("frequency") in dictFilter.keys()):
-        pat = re.compile('({}/{}/{}/{})'.format(dictFilter["modeling_realm"],"ts",dictFilter["frequency"],dictFilter["chunk_freq"]))
+    if("realm" in dictFilter.keys()) & (("frequency") in dictFilter.keys()):
+        pat = re.compile('({}/{}/{}/{})'.format(dictFilter["realm"],"ts",dictFilter["frequency"],dictFilter["chunk_freq"]))
 
     orig_pat = pat
 

diff --git a/intakebuilder/table.yaml b/intakebuilder/table.yaml
@@ -1,9 +1,9 @@
 Amon:
     frequency: mon
-    modeling_realm: atmos
+    realm: atmos
 Omon:
     frequency: mon
-    modeling_realm: ocean
+    realm: ocean
 3hr:
     frequency: 3hr
-    modeling_realm: na
+    realm: na
diff --git a/tests/test_catalog.py b/tests/test_catalog.py
@@ -29,7 +29,10 @@ def main(json_path,json_template_path,test_failure):
     comp = (diff(j,json_template))
     for key in comp.keys():
         if key != 'catalog_file':
-            sys.exit(key, 'section of JSON does not refect template')
+            if test_failure:
+                print(key + ' section of JSON does not refect template')
+            else:
+                sys.exit(key + ' section of JSON does not refect template')
 
     #Get CSV from JSON and open it
     csv_path = j["catalog_file"]
@@ -51,9 +54,10 @@ def main(json_path,json_template_path,test_failure):
             print(f"The required column '{column}' does not exist in the csv. In other words, there is some inconsistency between the json and the csv file. Please check out info listed under aggregation_control and groupby_attrs in your json file and verify if those columns show up in the csv as well.")
             errors += 1
 
-        if(catalog[column].isnull().values.any()):
-            print(f"'{column}' contains empty values.")
-            errors += 1
+        if column in catalog.columns:
+            if(catalog[column].isnull().values.any()):
+                print(f"'{column}' contains empty values.")
+                errors += 1
 
     if errors > 0:
         if test_failure: