Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Catalog schema updates #130

Merged
merged 8 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 50 additions & 23 deletions cats/gfdl_template.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,98 @@
"attributes": [
{
"column_name": "activity_id",
"vocabulary": ""
"vocabulary": "",
"required": false
},
{
"column_name": "institution_id",
"vocabulary": ""
"vocabulary": "",
"required": false
},
{
"column_name": "source_id",
"vocabulary": ""
"vocabulary": "",
"required": false
},
{
"column_name": "experiment_id",
"vocabulary": ""
"vocabulary": "",
"required": true
},
{
"column_name": "frequency",
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json"
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json",
"required": true
},
{
"column_name": "modeling_realm",
"vocabulary": ""
"column_name": "realm",
"vocabulary": "",
"required": true
},
{
"column_name": "table_id",
"vocabulary": ""
"vocabulary": "",
"required": false
},
{
"column_name": "member_id",
"vocabulary": ""
"vocabulary": "",
"required": false
},
{
"column_name": "grid_label",
"vocabulary": ""
"vocabulary": "",
"required": false
},
{
"column_name": "variable_id",
"vocabulary": ""
"vocabulary": "",
"required": true
},
{
"column_name": "temporal_subset",
"vocabulary": ""
"column_name": "time_range",
"vocabulary": "",
"required": true
},
{
"column_name": "chunk_freq",
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/GFDL_chunk_freq.json"
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/GFDL_chunk_freq.json",
"required": false
},
{
"column_name": "grid_label",
"vocabulary": ""
"column_name":"platform",
"vocabulary": "",
"required": false
},
{
"column_name":"platform",
"vocabulary": ""
"column_name":"target",
"vocabulary": "",
"required": false
},
{
"column_name": "cell_methods",
"vocabulary": ""
"vocabulary": "",
"required": "enhanced"
},
{
"column_name": "path",
"vocabulary": ""
"vocabulary": "",
"required": true
},
{
"column_name": "dimensions",
"vocabulary": "",
"required": "enhanced"
},
{
"column_name": "version_id",
"vocabulary": ""
"vocabulary": "",
"required": false
},
{
"column_name": "standard_name",
"vocabulary": "",
"required": "enhanced"
}
],
"assets": {
Expand All @@ -83,7 +110,7 @@
"frequency",
"table_id",
"grid_label",
"modeling_realm",
"realm",
"member_id",
"chunk_freq"
],
Expand All @@ -95,7 +122,7 @@
},
{
"type": "join_existing",
"attribute_name": "temporal_subset",
"attribute_name": "time_range",
"options": {
"dim": "time",
"coords": "minimal",
Expand Down
10 changes: 4 additions & 6 deletions intakebuilder/builderconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#with the ESM collection specification standards and the appropriate workflows.

headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "modeling_realm", "table_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]

Expand All @@ -28,10 +28,8 @@
#for the fourth value.


#output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']

output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
output_file_template = ['modeling_realm','temporal_subset','variable_id']
output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
output_file_template = ['realm','temporal_subset','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
Expand All @@ -46,7 +44,7 @@

dictFilter = {}
dictFilterIgnore = {}
dictFilter["modeling_realm"]= 'atmos_cmip'
dictFilter["realm"]= 'atmos_cmip'
dictFilter["frequency"] = "monthly"
dictFilter["chunk_freq"] = "5yr"
dictFilterIgnore["remove"]= 'DO_NOT_USE'
2 changes: 1 addition & 1 deletion intakebuilder/catalogcols.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "modeling_realm", "table_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
6 changes: 3 additions & 3 deletions intakebuilder/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "modeling_realm", "table_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]

Expand All @@ -27,9 +27,9 @@ headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']

output_file_template: ['modeling_realm','temporal_subset','variable_id']
output_file_template: ['realm','temporal_subset','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
Expand Down
24 changes: 4 additions & 20 deletions intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None):
except KeyError:
dictInfo["frequency"] = "NA"
try:
dictInfo["modeling_realm"] = mappings[miptable]["modeling_realm"]
dictInfo["realm"] = mappings[miptable]["realm"]
except KeyError:
dictInfo["modeling_realm"] = "NA"
dictInfo["realm"] = "NA"
return(dictInfo)

def getStem(dirpath,projectdir):
Expand Down Expand Up @@ -112,7 +112,7 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
:return:
'''
# we need thise dict keys "project", "institute", "model", "experiment_id",
# "frequency", "modeling_realm", "mip_table",
# "frequency", "realm", "mip_table",
# "ensemble_member", "grid_label", "variable",
# "temporal subset", "version", "path"]

Expand Down Expand Up @@ -153,22 +153,6 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
#print("Skipping non-timeseries data")
return {}
return dictInfo
'''
if stemdir[len(stemdir)-3] == "ts":
dictInfo['experiment_id'] = stemdir[len(stemdir)-7]
dictInfo['frequency'] = stemdir[len(stemdir)-2]
dictInfo['member_id'] = 'n/a'
dictInfo['modeling_realm'] = stemdir[len(stemdir)-4]

#Finds last (a) and second to last (b) periods and grabs value between them
a = dictInfo['path'].rfind('.')
b = dictInfo['path'].rfind('.',0,dictInfo['path'].rfind('.')-1)
dictInfo['variable_id'] = dictInfo['path'][b+1:a]
dictInfo['chunk_frequency'] = stemdir[len(stemdir)-1]


return dictInfo
'''

def getInfoFromDRS(dirpath,projectdir,dictInfo):
'''
Expand Down Expand Up @@ -215,7 +199,7 @@ def getInfoFromGlobalAtts(fname,dictInfo,filexra=None):
version = "NA"
dictInfo["version"] = version
realm = filexra["realm"]
dictInfo["modeling_realm"] = realm
dictInfo["realm"] = realm
frequency = filexra["frequency"]
dictInfo["frequency"] = frequency
return dictInfo
Expand Down
4 changes: 2 additions & 2 deletions intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
'''
listfiles = []
pat = None
if("modeling_realm" in dictFilter.keys()) & (("frequency") in dictFilter.keys()):
pat = re.compile('({}/{}/{}/{})'.format(dictFilter["modeling_realm"],"ts",dictFilter["frequency"],dictFilter["chunk_freq"]))
if("realm" in dictFilter.keys()) & (("frequency") in dictFilter.keys()):
pat = re.compile('({}/{}/{}/{})'.format(dictFilter["realm"],"ts",dictFilter["frequency"],dictFilter["chunk_freq"]))

orig_pat = pat

Expand Down
6 changes: 3 additions & 3 deletions intakebuilder/table.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Amon:
frequency: mon
modeling_realm: atmos
realm: atmos
Omon:
frequency: mon
modeling_realm: ocean
realm: ocean
3hr:
frequency: 3hr
modeling_realm: na
realm: na
12 changes: 8 additions & 4 deletions tests/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ def main(json_path,json_template_path,test_failure):
comp = (diff(j,json_template))
for key in comp.keys():
if key != 'catalog_file':
sys.exit(key, 'section of JSON does not refect template')
if test_failure:
print(key + ' section of JSON does not refect template')
else:
sys.exit(key + ' section of JSON does not refect template')

#Get CSV from JSON and open it
csv_path = j["catalog_file"]
Expand All @@ -51,9 +54,10 @@ def main(json_path,json_template_path,test_failure):
print(f"The required column '{column}' does not exist in the csv. In other words, there is some inconsistency between the json and the csv file. Please check out info listed under aggregation_control and groupby_attrs in your json file and verify if those columns show up in the csv as well.")
errors += 1

if(catalog[column].isnull().values.any()):
print(f"'{column}' contains empty values.")
errors += 1
if column in catalog.columns:
if(catalog[column].isnull().values.any()):
print(f"'{column}' contains empty values.")
errors += 1

if errors > 0:
if test_failure:
Expand Down
Loading