Bypass preprocessor (#439)

* add cli option no_pp to cli_plugins and default_tests config files * add No_pp data_manager to cli_plugins * add NoPPDataSource and placeholder MultirunNoPPDataSource that override the preprocessorClass with a new NullPreprocessor class * Add NullPreprocessor class that overrides init method and implements dummy required methods * fix typos and move preprocessing log info in preprocess_data to process method attched to the pod object * add logic blcok to single-run case loop to skip case.request data if the case is a NoPPDataSource type * add NoPPVarlist, NoPPVarlistEntry, and NoPPDiagnostic classes to handle variable infomation for the non-preprocessed data move log info for preprocessed data to process methods in DefaultPreprocessor classes change the _DiagnosticClass to NoPPDiagnostic in NoPPDataSource class * finalize file linking method for single-run noPP mode add more log info about no-PP mode modify rename_input_files to copy original files instead of symlinking, since no_PP performs symlinks, and linking to links causes segfaults * remove no_pp cli flag from cli_plugins * remove unused no_pp flag from default_tests.jsonc * add documentation for the No_pp datasource and a ref tag to rename_input_files.rst * add preliminary no preprocessor data manager and classes for multirun mode * updated the link_input_data_to_wkdir method for multirun to replace TAS_FILE with original location in input dir added logic to core.py to instantiate the noPP version of multirun diagnostic based on the parent object _PreprocessorClass added note to choke point in environment manager where exception is being caught added _children property and redefined preprocessorclass in MultirunNoPPDataSource may not need alternate no PP multrirun varlist and varlistentry methods but need to fix other issue first * clean up spacing in mdtf_framework.py * remove unused multirunnoppvarlist and multirunnoppvarlistentry classes redefine variable dest_path to point to raw input file location in link_input_data_to_wkdir * add qfp.MultirunDataSourceQFPMixin class to MultirunNoPPDatasource so that iter_vars_only method is available * remove ref to deleted MultirunNoPPVarlist class from MultirunNoPPDataSource * remove debug comment from environment_manager * update ref_data_sources.rst * reorder import of preprocessor and diagnostic modules in data_sources.py * revert changes to rename_input_files.py and docs. Moving changes to a separate PR * reorder module import statements in data_manager and data_sources * move varlist and varlistentry utilities required by other modules to separate modules to help prevent circular imports * add varlist and varlistentry attributes from new modules to diagnostic.py
NOAA-GFDL · Jan 12, 2023 · b12e764 · b12e764
1 parent 4e5b1f0
commit b12e764
Show file tree

Hide file tree

Showing 14 changed files with 847 additions and 410 deletions.
diff --git a/doc/sphinx/ref_data_sources.rst b/doc/sphinx/ref_data_sources.rst
@@ -150,3 +150,28 @@ This data source implements the following logic to guarantee that all data it pr
 
 * Variables that don't have global coverage (e.g., are restricted to the Greenland or Antarctic regions) or are zonally or otherwise spatially averaged are excluded from the search, as no POD is currently designed to use these types of data.
 
+.. _ref-data-source-nopp:
+
+No preprocessor
+++++++++++++++++++++++++++++
+.. Important:: The ``No_pp`` data source is a development feature intended to simplify POD debugging. Finalized PODs must function with preprocessor enabled in the framework.
+
+Selected via ``--data-manager="No_pp"``.
+
+This datasource bypasses the preprocessor entirely.  Model input data must adhere to the `Local_File` naming convention
+``<CASENAME>.<frequency>.<variable name>.nc`` and be located in the directory
+``[Input directory root]/[CASENAME]/[output frequency]``. If ``data_type=single_run``, files in the input data directories
+are symbolically linked to the working directory. If ``data_type=multi_run``, the data file paths point directly to the
+input data location because symbolic linking breaks the framework. Thus, for the ``multi_run`` configuration, the `index.html`
+file generated in the POD output directory will not work. However, the `[POD_NAME].html` file in the POD output directory
+will properly display the output.
+
+Data must have the variable names, units, convention, and dimensionality specified in the POD settings file.
+Users can use the :ref:`rename_input_files.py<ref-rename-input-files>` tool to create copies of files in the Local_file format
+
+The ``No_pp`` data source differs from passing the ``--disable-preprocessor`` option, which still renames variables
+to match the desired convention, crops the date range to match the ``FIRSTYR``  and ``LASTYR specified
+in the runtime configuration file, and writes copies of the modified files to the working directory.
+
+
+
diff --git a/mdtf_framework.py b/mdtf_framework.py
@@ -62,6 +62,7 @@ def main(argv):
         exit_code = framework.main()
         return exit_code
 
+
 if __name__ == '__main__':
     argv = sys.argv[1::] if len(sys.argv[1::]) >= 2 else sys.argv
     exit_code = main(argv)

diff --git a/src/cli_plugins.jsonc b/src/cli_plugins.jsonc
@@ -19,6 +19,20 @@
         ]
       }
     },
+    "No_pp": {
+      "help": "DataManager for working with input model data files that are already present on a local filesystem, for example the PODs' sample model data.",
+      "entry_point": ["src.data_sources:NoPPDataSource", "src.data_sources:MultirunNoPPDataSource"],
+      "cli": {
+        "arguments": [
+          {
+            "name": ["sample_dataset", "experiment"],
+            "short_name" : "e",
+            "help": "Name of sample model data source.",
+            "default" : ""
+          }
+        ]
+      }
+    },
     "Explicit_file": {
       "help": "DataManager which allows the user to explicitly specify what data files should be used to supply each variable in the PODs' data request.",
       "entry_point": ["src.data_sources:ExplicitFileDataSource","src.data_sources:MultirunExplicitFileDataSource"],

diff --git a/src/core.py b/src/core.py
@@ -1146,9 +1146,15 @@ def main(self):
 
             for case_name, case in self.cases.items():
                 if not case.failed:
-                    _log.info("### %s: requesting data for case '%s'.",
-                              self.full_name, case_name)
-                    case.request_data()
+                    if type(case).__name__ ==  'NoPPDataSource':
+                        _log.info("### %s: Skipping Data Preprocessing for case '%s'."
+                                  "Variables will not be renamed, and level extraction,"
+                                  "will not be done on 4-D fields.",
+                                  self.full_name, case_name)
+                    else:
+                        _log.info("### %s: requesting data for case '%s'.",
+                                  self.full_name, case_name)
+                        case.request_data()
                 else:
                     _log.info(("### %s: initialization for case '%s' failed; skipping "
                                f"data request."), self.full_name, case_name)
@@ -1172,28 +1178,37 @@ def main(self):
         else:
             # Import multirun methods here to avoid circular import problems
             # e.g., multirun.py inherits from diagnostic.py which inherits from core.py
-            from src.diagnostic import MultirunDiagnostic
+            from src.diagnostic import MultirunDiagnostic, MultirunNoPPDiagnostic
             pod_dict = dict.fromkeys(self.pod_list, [])
             for pod in pod_dict.keys():
+                if self.DataSource._PreprocessorClass.__name__ != 'MultirunNullPreprocessor':
+                    pod_dict[pod] = MultirunDiagnostic.from_config(pod, parent=self)
                 # Initialize the pod as a MultirunDiagnostic object
                 # Attach the caselist dict, and append case-specific attributes to each case object
                 # Set the POD attributes including paths, pod_env_vars, and the convention
                 # Append the varlist and import variable information from the pod settings file
-                pod_dict[pod] = MultirunDiagnostic.from_config(pod, parent=self)
+                else:  # initialize noPP object
+                    pod_dict[pod] = MultirunNoPPDiagnostic.from_config(pod, parent=self)
                 # Translate varlist variables and metadata
                 # Perform data preprocessing
                 pod_dict[pod].setup_pod()
                 # query the data
                 # request the data
                 util.transfer_log_cache(close=True)
-                for case_name, case in pod_dict[pod].cases.items():
-                    if not case.failed:
-                        _log.info("### %s: requesting data for case '%s'.",
-                                  self.full_name, case_name)
-                        case.request_data(pod_dict[pod])
-                    else:
-                        _log.info(("### %s: initialization for case '%s' failed; skipping "
-                                   f"data request."), self.full_name, case_name)
+                if type(pod_dict[pod]).__name__ == 'MultirunNoPPDiagnostic':
+                    _log.info("### %s: Skipping Data Preprocessing for POD '%s'."
+                              "Variables will not be renamed, and level extraction,"
+                              "will not be done on 4-D fields.",
+                              self.full_name, pod)
+                else:
+                    for case_name, case in pod_dict[pod].cases.items():
+                        if not case.failed:
+                            _log.info("### %s: requesting data for case '%s'.",
+                                      self.full_name, case_name)
+                            case.request_data(pod_dict[pod])
+                        else:
+                            _log.info(("### %s: initialization for case '%s' failed; skipping "
+                                       f"data request."), self.full_name, case_name)
             self.pods = pod_dict
             if not any(p.failed for p in self.pods.values()):
                 _log.info("### %s: running pods '%s'.", self.full_name, [p for p in pod_dict.keys()])

diff --git a/src/data_manager.py b/src/data_manager.py
@@ -11,7 +11,7 @@
 from abc import ABC
 import pandas as pd
 
-from src import util, core, diagnostic, preprocessor, pod_setup
+from src import util, core, varlistentry_util, diagnostic, pod_setup, preprocessor
 from src import query_fetch_preprocess as qfp
 _log = logging.getLogger(__name__)
 
@@ -562,7 +562,7 @@ def _expt_df(self, obj, var_iterator, col_group, parent_id=None, obj_name=None):
             obj_name = obj.name
 
         for v in var_iterator:
-            if v.stage < diagnostic.VarlistEntryStage.QUERIED:
+            if v.stage < varlistentry_util.VarlistEntryStage.QUERIED:
                 continue
             rows = set([])
             for d_key in v.iter_data_keys():

diff --git a/src/data_model.py b/src/data_model.py
@@ -9,7 +9,7 @@
 import itertools
 import typing
 from src import util
-import src.units # fully qualify name to reduce confusion with "units" attributes
+import src.units  # fully qualify name to reduce confusion with "units" attributes
 import src.core
 import logging
 _log = logging.getLogger(__name__)

diff --git a/src/data_sources.py b/src/data_sources.py
@@ -5,7 +5,7 @@
 import os
 import collections
 import dataclasses
-from src import util, multirun, core, diagnostic, xr_parser, preprocessor, cmip6
+from src import util, multirun, core, diagnostic, preprocessor, xr_parser, cmip6
 from src import data_manager as dm
 from src import query_fetch_preprocess as qfp
 import pandas as pd
@@ -29,7 +29,7 @@
 
 
 @util.regex_dataclass(sample_data_regex)
-class SampleDataFile():
+class SampleDataFile:
     """Dataclass describing catalog entries for sample model data files.
     """
     sample_dataset: str = util.MANDATORY
@@ -117,14 +117,25 @@ def CATALOG_DIR(self):
 # ----------------------------------------------------------------------------
 
 
+class NoPPDataSource(SampleLocalFileDataSource):
+    """DataSource for handling POD sample model data stored on a local filesystem.
+    """
+    # _FileRegexClass = SampleDataFile
+    # _AttributesClass = SampleDataAttributes
+    # col_spec = sampleLocalFileDataSource_col_spec
+    _DiagnosticClass = diagnostic.NoPPDiagnostic
+    _PreprocessorClass = preprocessor.NullPreprocessor
+
+
+# ----------------------------------------------------------------------------
+
 class MultirunSampleLocalFileDataSource(multirun.MultirunSingleLocalFileDataSource, SampleLocalFileDataSource):
     """DataSource for handling POD sample model data stored on a local filesystem.
     Duplicate of SampleLocalFileDataSource, but need to route to multirun parent data source classes
     """
     # No-op=--just inherit attributes, properties, and route to __init__ methods in parent classes
     pass
 
-
 class MultirunLocalFileDataSource(MultirunSampleLocalFileDataSource,
                                   qfp.MultirunDataSourceQFPMixin
                                   ):
@@ -133,8 +144,8 @@ class MultirunLocalFileDataSource(MultirunSampleLocalFileDataSource,
     # _FileRegexClass = SampleDataFile # fields inherited from SampleLocalFileDataSource
     # _AttributesClass = SampleDataAttributes
     # col_spec = sampleLocalFileDataSource_col_spec
-    _DiagnosticClass = diagnostic.MultirunDiagnostic
     varlist: diagnostic.MultirunVarlist = None
+    _DiagnosticClass = diagnostic.MultirunDiagnostic
     # Override data_manager:DataSourceBase init method
 
     def __init__(self, case_dict, parent):
@@ -157,6 +168,33 @@ def _children(self):
         yield from self.varlist.iter_vars()
 
 
+class MultirunNoPPDataSource(MultirunSampleLocalFileDataSource, qfp.MultirunDataSourceQFPMixin):
+    """DataSource for handling Multirun POD data that won't be preprocessed
+    """
+    # No-op=--just inherit attributes, properties, and route to __init__ methods in parent classes
+    _PreprocessorClass = preprocessor.MultirunNullPreprocessor
+    varlist: diagnostic.MultirunVarlist = None
+
+    def __init__(self, case_dict, parent):
+        # _id = util.MDTF_ID()        # attrs inherited from core.MDTFObjectBase
+        # name: str
+        # _parent: object
+        # log = util.MDTFObjectLogger
+        # status: ObjectStatus
+        # initialize data source atts and methods from parent classes
+        super(MultirunNoPPDataSource, self).__init__(case_dict, parent)
+
+        core.MDTFObjectBase.__init__(
+            self, name=case_dict['CASENAME'], _parent=parent
+        )
+
+    @property
+    def _children(self):
+        """Iterable of the multirun varlist that is associated with the data source object
+        """
+        yield from self.varlist.iter_vars()
+
+
 class MetadataRewriteParser(xr_parser.DefaultDatasetParser):
     """After loading and parsing the metadata on dataset *ds* but before
     applying the preprocessing functions, update attrs on *ds* with the new
@@ -686,7 +724,7 @@ def resolve_var_expt(self, df, obj):
         # NB need to pass list to iloc to get a pd.DataFrame instead of pd.Series
         df = df.sort_values(col_name).iloc[[0]]
         obj.log.debug("Selected experiment attribute '%s'='%s' for %s.",
-            col_name, df[col_name].iloc[0], obj.name)
+                      col_name, df[col_name].iloc[0], obj.name)
         return df