Merge pull request #2 from WEgeophysics/dev

config hydro-learn
earthai-tech · Sep 20, 2023 · 4d6ca91 · 4d6ca91
2 parents fc85aa7 + 8bb537d
commit 4d6ca91
Show file tree

Hide file tree

Showing 6 changed files with 357 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -4,23 +4,23 @@
 ##  Overview
 
 *Hydro-learn* is a Python-based package for solving hydro-geology engineering issues. From methodologies based on 
-Machine Learning,It brings novel approaches  for reducing numerous losses during the hydrogeological  
+Machine Learning, It brings novel approaches  for reducing numerous losses during the hydrogeological  
 exploration projects. It allows to: 
-- reduce the cost of permeability coefficient (k) data collection during the engineering projects,
+- reduce the cost of hydraulic conductivity (K) data collection during the engineering projects,
 - Guide drillers for to locating the drilling operations, 
 - predict the water content in the well such as the level of water inrush, ...
 
 ## Licence 
 
-*WATex* is under [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) License. 
+*hydro-learn* is under [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) License. 
 
 ## Installation 
 
 The system requires preferably Python 3.10+. 
 
-## Demos 
+## Demo 
 
-### Predict permeability coefficient ``K`` from logging dataset using MXS approach
+### Predict hydraulic conductivity ``K`` from logging dataset using MXS approach
 
 MXS stands for mixture learning strategy. It uses upstream unsupervised learning for 
 ``k`` -aquifer similarity label prediction and the supervising learning for 
@@ -52,8 +52,6 @@ Out[4]: array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2])
 ymxs[62:74]
 Out[5]: array([ 0,  0,  0,  0, 12, 12, 12, 12, 12, 12, 12, 12])
 ```
-To understand the transformation from NGA to MXS target (``ymxs``), please, have a look 
-of the following [paper](http://dx.doi.org/10.2139/ssrn.4326365).
 Once the MXS target is predicted, we call the ``make_naive_pipe`` function to 
 impute, scale, and transform the predictor ``X`` at once into a compressed sparse 
 matrix ready for final prediction using the [support vector machines](https://ieeexplore.ieee.org/document/708428) and 
@@ -78,8 +76,7 @@ Out[8]: 0.9636363636363636
 ```
 As we can see, the results of ``k`` prediction are quite satisfactory for our 
 toy example using only two boreholes data. Note that things can become more 
-interesting when using many boreholes data. For more in 
-depth, visit our [examples page](https://watex.readthedocs.io/en/latest/glr_examples/index.html). 
+interesting when using many boreholes data. 
 
 
 ## Contributions 

diff --git a/hlearn/__init__.py b/hlearn/__init__.py
@@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+# Licence:BSD-3-Clause
+# Author: L. Kouadio <etanoyau@gmail.com>
+
+from __future__ import annotations 
+import os 
+import sys 
+import logging 
+import random
+import warnings 
+
+# set the package name for consistency checker 
+sys.path.insert(0, os.path.dirname(__file__))  
+for p in ('.','..' ,'./hlearn'): 
+    sys.path.insert(0,  os.path.abspath(p)) 
+
+# assert package 
+if  __package__ is None: 
+    sys.path.append( os.path.dirname(__file__))
+    __package__ ='hlearn'
+
+# configure the logger file
+# from ._hlearnlog import hlearnlog
+try: 
+    conffile = os.path.join(
+        os.path.dirname(__file__),  "hlearn/hlog.yml")
+    if not os.path.isfile (conffile ): 
+        raise 
+except: 
+    conffile = os.path.join(
+        os.path.dirname(__file__), "hlog.yml")
+
+# generated version by setuptools_scm 
+__version__ = '0.1.0' 
+
+# # set loging Level
+logging.getLogger(__name__)#.setLevel(logging.WARNING)
+# disable the matplotlib font manager logger.
+logging.getLogger('matplotlib.font_manager').disabled = True
+# or ust suppress the DEBUG messages but not the others from that logger.
+# logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)
+
+# setting up
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
+
+# Workaround issue discovered in intel-openmp 2019.5:
+# https://github.com/ContinuumIO/anaconda-issues/issues/11294
+os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
+
+# https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/
+try:
+    # This variable is injected in the __builtins__ by the build process. 
+    __HLEARN_SETUP__  # type: ignore
+except NameError:
+    __HLEARN_SETUP__  = False
+
+if __HLEARN_SETUP__ :
+    sys.stderr.write("Partial import of hlearn during the build process.\n")
+else:
+    from . import _distributor_init  # noqa: F401
+    from . import _build  # noqa: F401
+    from .utils._show_versions import show_versions
+
+#https://github.com/pandas-dev/pandas
+# Let users know if they're missing any of our hard dependencies
+_main_dependencies = ("numpy", "scipy", "sklearn", "matplotlib", 
+                      "pandas","seaborn")
+_missing_dependencies = []
+
+for _dependency in _main_dependencies:
+    try:
+        __import__(_dependency)
+    except ImportError as _e:  # pragma: no cover
+        _missing_dependencies.append(
+            f"{'scikit-learn' if _dependency=='sklearn' else _dependency }: {_e}")
+
+if _missing_dependencies:  # pragma: no cover
+    raise ImportError(
+        "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
+    )
+del _main_dependencies, _dependency, _missing_dependencies
+
+# Try to suppress pandas future warnings
+# and reduce verbosity.
+# Setup hlearn public API  
+with warnings.catch_warnings():
+    warnings.filterwarnings(action='ignore', category=UserWarning)
+    import hlearn.externals as sklearn 
+
+from .datasets import ( 
+    fetch_data, 
+    ) 
+from .methods import ( 
+    Structural, 
+    Structures, 
+    MXS, 
+    )
+
+from .view import ( 
+    EvalPlot, 
+    plotLearningInspections, 
+    plotSilhouette,
+    plotDendrogram, 
+    plotProjection, 
+    )
+
+from .utils import ( 
+    read_data,
+    cleaner, 
+    reshape, 
+    to_numeric_dtypes, 
+    smart_label_classifier,
+    select_base_stratum , 
+    reduce_samples , 
+    make_MXS_labels, 
+    predict_NGA_labels, 
+    classify_k,  
+    plot_elbow, 
+    plot_clusters, 
+    plot_pca_components, 
+    plot_naive_dendrogram, 
+    plot_learning_curves, 
+    plot_confusion_matrices, 
+    plot_sbs_feature_selection, 
+    plot_regularization_path, 
+    plot_rf_feature_importances, 
+    plot_logging, 
+    plot_silhouette, 
+    plot_profiling,
+    plot_confidence_in,
+    )
+
+try : 
+    from .utils import ( 
+        selectfeatures, 
+        naive_imputer, 
+        naive_scaler,  
+        make_naive_pipe, 
+        bi_selector, 
+        )
+except ImportError :
+    pass 
+
+def setup_module(module):
+    """Fixture for the tests to assure globally controllable seeding of RNGs"""
+
+    import numpy as np
+
+    # Check if a random seed exists in the environment, if not create one.
+    _random_seed = os.environ.get("hlearn_SEED", None)
+    if _random_seed is None:
+        _random_seed = np.random.uniform() * np.iinfo(np.int32).max
+    _random_seed = int(_random_seed)
+    print("I: Seeding RNGs with %r" % _random_seed)
+    np.random.seed(_random_seed)
+    random.seed(_random_seed)
+
+__doc__= """\
+hydro-learn: An intelligent solver for hydrogeology engineering issues
+=======================================================================
+
+Hydro-learn is a Python-based package for solving hydro-geology engineering 
+issues. From methodologies based on Machine Learning,It brings novel 
+approaches for reducing numerous losses during the hydrogeological
+exploration projects. It allows to:
+
+- reduce the cost of permeability coefficient (k) data collection during the 
+  engineering projects,
+- guide drillers for to locating the drilling operations,
+- predict the water content in the well such as the level of water inrush, ...
+
+.. _hlearn: https://github.com/WEgeophysics/hydro-learn/
+
+"""
+#  __all__ is used to display a few public API. 
+# the public API is determined
+# based on the documentation.
+
+__all__ = [ 
+    "sklearn", 
+    "fetch_data",
+    "Structural", 
+    "Structures", 
+    "MXS", 
+    "EvalPlot", 
+    "plotLearningInspections", 
+    "plotSilhouette",
+    "plotDendrogram", 
+    "plotProjection", 
+    "plotAnomaly", 
+    "vesSelector", 
+    "erpSelector", 
+    "read_data",
+    "erpSmartDetector", 
+    "plot_confidence_in", 
+    "reshape", 
+    "to_numeric_dtypes", 
+    "smart_label_classifier",
+    "select_base_stratum" , 
+    "reduce_samples" , 
+    "make_MXS_labels", 
+    "predict_NGA_labels", 
+    "classify_k",  
+    "plot_elbow", 
+    "plot_clusters", 
+    "plot_pca_components", 
+    "plot_naive_dendrogram", 
+    "plot_learning_curves", 
+    "plot_confusion_matrices",  
+    "plot_sbs_feature_selection", 
+    "plot_regularization_path", 
+    "plot_rf_feature_importances", 
+    "plot_logging", 
+    "plot_silhouette", 
+    "plot_profiling", 
+    "selectfeatures", 
+    "naive_imputer", 
+    "naive_scaler",  
+    "make_naive_pipe", 
+    "bi_selector", 
+    "show_versions",
+    "cleaner", 
+    ]
+
diff --git a/hlearn/datasets/__init__.py b/hlearn/datasets/__init__.py
@@ -0,0 +1,16 @@
+"""
+Dataset subpackage is used to fetch data from the local machine. 
+"""
+from .sets import ( 
+    load_hlogs,
+    load_nlogs, 
+    load_mxs, 
+    fetch_data,
+    )
+
+__all__=[ 
+         "load_hlogs",
+         "load_nlogs", 
+         "load_mxs", 
+         "fetch_data",
+         ]
diff --git a/hlearn/datasets/_config.py b/hlearn/datasets/_config.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+#   License: BSD-3-Clause
+#   Author: LKouadio <etanoyau@gmail.com>
+
+""" 
+Set all dataset.  
+"""
+from warnings import warn 
+
+from ..utils.funcutils import ( 
+    smart_format 
+    )
+from ..exceptions import DatasetError
+from .._hlearnlog import hlearnlog
+
+_logger = hlearnlog().get_hlearn_logger(__name__)
+
+_DTAGS=(
+    "hlogs",
+    "nlogs", 
+    "mxs", 
+    )
+
+from .dload import (
+    load_hlogs,
+    load_nlogs, 
+    load_mxs, 
+    ) 
+
+__all__=[ 
+
+         "load_hlogs",
+         "load_nlogs", 
+         "fetch_data",
+         "load_mxs", 
+
+         ]
+
+def fetch_data (tag, **kws): 
+    tag = _parse_tags(tag, multi_kind_dataset='nanshan')
+    funcs= ( load_hlogs, load_nlogs, load_mxs ) 
+    funcns = list (map(lambda f: f.__name__.replace('load_', ''), funcs))
+    if tag in (funcns): 
+        func = funcs[funcns.index (tag)] 
+    else : raise DatasetError( 
+        f"Unknown data set {tag!r}. Expect {smart_format( funcns)}")
+
+    return func (tag=tag, data_names=funcns, **kws) if callable (func) else None 
+
+
+fetch_data.__doc__ ="""\
+Fetch dataset from `tag`. 
+
+A tag corresponds to the name area of data collection or each 
+level of data processing. 
+
+Parameters 
+------------
+tag: str, ['nlogs', 'hlogs', 'mxs', ]
+    name of the area of data to fetch. 
+
+Returns
+-------
+dict, X, y : frame of :class:`~hlearn.utils.box.Boxspace` object 
+   
+"""    
+
+def _parse_tags (tag, multi_kind_dataset ='nanshan'): 
+    """ Parse and sanitize tag to match the different type of datasets.
+    
+    In principle, only the 'Bagoue' datasets is allowed to contain a tag 
+    composed of two words i.e. 'Bagoue' + '<kind_of_data>'. For instance 
+    ``bagoue pipe`` fetchs only the pipeline used for Bagoue case study  
+    data preprocessing and so on. 
+    However , for other type of dataset, it a second word <kind_of_data> is 
+    passed, it should merely discarded. 
+    """ 
+    tag = str(tag);  t = tag.strip().split() 
+
+    if len(t) ==1 : 
+        if t[0].lower() not in _DTAGS: 
+            tag = multi_kind_dataset +' ' + t[0]
+
+            warn(f"Fetching {multi_kind_dataset.title()!r} data without"
+                 " explicitly prefixing the kind of data with the area"
+                 " name will raise an error. In future, the argument"
+                f" should be '{tag}' instead.", FutureWarning 
+                 )
+    elif len(t) >1 : 
+        # only the multi kind dataset is allowed 
+        # to contain two words for fetching data 
+        if t[0].lower() !=multi_kind_dataset: 
+            tag = t[0].lower() # skip the second word 
+    return tag 
+
+from ..utils.funcutils import listing_items_format
+
+_l=[ "{:<7}: {:<7}()".format(s.upper() , 'load_'+s ) for s in _DTAGS ] 
+_LST = listing_items_format(
+    _l, 
+    "Fetch data using 'load_<type_of_data|area_name>'like", 
+    " or using ufunc 'fetch_data (<type_of_data|area_name>)'.",
+    inline=True , verbose= False, 
+)
+
diff --git a/hlearn/methods/hydrophy.py → hlearn/methods/hydro.py b/hlearn/methods/hydrophy.py → hlearn/methods/hydro.py