From 8bb537de2acc07cc088c4e895134c438681eb2b4 Mon Sep 17 00:00:00 2001 From: WEgeophysics Date: Wed, 20 Sep 2023 18:46:15 +0800 Subject: [PATCH] config hydro-learn --- README.md | 15 +- hlearn/__init__.py | 224 +++++++++++++++++++++++ hlearn/datasets/__init__.py | 16 ++ hlearn/datasets/_config.py | 105 +++++++++++ hlearn/methods/{hydrophy.py => hydro.py} | 0 hlearn/utils/baseutils.py | 12 +- 6 files changed, 357 insertions(+), 15 deletions(-) create mode 100644 hlearn/datasets/__init__.py create mode 100644 hlearn/datasets/_config.py rename hlearn/methods/{hydrophy.py => hydro.py} (100%) diff --git a/README.md b/README.md index 811af8d..2afe9a7 100644 --- a/README.md +++ b/README.md @@ -4,23 +4,23 @@ ## Overview *Hydro-learn* is a Python-based package for solving hydro-geology engineering issues. From methodologies based on -Machine Learning,It brings novel approaches for reducing numerous losses during the hydrogeological +Machine Learning, It brings novel approaches for reducing numerous losses during the hydrogeological exploration projects. It allows to: -- reduce the cost of permeability coefficient (k) data collection during the engineering projects, +- reduce the cost of hydraulic conductivity (K) data collection during the engineering projects, - Guide drillers for to locating the drilling operations, - predict the water content in the well such as the level of water inrush, ... ## Licence -*WATex* is under [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) License. +*hydro-learn* is under [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) License. ## Installation The system requires preferably Python 3.10+. -## Demos +## Demo -### Predict permeability coefficient ``K`` from logging dataset using MXS approach +### Predict hydraulic conductivity ``K`` from logging dataset using MXS approach MXS stands for mixture learning strategy. It uses upstream unsupervised learning for ``k`` -aquifer similarity label prediction and the supervising learning for @@ -52,8 +52,6 @@ Out[4]: array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2]) ymxs[62:74] Out[5]: array([ 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12]) ``` -To understand the transformation from NGA to MXS target (``ymxs``), please, have a look -of the following [paper](http://dx.doi.org/10.2139/ssrn.4326365). Once the MXS target is predicted, we call the ``make_naive_pipe`` function to impute, scale, and transform the predictor ``X`` at once into a compressed sparse matrix ready for final prediction using the [support vector machines](https://ieeexplore.ieee.org/document/708428) and @@ -78,8 +76,7 @@ Out[8]: 0.9636363636363636 ``` As we can see, the results of ``k`` prediction are quite satisfactory for our toy example using only two boreholes data. Note that things can become more -interesting when using many boreholes data. For more in -depth, visit our [examples page](https://watex.readthedocs.io/en/latest/glr_examples/index.html). +interesting when using many boreholes data. ## Contributions diff --git a/hlearn/__init__.py b/hlearn/__init__.py index e69de29..2e6d2f5 100644 --- a/hlearn/__init__.py +++ b/hlearn/__init__.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- +# Licence:BSD-3-Clause +# Author: L. Kouadio + +from __future__ import annotations +import os +import sys +import logging +import random +import warnings + +# set the package name for consistency checker +sys.path.insert(0, os.path.dirname(__file__)) +for p in ('.','..' ,'./hlearn'): + sys.path.insert(0, os.path.abspath(p)) + +# assert package +if __package__ is None: + sys.path.append( os.path.dirname(__file__)) + __package__ ='hlearn' + +# configure the logger file +# from ._hlearnlog import hlearnlog +try: + conffile = os.path.join( + os.path.dirname(__file__), "hlearn/hlog.yml") + if not os.path.isfile (conffile ): + raise +except: + conffile = os.path.join( + os.path.dirname(__file__), "hlog.yml") + +# generated version by setuptools_scm +__version__ = '0.1.0' + +# # set loging Level +logging.getLogger(__name__)#.setLevel(logging.WARNING) +# disable the matplotlib font manager logger. +logging.getLogger('matplotlib.font_manager').disabled = True +# or ust suppress the DEBUG messages but not the others from that logger. +# logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) + +# setting up +os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True") + +# Workaround issue discovered in intel-openmp 2019.5: +# https://github.com/ContinuumIO/anaconda-issues/issues/11294 +os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE") + +# https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ +try: + # This variable is injected in the __builtins__ by the build process. + __HLEARN_SETUP__ # type: ignore +except NameError: + __HLEARN_SETUP__ = False + +if __HLEARN_SETUP__ : + sys.stderr.write("Partial import of hlearn during the build process.\n") +else: + from . import _distributor_init # noqa: F401 + from . import _build # noqa: F401 + from .utils._show_versions import show_versions + +#https://github.com/pandas-dev/pandas +# Let users know if they're missing any of our hard dependencies +_main_dependencies = ("numpy", "scipy", "sklearn", "matplotlib", + "pandas","seaborn") +_missing_dependencies = [] + +for _dependency in _main_dependencies: + try: + __import__(_dependency) + except ImportError as _e: # pragma: no cover + _missing_dependencies.append( + f"{'scikit-learn' if _dependency=='sklearn' else _dependency }: {_e}") + +if _missing_dependencies: # pragma: no cover + raise ImportError( + "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies) + ) +del _main_dependencies, _dependency, _missing_dependencies + +# Try to suppress pandas future warnings +# and reduce verbosity. +# Setup hlearn public API +with warnings.catch_warnings(): + warnings.filterwarnings(action='ignore', category=UserWarning) + import hlearn.externals as sklearn + +from .datasets import ( + fetch_data, + ) +from .methods import ( + Structural, + Structures, + MXS, + ) + +from .view import ( + EvalPlot, + plotLearningInspections, + plotSilhouette, + plotDendrogram, + plotProjection, + ) + +from .utils import ( + read_data, + cleaner, + reshape, + to_numeric_dtypes, + smart_label_classifier, + select_base_stratum , + reduce_samples , + make_MXS_labels, + predict_NGA_labels, + classify_k, + plot_elbow, + plot_clusters, + plot_pca_components, + plot_naive_dendrogram, + plot_learning_curves, + plot_confusion_matrices, + plot_sbs_feature_selection, + plot_regularization_path, + plot_rf_feature_importances, + plot_logging, + plot_silhouette, + plot_profiling, + plot_confidence_in, + ) + +try : + from .utils import ( + selectfeatures, + naive_imputer, + naive_scaler, + make_naive_pipe, + bi_selector, + ) +except ImportError : + pass + +def setup_module(module): + """Fixture for the tests to assure globally controllable seeding of RNGs""" + + import numpy as np + + # Check if a random seed exists in the environment, if not create one. + _random_seed = os.environ.get("hlearn_SEED", None) + if _random_seed is None: + _random_seed = np.random.uniform() * np.iinfo(np.int32).max + _random_seed = int(_random_seed) + print("I: Seeding RNGs with %r" % _random_seed) + np.random.seed(_random_seed) + random.seed(_random_seed) + +__doc__= """\ +hydro-learn: An intelligent solver for hydrogeology engineering issues +======================================================================= + +Hydro-learn is a Python-based package for solving hydro-geology engineering +issues. From methodologies based on Machine Learning,It brings novel +approaches for reducing numerous losses during the hydrogeological +exploration projects. It allows to: + +- reduce the cost of permeability coefficient (k) data collection during the + engineering projects, +- guide drillers for to locating the drilling operations, +- predict the water content in the well such as the level of water inrush, ... + +.. _hlearn: https://github.com/WEgeophysics/hydro-learn/ + +""" +# __all__ is used to display a few public API. +# the public API is determined +# based on the documentation. + +__all__ = [ + "sklearn", + "fetch_data", + "Structural", + "Structures", + "MXS", + "EvalPlot", + "plotLearningInspections", + "plotSilhouette", + "plotDendrogram", + "plotProjection", + "plotAnomaly", + "vesSelector", + "erpSelector", + "read_data", + "erpSmartDetector", + "plot_confidence_in", + "reshape", + "to_numeric_dtypes", + "smart_label_classifier", + "select_base_stratum" , + "reduce_samples" , + "make_MXS_labels", + "predict_NGA_labels", + "classify_k", + "plot_elbow", + "plot_clusters", + "plot_pca_components", + "plot_naive_dendrogram", + "plot_learning_curves", + "plot_confusion_matrices", + "plot_sbs_feature_selection", + "plot_regularization_path", + "plot_rf_feature_importances", + "plot_logging", + "plot_silhouette", + "plot_profiling", + "selectfeatures", + "naive_imputer", + "naive_scaler", + "make_naive_pipe", + "bi_selector", + "show_versions", + "cleaner", + ] + diff --git a/hlearn/datasets/__init__.py b/hlearn/datasets/__init__.py new file mode 100644 index 0000000..01f519a --- /dev/null +++ b/hlearn/datasets/__init__.py @@ -0,0 +1,16 @@ +""" +Dataset subpackage is used to fetch data from the local machine. +""" +from .sets import ( + load_hlogs, + load_nlogs, + load_mxs, + fetch_data, + ) + +__all__=[ + "load_hlogs", + "load_nlogs", + "load_mxs", + "fetch_data", + ] \ No newline at end of file diff --git a/hlearn/datasets/_config.py b/hlearn/datasets/_config.py new file mode 100644 index 0000000..7679743 --- /dev/null +++ b/hlearn/datasets/_config.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# License: BSD-3-Clause +# Author: LKouadio + +""" +Set all dataset. +""" +from warnings import warn + +from ..utils.funcutils import ( + smart_format + ) +from ..exceptions import DatasetError +from .._hlearnlog import hlearnlog + +_logger = hlearnlog().get_hlearn_logger(__name__) + +_DTAGS=( + "hlogs", + "nlogs", + "mxs", + ) + +from .dload import ( + load_hlogs, + load_nlogs, + load_mxs, + ) + +__all__=[ + + "load_hlogs", + "load_nlogs", + "fetch_data", + "load_mxs", + + ] + +def fetch_data (tag, **kws): + tag = _parse_tags(tag, multi_kind_dataset='nanshan') + funcs= ( load_hlogs, load_nlogs, load_mxs ) + funcns = list (map(lambda f: f.__name__.replace('load_', ''), funcs)) + if tag in (funcns): + func = funcs[funcns.index (tag)] + else : raise DatasetError( + f"Unknown data set {tag!r}. Expect {smart_format( funcns)}") + + return func (tag=tag, data_names=funcns, **kws) if callable (func) else None + + +fetch_data.__doc__ ="""\ +Fetch dataset from `tag`. + +A tag corresponds to the name area of data collection or each +level of data processing. + +Parameters +------------ +tag: str, ['nlogs', 'hlogs', 'mxs', ] + name of the area of data to fetch. + +Returns +------- +dict, X, y : frame of :class:`~hlearn.utils.box.Boxspace` object + +""" + +def _parse_tags (tag, multi_kind_dataset ='nanshan'): + """ Parse and sanitize tag to match the different type of datasets. + + In principle, only the 'Bagoue' datasets is allowed to contain a tag + composed of two words i.e. 'Bagoue' + ''. For instance + ``bagoue pipe`` fetchs only the pipeline used for Bagoue case study + data preprocessing and so on. + However , for other type of dataset, it a second word is + passed, it should merely discarded. + """ + tag = str(tag); t = tag.strip().split() + + if len(t) ==1 : + if t[0].lower() not in _DTAGS: + tag = multi_kind_dataset +' ' + t[0] + + warn(f"Fetching {multi_kind_dataset.title()!r} data without" + " explicitly prefixing the kind of data with the area" + " name will raise an error. In future, the argument" + f" should be '{tag}' instead.", FutureWarning + ) + elif len(t) >1 : + # only the multi kind dataset is allowed + # to contain two words for fetching data + if t[0].lower() !=multi_kind_dataset: + tag = t[0].lower() # skip the second word + return tag + +from ..utils.funcutils import listing_items_format + +_l=[ "{:<7}: {:<7}()".format(s.upper() , 'load_'+s ) for s in _DTAGS ] +_LST = listing_items_format( + _l, + "Fetch data using 'load_'like", + " or using ufunc 'fetch_data ()'.", + inline=True , verbose= False, +) + diff --git a/hlearn/methods/hydrophy.py b/hlearn/methods/hydro.py similarity index 100% rename from hlearn/methods/hydrophy.py rename to hlearn/methods/hydro.py diff --git a/hlearn/utils/baseutils.py b/hlearn/utils/baseutils.py index 7159d17..552a325 100644 --- a/hlearn/utils/baseutils.py +++ b/hlearn/utils/baseutils.py @@ -106,7 +106,7 @@ def read_data ( load text file. np.load Load uncompressed or compressed numpy `.npy` and `.npz` formats. - watex.utils.baseutils.save_or_load: + hlearn.utils.baseutils.save_or_load: Save or load numpy arrays. """ @@ -285,7 +285,7 @@ def array2hdf5 ( Examples ---------- >>> import numpy as np - >>> from watex.utils.baseutils import array2hdf5 + >>> from hlearn.utils.baseutils import array2hdf5 >>> data = np.random.randn (100, 27 ) >>> array2hdf5 ('test.h5', data ) >>> load_data = array2hdf5 ( 'test.h5', data, task ='load') @@ -329,7 +329,7 @@ def lowertify (*values, strip = True, return_origin: bool =... ): :return: value in lowercase and original value. :Example: - >>> from watex.utils.baseutils import lowertify + >>> from hlearn.utils.baseutils import lowertify >>> lowertify ( 'KIND') Out[19]: ('kind',) >>> lowertify ( "KIND", return_origin =True ) @@ -409,7 +409,7 @@ def save_or_load( Examples ---------- >>> import numpy as np - >>> from watex.utils.baseutils import save_or_load + >>> from hlearn.utils.baseutils import save_or_load >>> data = np.random.randn (2, 7) >>> # save to txt >>> save_or_load ( "test.txt" , data) @@ -535,8 +535,8 @@ def get_remote_data( rfile: str or PathLike-object Full path to the remote file. It can be the path to the repository root toward the file name. For instance, to retrieve the file - ``'AGSO.csv'`` which is located in ``watex/etc/`` directory then the - full path should be ``'watex/etc/AGSO.csv'`` + ``'AGSO.csv'`` which is located in ``hlearn/etc/`` directory then the + full path should be ``'hlearn/etc/AGSO.csv'`` savepath: str, optional Full path to place where to downloaded files should be located.