Skip to content

Commit

Permalink
Merge pull request #2 from WEgeophysics/dev
Browse files Browse the repository at this point in the history
config hydro-learn
  • Loading branch information
earthai-tech committed Sep 20, 2023
2 parents fc85aa7 + 8bb537d commit 4d6ca91
Show file tree
Hide file tree
Showing 6 changed files with 357 additions and 15 deletions.
15 changes: 6 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,23 @@
## Overview

*Hydro-learn* is a Python-based package for solving hydro-geology engineering issues. From methodologies based on
Machine Learning,It brings novel approaches for reducing numerous losses during the hydrogeological
Machine Learning, It brings novel approaches for reducing numerous losses during the hydrogeological
exploration projects. It allows to:
- reduce the cost of permeability coefficient (k) data collection during the engineering projects,
- reduce the cost of hydraulic conductivity (K) data collection during the engineering projects,
- Guide drillers for to locating the drilling operations,
- predict the water content in the well such as the level of water inrush, ...

## Licence

*WATex* is under [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) License.
*hydro-learn* is under [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause) License.

## Installation

The system requires preferably Python 3.10+.

## Demos
## Demo

### Predict permeability coefficient ``K`` from logging dataset using MXS approach
### Predict hydraulic conductivity ``K`` from logging dataset using MXS approach

MXS stands for mixture learning strategy. It uses upstream unsupervised learning for
``k`` -aquifer similarity label prediction and the supervising learning for
Expand Down Expand Up @@ -52,8 +52,6 @@ Out[4]: array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2])
ymxs[62:74]
Out[5]: array([ 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12])
```
To understand the transformation from NGA to MXS target (``ymxs``), please, have a look
of the following [paper](http://dx.doi.org/10.2139/ssrn.4326365).
Once the MXS target is predicted, we call the ``make_naive_pipe`` function to
impute, scale, and transform the predictor ``X`` at once into a compressed sparse
matrix ready for final prediction using the [support vector machines](https://ieeexplore.ieee.org/document/708428) and
Expand All @@ -78,8 +76,7 @@ Out[8]: 0.9636363636363636
```
As we can see, the results of ``k`` prediction are quite satisfactory for our
toy example using only two boreholes data. Note that things can become more
interesting when using many boreholes data. For more in
depth, visit our [examples page](https://watex.readthedocs.io/en/latest/glr_examples/index.html).
interesting when using many boreholes data.


## Contributions
Expand Down
224 changes: 224 additions & 0 deletions hlearn/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
# -*- coding: utf-8 -*-
# Licence:BSD-3-Clause
# Author: L. Kouadio <etanoyau@gmail.com>

from __future__ import annotations
import os
import sys
import logging
import random
import warnings

# set the package name for consistency checker
sys.path.insert(0, os.path.dirname(__file__))
for p in ('.','..' ,'./hlearn'):
sys.path.insert(0, os.path.abspath(p))

# assert package
if __package__ is None:
sys.path.append( os.path.dirname(__file__))
__package__ ='hlearn'

# configure the logger file
# from ._hlearnlog import hlearnlog
try:
conffile = os.path.join(
os.path.dirname(__file__), "hlearn/hlog.yml")
if not os.path.isfile (conffile ):
raise
except:
conffile = os.path.join(
os.path.dirname(__file__), "hlog.yml")

# generated version by setuptools_scm
__version__ = '0.1.0'

# # set loging Level
logging.getLogger(__name__)#.setLevel(logging.WARNING)
# disable the matplotlib font manager logger.
logging.getLogger('matplotlib.font_manager').disabled = True
# or ust suppress the DEBUG messages but not the others from that logger.
# logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)

# setting up
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")

# Workaround issue discovered in intel-openmp 2019.5:
# https://github.com/ContinuumIO/anaconda-issues/issues/11294
os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")

# https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/
try:
# This variable is injected in the __builtins__ by the build process.
__HLEARN_SETUP__ # type: ignore
except NameError:
__HLEARN_SETUP__ = False

if __HLEARN_SETUP__ :
sys.stderr.write("Partial import of hlearn during the build process.\n")
else:
from . import _distributor_init # noqa: F401
from . import _build # noqa: F401
from .utils._show_versions import show_versions

#https://github.com/pandas-dev/pandas
# Let users know if they're missing any of our hard dependencies
_main_dependencies = ("numpy", "scipy", "sklearn", "matplotlib",
"pandas","seaborn")
_missing_dependencies = []

for _dependency in _main_dependencies:
try:
__import__(_dependency)
except ImportError as _e: # pragma: no cover
_missing_dependencies.append(
f"{'scikit-learn' if _dependency=='sklearn' else _dependency }: {_e}")

if _missing_dependencies: # pragma: no cover
raise ImportError(
"Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
)
del _main_dependencies, _dependency, _missing_dependencies

# Try to suppress pandas future warnings
# and reduce verbosity.
# Setup hlearn public API
with warnings.catch_warnings():
warnings.filterwarnings(action='ignore', category=UserWarning)
import hlearn.externals as sklearn

from .datasets import (
fetch_data,
)
from .methods import (
Structural,
Structures,
MXS,
)

from .view import (
EvalPlot,
plotLearningInspections,
plotSilhouette,
plotDendrogram,
plotProjection,
)

from .utils import (
read_data,
cleaner,
reshape,
to_numeric_dtypes,
smart_label_classifier,
select_base_stratum ,
reduce_samples ,
make_MXS_labels,
predict_NGA_labels,
classify_k,
plot_elbow,
plot_clusters,
plot_pca_components,
plot_naive_dendrogram,
plot_learning_curves,
plot_confusion_matrices,
plot_sbs_feature_selection,
plot_regularization_path,
plot_rf_feature_importances,
plot_logging,
plot_silhouette,
plot_profiling,
plot_confidence_in,
)

try :
from .utils import (
selectfeatures,
naive_imputer,
naive_scaler,
make_naive_pipe,
bi_selector,
)
except ImportError :
pass

def setup_module(module):
"""Fixture for the tests to assure globally controllable seeding of RNGs"""

import numpy as np

# Check if a random seed exists in the environment, if not create one.
_random_seed = os.environ.get("hlearn_SEED", None)
if _random_seed is None:
_random_seed = np.random.uniform() * np.iinfo(np.int32).max
_random_seed = int(_random_seed)
print("I: Seeding RNGs with %r" % _random_seed)
np.random.seed(_random_seed)
random.seed(_random_seed)

__doc__= """\
hydro-learn: An intelligent solver for hydrogeology engineering issues
=======================================================================
Hydro-learn is a Python-based package for solving hydro-geology engineering
issues. From methodologies based on Machine Learning,It brings novel
approaches for reducing numerous losses during the hydrogeological
exploration projects. It allows to:
- reduce the cost of permeability coefficient (k) data collection during the
engineering projects,
- guide drillers for to locating the drilling operations,
- predict the water content in the well such as the level of water inrush, ...
.. _hlearn: https://github.com/WEgeophysics/hydro-learn/
"""
# __all__ is used to display a few public API.
# the public API is determined
# based on the documentation.

__all__ = [
"sklearn",
"fetch_data",
"Structural",
"Structures",
"MXS",
"EvalPlot",
"plotLearningInspections",
"plotSilhouette",
"plotDendrogram",
"plotProjection",
"plotAnomaly",
"vesSelector",
"erpSelector",
"read_data",
"erpSmartDetector",
"plot_confidence_in",
"reshape",
"to_numeric_dtypes",
"smart_label_classifier",
"select_base_stratum" ,
"reduce_samples" ,
"make_MXS_labels",
"predict_NGA_labels",
"classify_k",
"plot_elbow",
"plot_clusters",
"plot_pca_components",
"plot_naive_dendrogram",
"plot_learning_curves",
"plot_confusion_matrices",
"plot_sbs_feature_selection",
"plot_regularization_path",
"plot_rf_feature_importances",
"plot_logging",
"plot_silhouette",
"plot_profiling",
"selectfeatures",
"naive_imputer",
"naive_scaler",
"make_naive_pipe",
"bi_selector",
"show_versions",
"cleaner",
]

16 changes: 16 additions & 0 deletions hlearn/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""
Dataset subpackage is used to fetch data from the local machine.
"""
from .sets import (
load_hlogs,
load_nlogs,
load_mxs,
fetch_data,
)

__all__=[
"load_hlogs",
"load_nlogs",
"load_mxs",
"fetch_data",
]
105 changes: 105 additions & 0 deletions hlearn/datasets/_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
# License: BSD-3-Clause
# Author: LKouadio <etanoyau@gmail.com>

"""
Set all dataset.
"""
from warnings import warn

from ..utils.funcutils import (
smart_format
)
from ..exceptions import DatasetError
from .._hlearnlog import hlearnlog

_logger = hlearnlog().get_hlearn_logger(__name__)

_DTAGS=(
"hlogs",
"nlogs",
"mxs",
)

from .dload import (
load_hlogs,
load_nlogs,
load_mxs,
)

__all__=[

"load_hlogs",
"load_nlogs",
"fetch_data",
"load_mxs",

]

def fetch_data (tag, **kws):
tag = _parse_tags(tag, multi_kind_dataset='nanshan')
funcs= ( load_hlogs, load_nlogs, load_mxs )
funcns = list (map(lambda f: f.__name__.replace('load_', ''), funcs))
if tag in (funcns):
func = funcs[funcns.index (tag)]
else : raise DatasetError(
f"Unknown data set {tag!r}. Expect {smart_format( funcns)}")

return func (tag=tag, data_names=funcns, **kws) if callable (func) else None


fetch_data.__doc__ ="""\
Fetch dataset from `tag`.
A tag corresponds to the name area of data collection or each
level of data processing.
Parameters
------------
tag: str, ['nlogs', 'hlogs', 'mxs', ]
name of the area of data to fetch.
Returns
-------
dict, X, y : frame of :class:`~hlearn.utils.box.Boxspace` object
"""

def _parse_tags (tag, multi_kind_dataset ='nanshan'):
""" Parse and sanitize tag to match the different type of datasets.
In principle, only the 'Bagoue' datasets is allowed to contain a tag
composed of two words i.e. 'Bagoue' + '<kind_of_data>'. For instance
``bagoue pipe`` fetchs only the pipeline used for Bagoue case study
data preprocessing and so on.
However , for other type of dataset, it a second word <kind_of_data> is
passed, it should merely discarded.
"""
tag = str(tag); t = tag.strip().split()

if len(t) ==1 :
if t[0].lower() not in _DTAGS:
tag = multi_kind_dataset +' ' + t[0]

warn(f"Fetching {multi_kind_dataset.title()!r} data without"
" explicitly prefixing the kind of data with the area"
" name will raise an error. In future, the argument"
f" should be '{tag}' instead.", FutureWarning
)
elif len(t) >1 :
# only the multi kind dataset is allowed
# to contain two words for fetching data
if t[0].lower() !=multi_kind_dataset:
tag = t[0].lower() # skip the second word
return tag

from ..utils.funcutils import listing_items_format

_l=[ "{:<7}: {:<7}()".format(s.upper() , 'load_'+s ) for s in _DTAGS ]
_LST = listing_items_format(
_l,
"Fetch data using 'load_<type_of_data|area_name>'like",
" or using ufunc 'fetch_data (<type_of_data|area_name>)'.",
inline=True , verbose= False,
)

File renamed without changes.
Loading

0 comments on commit 4d6ca91

Please sign in to comment.