Merge branch 'main' into 44-improve-airpls-and-arpls-performance-spar…

…se-matrix-operations
paucablop · May 20, 2024 · 02f59f2 · 02f59f2
2 parents 4f1eb69 + a291245
commit 02f59f2
Show file tree

Hide file tree

Showing 15 changed files with 1,076 additions and 802 deletions.
diff --git a/Pipfile b/Pipfile
@@ -10,6 +10,8 @@ pytest = "*"
 dtuprosys = "*"
 cairocffi = "*"
 numba = "*"
+polars = "*"
+pyarrow = "*"
 
 [dev-packages]
 black = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/chemotools/datasets/_base.py b/chemotools/datasets/_base.py
@@ -1,14 +1,22 @@
-import pandas as pd
 import os
 
+
+import pandas as pd
+import polars as pl
+
 PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
 
 
-def load_fermentation_train():
+def load_fermentation_train(set_output="pandas"):
     """
-    Loads the training data of the fermentation dataset. This data corresponds to a synthetic dataset measured 
+    Loads the training data of the fermentation dataset. This data corresponds to a synthetic dataset measured
     off-line. This dataset is designed to represent the variability of real fermentation data.
 
+    Arguments
+    -------
+    set_output: str, default='pandas'
+        The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
+
     Returns
     -------
     train_spectra: pd.DataFrame A pandas DataFrame containing the synthetic spectra measured to train the model.
@@ -20,17 +28,32 @@ def load_fermentation_train():
     Mauricio Iglesias Miguel, Gernaey Krist V. Transforming data into information:
     A parallel hybrid model for real-time state estimation in lignocellulose ethanol fermentations.
     """
-    train_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
-    train_spectra.columns = train_spectra.columns.astype(float)
-    train_hplc = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
+    if set_output == "pandas":
+        train_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
+        train_spectra.columns = train_spectra.columns.astype(float)
+        train_hplc = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
+        return train_spectra, train_hplc
 
-    return train_spectra, train_hplc
+    if set_output == "polars":
+        train_spectra = pl.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
+        train_hplc = pl.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
+        return train_spectra, train_hplc
 
+    else:
+        raise ValueError(
+            "Invalid value for set_output. Please use 'pandas' or 'polars'."
+        )
 
-def load_fermentation_test():
+
+def load_fermentation_test(set_output="pandas"):
     """
     Loads the testing data of the fermentation dataset. This data corresponds to real fermentation data measured
-    on-line during a fermentation process. 
+    on-line during a fermentation process.
+
+    Arguments
+    -------
+    set_output: str, default='pandas'
+        The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
 
     Returns
     -------
@@ -43,27 +66,57 @@ def load_fermentation_test():
     Mauricio Iglesias Miguel, Gernaey Krist V. Transforming data into information:
     A parallel hybrid model for real-time state estimation in lignocellulose ethanol fermentations.
     """
-    fermentation_spectra = pd.read_csv(
-        PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
-    )
-    fermentation_spectra.columns = fermentation_spectra.columns.astype(float)
-    fermentation_hplc = pd.read_csv(PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv")
-
-    return fermentation_spectra, fermentation_hplc
-
-
-def load_coffee():
+    if set_output == "pandas":
+        fermentation_spectra = pd.read_csv(
+            PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
+        )
+        fermentation_spectra.columns = fermentation_spectra.columns.astype(float)
+        fermentation_hplc = pd.read_csv(
+            PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv"
+        )
+        return fermentation_spectra, fermentation_hplc
+
+    if set_output == "polars":
+        fermentation_spectra = pl.read_csv(
+            PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
+        )
+        fermentation_hplc = pl.read_csv(
+            PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv"
+        )
+        return fermentation_spectra, fermentation_hplc
+
+    else:
+        raise ValueError(
+            "Invalid value for set_output. Please use 'pandas' or 'polars'."
+        )
+
+
+def load_coffee(set_output="pandas"):
     """
-    Loads the coffee dataset. This data corresponds to a coffee spectra from three different origins 
+    Loads the coffee dataset. This data corresponds to a coffee spectra from three different origins
     measured off-line using attenuated total reflectance Fourier transform infrared spectroscopy (ATR-FTIR).
 
+    Arguments
+    -------
+    set_output: str, default='pandas'
+        The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
+
     Returns
     -------
     coffee_spectra: pd.DataFrame A pandas DataFrame containing the coffee spectra.
     coffee_labels: pd.DataFrame A pandas DataFrame containing the corresponding labels.
     """
-
-    coffee_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
-    coffee_labels = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")
-
-    return coffee_spectra, coffee_labels
+    if set_output == "pandas":
+        coffee_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
+        coffee_labels = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")
+        return coffee_spectra, coffee_labels
+
+    if set_output == "polars":
+        coffee_spectra = pl.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
+        coffee_labels = pl.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")
+        return coffee_spectra, coffee_labels
+
+    else:
+        raise ValueError(
+            "Invalid value for set_output. Please use 'pandas' or 'polars'."
+        )
diff --git a/chemotools/feature_selection/_range_cut.py b/chemotools/feature_selection/_range_cut.py
@@ -34,6 +34,8 @@ class RangeCut(BaseEstimator, SelectorMixin):
     end_index_ : int
         The index of the end of the range. It is -1 if the wavenumbers are not provided.
 
+    wavenuumbers_ : array-like
+        The cut wavenumbers of the input data.
 
     Methods
     -------
@@ -75,9 +77,11 @@ def fit(self, X: np.ndarray, y=None) -> "RangeCut":
         if self.wavenumbers is None:
             self.start_index_ = self.start
             self.end_index_ = self.end
+            self.wavenumbers_ = None
         else:
             self.start_index_ = self._find_index(self.start)
             self.end_index_ = self._find_index(self.end)
+            self.wavenumbers_ = self.wavenumbers[self.start_index_ : self.end_index_]
 
         return self
 

diff --git a/docs/variable_selection.md → docs/feature_selection.md b/docs/variable_selection.md → docs/feature_selection.md
@@ -1,13 +1,13 @@
 ---
-title: Variable selection
+title: Feature selection
 layout: default
 parent: Docs
 ---
 
-# __Variable selection__
-Variable selection is a preprocessing technique in spectroscopy that selects the most relevant variables. The following algorithms are available:
+# __Feature selection__
+Feature selection is a preprocessing technique in spectroscopy that selects the most relevant features. The following algorithms are available:
 - [Range cut](#range-cut)
-- [SelectFeatures](#range-cut-by-wavenumber)
+- [IndexSelector](#index-selector)
 
 {: .note }
 > The variable selection algorithms implemented in ```chemotools``` allow you to select a subset of variables/features from the spectra. They are not designed to find the most relevant variables/features for a given task. 
@@ -31,7 +31,7 @@ Range cut by index is a preprocessing technique in spectroscopy that selects all
 #### __Case 1: Range cut by index__
 
 ```python
-from chemotools.variable_selection import RangeCut
+from chemotools.feature_selection import RangeCut
 
 rcbi = RangeCut(0, 200)
 spectra_rcbi = rcbi.fit_transform(spectra)
@@ -41,20 +41,24 @@ spectra_rcbi = rcbi.fit_transform(spectra)
 
 
 ```python
-from chemotools.variable_selection import RangeCut
+from chemotools.feature_selection import RangeCut
 
 rcbw = RangeCut(950, 1100, wavenumbers=wn)
 spectra_rcbw = rcbw.fit_transform(spectra)
 ```
 
+After fitting the method with the wavenumbers, the selected wavenumbers can be accessed using the ```wavenumbers_``` attribute.
+
+
 ### __Plotting example__:
 
 <iframe src="figures/range_cut_by_index.html" width="800px" height="400px" style="border: none;"></iframe>
 
 <iframe src="figures/range_cut_by_wavenumber.html" width="800px" height="400px" style="border: none;"></iframe>
 
-## __SelectFeatures__
-SelectFeatures is a preprocessing technique in spectroscopy that selects the most relevant variables. The selected features do not need to be continuous in the spectra, but they can be located at different locations. The algorithm allows selecting the features by imputing a list of indices or wavenumbers.
+
+## __Index selector__
+IndexSelector is a preprocessing technique in spectroscopy that selects the most relevant variables. The selected features do not need to be continuous in the spectra, but they can be located at different locations. The algorithm allows selecting the features by imputing a list of indices or wavenumbers.
 
 ### __Arguments__:
 
@@ -72,9 +76,9 @@ In the example below, the selected wavenumbers ```wn_select``` are used to selec
 
 
 ```python
-from chemotools.variable_selection import SelectFeatures
+from chemotools.feature_selection import IndexSelector
 
-sfbw = SelectFeatures(features=wn_select,wavenumbers=wn)
+sfbw = IndexSelector(features=wn_select,wavenumbers=wn)
 spectra_sfbw = sfbw.fit_transform(spectra)
 ```
 

diff --git a/get-started/brewing_regressor.md b/get-started/brewing_regressor.md
@@ -10,13 +10,16 @@ nav_order: 4
 
 ## What will you learn?
 
-- [Get familiar with the Fermentation dataset](#introduction)
-- [Load the fermentation dataset](#loading-the-training-dataset)
-- [Explore the fermentation dataset](#exploring-the-training-dataset)
-- [Visualize the fermentation dataset](#visualizing-the-training-dataset)
-- [Preprocess the spectra using pipelines](#preprocessing-the-training-spectra)
-- [Train a PLS model](#training-a-pls-model)
-- [Apply the model to the testing dataset](#applying-the-model-to-the-testing-dataset)
+- [__Brewing a PLS regressor__](#brewing-a-pls-regressor)
+  - [What will you learn?](#what-will-you-learn)
+  - [__Introduction__](#introduction)
+  - [__Loading the training dataset__](#loading-the-training-dataset)
+  - [__Exploring the training dataset__](#exploring-the-training-dataset)
+  - [__Visualizing the training dataset__](#visualizing-the-training-dataset)
+  - [__Preprocessing the training spectra__](#preprocessing-the-training-spectra)
+  - [__Training a PLS model__](#training-a-pls-model)
+  - [__Applying the model to the testing dataset__](#applying-the-model-to-the-testing-dataset)
+  - [__Recap__](#recap)
 
 ## __Introduction__
 Welcome to the world of spectroscopic data analysis, where we provide you with a unique insight into lignocellulosic ethanol fermentation in real-time. Our dataset comprises spectra obtained through attenuated total reflectance, mid-infrared (ATR-MIR) spectroscopy, combined with high-performance liquid chromatography (HPLC) reference data to ensure precision and accuracy.
@@ -47,6 +50,10 @@ The ```load_fermentation_train()``` function returns two ```pandas.DataFrame```:
 
 - ```hplc```: AHere, you'll find HPLC measurements, specifically glucose concentrations (in g/L), stored in a single column labeled ```glucose```.
 
+{: .highlight }
+> If you are interested in working with ```polars.DataFrame``` you can simply use  ```load_fermentation_train(set_output="polars")``` (chemotools>=0.1.5). Note that if you choose to work with ```polars.DataFrame``` the wavenumbers are given in the column names as ```str``` and not as ```float```. This is because ```polars``` does not support column names with types other than ```str```. To extract the wavenumbers as ```float``` from the ```polars.DataFrame``` you can use the ```df.columns.to_numpy(dtype=np.float64)``` method.
+
+
 ## __Exploring the training dataset__
 
 Before diving into data modeling, it's essential to get familiar with your data. Start by answering basic questions: _How many samples are there?_, and _how many wavenumbers are available?_
@@ -99,7 +106,7 @@ To better understand our dataset, we employ visualization. We will plot the trai
 Up until now, we have used ```pandas.DataFrame``` to represent the dataset. ```pandas.DataFrame``` are great for storing and manipulating many large datasets. However, I often find more convenient to use ```numpy.ndarray``` to work with spectral data. Therefore, we will convert the ```pandas.DataFrame``` to ```numpy.ndarray``` using the ```pandas.DataFrame.to_numpy()``` method.
 
 {: .note }
-> Pandas lover 🐼 ❤️? No problem! ```chemotools``` also supports working with ```pandas.DataFrame``` by implementing the latest ```set_output()``` API from ```scikit-learn```. If you are more interested in working with ```pandas```, take a look at the documentation [here](https://paucablop.github.io/chemotools/get-started/scikit_learn_integration.html#working-with-pandas-dataframes).
+> Pandas 🐼 or polars 🐻‍❄️ lover ❤️? No problem! ```chemotools``` also supports working with ```pandas.DataFrame``` or ```polars.DataFrame``` by implementing the latest ```set_output()``` API from ```scikit-learn```. If you are more interested in working with ```pandas``` or ```polars```, take a look at the documentation [here](https://paucablop.github.io/chemotools/get-started/scikit_learn_integration.html#working-with-dataframes).
 
 So our first step will be to transform our ```pandas.DataFrame``` to ```numpy.ndarray```:
 
@@ -113,7 +120,7 @@ spectra_np = spectra.to_numpy()
 wavenumbers = spectra.columns.to_numpy(dtype=np.float64)
 
 # Convert the hplc pandas.DataFrame to numpy.ndarray
-hplc = hplc.to_numpy()
+hplc_np = hplc.to_numpy()
 ```
 
 Now that we have our data in the right format, we can start plotting. We will define a function to plot the spectra, where each spectrum will be color-coded according to its glucose concentration. We will use the ```matplotlib.colors.Normalize``` class to normalize the glucose concentrations between 0 and 1. Then, we will use the ```matplotlib.cm.ScalarMappable``` class to create a colorbar.
@@ -128,7 +135,7 @@ def plot_spectra(spectra: np.ndarray, wavenumbers: np.ndarray, hplc: np.ndarray)
     cmap = plt.get_cmap("jet")
 
     # Define a normalization function to scale glucose concentrations between 0 and 1
-    norm = Normalize(vmin=hplc.min(), vmax=hplc.max())
+    normalize = Normalize(vmin=hplc.min(), vmax=hplc.max())
     colors = [cmap(normalize(value)) for value in hplc]
 
     # Plot the spectra
@@ -152,7 +159,7 @@ def plot_spectra(spectra: np.ndarray, wavenumbers: np.ndarray, hplc: np.ndarray)
 Then, we can use this function to plot the training dataset:
 
 ```python
-plot_spectra(spectra, hplc)
+plot_spectra(spectra_np, wavenumbers, hplc_np)
 ```
 
 which should result in the following plot:
@@ -169,7 +176,7 @@ Now that you've explored the dataset, it's time to preprocess the spectral data.
 
 We will preprocess the spectra using the following steps:
 
-- __[Range Cut](https://paucablop.github.io/chemotools/docs/variable_selection.html#range-cut)__: to remove the wavenumbers outside the range between 950 and 1550 cm-1.
+- __[Range Cut](https://paucablop.github.io/chemotools/docs/feature_selection.html#range-cut)__: to remove the wavenumbers outside the range between 950 and 1550 cm-1.
 
 - __[Linear Correction](https://paucablop.github.io/chemotools/docs/baseline.html#linear-baseline-correction)__: to remove the linear baseline shift. 
 
@@ -182,7 +189,7 @@ We will chain the preprocessing steps using the [```make_pipeline()```](https://
 
 
 ```python
-from chemotools.variable_selection import RangeCut
+from chemotools.feature_selection import RangeCut
 from chemotools.baseline import LinearCorrection
 from chemotools.derivative import SavitzkyGolay
 
@@ -191,7 +198,7 @@ from sklearn.pipeline import make_pipeline
 
 # create a pipeline that scales the data
 preprocessing = make_pipeline(
-    RangeCut(start=950, end=1500, wavelength=wavenumbers),
+    RangeCut(start=950, end=1500, wavenumbers=wavenumbers),
     LinearCorrection(),
     SavitzkyGolay(window_size=15, polynomial_order=2, derivate_order=1),
     StandardScaler(with_std=False)
@@ -208,9 +215,8 @@ Finally, we can plot the preprocessed spectra:
 
 ```python
 # get the wavenumbers after the range cut
-start_index = preprocessing.named_steps['rangecut'].start
-end_index = preprocessing.named_steps['rangecut'].end
-wavenumbers_cut = wavenumbers[start_index:end_index]
+wavenumbers_cut = preprocessing.named_steps['rangecut'].wavenumbers_
+
 
 # plot the preprocessed spectra
 plot_spectra(spectra_preprocessed, wavenumbers_cut, hplc_np)
@@ -295,7 +301,7 @@ hplc_pred = pls.predict(spectra_preprocessed)
 
 # plot the predictions
 fig, ax = plt.subplots(figsize=(4, 4))
-ax.scatter(hplc_np, predictions, color='blue')
+ax.scatter(hplc_np, hplc_pred, color='blue')
 ax.plot([0, 40], [0, 40], color='magenta')
 ax.set_xlabel('Measured glucose (g/L)')
 ax.set_ylabel('Predicted glucose (g/L)')
@@ -352,12 +358,12 @@ Now we can compare the predicted glucose concentrations with the off-line HPLC m
 
 ```python
 # make linspace of length of predictoins
-time = np.linspace(0, len(predictions_test), len(predictions_test),) * 1.25 / 60
+time = np.linspace(0, len(glucose_test_pred), len(glucose_test_pred),) * 1.25 / 60
 
 # plot the predictions
 fig, ax = plt.subplots(figsize=(10, 4))
 
-ax.plot(time, predictions_test,  color='blue', label='Predicted')
+ax.plot(time, glucose_test_pred,  color='blue', label='Predicted')
 ax.plot(hplc_test.index, hplc_test['glucose']+4, 'o', color='red', label='Measured')
 ax.set_xlabel('Time (h)')
 ax.set_ylabel('Glucose (g/L)')