Skip to content

Commit

Permalink
Merge branch 'main' into 44-improve-airpls-and-arpls-performance-spar…
Browse files Browse the repository at this point in the history
…se-matrix-operations
  • Loading branch information
MothNik committed May 20, 2024
2 parents 4f1eb69 + a291245 commit 02f59f2
Show file tree
Hide file tree
Showing 15 changed files with 1,076 additions and 802 deletions.
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ pytest = "*"
dtuprosys = "*"
cairocffi = "*"
numba = "*"
polars = "*"
pyarrow = "*"

[dev-packages]
black = "*"
Expand Down
1,460 changes: 750 additions & 710 deletions Pipfile.lock

Large diffs are not rendered by default.

103 changes: 78 additions & 25 deletions chemotools/datasets/_base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
import pandas as pd
import os


import pandas as pd
import polars as pl

PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))


def load_fermentation_train():
def load_fermentation_train(set_output="pandas"):
"""
Loads the training data of the fermentation dataset. This data corresponds to a synthetic dataset measured
Loads the training data of the fermentation dataset. This data corresponds to a synthetic dataset measured
off-line. This dataset is designed to represent the variability of real fermentation data.
Arguments
-------
set_output: str, default='pandas'
The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
Returns
-------
train_spectra: pd.DataFrame A pandas DataFrame containing the synthetic spectra measured to train the model.
Expand All @@ -20,17 +28,32 @@ def load_fermentation_train():
Mauricio Iglesias Miguel, Gernaey Krist V. Transforming data into information:
A parallel hybrid model for real-time state estimation in lignocellulose ethanol fermentations.
"""
train_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
train_spectra.columns = train_spectra.columns.astype(float)
train_hplc = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
if set_output == "pandas":
train_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
train_spectra.columns = train_spectra.columns.astype(float)
train_hplc = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
return train_spectra, train_hplc

return train_spectra, train_hplc
if set_output == "polars":
train_spectra = pl.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
train_hplc = pl.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
return train_spectra, train_hplc

else:
raise ValueError(
"Invalid value for set_output. Please use 'pandas' or 'polars'."
)

def load_fermentation_test():

def load_fermentation_test(set_output="pandas"):
"""
Loads the testing data of the fermentation dataset. This data corresponds to real fermentation data measured
on-line during a fermentation process.
on-line during a fermentation process.
Arguments
-------
set_output: str, default='pandas'
The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
Returns
-------
Expand All @@ -43,27 +66,57 @@ def load_fermentation_test():
Mauricio Iglesias Miguel, Gernaey Krist V. Transforming data into information:
A parallel hybrid model for real-time state estimation in lignocellulose ethanol fermentations.
"""
fermentation_spectra = pd.read_csv(
PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
)
fermentation_spectra.columns = fermentation_spectra.columns.astype(float)
fermentation_hplc = pd.read_csv(PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv")

return fermentation_spectra, fermentation_hplc


def load_coffee():
if set_output == "pandas":
fermentation_spectra = pd.read_csv(
PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
)
fermentation_spectra.columns = fermentation_spectra.columns.astype(float)
fermentation_hplc = pd.read_csv(
PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv"
)
return fermentation_spectra, fermentation_hplc

if set_output == "polars":
fermentation_spectra = pl.read_csv(
PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
)
fermentation_hplc = pl.read_csv(
PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv"
)
return fermentation_spectra, fermentation_hplc

else:
raise ValueError(
"Invalid value for set_output. Please use 'pandas' or 'polars'."
)


def load_coffee(set_output="pandas"):
"""
Loads the coffee dataset. This data corresponds to a coffee spectra from three different origins
Loads the coffee dataset. This data corresponds to a coffee spectra from three different origins
measured off-line using attenuated total reflectance Fourier transform infrared spectroscopy (ATR-FTIR).
Arguments
-------
set_output: str, default='pandas'
The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
Returns
-------
coffee_spectra: pd.DataFrame A pandas DataFrame containing the coffee spectra.
coffee_labels: pd.DataFrame A pandas DataFrame containing the corresponding labels.
"""

coffee_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
coffee_labels = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")

return coffee_spectra, coffee_labels
if set_output == "pandas":
coffee_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
coffee_labels = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")
return coffee_spectra, coffee_labels

if set_output == "polars":
coffee_spectra = pl.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
coffee_labels = pl.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")
return coffee_spectra, coffee_labels

else:
raise ValueError(
"Invalid value for set_output. Please use 'pandas' or 'polars'."
)
4 changes: 4 additions & 0 deletions chemotools/feature_selection/_range_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class RangeCut(BaseEstimator, SelectorMixin):
end_index_ : int
The index of the end of the range. It is -1 if the wavenumbers are not provided.
wavenuumbers_ : array-like
The cut wavenumbers of the input data.
Methods
-------
Expand Down Expand Up @@ -75,9 +77,11 @@ def fit(self, X: np.ndarray, y=None) -> "RangeCut":
if self.wavenumbers is None:
self.start_index_ = self.start
self.end_index_ = self.end
self.wavenumbers_ = None
else:
self.start_index_ = self._find_index(self.start)
self.end_index_ = self._find_index(self.end)
self.wavenumbers_ = self.wavenumbers[self.start_index_ : self.end_index_]

return self

Expand Down
24 changes: 14 additions & 10 deletions docs/variable_selection.md → docs/feature_selection.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
---
title: Variable selection
title: Feature selection
layout: default
parent: Docs
---

# __Variable selection__
Variable selection is a preprocessing technique in spectroscopy that selects the most relevant variables. The following algorithms are available:
# __Feature selection__
Feature selection is a preprocessing technique in spectroscopy that selects the most relevant features. The following algorithms are available:
- [Range cut](#range-cut)
- [SelectFeatures](#range-cut-by-wavenumber)
- [IndexSelector](#index-selector)

{: .note }
> The variable selection algorithms implemented in ```chemotools``` allow you to select a subset of variables/features from the spectra. They are not designed to find the most relevant variables/features for a given task.
Expand All @@ -31,7 +31,7 @@ Range cut by index is a preprocessing technique in spectroscopy that selects all
#### __Case 1: Range cut by index__

```python
from chemotools.variable_selection import RangeCut
from chemotools.feature_selection import RangeCut

rcbi = RangeCut(0, 200)
spectra_rcbi = rcbi.fit_transform(spectra)
Expand All @@ -41,20 +41,24 @@ spectra_rcbi = rcbi.fit_transform(spectra)


```python
from chemotools.variable_selection import RangeCut
from chemotools.feature_selection import RangeCut

rcbw = RangeCut(950, 1100, wavenumbers=wn)
spectra_rcbw = rcbw.fit_transform(spectra)
```

After fitting the method with the wavenumbers, the selected wavenumbers can be accessed using the ```wavenumbers_``` attribute.


### __Plotting example__:

<iframe src="figures/range_cut_by_index.html" width="800px" height="400px" style="border: none;"></iframe>

<iframe src="figures/range_cut_by_wavenumber.html" width="800px" height="400px" style="border: none;"></iframe>

## __SelectFeatures__
SelectFeatures is a preprocessing technique in spectroscopy that selects the most relevant variables. The selected features do not need to be continuous in the spectra, but they can be located at different locations. The algorithm allows selecting the features by imputing a list of indices or wavenumbers.

## __Index selector__
IndexSelector is a preprocessing technique in spectroscopy that selects the most relevant variables. The selected features do not need to be continuous in the spectra, but they can be located at different locations. The algorithm allows selecting the features by imputing a list of indices or wavenumbers.

### __Arguments__:

Expand All @@ -72,9 +76,9 @@ In the example below, the selected wavenumbers ```wn_select``` are used to selec


```python
from chemotools.variable_selection import SelectFeatures
from chemotools.feature_selection import IndexSelector

sfbw = SelectFeatures(features=wn_select,wavenumbers=wn)
sfbw = IndexSelector(features=wn_select,wavenumbers=wn)
spectra_sfbw = sfbw.fit_transform(spectra)
```

Expand Down
46 changes: 26 additions & 20 deletions get-started/brewing_regressor.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@ nav_order: 4

## What will you learn?

- [Get familiar with the Fermentation dataset](#introduction)
- [Load the fermentation dataset](#loading-the-training-dataset)
- [Explore the fermentation dataset](#exploring-the-training-dataset)
- [Visualize the fermentation dataset](#visualizing-the-training-dataset)
- [Preprocess the spectra using pipelines](#preprocessing-the-training-spectra)
- [Train a PLS model](#training-a-pls-model)
- [Apply the model to the testing dataset](#applying-the-model-to-the-testing-dataset)
- [__Brewing a PLS regressor__](#brewing-a-pls-regressor)
- [What will you learn?](#what-will-you-learn)
- [__Introduction__](#introduction)
- [__Loading the training dataset__](#loading-the-training-dataset)
- [__Exploring the training dataset__](#exploring-the-training-dataset)
- [__Visualizing the training dataset__](#visualizing-the-training-dataset)
- [__Preprocessing the training spectra__](#preprocessing-the-training-spectra)
- [__Training a PLS model__](#training-a-pls-model)
- [__Applying the model to the testing dataset__](#applying-the-model-to-the-testing-dataset)
- [__Recap__](#recap)

## __Introduction__
Welcome to the world of spectroscopic data analysis, where we provide you with a unique insight into lignocellulosic ethanol fermentation in real-time. Our dataset comprises spectra obtained through attenuated total reflectance, mid-infrared (ATR-MIR) spectroscopy, combined with high-performance liquid chromatography (HPLC) reference data to ensure precision and accuracy.
Expand Down Expand Up @@ -47,6 +50,10 @@ The ```load_fermentation_train()``` function returns two ```pandas.DataFrame```:

- ```hplc```: AHere, you'll find HPLC measurements, specifically glucose concentrations (in g/L), stored in a single column labeled ```glucose```.

{: .highlight }
> If you are interested in working with ```polars.DataFrame``` you can simply use ```load_fermentation_train(set_output="polars")``` (chemotools>=0.1.5). Note that if you choose to work with ```polars.DataFrame``` the wavenumbers are given in the column names as ```str``` and not as ```float```. This is because ```polars``` does not support column names with types other than ```str```. To extract the wavenumbers as ```float``` from the ```polars.DataFrame``` you can use the ```df.columns.to_numpy(dtype=np.float64)``` method.

## __Exploring the training dataset__

Before diving into data modeling, it's essential to get familiar with your data. Start by answering basic questions: _How many samples are there?_, and _how many wavenumbers are available?_
Expand Down Expand Up @@ -99,7 +106,7 @@ To better understand our dataset, we employ visualization. We will plot the trai
Up until now, we have used ```pandas.DataFrame``` to represent the dataset. ```pandas.DataFrame``` are great for storing and manipulating many large datasets. However, I often find more convenient to use ```numpy.ndarray``` to work with spectral data. Therefore, we will convert the ```pandas.DataFrame``` to ```numpy.ndarray``` using the ```pandas.DataFrame.to_numpy()``` method.

{: .note }
> Pandas lover 🐼 ❤️? No problem! ```chemotools``` also supports working with ```pandas.DataFrame``` by implementing the latest ```set_output()``` API from ```scikit-learn```. If you are more interested in working with ```pandas```, take a look at the documentation [here](https://paucablop.github.io/chemotools/get-started/scikit_learn_integration.html#working-with-pandas-dataframes).
> Pandas 🐼 or polars 🐻‍❄️ lover ❤️? No problem! ```chemotools``` also supports working with ```pandas.DataFrame``` or ```polars.DataFrame``` by implementing the latest ```set_output()``` API from ```scikit-learn```. If you are more interested in working with ```pandas``` or ```polars```, take a look at the documentation [here](https://paucablop.github.io/chemotools/get-started/scikit_learn_integration.html#working-with-dataframes).
So our first step will be to transform our ```pandas.DataFrame``` to ```numpy.ndarray```:

Expand All @@ -113,7 +120,7 @@ spectra_np = spectra.to_numpy()
wavenumbers = spectra.columns.to_numpy(dtype=np.float64)

# Convert the hplc pandas.DataFrame to numpy.ndarray
hplc = hplc.to_numpy()
hplc_np = hplc.to_numpy()
```

Now that we have our data in the right format, we can start plotting. We will define a function to plot the spectra, where each spectrum will be color-coded according to its glucose concentration. We will use the ```matplotlib.colors.Normalize``` class to normalize the glucose concentrations between 0 and 1. Then, we will use the ```matplotlib.cm.ScalarMappable``` class to create a colorbar.
Expand All @@ -128,7 +135,7 @@ def plot_spectra(spectra: np.ndarray, wavenumbers: np.ndarray, hplc: np.ndarray)
cmap = plt.get_cmap("jet")

# Define a normalization function to scale glucose concentrations between 0 and 1
norm = Normalize(vmin=hplc.min(), vmax=hplc.max())
normalize = Normalize(vmin=hplc.min(), vmax=hplc.max())
colors = [cmap(normalize(value)) for value in hplc]

# Plot the spectra
Expand All @@ -152,7 +159,7 @@ def plot_spectra(spectra: np.ndarray, wavenumbers: np.ndarray, hplc: np.ndarray)
Then, we can use this function to plot the training dataset:

```python
plot_spectra(spectra, hplc)
plot_spectra(spectra_np, wavenumbers, hplc_np)
```

which should result in the following plot:
Expand All @@ -169,7 +176,7 @@ Now that you've explored the dataset, it's time to preprocess the spectral data.

We will preprocess the spectra using the following steps:

- __[Range Cut](https://paucablop.github.io/chemotools/docs/variable_selection.html#range-cut)__: to remove the wavenumbers outside the range between 950 and 1550 cm-1.
- __[Range Cut](https://paucablop.github.io/chemotools/docs/feature_selection.html#range-cut)__: to remove the wavenumbers outside the range between 950 and 1550 cm-1.

- __[Linear Correction](https://paucablop.github.io/chemotools/docs/baseline.html#linear-baseline-correction)__: to remove the linear baseline shift.

Expand All @@ -182,7 +189,7 @@ We will chain the preprocessing steps using the [```make_pipeline()```](https://


```python
from chemotools.variable_selection import RangeCut
from chemotools.feature_selection import RangeCut
from chemotools.baseline import LinearCorrection
from chemotools.derivative import SavitzkyGolay

Expand All @@ -191,7 +198,7 @@ from sklearn.pipeline import make_pipeline

# create a pipeline that scales the data
preprocessing = make_pipeline(
RangeCut(start=950, end=1500, wavelength=wavenumbers),
RangeCut(start=950, end=1500, wavenumbers=wavenumbers),
LinearCorrection(),
SavitzkyGolay(window_size=15, polynomial_order=2, derivate_order=1),
StandardScaler(with_std=False)
Expand All @@ -208,9 +215,8 @@ Finally, we can plot the preprocessed spectra:

```python
# get the wavenumbers after the range cut
start_index = preprocessing.named_steps['rangecut'].start
end_index = preprocessing.named_steps['rangecut'].end
wavenumbers_cut = wavenumbers[start_index:end_index]
wavenumbers_cut = preprocessing.named_steps['rangecut'].wavenumbers_


# plot the preprocessed spectra
plot_spectra(spectra_preprocessed, wavenumbers_cut, hplc_np)
Expand Down Expand Up @@ -295,7 +301,7 @@ hplc_pred = pls.predict(spectra_preprocessed)

# plot the predictions
fig, ax = plt.subplots(figsize=(4, 4))
ax.scatter(hplc_np, predictions, color='blue')
ax.scatter(hplc_np, hplc_pred, color='blue')
ax.plot([0, 40], [0, 40], color='magenta')
ax.set_xlabel('Measured glucose (g/L)')
ax.set_ylabel('Predicted glucose (g/L)')
Expand Down Expand Up @@ -352,12 +358,12 @@ Now we can compare the predicted glucose concentrations with the off-line HPLC m

```python
# make linspace of length of predictoins
time = np.linspace(0, len(predictions_test), len(predictions_test),) * 1.25 / 60
time = np.linspace(0, len(glucose_test_pred), len(glucose_test_pred),) * 1.25 / 60

# plot the predictions
fig, ax = plt.subplots(figsize=(10, 4))

ax.plot(time, predictions_test, color='blue', label='Predicted')
ax.plot(time, glucose_test_pred, color='blue', label='Predicted')
ax.plot(hplc_test.index, hplc_test['glucose']+4, 'o', color='red', label='Measured')
ax.set_xlabel('Time (h)')
ax.set_ylabel('Glucose (g/L)')
Expand Down
Loading

0 comments on commit 02f59f2

Please sign in to comment.