From b7d1939884d7c92f1515fda6e8cc252d3aef3c15 Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Tue, 7 May 2024 15:06:51 +0100 Subject: [PATCH] Usability tweaks & new featurizer preset (#215) * Add ability to disable o-o-b remapping * Better debugging of target names * Add fast featurizer preset * Allow structure IDs to be passed via featurized df * Linting --- .../featurizers/presets/matminer_2024_fast.py | 219 ++++++++++++++++++ modnet/models/ensemble.py | 13 +- modnet/models/vanilla.py | 38 +-- modnet/preprocessing.py | 26 ++- 4 files changed, 270 insertions(+), 26 deletions(-) create mode 100644 modnet/featurizers/presets/matminer_2024_fast.py diff --git a/modnet/featurizers/presets/matminer_2024_fast.py b/modnet/featurizers/presets/matminer_2024_fast.py new file mode 100644 index 0000000..f1f6f09 --- /dev/null +++ b/modnet/featurizers/presets/matminer_2024_fast.py @@ -0,0 +1,219 @@ +"""This submodule contains the `Matminer2024FastFeaturizer` class. """ + +import numpy as np +import modnet.featurizers +import contextlib + + +class Matminer2024FastFeaturizer(modnet.featurizers.MODFeaturizer): + """A set of efficient featurizers for features implemented in matminer + at time of creation (matminer v0.9.2 from 2024). + + Removes featurizers that are known to be slow (i.e., orders of magnitude + more intensive to compute than the rest of the featurizers). + + """ + + def __init__( + self, + fast_oxid: bool = True, + continuous_only: bool = True, + ): + """Creates the featurizer and imports all featurizer functions. + + Parameters: + fast_oxid: Whether to use the accelerated oxidation state parameters within + pymatgen when constructing features that constrain oxidation states such + that all sites with the same species in a structure will have the same + oxidation state (recommended if featurizing any structure + with large unit cells). + continuous_only: Whether to keep only the features that are continuous + with respect to the composition (only for composition featurizers). + Discontinuous features may lead to discontinuities in the model predictions. + + """ + + super().__init__() + self.drop_allnan = False + self.fast_oxid = fast_oxid + self.continuous_only = continuous_only + self.load_featurizers() + + def load_featurizers(self): + with contextlib.redirect_stdout(None): + from matminer.featurizers.composition import ( + BandCenter, + ElementFraction, + ElementProperty, + Stoichiometry, + TMetalFraction, + ValenceOrbital, + ) + from matminer.featurizers.structure import ( + DensityFeatures, + EwaldEnergy, + GlobalSymmetryFeatures, + StructuralComplexity, + ) + from matminer.utils.data import ( + DemlData, + PymatgenData, + ) + + pymatgen_features = [ + "block", + "mendeleev_no", + "electrical_resistivity", + "velocity_of_sound", + "thermal_conductivity", + "bulk_modulus", + "coefficient_of_linear_thermal_expansion", + ] + + deml_features = [ + "atom_radius", + "molar_vol", + "heat_fusion", + "boiling_point", + "heat_cap", + "first_ioniz", + "electric_pol", + "GGAU_Etot", + "mus_fere", + "FERE correction", + ] + + magpie_featurizer = ElementProperty.from_preset("magpie") + magpie_featurizer.stats = ["mean", "avg_dev"] + + pymatgen_featurizer = ElementProperty( + data_source=PymatgenData(), + stats=["mean", "avg_dev"], + features=pymatgen_features, + ) + + deml_featurizer = ElementProperty( + data_source=DemlData(), + stats=["mean", "avg_dev"], + features=deml_features, + ) + + self.composition_featurizers = ( + BandCenter(), + ElementFraction(), + magpie_featurizer, + pymatgen_featurizer, + deml_featurizer, + Stoichiometry(p_list=[2, 3, 5, 7, 10]), + TMetalFraction(), + ValenceOrbital(props=["frac"]), + ) + + self.oxid_composition_featurizers = [] + + self.structure_featurizers = ( + DensityFeatures(), + EwaldEnergy(), + GlobalSymmetryFeatures(), + StructuralComplexity(), + ) + + self.site_featurizers = [] + + def featurize_composition(self, df): + """Applies the preset composition featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + from pymatgen.core.periodic_table import Element + + df = super().featurize_composition(df) + + if self.composition_featurizers and not self.continuous_only: + _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} + df["AtomicOrbitals|HOMO_character"] = df[ + "AtomicOrbitals|HOMO_character" + ].map(_orbitals) + df["AtomicOrbitals|LUMO_character"] = df[ + "AtomicOrbitals|LUMO_character" + ].map(_orbitals) + + df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + + if self.continuous_only: + # These are additional features that have shown discontinuities in my tests. + # Hopefully, I got them all... + df.drop( + columns=[ + "ElementProperty|DemlData mean electric_pol", + "ElementProperty|DemlData mean FERE correction", + "ElementProperty|DemlData mean GGAU_Etot", + "ElementProperty|DemlData mean heat_fusion", + "ElementProperty|DemlData mean mus_fere", + ], + inplace=True, + errors="ignore", + ) + + if self.oxid_composition_featurizers: + df.drop(columns=["IonProperty|max ionic char"], inplace=True) + + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) + + def featurize_structure(self, df): + """Applies the preset structural featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + if self.structure_featurizers: + df = super().featurize_structure(df) + + _crystal_system = { + "cubic": 1, + "tetragonal": 2, + "orthorombic": 3, + "hexagonal": 4, + "trigonal": 5, + "monoclinic": 6, + "triclinic": 7, + } + + def _int_map(x): + if x == np.nan: + return 0 + elif x: + return 1 + else: + return 0 + + df["GlobalSymmetryFeatures|crystal_system"] = df[ + "GlobalSymmetryFeatures|crystal_system" + ].map(_crystal_system) + df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ + "GlobalSymmetryFeatures|is_centrosymmetric" + ].map(_int_map) + + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) + + def featurize_site(self, df): + """Applies the preset site featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + # rename some features for backwards compatibility with pretrained models + aliases = { + "GeneralizedRadialDistributionFunction": "GeneralizedRDF", + "AGNIFingerprints": "AGNIFingerPrint", + "BondOrientationalParameter": "BondOrientationParameter", + } + df = super().featurize_site(df, aliases=aliases) + df = df.loc[:, (df != 0).any(axis=0)] + + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) diff --git a/modnet/models/ensemble.py b/modnet/models/ensemble.py index 4fb0813..a87954b 100644 --- a/modnet/models/ensemble.py +++ b/modnet/models/ensemble.py @@ -144,7 +144,11 @@ def fit( pool.join() def predict( - self, test_data: MODData, return_unc=False, return_prob=False + self, + test_data: MODData, + return_unc: bool = False, + return_prob: bool = False, + remap_out_of_bounds: bool = True, ) -> pd.DataFrame: """Predict the target values for the passed MODData. @@ -154,6 +158,7 @@ def predict( return_prob: For a classification task only: whether to return the probability of each class OR only return the most probable class. return_unc: whether to return a second dataframe containing the uncertainties + remap_out_of_bounds: whether to remap out-of-bounds values to the nearest bound. Returns: A `pandas.DataFrame` containing the predicted values of the targets. @@ -163,7 +168,11 @@ class OR only return the most probable class. all_predictions = [] for i in range(self.n_models): - p = self.models[i].predict(test_data, return_prob=return_prob) + p = self.models[i].predict( + test_data, + return_prob=return_prob, + remap_out_of_bounds=remap_out_of_bounds, + ) all_predictions.append(p.values) p_mean = np.array(all_predictions).mean(axis=0) diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index b310f1f..1350f54 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -693,7 +693,12 @@ def fit_preset( return models, val_losses, best_learning_curve, learning_curves, best_preset - def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame: + def predict( + self, + test_data: MODData, + return_prob: bool = False, + remap_out_of_bounds: bool = True, + ) -> pd.DataFrame: """Predict the target values for the passed MODData. Parameters: @@ -701,6 +706,7 @@ def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame: object containing the descriptors used in training. return_prob: For a classification tasks only: whether to return the probability of each class OR only return the most probable class. + remap_out_of_bounds: Whether to remap out-of-bounds predictions to the training data distribution. Returns: A `pandas.DataFrame` containing the predicted values of the targets. @@ -724,20 +730,22 @@ class OR only return the most probable class. p = [p] # post-process based on training data - if max(self.num_classes.values()) <= 2: # regression - for i, vals in enumerate(p): - yrange = self.max_y[i] - self.min_y[i] - upper_bound = self.max_y[i] + 0.25 * yrange - lower_bound = self.min_y[i] - 0.25 * yrange - for j in range(len(self.targets_groups[i])): - out_of_range_idxs = np.where( - (vals[:, j] < lower_bound[j]) | (vals[:, j] > upper_bound[j]) - ) - vals[out_of_range_idxs, j] = ( - np.random.uniform(0, 1, size=len(out_of_range_idxs[0])) - * (yrange[j]) - + self.min_y[i][j] - ) + if remap_out_of_bounds: + if max(self.num_classes.values()) <= 2: # regression + for i, vals in enumerate(p): + yrange = self.max_y[i] - self.min_y[i] + upper_bound = self.max_y[i] + 0.25 * yrange + lower_bound = self.min_y[i] - 0.25 * yrange + for j in range(len(self.targets_groups[i])): + out_of_range_idxs = np.where( + (vals[:, j] < lower_bound[j]) + | (vals[:, j] > upper_bound[j]) + ) + vals[out_of_range_idxs, j] = ( + np.random.uniform(0, 1, size=len(out_of_range_idxs[0])) + * (yrange[j]) + + self.min_y[i][j] + ) p_dic = {} diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index c5e8f8a..784e04b 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -664,8 +664,12 @@ def __init__( LOG.info(f"Loaded {self.featurizer.__class__.__name__} featurizer.") if target_names is not None: + if isinstance(target_names, str): + target_names = [target_names] if np.shape(targets)[-1] != len(target_names): - raise ValueError("Target names must be supplied for every target.") + raise ValueError( + f"Target names must be supplied for every target: {np.shape(targets)} vs {target_names=}" + ) elif targets is not None: if len(np.shape(targets)) == 1: target_names = ["prop0"] @@ -681,16 +685,20 @@ def __init__( "List of IDs (`structure_ids`) provided must be unique." ) - if len(structure_ids) != len(materials): - raise ValueError( - "List of IDs (`structure_ids`) must have same length as list of structure." - ) + if materials is not None: + if len(structure_ids) != len(materials): + raise ValueError( + "List of IDs (`structure_ids`) must have same length as list of structure." + ) else: - num_entries = ( - len(materials) if materials is not None else len(df_featurized) - ) - structure_ids = [f"id{i}" for i in range(num_entries)] + if df_featurized is not None: + structure_ids = df_featurized.index + else: + num_entries = ( + len(materials) if materials is not None else len(df_featurized) + ) + structure_ids = [f"id{i}" for i in range(num_entries)] if targets is not None: # set up dataframe for targets with columns (id, property_1, ..., property_n)