kjappelbaum · FMcil · Nov 14, 2023 · Aug 27, 2024 · Aug 29, 2024 · Sep 11, 2024
diff --git a/docs/source/featurizers/global/raspa.rst b/docs/source/featurizers/global/raspa.rst
@@ -47,3 +47,21 @@ The Henry coefficient is strongly connected to the energy grid and might sometim
     :scalar: False
 
     We use the implementation in [Dubbeldam]_
+
+
+
+Loading calculation
+..................
+
+The loading is the pure component uptake on a MOF.
+
+.. featurizer::  Loading
+    :id: Loading
+    :considers_geometry: True
+    :considers_structure_graph: False
+    :encodes_chemistry: True
+    :scope: global
+    :scalar: True
+
+    We use the implementation in [Dubbeldam]_
+
diff --git a/src/mofdscribe/featurizers/__init__.py b/src/mofdscribe/featurizers/__init__.py
@@ -22,6 +22,7 @@
 from .chemistry import RACS  # noqa: F401
 from .chemistry import EnergyGridHistogram  # noqa: F401
 from .chemistry import Henry  # noqa: F401
+from .chemistry import Loading  # noqa: F401
 from .chemistry import PartialChargeHistogram  # noqa: F401
 from .chemistry import PartialChargeStats  # noqa: F401
 from .chemistry import PriceLowerBound  # noqa: F401

diff --git a/src/mofdscribe/featurizers/chemistry/__init__.py b/src/mofdscribe/featurizers/chemistry/__init__.py
@@ -4,6 +4,7 @@
 from .aprdf import APRDF  # noqa: F401
 from .energygrid import EnergyGridHistogram  # noqa: F401
 from .henry import Henry  # noqa: F401
+from .loading import Loading  # noqa: F401
 from .partialchargehistogram import PartialChargeHistogram  # noqa: F401
 from .partialchargestats import PartialChargeStats  # noqa: F401
 from .price import PriceLowerBound  # noqa: F401

diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py
@@ -178,10 +178,10 @@ def __init__(
         super().__init__(primitive=primitive)
 
     def fit_transform(self, structures: List[Union[Structure, IStructure]]):
-        ...
+        pass
 
     def fit(self, structure: Union[Structure, IStructure]):
-        ...
+        pass
 
     def _get_grid(self):
         return np.arange(self.min_energy_vdw, self.max_energy_vdw, self.bin_size_vdw)

diff --git a/src/mofdscribe/featurizers/chemistry/loading.py b/src/mofdscribe/featurizers/chemistry/loading.py
@@ -0,0 +1,227 @@
+# -*- coding: utf-8 -*-
+"""Featurizer that runs RASPA to calculate the Henry coefficient."""
+import os
+from glob import glob
+from typing import List, Union
+
+import numpy as np
+from pymatgen.core import IStructure, Structure
+
+from mofdscribe.featurizers.base import MOFBaseFeaturizer
+from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
+from mofdscribe.featurizers.utils.raspa.base_parser import parse_base_output
+from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell
+from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa
+
+__all__ = ["Loading"]
+LOADING_INPUT_TEMPLATE = """SimulationType                MonteCarlo
+NumberOfCycles                {cycles}
+NumberOfInitializationCycles  4000
+PrintEvery                    {print_every}
+RestartFile                   no
+
+Forcefield                    Local
+UseChargesFromCIFFile         yes
+
+Framework 0
+FrameworkName input
+UnitCells {unit_cells}
+HeliumVoidFraction 0.29
+ExternalTemperature {temp}
+ExternalPressure {press}
+
+CutOff                        {cutoff}
+
+Component 0 MoleculeName             {molname}
+            MoleculeDefinition       Local
+            TranslationProbability   0.5
+            RotationProbability      0.5
+            ReinsertionProbability   0.5
+            SwapProbability          1.0
+            CreateNumberOfMolecules  0
+
+
+"""
+
+
+def parse_loading(directory: Union[str, os.PathLike]) -> dict:
+    """Parse the widom output files in the given directory."""
+    outputs = glob(os.path.join(directory, "Output", "System_0", "*.data"))
+    if len(outputs) != 1:
+        raise ValueError("Expected one output file, got {}".format(len(outputs)))
+
+    parsed_output, _ = parse_base_output(outputs[0], system_name="System_0", ncomponents=1)
+
+    component_results = list(parsed_output["components"].values())[0]
+
+    return [
+        component_results["loading_molar_absolute_average"],
+        component_results["loading_molar_excess_average"],
+    ], [
+        component_results["loading_molar_absolute_dev"],
+        component_results["loading_molar_excess_dev"],
+    ]
+
+
+@operates_on_structure
+@operates_on_istructure
+class Loading(MOFBaseFeaturizer):
+    """Computes the uptake for a given molecule using the RASPA [1]_ program.
+
+    While loading are sometimes the targets of ML
+    algorithms, they are sometimes also used as features.
+    """
+
+    def __init__(
+        self,
+        raspa_dir: Union[str, os.PathLike, None] = None,
+        cycles: int = 6000,
+        temperature: float = 298,
+        pressure: float = 101325,
+        cutoff: float = 12,
+        mof_ff: str = "UFF",
+        mol_ff: str = "TraPPE",
+        mol_name: str = "CO2",
+        tail_corrections: bool = True,
+        mixing_rule: str = "Lorentz-Berthelot",
+        shifted: bool = False,
+        separate_interactions: bool = True,
+        run_eqeq: bool = True,
+        return_std: bool = False,
+        primitive: bool = False,
+    ):
+        """Initialize the featurizer.
+
+        Args:
+            raspa_dir (Union[str, PathLike, None]): Path to the raspa
+                directory (with lib, bin, share) subdirectories.
+                If `None` we will look for the `RASPA_DIR` environment variable.
+                Defaults to None.
+            cycles (int): Number of simulation cycles.
+                Defaults to 5_000.
+            temperature (float): Simulation temperature in
+                Kelvin. Defaults to 300.
+            pressure (float): Simluation pressure in Pascals.
+                Defaults to 101325.
+            cutoff (float): Cutoff for simulation in Angstrom.
+                Defaults to 12.
+            mof_ff (str): Name of the forcefield used for the framework.
+                Defaults to "UFF".
+            mol_ff (str): Name of the forcefield used for the guest molecule.
+                Defaults to "TraPPE".
+            mol_name (str): Name of the guest molecule. Defaults to "CO2".
+            tail_corrections (bool): If true, use analytical tail-correction
+                for the contribution of the interaction potential after the
+                cutoff. Defaults to True.
+            mixing_rule (str): Mixing rule for framework and guest
+                molecule force field. Available options are `Jorgenson` and
+                `Lorentz-Berthelot`. Defaults to "Lorentz-Berthelot".
+            shifted (bool): If true, shifts the potential to equal to zero at the
+                cutoff. Defaults to False.
+            separate_interactions (bool): If True use framework's force field
+                for framework-molecule interactions.
+                Defaults to True.
+            run_eqeq (bool): If true, runs EqEq to compute charges.
+                Defaults to True.
+            return_std (bool): If true, return the standard deviations.
+                Defaults to False.
+            primitive (bool): If true, use the primitive unit cell.
+                Defaults to False.
+
+        Raises:
+            ValueError: If the `RASPA_DIR` environment variable is not set.
+        """
+        self.raspa_dir = raspa_dir if raspa_dir else os.environ.get("RASPA_DIR")
+        if self.raspa_dir is None:
+            try:
+                self.raspa_dir = detect_raspa_dir()
+            except ValueError:
+                raise ValueError(
+                    "Please set the RASPA_DIR environment variable or provide the path for the class initialization."
+                )
+        self.cycles = cycles
+        self.cutoff = cutoff
+        self.mof_ff = mof_ff
+        self.mol_ff = mol_ff
+        self.mol_name = mol_name
+        self.tail_corrections = tail_corrections
+        self.mixing_rule = mixing_rule
+        self.shifted = shifted
+        self.separate_interactions = separate_interactions
+        self.temperature = temperature
+        self.pressure = pressure
+        self.run_eqeq = run_eqeq
+        self.return_std = return_std
+        super().__init__(primitive=primitive)
+
+    def _featurize(self, s: Union[Structure, IStructure]) -> np.array:
+        ff_molecules = {self.mol_name: self.mol_ff}
+
+        parameters = {
+            "ff_framework": self.mof_ff,
+            "ff_molecules": ff_molecules,
+            "shifted": self.shifted,
+            "tail_corrections": self.tail_corrections,
+            "mixing_rule": self.mixing_rule,
+            "separate_interactions": self.separate_interactions,
+        }
+        replicas = resize_unit_cell(s, self.cutoff)
+        ucells = f"{replicas[0]} {replicas[1]} {replicas[2]}"
+        simulation_script = LOADING_INPUT_TEMPLATE.format(
+            cycles=self.cycles,
+            unit_cells=ucells,
+            cutoff=self.cutoff,
+            print_every=self.cycles // 10,
+            molname=self.mol_name,
+            temp=self.temperature,
+            press=self.pressure,
+        )
+
+        res, deviations = run_raspa(
+            s,
+            self.raspa_dir,
+            simulation_script,
+            parameters,
+            parse_loading,
+            self.run_eqeq,
+        )
+
+        if self.return_std:
+            res.extend(deviations)
+        return np.array(res)
+
+    def feature_labels(self) -> List[str]:
+        feat = [
+            f"loading_{self.mol_name}_{self.temperature}_{self.pressure}_mol/kg",
+            f"loading_excess_{self.mol_name}_{self.temperature}_{self.pressure}_mol/kg",
+        ]
+        if self.return_std:
+            feat.extend(
+                [
+                    f"loading_std_{self.mol_name}_{self.temperature}_{self.pressure}_mol/kg",
+                    f"loading_excess_std_{self.mol_name}_{self.temperature}_{self.pressure}_mol/kg",
+                ]
+            )
+        return feat
+
+    def implementors(self) -> List[str]:
+        return ["Fergus Mcilwaine"]
+
+    def citations(self) -> List[str]:
+        return [
+            "@article{Dubbeldam2015,"
+            "doi = {10.1080/08927022.2015.1010082},"
+            "url = {https://doi.org/10.1080/08927022.2015.1010082},"
+            "year = {2015},"
+            "month = feb,"
+            "publisher = {Informa {UK} Limited},"
+            "volume = {42},"
+            "number = {2},"
+            "pages = {81--101},"
+            r"author = {David Dubbeldam and Sof{'{\i}}a Calero and "
+            "Donald E. Ellis and Randall Q. Snurr},"
+            "title = {{RASPA}: molecular simulation software for adsorption "
+            "and diffusion in flexible nanoporous materials},"
+            "journal = {Molecular Simulation}"
+            "}"
+        ]
diff --git a/src/mofdscribe/featurizers/utils/raspa/base_parser.py b/src/mofdscribe/featurizers/utils/raspa/base_parser.py
@@ -316,6 +316,18 @@ def parse_base_output(output_abs_path, system_name, ncomponents):  # noqa: C901
                 res_per_component[icomponent]["loading_excess_average"] = float(line.split()[5])
                 res_per_component[icomponent]["loading_excess_dev"] = float(line.split()[7])
                 res_per_component[icomponent]["loading_excess_unit"] = "molecules/unit cell"
+            elif "Average loading absolute [mol/kg framework]" in line:
+                res_per_component[icomponent]["loading_molar_absolute_average"] = float(
+                    line.split()[5]
+                )
+                res_per_component[icomponent]["loading_molar_absolute_dev"] = float(line.split()[7])
+                res_per_component[icomponent]["loading_molar_absolute_unit"] = "mol/kg framework"
+            elif "Average loading excess [mol/kg framework]" in line:
+                res_per_component[icomponent]["loading_molar_excess_average"] = float(
+                    line.split()[5]
+                )
+                res_per_component[icomponent]["loading_molar_excess_dev"] = float(line.split()[7])
+                res_per_component[icomponent]["loading_molar_excess_unit"] = "mol/kg framework"
                 icomponent += 1
             if icomponent >= ncomponents:
                 break

diff --git a/tests/featurizers/bu/test_rdkit_adaptor.py b/tests/featurizers/bu/test_rdkit_adaptor.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 """Test the RDkitAdaptor."""
-import numpy as np
 from rdkit.Chem.Descriptors3D import Asphericity
 
 from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor

diff --git a/tests/featurizers/chemistry/test_loading.py b/tests/featurizers/chemistry/test_loading.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+"""Test the Loading featurizer."""
+from mofdscribe.featurizers.chemistry.loading import Loading
+
+from ..helpers import is_jsonable
+
+
+def test_loadinghoa(hkust_structure, irmof_structure):
+    """Make sure that the featurization works for typical MOFs and the number of features is as expected."""
+    featurizer = Loading(cycles=10)
+    features = featurizer.featurize(hkust_structure)
+    labels = featurizer.feature_labels()
+    assert len(features) == len(labels)
+    features_irmof = featurizer.featurize(irmof_structure)
+    assert sum(abs(features - features_irmof)) > 0
+    assert is_jsonable(dict(zip(featurizer.feature_labels(), features)))
+    assert features_irmof.ndim == 1
+
+    featurizer = Loading(cycles=500, return_std=True)
+    features = featurizer.featurize(hkust_structure)
+    assert len(features) == len(featurizer.feature_labels()) == 4
+
+    # make sure we indeed use pyeqeq
+    featurizer = Loading(cycles=500, return_std=True, run_eqeq=False)
+    features_no_charge = featurizer.featurize(hkust_structure)
+    assert features_no_charge[0] < features[0]