zincware · SamTov · Jan 21, 2022 · Jan 21, 2022 · Jan 21, 2022 · Jan 24, 2022
diff --git a/CI/integration_tests/calculators/test_radial_distribution_function.py b/CI/integration_tests/calculators/test_radial_distribution_function.py
@@ -30,6 +30,7 @@
 from zinchub import DataHub
 
 import mdsuite as mds
+mds.config.memory_fraction = 1
 from mdsuite.utils.testing import assertDeepAlmostEqual
 
 

diff --git a/CI/memory_scaling/test_scaling_coefficients.py b/CI/memory_scaling/test_scaling_coefficients.py
@@ -0,0 +1,128 @@
+"""
+MDSuite: A Zincwarecode package.
+
+License
+-------
+This program and the accompanying materials are made available under the terms
+of the Eclipse Public License v2.0 which accompanies this distribution, and is
+available at https://www.eclipse.org/legal/epl-v20.html
+
+SPDX-License-Identifier: EPL-2.0
+
+Copyright Contributors to the Zincwarecode Project.
+
+Contact Information
+-------------------
+email: zincwarecode@gmail.com
+github: https://github.com/zincware
+web: https://zincwarecode.com/
+
+Citation
+--------
+If you use this module please cite us with:
+
+Summary
+-------
+Module to test scaling coefficients.
+"""
+import sqlite3
+
+import numpy as np
+import pandas as pd
+import pytest
+
+import mdsuite
+import mdsuite.transformations
+
+
+def _build_atomwise(data_scaling: int, system: bool = False):
+    """
+    Build a numpy array of atom-wise data in steps of MBs.
+
+    Parameters
+    ----------
+    data_scaling : int
+            Number of atoms in the data e.g. zeroth array of the data. 1 atom is 1/10
+            of a MB of data.
+    system : bool
+            If true, the returned array should be (n_confs, 3)
+
+    Returns
+    -------
+    data_array : np.ones
+            A numpy array of ones that matches close to 1/10 * data_scaling MBs in
+            size (~98%).
+    Notes
+    -----
+    TODO: When moved to (confs, n_atoms, dim), this will need to be updated to take the
+          first column as atoms otherwise the memory scaling will be wrong.
+
+    """
+    if system:
+        return np.ones((data_scaling * 4096, 3))
+    else:
+        return np.ones((data_scaling, 4096, 3))
+
+
+@pytest.fixture()
+def mdsuite_project(tmp_path) -> mdsuite.Project:
+    """
+    Build an MDSuite project with all data stored in a temp directory for easier
+    cleanup after the test.
+
+    Returns
+    -------
+    project : mdsuite.Project
+            MDSuite project to be used in the tests.
+    """
+    project = mdsuite.Project(storage_path=tmp_path.as_posix())
+
+    scaling_sizes = [10, 100, 500, 1000]
+
+    return project
+
+
+def get_memory_usage(database: str, callable_name: str) -> float:
+    """
+    Get the memory used from the dumped sql database.
+
+    Parameters
+    ----------
+    database : str
+            Path to the sqlite database that will be read.
+    callable_name : str
+            Name of the function being measured and therefore, what memory value to
+            return.
+
+    Returns
+    -------
+    memory : float
+            memory used during the calculation.
+    """
+    with sqlite3.connect(database) as db:
+        data = pd.read_sql_query("SELECT * from TEST_METRICS", db)
+
+    data = data.loc[data["ITEM"] == callable_name]
+
+    return data["MEM_USAGE"]
+
+
+def test_rdf_memory(mdsuite_project):
+    """
+    Test the memory of the RDF.
+
+    Parameters
+    ----------
+    mdsuite_project : mdsuite.Project
+            An mdsuite project with stored files in a tmp directory.
+
+    Returns
+    -------
+
+    """
+    memory_array = np.zeros((2,))
+    mdsuite_project.run.RadialDistributionFunction(plot=False)
+    memory = get_memory_usage("pymon.db", test_rdf_memory.__name__)
+    memory_array[0] = memory
+
+    print(memory_array)
diff --git a/CI/unit_tests/memory_manager/test_memory_manager.py b/CI/unit_tests/memory_manager/test_memory_manager.py
@@ -22,7 +22,7 @@
 import unittest
 
 import numpy as np
-
+import mdsuite
 from mdsuite.memory_management.memory_manager import MemoryManager
 
 
@@ -146,7 +146,6 @@ def test_get_batch_size(self):
         # Test correct returns for 1 batch
         self.memory_manager.database = TestDatabase(data_size=500, rows=10, columns=10)
         self.memory_manager.data_path = ["Test/Path"]
-        self.memory_manager.memory_fraction = 0.5
         self.memory_manager.machine_properties["memory"] = 50000
         batch_size, number_of_batches, remainder = self.memory_manager.get_batch_size(
             system=False
@@ -188,7 +187,8 @@ def test_get_optimal_batch_size(self):
         the same value that is passed to it.
         """
         data = self.memory_manager._get_optimal_batch_size(10)
-        self.assertEqual(data, data)  # Todo: no shit, sherlock
+        self.assertEqual(data, 10)  # Todo: no shit, sherlock
+        mdsuite.config.memory_scaling_test = True
 
     def test_compute_atomwise_minibatch(self):
         """

diff --git a/mdsuite/experiment/experiment.py b/mdsuite/experiment/experiment.py
@@ -607,7 +607,7 @@ def _store_metadata(self, metadata: TrajectoryMetadata, update_with_pubchempy=Fa
         ----------
         metadata: TrajectoryMetadata
         update_with_pubchempy: bool
-            Load data from pubchempy and add it to fill missing infomration
+            Load data from pubchempy and add it to fill missing information.
         """
         # new trajectory: store all metadata and construct a new database
         self.temperature = metadata.temperature

diff --git a/mdsuite/memory_management/memory_manager.py b/mdsuite/memory_management/memory_manager.py
@@ -23,12 +23,12 @@
 
 Summary
 -------
+Module to manage the memory use of MDSuite operations.
 """
 import logging
 from typing import Tuple
 
 import numpy as np
-import tensorflow as tf
 
 from mdsuite.database.simulation_database import Database
 from mdsuite.utils.meta_functions import get_machine_properties, gpu_available
@@ -38,6 +38,7 @@
     polynomial_scale_function,
     quadratic_scale_function,
 )
+from mdsuite.utils import config
 
 log = logging.getLogger(__name__)
 
@@ -58,11 +59,20 @@ class MemoryManager:
     Attributes
     ----------
     data_path : list
+            Path to reference the data in the hdf5 database.
     database : Database
+            Database to look through.
     parallel : bool
+            If true, batch sizes should take into account the use of multiple machines
+            with shared memory. TODO: This is outdated.
     memory_fraction : float
+            Amount of memory to use TODO: In a perfect scaling, this can be 100 % of the
+                                          free memory.
     scale_function : dict
+            Function to use to describe how the memory scaling changes with changing
+            data size.
     gpu : bool
+            If true, a gpu is available.
     """
 
     def __init__(
@@ -93,7 +103,8 @@ def __init__(
         scale_function : dict
                 Scaling function to compute the memory scaling of a calculator.
         gpu : bool
-                If true, gpu should be used.
+                If true, a GPU has been detected and the available memory will be
+                calculated from the GPU.
         offset : int
                 If data is being loaded from a non-zero point in the database the
                 offset is used to take this into account. For example, expanding a
@@ -104,7 +115,6 @@ def __init__(
         self.data_path = data_path
         self.parallel = parallel
         self.database = database
-        self.memory_fraction = memory_fraction
         self.offset = offset
 
         self.machine_properties = get_machine_properties()
@@ -115,9 +125,6 @@ def __init__(
                     memory = self.machine_properties["gpu"][item]["memory"]
 
             self.machine_properties["memory"] = memory * 1e6
-            tf.device("gpu")
-        else:
-            tf.device("cpu")
 
         self.batch_size = None
         self.n_batches = None
@@ -209,13 +216,13 @@ def get_batch_size(self, system: bool = False) -> tuple:
         )
         maximum_loaded_configurations = int(
             np.clip(
-                (self.memory_fraction * self.machine_properties["memory"])
+                (config.memory_fraction * self.machine_properties["memory"])
                 / per_configuration_memory,
                 1,
                 n_configs - self.offset,
             )
         )
-        batch_size = self._get_optimal_batch_size(maximum_loaded_configurations)
+        batch_size = self._get_optimal_batch_size(maximum_loaded_configurations, n_configs)
         number_of_batches, remainder = divmod((n_configs - self.offset), batch_size)
         self.batch_size = batch_size
         self.n_batches = number_of_batches
@@ -241,23 +248,30 @@ def hdf5_load_time(n: int):
         return np.log(n)
 
     @staticmethod
-    def _get_optimal_batch_size(naive_size):
+    def _get_optimal_batch_size(naive_size, n_configs: int):
         """
         Use the open/close and read speeds of the hdf5 database_path as well as the
         operation being performed to get an optimal batch size.
 
+        This is where the memory scaling test will be enforced.
+
         Parameters
         ----------
         naive_size : int
                 Naive batch size to be optimized
+        n_configs : int
+                Total number of configurations in the database.
 
         Returns
         -------
         batch_size : int
                 An optimized batch size
         """
         # db_io_time = self.database.get_load_time()
-        return naive_size
+        if config.memory_scaling_test:
+            return n_configs
+        else:
+            return naive_size
 
     def _compute_atomwise_minibatch(self, data_range: int):
         """
@@ -310,7 +324,7 @@ def _compute_atomwise_minibatch(self, data_range: int):
                 )
                 batch_size = int(
                     np.clip(
-                        self.memory_fraction
+                        config.memory_fraction
                         * self.machine_properties["memory"]
                         / per_atom_memory,
                         1,
@@ -323,7 +337,7 @@ def _compute_atomwise_minibatch(self, data_range: int):
             atom_batch_memory = fraction * per_atom_memory
             batch_size = int(
                 np.clip(
-                    self.memory_fraction
+                    config.memory_fraction
                     * self.machine_properties["memory"]
                     / atom_batch_memory,
                     1,

diff --git a/mdsuite/utils/config.py b/mdsuite/utils/config.py
@@ -23,6 +23,8 @@
 
 Summary
 -------
+A set of configuration parameters for the MDSuite framework. Includes information
+regarding memory fraction, scaling test state, jupyter use and so on.
 """
 from dataclasses import dataclass
 
@@ -36,10 +38,22 @@ class Config:
     bokeh_sizing_mode: str
         The way bokeh scales plots.
         see bokeh / sizing_mode for more information
+    jupyter : bool
+            If true, jupyter is being used.
+    GPU: bool
+            TODO I think this is outdated.
+    memory_scaling_test : bool
+            If true, a scaling test is being performed and therefore, all batch sizes
+            are set to 1. Should typically be accompanied by the memory fraction being
+            set to 1 as well.
+    memory_fraction: bool
+            The portion of the available memory to be used.
     """
 
     jupyter: bool = False
     GPU: bool = False
+    memory_scaling_test: bool = False
+    memory_fraction: float = 0.5
     bokeh_sizing_mode: str = "stretch_both"