diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8029dfa7c3..5fa8c304be 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -53,6 +53,7 @@ Projects
    projects/compression.rst
    projects/second_order.rst
    projects/monodomain.rst
+   projects/GPU.rst
 
 
 API documentation
diff --git a/docs/source/projects/GPU.rst b/docs/source/projects/GPU.rst
new file mode 100644
index 0000000000..d6935d722b
--- /dev/null
+++ b/docs/source/projects/GPU.rst
@@ -0,0 +1 @@
+.. include:: /../../pySDC/projects/GPU/README.rst
diff --git a/pySDC/implementations/datatype_classes/cupy_mesh.py b/pySDC/implementations/datatype_classes/cupy_mesh.py
index 0b27524f32..f3149cfedd 100644
--- a/pySDC/implementations/datatype_classes/cupy_mesh.py
+++ b/pySDC/implementations/datatype_classes/cupy_mesh.py
@@ -5,6 +5,11 @@
 except ImportError:
     MPI = None
 
+try:
+    from pySDC.helpers.NCCL_communicator import NCCLComm
+except ImportError:
+    NCCLComm = None
+
 
 class cupy_mesh(cp.ndarray):
     """
@@ -31,7 +36,7 @@ def __new__(cls, init, val=0.0, **kwargs):
             obj[:] = init[:]
         elif (
             isinstance(init, tuple)
-            and (init[1] is None or isinstance(init[1], MPI.Intracomm))
+            and (init[1] is None or isinstance(init[1], MPI.Intracomm) or isinstance(init[1], NCCLComm))
             and isinstance(init[2], cp.dtype)
         ):
             obj = cp.ndarray.__new__(cls, init[0], dtype=init[2], **kwargs)
@@ -62,12 +67,15 @@ def __abs__(self):
             float: absolute maximum of all mesh values
         """
         # take absolute values of the mesh values
-        local_absval = float(cp.amax(cp.ndarray.__abs__(self)))
+        local_absval = cp.max(cp.ndarray.__abs__(self))
 
         if self.comm is not None:
             if self.comm.Get_size() > 1:
-                global_absval = 0.0
-                global_absval = max(self.comm.allreduce(sendobj=local_absval, op=MPI.MAX), global_absval)
+                global_absval = local_absval * 0
+                if isinstance(self.comm, NCCLComm):
+                    self.comm.Allreduce(sendbuf=local_absval, recvbuf=global_absval, op=MPI.MAX)
+                else:
+                    global_absval = self.comm.allreduce(sendobj=float(local_absval), op=MPI.MAX)
             else:
                 global_absval = local_absval
         else:
diff --git a/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py b/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py
index 26bf60cacf..ba14ea762d 100644
--- a/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py
+++ b/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py
@@ -1,10 +1,8 @@
 import scipy.sparse as sp
-from mpi4py import MPI
-from mpi4py_fft import PFFT
 
 from pySDC.core.errors import ProblemError
-from pySDC.core.problem import Problem, WorkCounter
-from pySDC.implementations.datatype_classes.mesh import mesh, imex_mesh, comp2_mesh
+from pySDC.core.problem import WorkCounter
+from pySDC.implementations.datatype_classes.mesh import comp2_mesh
 from pySDC.implementations.problem_classes.generic_MPIFFT_Laplacian import IMEX_Laplacian_MPIFFT
 
 from mpi4py_fft import newDistArray
@@ -48,6 +46,8 @@ class grayscott_imex_diffusion(IMEX_Laplacian_MPIFFT):
         Denotes the period of the function to be approximated for the Fourier transform.
     comm : COMM_WORLD, optional
         Communicator for ``mpi4py-fft``.
+    num_blobs : int, optional
+        Number of blobs in the initial conditions. Negative values give rectangles.
 
     Attributes
     ----------
@@ -71,18 +71,35 @@ class grayscott_imex_diffusion(IMEX_Laplacian_MPIFFT):
     .. [3] https://www.chebfun.org/examples/pde/GrayScott.html
     """
 
-    def __init__(self, Du=1.0, Dv=0.01, A=0.09, B=0.086, **kwargs):
-        kwargs['L'] = 2.0
-        super().__init__(dtype='d', alpha=1.0, x0=-kwargs['L'] / 2.0, **kwargs)
+    def __init__(
+        self,
+        Du=1.0,
+        Dv=0.01,
+        A=0.09,
+        B=0.086,
+        L=2.0,
+        num_blobs=1,
+        **kwargs,
+    ):
+        super().__init__(dtype='d', alpha=1.0, x0=-L / 2.0, L=L, **kwargs)
 
         # prepare the array with two components
         shape = (2,) + (self.init[0])
         self.iU = 0
         self.iV = 1
         self.ncomp = 2  # needed for transfer class
-        self.init = (shape, self.comm, self.xp.dtype('float'))
 
-        self._makeAttributeAndRegister('Du', 'Dv', 'A', 'B', localVars=locals(), readOnly=True)
+        self.init = (shape, self.comm, self.xp.dtype('complex') if self.spectral else self.xp.dtype('float'))
+
+        self._makeAttributeAndRegister(
+            'Du',
+            'Dv',
+            'A',
+            'B',
+            'num_blobs',
+            localVars=locals(),
+            readOnly=True,
+        )
 
         # prepare "Laplacians"
         self.Ku = -self.Du * self.K2
@@ -168,7 +185,7 @@ def solve_system(self, rhs, factor, u0, t):
 
         return me
 
-    def u_exact(self, t):
+    def u_exact(self, t, seed=10700000):
         r"""
         Routine to compute the exact solution at time :math:`t = 0`, see [3]_.
 
@@ -185,19 +202,135 @@ def u_exact(self, t):
         assert t == 0.0, 'Exact solution only valid as initial condition'
         assert self.ndim == 2, 'The initial conditions are 2D for now..'
 
-        me = self.dtype_u(self.init, val=0.0)
+        xp = self.xp
+
+        _u = xp.zeros_like(self.X[0])
+        _v = xp.zeros_like(self.X[0])
+
+        rng = xp.random.default_rng(seed)
+
+        if self.num_blobs < 0:
+            """
+            Rectangles with stationary background, see arXiv:1501.01990
+            """
+            F, k = self.A, self.B - self.A
+            A = xp.sqrt(F) / (F + k)
+
+            # set stable background state from Equation 2
+            assert 2 * k < xp.sqrt(F) - 2 * F, 'Kill rate is too large to facilitate stable background'
+            _u[...] = (A - xp.sqrt(A**2 - 4)) / (2 * A)
+            _v[...] = xp.sqrt(F) * (A + xp.sqrt(A**2 - 4)) / 2
+
+            for _ in range(-self.num_blobs):
+                x0, y0 = rng.random(size=2) * self.L[0] - self.L[0] / 2
+                lx, ly = rng.random(size=2) * self.L[0] / self.nvars[0] * 30
+
+                mask_x = xp.logical_and(self.X[0] > x0, self.X[0] < x0 + lx)
+                mask_y = xp.logical_and(self.X[1] > y0, self.X[1] < y0 + ly)
+                mask = xp.logical_and(mask_x, mask_y)
+
+                _u[mask] = rng.random()
+                _v[mask] = rng.random()
+
+        elif self.num_blobs > 0:
+            """
+            Blobs as in https://www.chebfun.org/examples/pde/GrayScott.html
+            """
+
+            inc = self.L[0] / (self.num_blobs + 1)
+
+            for i in range(1, self.num_blobs + 1):
+                for j in range(1, self.num_blobs + 1):
+                    signs = (-1) ** rng.integers(low=0, high=2, size=2)
+
+                    # This assumes that the box is [-L/2, L/2]^2
+                    _u[...] += -xp.exp(
+                        -80.0
+                        * (
+                            (self.X[0] + self.x0 + inc * i + signs[0] * 0.05) ** 2
+                            + (self.X[1] + self.x0 + inc * j + signs[1] * 0.02) ** 2
+                        )
+                    )
+                    _v[...] += xp.exp(
+                        -80.0
+                        * (
+                            (self.X[0] + self.x0 + inc * i - signs[0] * 0.05) ** 2
+                            + (self.X[1] + self.x0 + inc * j - signs[1] * 0.02) ** 2
+                        )
+                    )
+
+            _u += 1
+        else:
+            raise NotImplementedError
 
-        # This assumes that the box is [-L/2, L/2]^2
+        u = self.u_init
         if self.spectral:
-            tmp = 1.0 - self.xp.exp(-80.0 * ((self.X[0] + 0.05) ** 2 + (self.X[1] + 0.02) ** 2))
-            me[0, ...] = self.fft.forward(tmp)
-            tmp = self.xp.exp(-80.0 * ((self.X[0] - 0.05) ** 2 + (self.X[1] - 0.02) ** 2))
-            me[1, ...] = self.fft.forward(tmp)
+            u[0, ...] = self.fft.forward(_u)
+            u[1, ...] = self.fft.forward(_v)
         else:
-            me[0, ...] = 1.0 - self.xp.exp(-80.0 * ((self.X[0] + 0.05) ** 2 + (self.X[1] + 0.02) ** 2))
-            me[1, ...] = self.xp.exp(-80.0 * ((self.X[0] - 0.05) ** 2 + (self.X[1] - 0.02) ** 2))
+            u[0, ...] = _u
+            u[1, ...] = _v
 
-        return me
+        return u
+
+    def get_fig(self, n_comps=2):  # pragma: no cover
+        """
+        Get a figure suitable to plot the solution of this problem
+
+        Args:
+        n_comps (int): Number of components that fit in the solution
+
+        Returns
+        -------
+        self.fig : matplotlib.pyplot.figure.Figure
+        """
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.axes_grid1 import make_axes_locatable
+
+        plt.rcParams['figure.constrained_layout.use'] = True
+
+        if n_comps == 2:
+            self.fig, axs = plt.subplots(1, 2, sharex=True, sharey=True, figsize=((6, 3)))
+            divider = make_axes_locatable(axs[1])
+            self.cax = divider.append_axes('right', size='3%', pad=0.03)
+        else:
+            self.fig, ax = plt.subplots(1, 1, figsize=((6, 5)))
+            divider = make_axes_locatable(ax)
+            self.cax = divider.append_axes('right', size='3%', pad=0.03)
+        return self.fig
+
+    def plot(self, u, t=None, fig=None):  # pragma: no cover
+        r"""
+        Plot the solution. Please supply a figure with the same structure as returned by ``self.get_fig``.
+
+        Parameters
+        ----------
+        u : dtype_u
+            Solution to be plotted
+        t : float
+            Time to display at the top of the figure
+        fig : matplotlib.pyplot.figure.Figure
+            Figure with the correct structure
+
+        Returns
+        -------
+        None
+        """
+        fig = self.get_fig(n_comps=2) if fig is None else fig
+        axs = fig.axes
+
+        vmin = u.min()
+        vmax = u.max()
+        for i, label in zip([self.iU, self.iV], [r'$u$', r'$v$']):
+            im = axs[i].pcolormesh(self.X[0], self.X[1], u[i], vmin=vmin, vmax=vmax)
+            axs[i].set_aspect(1)
+            axs[i].set_title(label)
+
+        if t is not None:
+            fig.suptitle(f't = {t:.2e}')
+        axs[0].set_xlabel(r'$x$')
+        axs[0].set_ylabel(r'$y$')
+        fig.colorbar(im, self.cax)
 
 
 class grayscott_imex_linear(grayscott_imex_diffusion):
diff --git a/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py b/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py
index 215c45336d..e1fc8f6812 100644
--- a/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py
+++ b/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py
@@ -66,8 +66,6 @@ def setup_GPU(cls):
     def __init__(
         self, nvars=None, spectral=False, L=2 * np.pi, alpha=1.0, comm=MPI.COMM_WORLD, dtype='d', useGPU=False, x0=0.0
     ):
-        """Initialization routine"""
-
         if useGPU:
             self.setup_GPU()
 
@@ -98,7 +96,7 @@ def __init__(
         # invoke super init, passing the communicator and the local dimensions as init
         super().__init__(init=(tmp_u.shape, comm, tmp_u.dtype))
         self._makeAttributeAndRegister(
-            'nvars', 'spectral', 'L', 'alpha', 'comm', 'x0', localVars=locals(), readOnly=True
+            'nvars', 'spectral', 'L', 'alpha', 'comm', 'x0', 'useGPU', localVars=locals(), readOnly=True
         )
 
         # get local mesh
diff --git a/pySDC/projects/GPU/README.md b/pySDC/projects/GPU/README.rst
similarity index 55%
rename from pySDC/projects/GPU/README.md
rename to pySDC/projects/GPU/README.rst
index a806c50120..b0fe8173aa 100644
--- a/pySDC/projects/GPU/README.md
+++ b/pySDC/projects/GPU/README.rst
@@ -1,8 +1,9 @@
 pySDC using GPUs
-===================
+================
+
 Installation
 ------------
-In order to start playing on GPU, install `pySDC` and it's dependencies, ideally in developer mode.
+In order to start playing on GPU, install `pySDC` and its dependencies, ideally in developer mode.
 First start by setting up a virtual environment, e.g. by using [Miniconda](https://docs.conda.io/en/latest/miniconda.html).
 Then also add the CuPy Package (the cuda-toolkit will be installed automatically):
 
@@ -13,28 +14,49 @@ Then also add the CuPy Package (the cuda-toolkit will be installed automatically
 When this is done (and it can take a while), you have your setup to run `pySDC` on the GPU.
 
 Changes in the problem_classes
-------------
+------------------------------
 Now you have to change a little bit in the problem_classes. The first and easy step is to change the datatype.
 To use pySDC on the GPU with CuPy you must use the [cupy-datatype](../../implementations/datatype_classes/cupy_mesh.py).
 The next step is to import cupy in the problem_class. In the following you have to exchange the NumPy/SciPy functions with the CuPy functions.
 A [Comparison Table](https://docs.cupy.dev/en/latest/reference/comparison.html) is given from CuPy to do that.
-For Exmaple: The above steps can be traced using the files 
+For example: The above steps can be traced using the files 
 [HeatEquation_ND_FD_forced_periodic.py](../../implementations/problem_classes/HeatEquation_ND_FD_forced_periodic.py) 
 and [HeatEquation_ND_FD_forced_periodic_gpu.py](../../implementations/problem_classes/HeatEquation_ND_FD_forced_periodic.py)
 Now you are ready to run `pySDC` on the GPU. 
 
 Run pySDC on the GPU
-------------
-You have to configure a Script to run it. You can see at the file [heat.py](heat.py) that the parameters are the 
+--------------------
+You have to configure a script to run it. You can see at the file [heat.py](heat.py) that the parameters are the 
 same for GPU and CPU. Only the import for the problem_class changed.  
 
-
-
 More examples
-----------
+-------------
 Further examples can found with Allen-Cahn:
 * problem: [AllenCahn_2D_FD.py](../../implementations/problem_classes/AllenCahn_2D_FD.py) and [AllenCahn_2D_FD_gpu.py](../../implementations/problem_classes/AllenCahn_2D_FD_gpu.py)
 * problem: [AllenCahn_2D_FFT.py](../../implementations/problem_classes/AllenCahn_2D_FFT.py) and [AllenCahn_2D_FFT_gpu.py](../../implementations/problem_classes/AllenCahn_2D_FFT_gpu.py)
   * Script to run pySDC: [ac-fft.py](ac-fft.py)
 
 
+Running large problems on GPU
+-----------------------------
+This project contains some infrastructure for running and plotting specific problems.
+The main file is `run_experiment` and can be configured using command line arguments.
+For instance, use
+
+.. code-block:: bash
+ 
+    srun -n 4 python work_precision.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=run
+    mpirun -np 8 python work_precision.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=plot
+    python work_precision.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=video
+
+to first run the problem, then make plots and then make a video for Gray-Scott with the U-Skate configuration (see arXiv:1501.01990).
+
+To do a parallel scaling test, you can go to JUWELS Booster and use, for instance,
+
+.. code-block:: bash
+   python analysis_scripts/parallel_scaling.py --mode=run --scaling=strong --space_time=True --XPU=GPU --problem=GS
+   srun python analysis_scripts/parallel_scaling.py --mode=plot --scaling=strong --space_time=True --XPU=GPU --problem=GS
+
+This will generate jobscripts and submit the jobs. Notice that you have to wait for the jobs to complete before you can plot them.
+
+To learn more about the options for the scripts, run them with `--help`.
diff --git a/pySDC/projects/GPU/analysis_scripts/parallel_scaling.py b/pySDC/projects/GPU/analysis_scripts/parallel_scaling.py
new file mode 100644
index 0000000000..6358ee66dc
--- /dev/null
+++ b/pySDC/projects/GPU/analysis_scripts/parallel_scaling.py
@@ -0,0 +1,195 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import pickle
+from pySDC.helpers.stats_helper import get_sorted
+from pySDC.projects.GPU.configs.base_config import get_config
+from pySDC.projects.GPU.etc.generate_jobscript import write_jobscript, PROJECT_PATH
+from pySDC.helpers.plot_helper import setup_mpl, figsize_by_journal
+
+setup_mpl()
+
+
+class ScalingConfig(object):
+    cluster = None
+    config = ''
+    base_resolution = -1
+    base_resolution_weak = -1
+    useGPU = False
+    partition = None
+    tasks_per_node = None
+    ndim = 2
+    tasks_time = 1
+    max_steps_space = None
+    max_steps_space_weak = None
+    sbatch_options = []
+    max_nodes = 9999
+
+    def __init__(self, space_time_parallel):
+        if space_time_parallel in ['False', False]:
+            self._tasks_time = 1
+        else:
+            self._tasks_time = self.tasks_time
+
+    def get_resolution_and_tasks(self, strong, i):
+        if strong:
+            return self.base_resolution, [1, self._tasks_time, 2**i]
+        else:
+            return self.base_resolution_weak * int(self._tasks_time ** (1.0 / self.ndim)) * (2**i), [
+                1,
+                self._tasks_time,
+                (2 * self.ndim) ** i,
+            ]
+
+    def run_scaling_test(self, strong=True):
+        max_steps = self.max_steps_space if strong else self.max_steps_space_weak
+        for i in range(max_steps):
+            res, procs = self.get_resolution_and_tasks(strong, i)
+
+            _nodes = np.prod(procs) // self.tasks_per_node
+            if _nodes > self.max_nodes:
+                break
+
+            sbatch_options = [
+                f'-n {np.prod(procs)}',
+                f'-p {self.partition}',
+                f'--tasks-per-node={self.tasks_per_node}',
+            ] + self.sbatch_options
+            srun_options = [f'--tasks-per-node={self.tasks_per_node}']
+            if self.useGPU:
+                srun_options += ['--cpus-per-task=4', '--gpus-per-task=1']
+                sbatch_options += ['--cpus-per-task=4', '--gpus-per-task=1']
+
+            procs = (''.join(f'{me}/' for me in procs))[:-1]
+            command = f'run_experiment.py --mode=run --res={res} --config={self.config} --procs={procs}'
+
+            if self.useGPU:
+                command += ' --useGPU=True'
+
+            write_jobscript(sbatch_options, srun_options, command, self.cluster)
+
+    def plot_scaling_test(self, strong, ax, plot_ideal=False, **plotting_params):  # pragma: no cover
+        timings = {}
+
+        max_steps = self.max_steps_space if strong else self.max_steps_space_weak
+        for i in range(max_steps):
+            res, procs = self.get_resolution_and_tasks(strong, i)
+
+            args = {'useGPU': self.useGPU, 'config': self.config, 'res': res, 'procs': procs, 'mode': None}
+
+            config = get_config(args)
+
+            path = f'data/{config.get_path(ranks=[me -1 for me in procs])}-stats-whole-run.pickle'
+            try:
+                with open(path, 'rb') as file:
+                    stats = pickle.load(file)
+
+                timing_step = get_sorted(stats, type='timing_step')
+                timings[np.prod(procs) / self.tasks_per_node] = np.mean([me[1] for me in timing_step])
+            except FileNotFoundError:
+                pass
+
+        if plot_ideal:
+            if strong:
+                ax.loglog(
+                    timings.keys(),
+                    list(timings.values())[0] * list(timings.keys())[0] / np.array(list(timings.keys())),
+                    ls='--',
+                    color='grey',
+                    label='ideal',
+                )
+        ax.loglog(timings.keys(), timings.values(), **plotting_params)
+        ax.set_xlabel(r'$N_\mathrm{nodes}$')
+        ax.set_ylabel(r'$t_\mathrm{step}$')
+
+
+class CPUConfig(ScalingConfig):
+    cluster = 'jusuf'
+    partition = 'batch'
+    tasks_per_node = 16
+    max_nodes = 144
+
+
+class GPUConfig(ScalingConfig):
+    cluster = 'booster'
+    partition = 'booster'
+    tasks_per_node = 4
+    useGPU = True
+    max_nodes = 936
+
+
+class GrayScottSpaceScalingCPU(CPUConfig, ScalingConfig):
+    base_resolution = 8192
+    base_resolution_weak = 512
+    config = 'GS_scaling'
+    max_steps_space = 11
+    max_steps_space_weak = 11
+    tasks_time = 4
+    sbatch_options = ['--time=3:30:00']
+
+
+class GrayScottSpaceScalingGPU(GPUConfig, ScalingConfig):
+    base_resolution_weak = 1024
+    base_resolution = 8192
+    config = 'GS_scaling'
+    max_steps_space = 7
+    max_steps_space_weak = 5
+    tasks_time = 4
+    max_nodes = 64
+
+
+def plot_scalings(strong, problem, kwargs):  # pragma: no cover
+    if problem == 'GS':
+        fig, ax = plt.subplots(figsize=figsize_by_journal('JSC_beamer', 1, 0.45))
+
+        plottings_params = [
+            {'plot_ideal': True, 'marker': 'x', 'label': 'CPU space parallel'},
+            {'marker': '>', 'label': 'CPU space time parallel'},
+            {'marker': '^', 'label': 'GPU space parallel'},
+            {'marker': '<', 'label': 'GPU space time parallel'},
+        ]
+        configs = [
+            GrayScottSpaceScalingCPU(space_time_parallel=False),
+            GrayScottSpaceScalingCPU(space_time_parallel=True),
+            GrayScottSpaceScalingGPU(space_time_parallel=False),
+            GrayScottSpaceScalingGPU(space_time_parallel=True),
+        ]
+
+        for config, params in zip(configs, plottings_params):
+            config.plot_scaling_test(strong=strong, ax=ax, **params)
+        ax.legend(frameon=False)
+        fig.savefig(f'{PROJECT_PATH}/plots/{"strong" if strong else "weak"}_scaling_{problem}.pdf', bbox_inches='tight')
+    else:
+        raise NotImplementedError
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--scaling', type=str, choices=['strong', 'weak'], default='strong')
+    parser.add_argument('--mode', type=str, choices=['run', 'plot'], default='run')
+    parser.add_argument('--problem', type=str, default='GS')
+    parser.add_argument('--XPU', type=str, choices=['CPU', 'GPU'], default='CPU')
+    parser.add_argument('--space_time', type=str, choices=['True', 'False'], default='False')
+
+    args = parser.parse_args()
+
+    strong = args.scaling == 'strong'
+
+    if args.problem == 'GS':
+        if args.XPU == 'CPU':
+            configClass = GrayScottSpaceScalingCPU
+        else:
+            configClass = GrayScottSpaceScalingGPU
+    else:
+        raise NotImplementedError(f'Don\'t know problem {args.problem!r}')
+
+    kwargs = {'space_time_parallel': args.space_time}
+    config = configClass(**kwargs)
+
+    if args.mode == 'run':
+        config.run_scaling_test(strong=strong)
+    elif args.mode == 'plot':
+        plot_scalings(strong=strong, problem=args.problem, kwargs=kwargs)
+    else:
+        raise NotImplementedError(f'Don\'t know mode {args.mode!r}')
diff --git a/pySDC/projects/GPU/analysis_scripts/plot_step_size.py b/pySDC/projects/GPU/analysis_scripts/plot_step_size.py
new file mode 100644
index 0000000000..ed107f3bc6
--- /dev/null
+++ b/pySDC/projects/GPU/analysis_scripts/plot_step_size.py
@@ -0,0 +1,51 @@
+import pickle
+import numpy as np
+import matplotlib.pyplot as plt
+from pySDC.projects.GPU.configs.base_config import get_config
+from pySDC.projects.GPU.run_experiment import parse_args
+from pySDC.helpers.stats_helper import get_sorted
+
+
+def plot_step_size_single(args, ax, **plotting_params):  # pragma: no cover
+
+    config = get_config(args)
+
+    path = f'data/{config.get_path(ranks=[me -1 for me in args["procs"]])}-stats-whole-run.pickle'
+    with open(path, 'rb') as file:
+        stats = pickle.load(file)
+
+    dt = get_sorted(stats, type='dt', recomputed=False)
+
+    plotting_params = {
+        **plotting_params,
+    }
+    ax.plot([me[0] for me in dt], [me[1] for me in dt], **plotting_params)
+    ax.legend(frameon=False)
+    ax.set_ylabel(r'$\Delta t$')
+    ax.set_xlabel('$t$')
+
+
+def plot_step_size(args):  # pragma: no cover
+    fig, ax = plt.subplots()
+    plot_step_size_single(args, ax)
+
+    config = get_config(args)
+    path = f'plots/{config.get_path(ranks=[me -1 for me in args["procs"]])}-dt.pdf'
+    fig.savefig(path, bbox_inches='tight')
+    plt.show()
+
+
+def plot_step_size_GS(args):  # pragma: no cover
+    fig, ax = plt.subplots()
+    for config in ['GS_GoL', 'GS_dt']:
+        plot_step_size_single({**args, 'config': config}, ax, label=config)
+
+    path = 'plots/GrayScott-dt.pdf'
+    fig.savefig(path, bbox_inches='tight')
+    plt.show()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    # plot_step_size(args)
+    plot_step_size_GS(args)
diff --git a/pySDC/projects/GPU/configs/GS_configs.py b/pySDC/projects/GPU/configs/GS_configs.py
new file mode 100644
index 0000000000..6ee1374d5d
--- /dev/null
+++ b/pySDC/projects/GPU/configs/GS_configs.py
@@ -0,0 +1,211 @@
+from pySDC.projects.GPU.configs.base_config import Config
+
+
+def get_config(args):
+    name = args['config']
+
+    if name == 'GS':
+        return GrayScott(args)
+    elif name == 'GS_dt':
+        return GrayScott_dt_adaptivity(args)
+    elif name == 'GS_GoL':
+        return GrayScott_GoL(args)
+    elif name == 'GS_USkate':
+        return GrayScott_USkate(args)
+    elif name == 'GS_scaling':
+        return GrayScottScaling(args)
+    else:
+        return NotImplementedError(f'Don\'t know config {name}')
+
+
+def get_A_B_from_f_k(f, k):
+    return {'A': f, 'B': f + k}
+
+
+class GrayScott(Config):
+    Tend = 6000
+    num_frames = 200
+    sweeper_type = 'IMEX'
+    res_per_blob = 2**7
+
+    def get_LogToFile(self, ranks=None):
+        import numpy as np
+        from pySDC.implementations.hooks.log_solution import LogToFileAfterXs as LogToFile
+
+        LogToFile.path = './data/'
+        LogToFile.file_name = f'{self.get_path(ranks=ranks)}-solution'
+        LogToFile.time_increment = self.Tend / self.num_frames
+
+        def process_solution(L):
+            P = L.prob
+
+            if P.spectral:
+                uend = P.itransform(L.uend)
+            else:
+                uend = L.uend
+
+            if P.useGPU:
+                return {
+                    't': L.time + L.dt,
+                    'u': uend[0].get().view(np.ndarray),
+                    'v': uend[1].get().view(np.ndarray),
+                    'X': L.prob.X[0].get().view(np.ndarray),
+                    'Y': L.prob.X[1].get().view(np.ndarray),
+                }
+            else:
+                return {
+                    't': L.time + L.dt,
+                    'u': uend[0],
+                    'v': uend[1],
+                    'X': L.prob.X[0],
+                    'Y': L.prob.X[1],
+                }
+
+        def logging_condition(L):
+            sweep = L.sweep
+            if hasattr(sweep, 'comm'):
+                if sweep.comm.rank == sweep.comm.size - 1:
+                    return True
+                else:
+                    return False
+            else:
+                return True
+
+        LogToFile.process_solution = process_solution
+        LogToFile.logging_condition = logging_condition
+        return LogToFile
+
+    def plot(self, P, idx, n_procs_list):  # pragma: no cover
+        import numpy as np
+        from matplotlib import ticker as tkr
+
+        fig = P.get_fig(n_comps=1)
+        cax = P.cax
+        ax = fig.get_axes()[0]
+
+        buffer = {}
+        vmin = {'u': np.inf, 'v': np.inf}
+        vmax = {'u': -np.inf, 'v': -np.inf}
+
+        for rank in range(n_procs_list[2]):
+            ranks = [0, 0] + [rank]
+            LogToFile = self.get_LogToFile(ranks=ranks)
+
+            buffer[f'u-{rank}'] = LogToFile.load(idx)
+
+            vmin['v'] = min([vmin['v'], buffer[f'u-{rank}']['v'].real.min()])
+            vmax['v'] = max([vmax['v'], buffer[f'u-{rank}']['v'].real.max()])
+            vmin['u'] = min([vmin['u'], buffer[f'u-{rank}']['u'].real.min()])
+            vmax['u'] = max([vmax['u'], buffer[f'u-{rank}']['u'].real.max()])
+
+        for rank in range(n_procs_list[2]):
+            im = ax.pcolormesh(
+                buffer[f'u-{rank}']['X'],
+                buffer[f'u-{rank}']['Y'],
+                buffer[f'u-{rank}']['v'].real,
+                vmin=vmin['v'],
+                vmax=vmax['v'],
+                cmap='binary',
+            )
+            fig.colorbar(im, cax, format=tkr.FormatStrFormatter('%.1f'))
+            ax.set_title(f't={buffer[f"u-{rank}"]["t"]:.2f}')
+            ax.set_xlabel('$x$')
+            ax.set_ylabel('$y$')
+            ax.set_aspect(1.0)
+            ax.set_aspect(1.0)
+        return fig
+
+    def get_description(self, *args, res=-1, **kwargs):
+        from pySDC.implementations.problem_classes.GrayScott_MPIFFT import grayscott_imex_diffusion
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['step_params']['maxiter'] = 5
+
+        desc['level_params']['dt'] = 1e0
+        desc['level_params']['restol'] = -1
+
+        desc['sweeper_params']['quad_type'] = 'RADAU-RIGHT'
+        desc['sweeper_params']['num_nodes'] = 3
+        desc['sweeper_params']['QI'] = 'MIN-SR-S'
+        desc['sweeper_params']['QE'] = 'PIC'
+
+        desc['problem_params']['nvars'] = (2**8 if res == -1 else res,) * 2
+        desc['problem_params']['Du'] = 0.00002
+        desc['problem_params']['Dv'] = 0.00001
+        desc['problem_params']['A'] = 0.04
+        desc['problem_params']['B'] = 0.1
+        desc['problem_params']['L'] = 2 * desc['problem_params']['nvars'][0] // self.res_per_blob
+        desc['problem_params']['num_blobs'] = desc['problem_params']['nvars'][0] // self.res_per_blob
+
+        desc['problem_class'] = grayscott_imex_diffusion
+
+        return desc
+
+
+class GrayScott_dt_adaptivity(GrayScott):
+    """
+    Configuration with dt adaptivity added to base configuration
+    """
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
+
+        desc = super().get_description(*args, **kwargs)
+        desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-5}
+        return desc
+
+
+class GrayScott_GoL(GrayScott):
+    '''
+    This configuration shows gliders that are similar in complexity to Conway's Game of life.
+    '''
+
+    num_frames = 400
+    res_per_blob = 2**8
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
+
+        desc = super().get_description(*args, **kwargs)
+        desc['problem_params'] = {**desc['problem_params'], **get_A_B_from_f_k(f=0.010, k=0.049)}
+        desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-5}
+        self.Tend = 10000
+        return desc
+
+
+class GrayScott_USkate(GrayScott):
+    '''
+    See arXiv:1501.01990 or http://www.mrob.com/sci/papers/2009smp-figs/index.html
+    '''
+
+    num_frames = 400
+    res_per_blob = 2**7
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
+
+        desc = super().get_description(*args, **kwargs)
+        desc['problem_params'] = {**desc['problem_params'], **get_A_B_from_f_k(f=0.062, k=0.0609)}
+        desc['problem_params']['num_blobs'] = -12 * desc['problem_params']['L'] ** 2
+        desc['problem_params']['Du'] = 2e-5
+        desc['problem_params']['Dv'] = 1e-5
+        desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-3}
+        self.Tend = 200000
+        return desc
+
+
+class GrayScottScaling(GrayScott):
+    def get_description(self, *args, **kwargs):
+        desc = super().get_description(*args, **kwargs)
+        desc['problem_params']['L'] = 2
+        desc['problem_params']['num_blobs'] = 4
+        desc['sweeper_params']['skip_residual_computation'] = ('IT_CHECK', 'IT_DOWN', 'IT_UP', 'IT_FINE', 'IT_COARSE')
+        desc['sweeper_params']['num_nodes'] = 4
+        self.Tend = 50 * desc['level_params']['dt']
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        params = super().get_controller_params(*args, **kwargs)
+        params['hook_class'] = []
+        return params
diff --git a/pySDC/projects/GPU/configs/base_config.py b/pySDC/projects/GPU/configs/base_config.py
new file mode 100644
index 0000000000..cdf89eec00
--- /dev/null
+++ b/pySDC/projects/GPU/configs/base_config.py
@@ -0,0 +1,191 @@
+from pySDC.core.convergence_controller import ConvergenceController
+import pickle
+import numpy as np
+
+
+def get_config(args):
+    name = args['config']
+    if name[:2] == 'GS':
+        from pySDC.projects.GPU.configs.GS_configs import get_config
+
+        return get_config(args)
+    else:
+        raise NotImplementedError(f'There is no configuration called {name!r}!')
+
+
+def get_comms(n_procs_list, comm_world=None, _comm=None, _tot_rank=0, _rank=None, useGPU=False):
+    from mpi4py import MPI
+
+    comm_world = MPI.COMM_WORLD if comm_world is None else comm_world
+    _comm = comm_world if _comm is None else _comm
+    _rank = comm_world.rank if _rank is None else _rank
+
+    if len(n_procs_list) > 0:
+        color = _tot_rank + _rank // n_procs_list[0]
+        new_comm = comm_world.Split(color)
+
+        assert new_comm.size == n_procs_list[0]
+
+        if useGPU:
+            import cupy_backends
+
+            try:
+                import cupy
+                from pySDC.helpers.NCCL_communicator import NCCLComm
+
+                new_comm = NCCLComm(new_comm)
+            except (
+                ImportError,
+                cupy_backends.cuda.api.runtime.CUDARuntimeError,
+                cupy_backends.cuda.libs.nccl.NcclError,
+            ):
+                print('Warning: Communicator is MPI instead of NCCL in spite of using GPUs!')
+
+        return [new_comm] + get_comms(
+            n_procs_list[1:],
+            comm_world,
+            _comm=new_comm,
+            _tot_rank=_tot_rank + _comm.size * new_comm.rank,
+            _rank=_comm.rank // new_comm.size,
+            useGPU=useGPU,
+        )
+    else:
+        return []
+
+
+class Config(object):
+    sweeper_type = None
+    Tend = None
+
+    def __init__(self, args, comm_world=None):
+        from mpi4py import MPI
+
+        self.args = args
+        self.comm_world = MPI.COMM_WORLD if comm_world is None else comm_world
+        self.n_procs_list = args["procs"]
+        if args['mode'] == 'run':
+            self.comms = get_comms(n_procs_list=self.n_procs_list, useGPU=args['useGPU'], comm_world=self.comm_world)
+        else:
+            self.comms = [self.comm_world, self.comm_world, self.comm_world]
+        self.ranks = [me.rank for me in self.comms]
+
+    def get_description(self, *args, MPIsweeper=False, useGPU=False, **kwargs):
+        description = {}
+        description['problem_class'] = None
+        description['problem_params'] = {'useGPU': useGPU, 'comm': self.comms[2]}
+        description['sweeper_class'] = self.get_sweeper(useMPI=MPIsweeper)
+        description['sweeper_params'] = {'initial_guess': 'copy'}
+        description['level_params'] = {}
+        description['step_params'] = {}
+        description['convergence_controllers'] = {}
+
+        if self.get_LogToFile():
+            description['convergence_controllers'][LogStats] = {}
+
+        if MPIsweeper:
+            description['sweeper_params']['comm'] = self.comms[1]
+        return description
+
+    def get_controller_params(self, *args, logger_level=15, **kwargs):
+        from pySDC.implementations.hooks.log_work import LogWork
+
+        controller_params = {}
+        controller_params['logger_level'] = logger_level if self.comm_world.rank == 0 else 40
+        controller_params['hook_class'] = [LogWork]
+        logToFile = self.get_LogToFile()
+        if logToFile:
+            controller_params['hook_class'] += [logToFile]
+        controller_params['mssdc_jac'] = False
+        return controller_params
+
+    def get_sweeper(self, useMPI):
+        if useMPI and self.sweeper_type == 'IMEX':
+            from pySDC.implementations.sweeper_classes.imex_1st_order_MPI import imex_1st_order_MPI as sweeper
+        elif not useMPI and self.sweeper_type == 'IMEX':
+            from pySDC.implementations.sweeper_classes.imex_1st_order import imex_1st_order as sweeper
+        elif useMPI and self.sweeper_type == 'generic_implicit':
+            from pySDC.implementations.sweeper_classes.generic_implicit_MPI import generic_implicit_MPI as sweeper
+        elif not useMPI and self.sweeper_type == 'generic_implicit':
+            from pySDC.implementations.sweeper_classes.generic_implicit import generic_implicit as sweeper
+        else:
+            raise NotImplementedError(f'Don\'t know the sweeper for {self.sweeper_type=}')
+
+        return sweeper
+
+    def get_path(self, *args, ranks=None, **kwargs):
+        ranks = self.ranks if ranks is None else ranks
+        return f'{type(self).__name__}{self.args_to_str()}-{ranks[0]}-{ranks[2]}'
+
+    def args_to_str(self, args=None):
+        args = self.args if args is None else args
+        name = ''
+
+        name = f'{name}-res_{args["res"]}'
+        name = f'{name}-useGPU_{args["useGPU"]}'
+        name = f'{name}-procs_{args["procs"][0]}_{args["procs"][1]}_{args["procs"][2]}'
+        return name
+
+    def plot(self, P, idx, num_procs_list):
+        raise NotImplementedError
+
+    def get_initial_condition(self, P, *args, restart_idx=0, **kwargs):
+        if restart_idx > 0:
+            LogToFile = self.get_LogToFile()
+            file = LogToFile.load(restart_idx)
+            LogToFile.counter = restart_idx
+            u0 = P.u_init
+            if hasattr(P, 'spectral_space'):
+                if P.spectral_space:
+                    u0[...] = P.transform(file['u'])
+                else:
+                    u0[...] = file['u']
+            else:
+                u0[...] = file['u']
+            return u0, file['t']
+        else:
+            return P.u_exact(t=0), 0
+
+    def get_previous_stats(self, P, restart_idx):
+        if restart_idx == 0:
+            return {}
+        else:
+            hook = self.get_LogToFile()
+            path = LogStats.get_stats_path(hook, counter_offset=0)
+            with open(path, 'rb') as file:
+                return pickle.load(file)
+
+    def get_LogToFile(self):
+        return None
+
+
+class LogStats(ConvergenceController):
+
+    @staticmethod
+    def get_stats_path(hook, counter_offset=-1):
+        return f'{hook.path}/{hook.file_name}_{hook.format_index(hook.counter+counter_offset)}-stats.pickle'
+
+    def setup(self, controller, params, *args, **kwargs):
+        params['control_order'] = 999
+        if 'hook' not in params.keys():
+            from pySDC.implementations.hooks.log_solution import LogToFileAfterXs
+
+            params['hook'] = LogToFileAfterXs
+
+        self.counter = params['hook'].counter
+
+        return super().setup(controller, params, *args, **kwargs)
+
+    def post_step_processing(self, controller, S, **kwargs):
+        hook = self.params.hook
+
+        for _hook in controller.hooks:
+            _hook.post_step(S, 0)
+
+        if self.counter < hook.counter:
+            path = self.get_stats_path(hook)
+            stats = controller.return_stats()
+            if hook.logging_condition(S.levels[0]):
+                with open(path, 'wb') as file:
+                    pickle.dump(stats, file)
+                    self.log(f'Stored stats in {path!r}', S)
+            self.counter = hook.counter
diff --git a/pySDC/projects/GPU/etc/generate_jobscript.py b/pySDC/projects/GPU/etc/generate_jobscript.py
new file mode 100644
index 0000000000..ff877bfa1e
--- /dev/null
+++ b/pySDC/projects/GPU/etc/generate_jobscript.py
@@ -0,0 +1,66 @@
+PROJECT_PATH = '/p/project1/ccstma/baumann7/pySDC/pySDC/projects/GPU'
+DEFAULT_SBATCH_OPTIONS = ['-A cstma', '--threads-per-core=1', f'--output={PROJECT_PATH}/etc/slurm-out/%j.txt']
+DEFAULT_SRUN_OPTIONS = ['--cpu-bind=sockets']
+
+
+def generate_directories():
+    '''
+    Initialize directories for jobscripts and slurm output
+    '''
+    import os
+
+    for name in ['jobscripts', 'slurm-out']:
+        path = f'{PROJECT_PATH}/etc/{name}'
+        os.makedirs(path, exist_ok=True)
+
+
+def get_jobscript_text(sbatch_options, srun_options, command, cluster):
+    """
+    Generate the text for a jobscript
+
+    Args:
+        sbatch_options (list): List of options for sbatch
+        srun_options (list): Options for the srun command
+        command (str): python (!) command. Will be prefaced by `python <path>/`
+        cluster (str): Name of the cluster you want to run on
+
+    Returns:
+        str: Content of jobscript
+    """
+    msg = '#!/usr/bin/bash\n\n'
+    for op in DEFAULT_SBATCH_OPTIONS + sbatch_options:
+        msg += f'#SBATCH {op}\n'
+
+    msg += f'\nsource {PROJECT_PATH}/etc/venv_{cluster.lower()}/activate.sh\n'
+
+    srun_cmd = 'srun'
+    for op in DEFAULT_SRUN_OPTIONS + srun_options:
+        srun_cmd += f' {op}'
+
+    msg += f'\n{srun_cmd} python {PROJECT_PATH}/{command}'
+    return msg
+
+
+def write_jobscript(sbatch_options, srun_options, command, cluster, submit=True):
+    """
+    Generate a jobscript.
+
+    Args:
+        sbatch_options (list): List of options for sbatch
+        srun_options (list): Options for the srun command
+        command (str): python (!) command. Will be prefaced by `python <path>/`
+        cluster (str): Name of the cluster you want to run on
+        submit (bool): If yes, the script will be submitted to SLURM after it is written
+    """
+    generate_directories()
+
+    text = get_jobscript_text(sbatch_options, srun_options, command, cluster)
+
+    path = f'{PROJECT_PATH}/etc/jobscripts/{command.replace(" ", "").replace("/", "_")}-{cluster}.sh'
+    with open(path, 'w') as file:
+        file.write(text)
+
+    if submit:
+        import os
+
+        os.system(f'sbatch {path}')
diff --git a/pySDC/projects/GPU/etc/venv_booster/activate.sh b/pySDC/projects/GPU/etc/venv_booster/activate.sh
new file mode 100644
index 0000000000..1ebb7bc0ad
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/activate.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# See https://stackoverflow.com/a/28336473
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+
+[[ "$0" != "${SOURCE_PATH}" ]] && echo "The activation script must be sourced, otherwise the virtual environment will not work." || ( echo "Vars script must be sourced." && exit 1) ;
+
+source "${ABSOLUTE_PATH}"/config.sh
+source "${ABSOLUTE_PATH}"/modules.sh
+
+export PYTHONPATH="$(echo "${ENV_DIR}"/lib/python*/site-packages):${PYTHONPATH}"
+
+source "${ENV_DIR}"/bin/activate
diff --git a/pySDC/projects/GPU/etc/venv_booster/config.sh b/pySDC/projects/GPU/etc/venv_booster/config.sh
new file mode 100644
index 0000000000..a2e933acab
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/config.sh
@@ -0,0 +1,12 @@
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+## Check if this script is sourced
+[[ "$0" != "${SOURCE_PATH}" ]] && echo "Setting vars" || ( echo "Vars script must be sourced." && exit 1) ;
+## Determine location of this file
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+####################################
+
+### User Configuration
+export ENV_NAME="GPU"             # Default Name of the venv is the directory that contains this file
+export ENV_DIR="${ABSOLUTE_PATH}"/venv         # Default location of this VENV is "./venv"
diff --git a/pySDC/projects/GPU/etc/venv_booster/create_kernel.sh b/pySDC/projects/GPU/etc/venv_booster/create_kernel.sh
new file mode 100755
index 0000000000..61be48923f
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/create_kernel.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+source "${ABSOLUTE_PATH}"/config.sh
+
+KERNELFILE="${ENV_DIR}"/kernel.sh
+
+echo the name is "$ENV_NAME"
+
+echo "Setting up the kernel script in the following dir: " "${KERNELFILE}"
+echo "If you use multiprocessing, edit this file to remove the srun call from the kernel and run it again."
+echo '#!/bin/bash
+
+source "'"${ABSOLUTE_PATH}"'"/activate.sh
+
+hostname=$(hostname)
+
+if [[ $hostname == *"login"* || $hostname == *"jsfl"* ]]; then
+   exec python -m ipykernel "$@"
+else
+   srun python -m ipykernel "$@"
+fi
+' > "${KERNELFILE}"
+
+chmod a+x "${KERNELFILE}"
+
+mkdir -p ~/.local/share/jupyter/kernels/"${ENV_NAME}"
+echo '{
+ "argv": [
+  "'"${KERNELFILE}"'",
+  "-f",
+  "{connection_file}"
+ ],
+ "display_name": "'"${ENV_NAME}"'",
+ "language": "python"
+}' > ~/.local/share/jupyter/kernels/"${ENV_NAME}"/kernel.json
diff --git a/pySDC/projects/GPU/etc/venv_booster/create_python_for_vscode.sh b/pySDC/projects/GPU/etc/venv_booster/create_python_for_vscode.sh
new file mode 100755
index 0000000000..aa3e98584a
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/create_python_for_vscode.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+source "${ABSOLUTE_PATH}"/config.sh
+PYTHONWRAPPER="${ABSOLUTE_PATH}"/python
+
+echo '#!/bin/bash
+module purge 2> /dev/null
+deactivate 2> /dev/null
+source '"'${ABSOLUTE_PATH}'"'/activate.sh
+python "$@"
+' > "${PYTHONWRAPPER}"
+
+chmod a+x "${PYTHONWRAPPER}"
diff --git a/pySDC/projects/GPU/etc/venv_booster/modules.sh b/pySDC/projects/GPU/etc/venv_booster/modules.sh
new file mode 100644
index 0000000000..bbadd908dd
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/modules.sh
@@ -0,0 +1,13 @@
+module --force purge
+module load Stages/2024
+module load GCC
+module load ParaStationMPI
+module load NCCL
+module load MPI-settings/CUDA
+module load UCX-settings/RC-CUDA
+module load Python
+module load CuPy
+module load FFTW
+module load mpi4py
+module load FFmpeg/.6.0
+module load SciPy-Stack
diff --git a/pySDC/projects/GPU/etc/venv_booster/readme.md b/pySDC/projects/GPU/etc/venv_booster/readme.md
new file mode 100644
index 0000000000..0e114403da
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/readme.md
@@ -0,0 +1,54 @@
+Supercomputing Environment Template using Python Virtual Environments
+=================
+
+# Idea
+This project contains a lightweight set of scripts to easily create Python working environments on
+typical supercomputer setups, including creating Jupyter Kernels.
+
+On Supercomputers, typically a basic environment based on **Environment Modules**. This setup is carefully
+curated and optimized, including compilers, MPI version etc. Extra Python packages can be installed
+with pip into user space. This, however, does not create a reproducible environment that can be used
+by other users as well.
+
+Conceptually, with Virtual Environments, it is easily possible to create project-based virtual environments.
+These scripts streamline the creation and usage of such environments and make it easy for a users to share a setup
+and to put it under version control with the main code.
+
+Furthermore, in typical compute setup of scientific projects, one or more packages possibly are in active
+development. In the context of these setups, it is intended to include them as submodules and add integrate
+them into the workflow. This can e.g. mean that a compilation step is added in the setup step and
+setting appropriate environment variables is included in the activation step.
+
+# Details
+The setup is configured in the bash script `config.sh`. The user can define a name for the venv and directory
+where the venv files are stored. This defaults to the directory name of the containing folder and the "." folder
+of the scripts. Please **edit** this file if you want a custom name and location for the venv.
+
+The modules on top of which the the venv should be built are defined in `modules.sh`. Please **edit** the file
+to your needs.
+
+The file `requirements.txt` contains a list of packages to be installed during the setup process. Add required
+packages to this file to reproducibly add them to the venv.
+
+The script `setup.sh` creates the venv according to the config given in `config.sh`. Please **edit** this
+file to add a setup step for submodules (e.g. compilation of libraries). If only plain venvs are used, this file
+can remain unchanged. Note that the script *must* be ran at least once after the above configurations to actually create the environment.
+
+The script `activate.sh` sets the environment variables such that the venv can be used. Please **edit** this file
+to add environment variables for submodules. Note that the script must be *sourced* to take effect. Example:
+```bash
+source <path_to_venv>/activate.sh
+```
+
+The script `create_kernel.sh` will create a kernel json file in the user's home directory that can be found
+by Jupyter and a helper script in the virtual environment folder.
+
+
+
+# Intended Workflow
+1. Edit `config.sh` to change name an location of the venv if required.
+2. Edit `modules.sh` to change the modules loaded prior to the creation of the venv.
+3. Edit `requirements.txt` to change the packages to be installed during setup.
+4. Edit `setup.sh` and `activate.sh` to add extra steps for custom modules.
+5. Create the environment with `bash setup.sh`.
+6. Create a kernel with `bash create_kernel.sh`.
diff --git a/pySDC/projects/GPU/etc/venv_booster/requirements.txt b/pySDC/projects/GPU/etc/venv_booster/requirements.txt
new file mode 100644
index 0000000000..bd2a63cd8a
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/requirements.txt
@@ -0,0 +1,5 @@
+# Add here the pip packages you would like to install on this virtual environment / kernel
+numpy
+matplotlib>=3.0
+dill>=0.2.6
+scipy>=1.14
diff --git a/pySDC/projects/GPU/etc/venv_booster/setup.sh b/pySDC/projects/GPU/etc/venv_booster/setup.sh
new file mode 100755
index 0000000000..1e0644490c
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_booster/setup.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+
+source "${ABSOLUTE_PATH}"/config.sh
+source "${ABSOLUTE_PATH}"/modules.sh
+
+python3 -m venv --prompt "$ENV_NAME" --system-site-packages "${ENV_DIR}"
+
+source "${ABSOLUTE_PATH}"/activate.sh
+
+# FFTW_LIBRARY_DIR="/p/software/jusuf/stages/2024/software/FFTW/3.3.10-GCC-12.3.0/lib64" python3 -m pip install -e /p/project/ccstma/baumann7/mpi4py-fft
+# FFTW_LIBRARY_DIR="/p/software/juwels/stages/2024/software/FFTW/3.3.10-GCC-12.3.0/lib64/" python3 -m pip install -e /p/project/ccstma/baumann7/mpi4py-fft
+FFTW_LIBRARY_DIR="/p/software/juwelsbooster/stages/2024/software/FFTW/3.3.10-GCC-12.3.0/lib64/" python3 -m pip install -e /p/project1/ccstma/baumann7/mpi4py-fft
+python3 -m pip install -e /p/project1/ccstma/baumann7/qmat
+python3 -m pip install -r "${ABSOLUTE_PATH}"/requirements.txt
+python3 -m pip install -e /p/project1/ccstma/baumann7/pySDC/
+
diff --git a/pySDC/projects/GPU/etc/venv_jusuf/activate.sh b/pySDC/projects/GPU/etc/venv_jusuf/activate.sh
new file mode 100644
index 0000000000..1ebb7bc0ad
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_jusuf/activate.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# See https://stackoverflow.com/a/28336473
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+
+[[ "$0" != "${SOURCE_PATH}" ]] && echo "The activation script must be sourced, otherwise the virtual environment will not work." || ( echo "Vars script must be sourced." && exit 1) ;
+
+source "${ABSOLUTE_PATH}"/config.sh
+source "${ABSOLUTE_PATH}"/modules.sh
+
+export PYTHONPATH="$(echo "${ENV_DIR}"/lib/python*/site-packages):${PYTHONPATH}"
+
+source "${ENV_DIR}"/bin/activate
diff --git a/pySDC/projects/GPU/etc/venv_jusuf/config.sh b/pySDC/projects/GPU/etc/venv_jusuf/config.sh
new file mode 100644
index 0000000000..a2e933acab
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_jusuf/config.sh
@@ -0,0 +1,12 @@
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+## Check if this script is sourced
+[[ "$0" != "${SOURCE_PATH}" ]] && echo "Setting vars" || ( echo "Vars script must be sourced." && exit 1) ;
+## Determine location of this file
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+####################################
+
+### User Configuration
+export ENV_NAME="GPU"             # Default Name of the venv is the directory that contains this file
+export ENV_DIR="${ABSOLUTE_PATH}"/venv         # Default location of this VENV is "./venv"
diff --git a/pySDC/projects/GPU/etc/venv_jusuf/modules.sh b/pySDC/projects/GPU/etc/venv_jusuf/modules.sh
new file mode 100644
index 0000000000..85ec025c8e
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_jusuf/modules.sh
@@ -0,0 +1,13 @@
+module --force purge
+module load Stages/2024
+module load GCC
+module load ParaStationMPI
+# module load NCCL
+# module load MPI-settings/CUDA
+# module load UCX-settings/RC-CUDA
+module load Python
+# module load CuPy
+module load FFTW
+module load mpi4py
+module load FFmpeg/.6.0
+module load SciPy-Stack
diff --git a/pySDC/projects/GPU/etc/venv_jusuf/readme.md b/pySDC/projects/GPU/etc/venv_jusuf/readme.md
new file mode 100644
index 0000000000..0e114403da
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_jusuf/readme.md
@@ -0,0 +1,54 @@
+Supercomputing Environment Template using Python Virtual Environments
+=================
+
+# Idea
+This project contains a lightweight set of scripts to easily create Python working environments on
+typical supercomputer setups, including creating Jupyter Kernels.
+
+On Supercomputers, typically a basic environment based on **Environment Modules**. This setup is carefully
+curated and optimized, including compilers, MPI version etc. Extra Python packages can be installed
+with pip into user space. This, however, does not create a reproducible environment that can be used
+by other users as well.
+
+Conceptually, with Virtual Environments, it is easily possible to create project-based virtual environments.
+These scripts streamline the creation and usage of such environments and make it easy for a users to share a setup
+and to put it under version control with the main code.
+
+Furthermore, in typical compute setup of scientific projects, one or more packages possibly are in active
+development. In the context of these setups, it is intended to include them as submodules and add integrate
+them into the workflow. This can e.g. mean that a compilation step is added in the setup step and
+setting appropriate environment variables is included in the activation step.
+
+# Details
+The setup is configured in the bash script `config.sh`. The user can define a name for the venv and directory
+where the venv files are stored. This defaults to the directory name of the containing folder and the "." folder
+of the scripts. Please **edit** this file if you want a custom name and location for the venv.
+
+The modules on top of which the the venv should be built are defined in `modules.sh`. Please **edit** the file
+to your needs.
+
+The file `requirements.txt` contains a list of packages to be installed during the setup process. Add required
+packages to this file to reproducibly add them to the venv.
+
+The script `setup.sh` creates the venv according to the config given in `config.sh`. Please **edit** this
+file to add a setup step for submodules (e.g. compilation of libraries). If only plain venvs are used, this file
+can remain unchanged. Note that the script *must* be ran at least once after the above configurations to actually create the environment.
+
+The script `activate.sh` sets the environment variables such that the venv can be used. Please **edit** this file
+to add environment variables for submodules. Note that the script must be *sourced* to take effect. Example:
+```bash
+source <path_to_venv>/activate.sh
+```
+
+The script `create_kernel.sh` will create a kernel json file in the user's home directory that can be found
+by Jupyter and a helper script in the virtual environment folder.
+
+
+
+# Intended Workflow
+1. Edit `config.sh` to change name an location of the venv if required.
+2. Edit `modules.sh` to change the modules loaded prior to the creation of the venv.
+3. Edit `requirements.txt` to change the packages to be installed during setup.
+4. Edit `setup.sh` and `activate.sh` to add extra steps for custom modules.
+5. Create the environment with `bash setup.sh`.
+6. Create a kernel with `bash create_kernel.sh`.
diff --git a/pySDC/projects/GPU/etc/venv_jusuf/requirements.txt b/pySDC/projects/GPU/etc/venv_jusuf/requirements.txt
new file mode 100644
index 0000000000..bd2a63cd8a
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_jusuf/requirements.txt
@@ -0,0 +1,5 @@
+# Add here the pip packages you would like to install on this virtual environment / kernel
+numpy
+matplotlib>=3.0
+dill>=0.2.6
+scipy>=1.14
diff --git a/pySDC/projects/GPU/etc/venv_jusuf/setup.sh b/pySDC/projects/GPU/etc/venv_jusuf/setup.sh
new file mode 100755
index 0000000000..7a1378cdf0
--- /dev/null
+++ b/pySDC/projects/GPU/etc/venv_jusuf/setup.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
+
+RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
+ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
+
+source "${ABSOLUTE_PATH}"/config.sh
+source "${ABSOLUTE_PATH}"/modules.sh
+
+python3 -m venv --prompt "$ENV_NAME" --system-site-packages "${ENV_DIR}"
+
+source "${ABSOLUTE_PATH}"/activate.sh
+
+FFTW_LIBRARY_DIR="/p/software/jusuf/stages/2024/software/FFTW/3.3.10-GCC-12.3.0/lib64" python3 -m pip install -e /p/project/ccstma/baumann7/mpi4py-fft
+python3 -m pip install -e /p/project1/ccstma/baumann7/qmat
+python3 -m pip install -r "${ABSOLUTE_PATH}"/requirements.txt
+python3 -m pip install -e /p/project1/ccstma/baumann7/pySDC/
+
diff --git a/pySDC/projects/GPU/run_experiment.py b/pySDC/projects/GPU/run_experiment.py
new file mode 100644
index 0000000000..e095e07ad7
--- /dev/null
+++ b/pySDC/projects/GPU/run_experiment.py
@@ -0,0 +1,120 @@
+def parse_args():
+    import argparse
+
+    def cast_to_bool(me):
+        return False if me in ['False', '0', 0] else True
+
+    def str_to_procs(me):
+        procs = me.split('/')
+        assert len(procs) == 3
+        return [int(p) for p in procs]
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--useGPU', type=cast_to_bool, help='Toggle for GPUs', default=False)
+    parser.add_argument(
+        '--mode', type=str, help='Mode for this script', default=None, choices=['run', 'plot', 'render', 'video']
+    )
+    parser.add_argument('--config', type=str, help='Configuration to load', default=None)
+    parser.add_argument('--restart_idx', type=int, help='Restart from file by index', default=0)
+    parser.add_argument('--procs', type=str_to_procs, help='Processes in steps/sweeper/space', default='1/1/1')
+    parser.add_argument('--res', type=int, help='Space resolution along first axis', default=-1)
+    parser.add_argument(
+        '--logger_level', type=int, help='Logger level on the first rank in space and in the sweeper', default=15
+    )
+
+    return vars(parser.parse_args())
+
+
+def run_experiment(args, config, **kwargs):
+    import pickle
+
+    # from pySDC.implementations.controller_classes.controller_MPI import controller_MPI
+    from pySDC.implementations.controller_classes.controller_nonMPI import controller_nonMPI
+    from pySDC.helpers.stats_helper import filter_stats
+
+    description = config.get_description(
+        useGPU=args['useGPU'], MPIsweeper=args['procs'][1] > 1, res=args['res'], **kwargs
+    )
+    controller_params = config.get_controller_params(logger_level=args['logger_level'])
+
+    # controller = controller_MPI(controller_params, description, config.comms[0])
+    assert (
+        config.comms[0].size == 1
+    ), 'Have not figured out how to do MPI controller with GPUs yet because I need NCCL for that!'
+    controller = controller_nonMPI(num_procs=1, controller_params=controller_params, description=description)
+    prob = controller.MS[0].levels[0].prob
+
+    u0, t0 = config.get_initial_condition(prob, restart_idx=args['restart_idx'])
+
+    previous_stats = config.get_previous_stats(prob, restart_idx=args['restart_idx'])
+
+    uend, stats = controller.run(u0=u0, t0=t0, Tend=config.Tend)
+
+    combined_stats = {**previous_stats, **stats}
+    combined_stats = filter_stats(combined_stats, comm=config.comm_world)
+
+    if config.comm_world.rank == config.comm_world.size - 1:
+        path = f'data/{config.get_path()}-stats-whole-run.pickle'
+        with open(path, 'wb') as file:
+            pickle.dump(combined_stats, file)
+        print(f'Stored stats in {path}', flush=True)
+
+    return uend
+
+
+def plot_experiment(args, config):  # pragma: no cover
+    import gc
+    import matplotlib.pyplot as plt
+
+    description = config.get_description()
+
+    P = description['problem_class'](**description['problem_params'])
+
+    comm = config.comm_world
+
+    for idx in range(args['restart_idx'], 9999, comm.size):
+        try:
+            fig = config.plot(P=P, idx=idx + comm.rank, n_procs_list=args['procs'])
+        except FileNotFoundError:
+            break
+
+        path = f'simulation_plots/{config.get_path(ranks=[0,0,0])}-{idx+comm.rank:06d}.png'
+        fig.savefig(path, dpi=300, bbox_inches='tight')
+        print(f'{comm.rank} Stored figure {path!r}', flush=True)
+
+        if args['mode'] == 'render':
+            plt.pause(1e-9)
+
+        plt.close(fig)
+        del fig
+        gc.collect()
+
+
+def make_video(args, config):  # pragma: no cover
+    comm = config.comm_world
+    if comm.rank > 0:
+        return None
+
+    import subprocess
+
+    path = f'simulation_plots/{config.get_path(ranks=[0,0,0])}-%06d.png'
+    path_target = f'videos/{args["config"]}.mp4'
+
+    cmd = f'ffmpeg -i {path} -pix_fmt yuv420p -r 9 -s 2048:1536 {path_target}'.split()
+
+    subprocess.run(cmd)
+
+
+if __name__ == '__main__':
+    from pySDC.projects.GPU.configs.base_config import get_config
+
+    args = parse_args()
+
+    config = get_config(args)
+
+    if args['mode'] == 'run':
+        run_experiment(args, config)
+    elif args['mode'] in ['plot', 'render']:  # pragma: no cover
+        plot_experiment(args, config)
+    elif args['mode'] == 'video':  # pragma: no cover
+        make_video(args, config)
diff --git a/pySDC/projects/GPU/tests/test_configs.py b/pySDC/projects/GPU/tests/test_configs.py
new file mode 100644
index 0000000000..f21759712d
--- /dev/null
+++ b/pySDC/projects/GPU/tests/test_configs.py
@@ -0,0 +1,118 @@
+import pytest
+
+
+@pytest.mark.mpi4py
+def test_get_comms(launch=True):
+    if launch:
+        import subprocess
+        import os
+
+        # Set python path once
+        my_env = os.environ.copy()
+        my_env['PYTHONPATH'] = '../../..:.'
+        my_env['COVERAGE_PROCESS_START'] = 'pyproject.toml'
+
+        cmd = f"mpirun -np 24 python {__file__} --test=get_comms".split()
+
+        p = subprocess.Popen(cmd, env=my_env, cwd=".")
+
+        p.wait()
+        assert p.returncode == 0, f'ERROR: did not get return code 0, got {p.returncode}'
+    else:
+        from pySDC.projects.GPU.configs.base_config import get_comms
+        import numpy as np
+
+        n_procs_list = [2, 3, 4]
+        comms = get_comms(n_procs_list=n_procs_list)
+        assert np.allclose([me.size for me in comms], n_procs_list)
+
+
+def create_directories():
+    import os
+
+    dir_path = __file__.split('/')
+    path = ''.join([me + '/' for me in dir_path[:-1]]) + 'data'
+    os.makedirs(path, exist_ok=True)
+
+
+@pytest.mark.order(1)
+def test_run_experiment(restart_idx=0):
+    from pySDC.projects.GPU.configs.base_config import Config
+    from pySDC.projects.GPU.run_experiment import run_experiment, parse_args
+    from pySDC.helpers.stats_helper import get_sorted
+    import pickle
+    import numpy as np
+
+    create_directories()
+
+    class VdPConfig(Config):
+        sweeper_type = 'generic_implicit'
+        Tend = 1
+
+        def get_description(self, *args, **kwargs):
+            from pySDC.implementations.problem_classes.Van_der_Pol_implicit import vanderpol
+
+            desc = super().get_description(*args, **kwargs)
+            desc['problem_class'] = vanderpol
+            desc['problem_params'].pop('useGPU')
+            desc['problem_params'].pop('comm')
+            desc['sweeper_params']['num_nodes'] = 2
+            desc['sweeper_params']['quad_type'] = 'RADAU-RIGHT'
+            desc['sweeper_params']['QI'] = 'LU'
+            desc['level_params']['dt'] = 0.1
+            desc['step_params']['maxiter'] = 3
+            return desc
+
+        def get_LogToFile(self, ranks=None):
+            from pySDC.implementations.hooks.log_solution import LogToFileAfterXs as LogToFile
+
+            LogToFile.path = './data/'
+            LogToFile.file_name = f'{self.get_path(ranks=ranks)}-solution'
+            LogToFile.time_increment = 2e-1
+
+            def logging_condition(L):
+                sweep = L.sweep
+                if hasattr(sweep, 'comm'):
+                    if sweep.comm.rank == sweep.comm.size - 1:
+                        return True
+                    else:
+                        return False
+                else:
+                    return True
+
+            LogToFile.logging_condition = logging_condition
+            return LogToFile
+
+    args = {'procs': [1, 1, 1], 'useGPU': False, 'res': -1, 'logger_level': 15, 'restart_idx': restart_idx}
+    config = VdPConfig(args)
+    run_experiment(args, config)
+
+    with open(f'data/{config.get_path()}-stats-whole-run.pickle', 'rb') as file:
+        stats = pickle.load(file)
+
+    k_Newton = get_sorted(stats, type='work_newton')
+    assert len(k_Newton) == 10
+    assert sum([me[1] for me in k_Newton]) == 91
+
+
+@pytest.mark.order(2)
+def test_restart():
+    test_run_experiment(3)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test', type=str)
+
+    args = parser.parse_args()
+
+    if args.test == 'get_comms':
+        test_get_comms(False)
+    elif args.test == 'run_experiment':
+        test_run_experiment()
+    elif args.test == 'restart':
+        test_restart()
+    else:
+        raise NotImplementedError