diff --git a/tests/modules/layers/test_activation.py b/tests/modules/layers/test_activation.py
index 4e8a7010..8a0b42aa 100644
--- a/tests/modules/layers/test_activation.py
+++ b/tests/modules/layers/test_activation.py
@@ -7,7 +7,7 @@
 import torch
 
 from tests.test_utils import assert_expected
-from torchmultimodal.modules.layers.activation import SiLU
+from torchmultimodal.modules.layers.activation import GEGLU, SiLU
 
 
 def test_sigmoid_linear_unit():
@@ -15,3 +15,10 @@ def test_sigmoid_linear_unit():
     actual = silu(torch.ones(3))
     expected = torch.tensor([0.8458, 0.8458, 0.8458])
     assert_expected(actual, expected)
+
+
+def test_geglu():
+    geglu = GEGLU()
+    actual = geglu(torch.ones(10))
+    expected = torch.tensor([0.8413, 0.8413, 0.8413, 0.8413, 0.8413])
+    assert_expected(actual, expected, atol=1e-4, rtol=1e-5)
diff --git a/tests/utils/test_common.py b/tests/utils/test_common.py
index fda1918f..c5340947 100644
--- a/tests/utils/test_common.py
+++ b/tests/utils/test_common.py
@@ -13,6 +13,7 @@
 from torch.utils.checkpoint import checkpoint
 from torchmultimodal.utils.common import (
     checkpoint_wrapper,
+    init_module_parameters_to_zero,
     shift_dim,
     tensor_slice,
     to_tuple_tuple,
@@ -30,6 +31,13 @@ def test_shift_dim():
     assert_expected(actual, expected)
 
 
+def test_init_module_parameters_to_zero():
+    module = nn.Conv2d(10, 10, kernel_size=1)
+    init_module_parameters_to_zero(module)
+    for p in module.parameters():
+        assert_expected(p, torch.zeros_like(p))
+
+
 class TestTensorSlice:
     @pytest.fixture(scope="class")
     def test_input(self):
diff --git a/torchmultimodal/modules/layers/activation.py b/torchmultimodal/modules/layers/activation.py
index e3f4c374..3ca3615b 100644
--- a/torchmultimodal/modules/layers/activation.py
+++ b/torchmultimodal/modules/layers/activation.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+import torch.nn.functional as F
 from torch import nn, Tensor
 
 
@@ -22,3 +23,22 @@ class SiLU(nn.Module):
 
     def forward(self, x: Tensor) -> Tensor:
         return torch.sigmoid(1.702 * x) * x
+
+
+class GEGLU(nn.Module):
+    """Gated Linear Unit with GELU activation function
+
+    .. math:: \text{GEGLU}(a,b) = a * \text{GELU}(b)
+
+    where :math:`a` is the first half of the input matrices and :math:`b` is
+    the second half, as descibed in the paper:
+    `"GLU Variants Improve Transformer"<https://arxiv.org/pdf/2002.05202.pdf>`.
+    """
+
+    def __init__(self, dim: int = -1):
+        super().__init__()
+        self.split_dim = dim
+
+    def forward(self, x: Tensor) -> Tensor:
+        x, gate = x.chunk(2, dim=self.split_dim)
+        return x * F.gelu(gate)
diff --git a/torchmultimodal/utils/common.py b/torchmultimodal/utils/common.py
index 7cff4fb6..5d4fc4c3 100644
--- a/torchmultimodal/utils/common.py
+++ b/torchmultimodal/utils/common.py
@@ -187,3 +187,12 @@ def custom_forward(*inputs: Any) -> Callable:
 
 def get_clones(module: nn.Module, n: int) -> nn.ModuleList:
     return nn.ModuleList([deepcopy(module) for i in range(n)])
+
+
+def init_module_parameters_to_zero(module: nn.Module) -> None:
+    """
+    Sets the parameters of a module to zero. This is a commonly used trick
+    from Fixup initialization, to stabilize training of residual networks.
+    """
+    for p in module.parameters():
+        nn.init.zeros_(p)