Add LayerNorm

parsiad · Aug 21, 2024 · 468e5ab · 468e5ab
1 parent b0c1c58
commit 468e5ab
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 1 deletion.
diff --git a/src/micrograd_pp/__init__.py b/src/micrograd_pp/__init__.py
@@ -1,6 +1,18 @@
 from ._expr import Constant, Expr, Parameter, is_grad_enabled, maximum, no_grad, relu
 from ._func import cat, cross_entropy_loss, softmax
-from ._nn import BatchNorm1d, Dropout, Embedding, Linear, Module, MultiheadAttention, ReLU, Sequential, eval, is_eval
+from ._nn import (
+    BatchNorm1d,
+    Dropout,
+    Embedding,
+    LayerNorm,
+    Linear,
+    Module,
+    MultiheadAttention,
+    ReLU,
+    Sequential,
+    eval,
+    is_eval,
+)
 from ._opt import SGD
 
 from . import datasets
@@ -11,6 +23,7 @@
     "Dropout",
     "Embedding",
     "Expr",
+    "LayerNorm",
     "Linear",
     "Module",
     "MultiheadAttention",

diff --git a/src/micrograd_pp/_nn.py b/src/micrograd_pp/_nn.py
@@ -161,6 +161,59 @@ def __repr__(self) -> str:
         return f"Embedding({self._a.shape[0]}, {self._a.shape[1]})"
 
 
+class LayerNorm:
+    """Layer normalization.
+
+    Parameters
+    ----------
+    normalized_shape
+        Shape of the last D dimensions to normalize over where D is the dimension of normalized_shape (if an integer is
+        specified, it will be promoted to a singleton tuple)
+    bias
+        Whether or not to learn a bias (ignored if elementwise_affine is False)
+    dtype
+        Data type for running mean and variance and scale and shift parameters
+    elementwise_affine
+        Whether to use learnable scale and shift parameters
+    eps
+        When standardizing, this quantity is added to the denominator for numerical stability
+    """
+
+    def __init__(
+        self,
+        normalized_shape: int | tuple[int, ...],
+        bias: bool = True,
+        dtype: type = np.float32,
+        elementwise_affine: bool = True,
+        eps: float = 1e-5,
+    ) -> None:
+        if not elementwise_affine and bias:
+            msg = f"{LayerNorm.__name__} does not support learnable bias without a learnable scale"
+            raise ValueError(msg)
+        self._eps = eps
+        self._scale = Parameter(np.ones(normalized_shape, dtype=dtype)) if elementwise_affine else None
+        self._shift = Parameter(np.zeros(normalized_shape, dtype=dtype)) if bias else None
+        self._normalized_shape = (normalized_shape,) if isinstance(normalized_shape, int) else normalized_shape
+
+    def __call__(self, x: Expr) -> Expr:
+        dims = tuple(range(-len(self._normalized_shape), 0))
+        mean = x.mean(dims, keepdim=True)
+        var = x.var(dims, keepdim=True)
+        retval = (x - mean) / ((var + self._eps) ** 0.5)
+        if self._scale is not None:
+            retval = self._scale * retval
+        if self._shift is not None:
+            retval = retval + self._shift
+        return retval
+
+    def __repr__(self) -> str:
+        return (
+            f"LayerNorm({self._normalized_shape}, eps={self._eps=}, "
+            f"elementwise_affine={self._scale is not None}, "
+            f"bias={self._shift is not None})"
+        )
+
+
 class Linear:
     """Linear layer.
 

diff --git a/tests/test_nn.py b/tests/test_nn.py
@@ -78,6 +78,15 @@ def test_embedding() -> None:
     assert y.shape == x.shape + (NUM_FEATURES,)
 
 
+def test_layer_norm() -> None:
+    normalized_shape = (4, 3)
+    x = mpp.Constant(np.random.randn(BATCH_SZ, *normalized_shape))
+    ln = mpp.LayerNorm(normalized_shape, eps=0.0)
+    y = ln(x)
+    np.testing.assert_allclose(y.mean((-1, -2)).value, 0.0, atol=1e-12)
+    np.testing.assert_allclose(y.var((-1, -2)).value, 1.0)
+
+
 @pytest.mark.parametrize("is_causal", (False, True))
 def test_multihead_attention(is_causal: bool) -> None:  # Test against PyTorch implementation
     torch_attn_mask = None