Skip to content

Commit

Permalink
Add LayerNorm
Browse files Browse the repository at this point in the history
  • Loading branch information
parsiad committed Aug 21, 2024
1 parent b0c1c58 commit 468e5ab
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 1 deletion.
15 changes: 14 additions & 1 deletion src/micrograd_pp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
from ._expr import Constant, Expr, Parameter, is_grad_enabled, maximum, no_grad, relu
from ._func import cat, cross_entropy_loss, softmax
from ._nn import BatchNorm1d, Dropout, Embedding, Linear, Module, MultiheadAttention, ReLU, Sequential, eval, is_eval
from ._nn import (
BatchNorm1d,
Dropout,
Embedding,
LayerNorm,
Linear,
Module,
MultiheadAttention,
ReLU,
Sequential,
eval,
is_eval,
)
from ._opt import SGD

from . import datasets
Expand All @@ -11,6 +23,7 @@
"Dropout",
"Embedding",
"Expr",
"LayerNorm",
"Linear",
"Module",
"MultiheadAttention",
Expand Down
53 changes: 53 additions & 0 deletions src/micrograd_pp/_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,59 @@ def __repr__(self) -> str:
return f"Embedding({self._a.shape[0]}, {self._a.shape[1]})"


class LayerNorm:
"""Layer normalization.
Parameters
----------
normalized_shape
Shape of the last D dimensions to normalize over where D is the dimension of normalized_shape (if an integer is
specified, it will be promoted to a singleton tuple)
bias
Whether or not to learn a bias (ignored if elementwise_affine is False)
dtype
Data type for running mean and variance and scale and shift parameters
elementwise_affine
Whether to use learnable scale and shift parameters
eps
When standardizing, this quantity is added to the denominator for numerical stability
"""

def __init__(
self,
normalized_shape: int | tuple[int, ...],
bias: bool = True,
dtype: type = np.float32,
elementwise_affine: bool = True,
eps: float = 1e-5,
) -> None:
if not elementwise_affine and bias:
msg = f"{LayerNorm.__name__} does not support learnable bias without a learnable scale"
raise ValueError(msg)
self._eps = eps
self._scale = Parameter(np.ones(normalized_shape, dtype=dtype)) if elementwise_affine else None
self._shift = Parameter(np.zeros(normalized_shape, dtype=dtype)) if bias else None
self._normalized_shape = (normalized_shape,) if isinstance(normalized_shape, int) else normalized_shape

def __call__(self, x: Expr) -> Expr:
dims = tuple(range(-len(self._normalized_shape), 0))
mean = x.mean(dims, keepdim=True)
var = x.var(dims, keepdim=True)
retval = (x - mean) / ((var + self._eps) ** 0.5)
if self._scale is not None:
retval = self._scale * retval
if self._shift is not None:
retval = retval + self._shift
return retval

def __repr__(self) -> str:
return (
f"LayerNorm({self._normalized_shape}, eps={self._eps=}, "
f"elementwise_affine={self._scale is not None}, "
f"bias={self._shift is not None})"
)


class Linear:
"""Linear layer.
Expand Down
9 changes: 9 additions & 0 deletions tests/test_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ def test_embedding() -> None:
assert y.shape == x.shape + (NUM_FEATURES,)


def test_layer_norm() -> None:
normalized_shape = (4, 3)
x = mpp.Constant(np.random.randn(BATCH_SZ, *normalized_shape))
ln = mpp.LayerNorm(normalized_shape, eps=0.0)
y = ln(x)
np.testing.assert_allclose(y.mean((-1, -2)).value, 0.0, atol=1e-12)
np.testing.assert_allclose(y.var((-1, -2)).value, 1.0)


@pytest.mark.parametrize("is_causal", (False, True))
def test_multihead_attention(is_causal: bool) -> None: # Test against PyTorch implementation
torch_attn_mask = None
Expand Down

0 comments on commit 468e5ab

Please sign in to comment.