Reorganize and clean MNIST + CIFAR10 examples. (#113)

graphcore-research · Jun 17, 2024 · 1b41e27 · 1b41e27
1 parent b3ed77c
commit 1b41e27
Show file tree

Hide file tree

Showing 17 changed files with 194 additions and 510 deletions.
diff --git a/README.md b/README.md
@@ -68,9 +68,9 @@ As presented in the code above, the model state is represented as a JAX PyTree o
 
 A full collection of examples is available:
 * [Scalify quickstart notebook](./examples/scalify-quickstart.ipynb): basics of `ScaledArray` and `scalify` transform;
-* [MNIST FP16 training example](./experiments/mnist/mnist_classifier_from_scratch.py): adapting JAX MNIST example to `scalify`;
-* [MNIST FP8 training example](./experiments/mnist/mnist_classifier_from_scratch.py): easy FP8 support in `scalify`;
-* [CIFAR10 training](./experiments/mnist/cifar_training.py): `scalify` CIFAR10 training, with Optax optimizer integration;
+* [MNIST FP16 training example](./examples/mnist/mnist_classifier_from_scratch.py): adapting JAX MNIST example to `scalify`;
+* [MNIST FP8 training example](./examples/mnist/mnist_classifier_from_scratch_fp8.py): easy FP8 support in `scalify`;
+* [MNIST Flax example](./examples/mnist/flax): `scalify` Flax training, with Optax optimizer integration;
 
 
 ## Installation

diff --git a/experiments/mnist/cifar_training.py → examples/cifar10/cifar10_training.py b/experiments/mnist/cifar_training.py → examples/cifar10/cifar10_training.py
@@ -11,16 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified by Graphcore Ltd 2024.
 
-"""A basic MNIST example using Numpy and JAX.
+"""A basic CIFAR10 example using Numpy and JAX.
 
-The primary aim here is simplicity and minimal dependencies.
+CIFAR10 training using MLP network + raw SGD optimizer.
 """
-
-
 import time
 
-import datasets
+import dataset_cifar10
 import jax
 import jax.numpy as jnp
 import numpy as np
@@ -100,7 +99,7 @@ def accuracy(params, batch):
     training_dtype = np.float16
     scale_dtype = np.float32
 
-    train_images, train_labels, test_images, test_labels = datasets.cifar()
+    train_images, train_labels, test_images, test_labels = dataset_cifar10.cifar()
     num_train = train_images.shape[0]
     num_complete_batches, leftover = divmod(num_train, batch_size)
     num_batches = num_complete_batches + bool(leftover)
@@ -118,7 +117,7 @@ def data_stream():
     # Transform parameters to `ScaledArray` and proper dtype.
     if use_scalify:
         params = jsa.as_scaled_array(params, scale=scale_dtype(param_scale))
-    params = jax.tree_map(lambda v: v.astype(training_dtype), params, is_leaf=jsa.core.is_scaled_leaf)
+    params = jax.tree_util.tree_map(lambda v: v.astype(training_dtype), params, is_leaf=jsa.core.is_scaled_leaf)
 
     @jit
     @scalify
@@ -133,7 +132,7 @@ def update(params, batch):
             # Scaled micro-batch + training dtype cast.
             if use_scalify:
                 batch = jsa.as_scaled_array(batch, scale=scale_dtype(param_scale))
-            batch = jax.tree_map(lambda v: v.astype(training_dtype), batch, is_leaf=jsa.core.is_scaled_leaf)
+            batch = jax.tree_util.tree_map(lambda v: v.astype(training_dtype), batch, is_leaf=jsa.core.is_scaled_leaf)
 
             with jsa.ScalifyConfig(rounding_mode=jsa.Pow2RoundMode.DOWN, scale_dtype=scale_dtype):
                 params = update(params, batch)

diff --git a/experiments/mnist/optax_cifar_training.py → ...es/cifar10/cifar10_training_with_optax.py b/experiments/mnist/optax_cifar_training.py → ...es/cifar10/cifar10_training_with_optax.py
@@ -11,16 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified by Graphcore Ltd 2024.
 
-"""A basic MNIST example using Numpy and JAX.
-
-The primary aim here is simplicity and minimal dependencies.
+"""A basic CIFAR10 example using Numpy and JAX.
 """
 
 
 import time
 
-import datasets
+import dataset_cifar10
 import jax
 import jax.numpy as jnp
 import numpy as np
@@ -65,10 +64,6 @@ def predict(params, inputs):
 
     final_w, final_b = params[-1]
     logits = jnp.dot(activations, final_w) + final_b
-
-    # jsa.ops.debug_callback(partial(print_mean_std, "Logits"), logits)
-    # (logits,) = jsa.ops.debug_callback_grad(partial(print_mean_std, "LogitsGrad"), logits)
-
     # Dynamic rescaling of the gradient, as logits gradient not properly scaled.
     logits = jsa.ops.dynamic_rescale_l2_grad(logits)
     output = logits - logsumexp(logits, axis=1, keepdims=True)
@@ -102,7 +97,7 @@ def accuracy(params, batch):
     batch_size = 128
     scale_dtype = np.float32
 
-    train_images, train_labels, test_images, test_labels = datasets.cifar()
+    train_images, train_labels, test_images, test_labels = dataset_cifar10.cifar()
     num_train = train_images.shape[0]
     num_complete_batches, leftover = divmod(num_train, batch_size)
     num_batches = num_complete_batches + bool(leftover)

diff --git a/examples/cifar10/dataset_cifar10.py b/examples/cifar10/dataset_cifar10.py
@@ -0,0 +1,154 @@
+# Copyright 2018 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified by Graphcore Ltd 2024.
+
+"""Datasets used in examples."""
+
+
+import array
+import gzip
+import os
+import pickle
+import struct
+import tarfile
+import urllib.request
+from os import path
+
+import numpy as np
+
+_DATA = "/tmp/jax_example_data/"
+
+
+def _download(url, filename):
+    """Download a url to a file in the JAX data temp directory."""
+    if not path.exists(_DATA):
+        os.makedirs(_DATA)
+    out_file = path.join(_DATA, filename)
+    if not path.isfile(out_file):
+        urllib.request.urlretrieve(url, out_file)
+        print(f"downloaded {url} to {_DATA}")
+
+
+def _partial_flatten(x):
+    """Flatten all but the first dimension of an ndarray."""
+    return np.reshape(x, (x.shape[0], -1))
+
+
+def _one_hot(x, k, dtype=np.float32):
+    """Create a one-hot encoding of x of size k."""
+    return np.array(x[:, None] == np.arange(k), dtype)
+
+
+def _unzip(file):
+    file = tarfile.open(file)
+    file.extractall(_DATA)
+    file.close()
+    return
+
+
+def _unpickle(file):
+    with open(file, "rb") as fo:
+        dict = pickle.load(fo, encoding="bytes")
+    return dict
+
+
+def mnist_raw():
+    """Download and parse the raw MNIST dataset."""
+    # CVDF mirror of http://yann.lecun.com/exdb/mnist/
+    base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"
+
+    def parse_labels(filename):
+        with gzip.open(filename, "rb") as fh:
+            _ = struct.unpack(">II", fh.read(8))
+            return np.array(array.array("B", fh.read()), dtype=np.uint8)
+
+    def parse_images(filename):
+        with gzip.open(filename, "rb") as fh:
+            _, num_data, rows, cols = struct.unpack(">IIII", fh.read(16))
+            return np.array(array.array("B", fh.read()), dtype=np.uint8).reshape(num_data, rows, cols)
+
+    for filename in [
+        "train-images-idx3-ubyte.gz",
+        "train-labels-idx1-ubyte.gz",
+        "t10k-images-idx3-ubyte.gz",
+        "t10k-labels-idx1-ubyte.gz",
+    ]:
+        _download(base_url + filename, filename)
+
+    train_images = parse_images(path.join(_DATA, "train-images-idx3-ubyte.gz"))
+    train_labels = parse_labels(path.join(_DATA, "train-labels-idx1-ubyte.gz"))
+    test_images = parse_images(path.join(_DATA, "t10k-images-idx3-ubyte.gz"))
+    test_labels = parse_labels(path.join(_DATA, "t10k-labels-idx1-ubyte.gz"))
+
+    return train_images, train_labels, test_images, test_labels
+
+
+def mnist(permute_train=False):
+    """Download, parse and process MNIST data to unit scale and one-hot labels."""
+    train_images, train_labels, test_images, test_labels = mnist_raw()
+
+    train_images = _partial_flatten(train_images) / np.float32(255.0)
+    test_images = _partial_flatten(test_images) / np.float32(255.0)
+    train_labels = _one_hot(train_labels, 10)
+    test_labels = _one_hot(test_labels, 10)
+
+    if permute_train:
+        perm = np.random.RandomState(0).permutation(train_images.shape[0])
+        train_images = train_images[perm]
+        train_labels = train_labels[perm]
+
+    return train_images, train_labels, test_images, test_labels
+
+
+def cifar_raw():
+    """Download, unzip and parse the raw cifar dataset."""
+
+    filename = "cifar-10-python.tar.gz"
+    url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
+    _download(url, filename)
+    _unzip(path.join(_DATA, filename))
+
+    data_batches = ["data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4", "data_batch_5"]
+    data = []
+    labels = []
+    for batch in data_batches:
+        tmp_dict = _unpickle(path.join(_DATA, "cifar-10-batches-py", batch))
+        data.append(tmp_dict[b"data"])
+        labels.append(tmp_dict[b"labels"])
+    train_images = np.concatenate(data)
+    train_labels = np.concatenate(labels)
+
+    test_dict = _unpickle(path.join(_DATA, "cifar-10-batches-py", "test_batch"))
+    test_images = test_dict[b"data"]
+    test_labels = np.array(test_dict[b"labels"])
+
+    return train_images, train_labels, test_images, test_labels
+
+
+def cifar(permute_train=False):
+    """Download, parse and process cifar data to unit scale and one-hot labels."""
+
+    train_images, train_labels, test_images, test_labels = cifar_raw()
+
+    train_images = train_images / np.float32(255.0)
+    test_images = test_images / np.float32(255.0)
+    train_labels = _one_hot(train_labels, 10)
+    test_labels = _one_hot(test_labels, 10)
+
+    if permute_train:
+        perm = np.random.RandomState(0).permutation(train_images.shape[0])
+        train_images = train_images[perm]
+        train_labels = train_labels[perm]
+
+    return train_images, train_labels, test_images, test_labels
diff --git a/experiments/mnist/datasets.py → examples/mnist/datasets.py b/experiments/mnist/datasets.py → examples/mnist/datasets.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified by Graphcore Ltd 2024.
+
 
 """Datasets used in examples."""
 

diff --git a/experiments/mnist/flax_example/README.md → examples/mnist/flax/README.md b/experiments/mnist/flax_example/README.md → examples/mnist/flax/README.md
diff --git a/...ts/mnist/flax_example/configs/__init__.py → examples/mnist/flax/configs/__init__.py b/...ts/mnist/flax_example/configs/__init__.py → examples/mnist/flax/configs/__init__.py
diff --git a/...nts/mnist/flax_example/configs/default.py → examples/mnist/flax/configs/default.py b/...nts/mnist/flax_example/configs/default.py → examples/mnist/flax/configs/default.py
diff --git a/experiments/mnist/flax_example/main.py → examples/mnist/flax/main.py b/experiments/mnist/flax_example/main.py → examples/mnist/flax/main.py
@@ -19,7 +19,8 @@
 """
 
 import jax
-import tensorflow as tf
+
+# import tensorflow as tf
 import train
 from absl import app, flags, logging
 from clu import platform
@@ -42,7 +43,7 @@ def main(argv):
 
     # Hide any GPUs from TensorFlow. Otherwise TF might reserve memory and make
     # it unavailable to JAX.
-    tf.config.experimental.set_visible_devices([], "GPU")
+    # tf.config.experimental.set_visible_devices([], "GPU")
 
     logging.info("JAX process: %d / %d", jax.process_index(), jax.process_count())
     logging.info("JAX local devices: %r", jax.local_devices())

diff --git a/...ments/mnist/flax_example/requirements.txt → examples/mnist/flax/requirements.txt b/...ments/mnist/flax_example/requirements.txt → examples/mnist/flax/requirements.txt
diff --git a/experiments/mnist/flax_example/train.py → examples/mnist/flax/train.py b/experiments/mnist/flax_example/train.py → examples/mnist/flax/train.py
@@ -28,8 +28,9 @@
 import optax
 import tensorflow_datasets as tfds
 from absl import logging
-from flax import linen as nn
-from flax.metrics import tensorboard
+from flax import linen as nn  # type:ignore
+
+# from flax.metrics import tensorboard
 from flax.training import train_state
 
 import jax_scalify as jsa
@@ -143,8 +144,8 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str) -> train
     train_ds, test_ds = get_datasets()
     rng = jax.random.key(0)
 
-    summary_writer = tensorboard.SummaryWriter(workdir)
-    summary_writer.hparams(dict(config))
+    # summary_writer = tensorboard.SummaryWriter(workdir)
+    # summary_writer.hparams(dict(config))
 
     rng, init_rng = jax.random.split(rng)
     init_rng = jax.random.PRNGKey(1)
@@ -173,10 +174,10 @@ def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str) -> train
             )
         )
 
-        summary_writer.scalar("train_loss", train_loss, epoch)
-        summary_writer.scalar("train_accuracy", train_accuracy, epoch)
-        summary_writer.scalar("test_loss", test_loss, epoch)
-        summary_writer.scalar("test_accuracy", test_accuracy, epoch)
+        # summary_writer.scalar("train_loss", train_loss, epoch)
+        # summary_writer.scalar("train_accuracy", train_accuracy, epoch)
+        # summary_writer.scalar("test_loss", test_loss, epoch)
+        # summary_writer.scalar("test_accuracy", test_accuracy, epoch)
 
-    summary_writer.flush()
+    # summary_writer.flush()
     return state
diff --git a/...ts/mnist/mnist_classifier_from_scratch.py → ...es/mnist/mnist_classifier_from_scratch.py b/...ts/mnist/mnist_classifier_from_scratch.py → ...es/mnist/mnist_classifier_from_scratch.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified by Graphcore Ltd 2024.
 
 """A basic MNIST example using Numpy and JAX.
 
@@ -78,9 +79,9 @@ def accuracy(params, batch):
 
 
 if __name__ == "__main__":
-    layer_sizes = [784, 1024, 1024, 10]
-    param_scale = 1.0
-    step_size = 0.001
+    layer_sizes = [784, 512, 512, 10]
+    param_scale = 0.1
+    step_size = 0.1
     num_epochs = 10
     batch_size = 128
 
@@ -125,7 +126,7 @@ def update(params, batch):
 
         epoch_time = time.time() - start_time
 
-        # Evaluation in float32, for consistency.
+        # Evaluation in normal/unscaled float32, for consistency.
         raw_params = jsa.asarray(params, dtype=np.float32)
         train_acc = accuracy(raw_params, (train_images, train_labels))
         test_acc = accuracy(raw_params, (test_images, test_labels))

diff --git a/...nist/mnist_classifier_from_scratch_fp8.py → ...nist/mnist_classifier_from_scratch_fp8.py b/...nist/mnist_classifier_from_scratch_fp8.py → ...nist/mnist_classifier_from_scratch_fp8.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified by Graphcore Ltd 2024.
 
 """A basic MNIST example using Numpy and JAX.
 
@@ -34,6 +35,7 @@
 
 
 def print_mean_std(name, v):
+    """Debugging method/tool for JAX Scalify."""
     data, scale = jsa.lax.get_data_scale(v)
     # Always use np.float32, to avoid floating errors in descaling + stats.
     data = jsa.asarray(data, dtype=np.float32)
@@ -105,9 +107,9 @@ def accuracy(params, batch):
 
 
 if __name__ == "__main__":
-    layer_sizes = [784, 1024, 1024, 10]
-    param_scale = 1.0
-    step_size = 0.001
+    layer_sizes = [784, 512, 512, 10]
+    param_scale = 0.1
+    step_size = 0.1
     num_epochs = 10
     batch_size = 128