From c418bde3a2ea68df31ca62541401523eb0531528 Mon Sep 17 00:00:00 2001
From: icppWorld <124377669+icppWorld@users.noreply.github.com>
Date: Thu, 7 Sep 2023 05:19:21 +0530
Subject: [PATCH] icpp_llama2_sizer, to calculate resource requirements (#10)

---
 ...EADME_icpp_llama2_resource_requirements.md |  71 +++++
 icpp_llama2/scripts/icpp_llama2_sizer.py      | 266 ++++++++++++++++++
 2 files changed, 337 insertions(+)
 create mode 100644 icpp_llama2/README_icpp_llama2_resource_requirements.md
 create mode 100644 icpp_llama2/scripts/icpp_llama2_sizer.py
diff --git a/icpp_llama2/README_icpp_llama2_resource_requirements.md b/icpp_llama2/README_icpp_llama2_resource_requirements.md
new file mode 100644
index 0000000..b0e6880
--- /dev/null
+++ b/icpp_llama2/README_icpp_llama2_resource_requirements.md
@@ -0,0 +1,71 @@
+# Canister resource requirements for icpp_llama2.
+
+Do not edit this file. It is created with the command: 
+```bash
+python -m scripts.icpp_llama2_sizer
+```
+
+### Tokenizer Memory (per model)
+
+Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
+--- | --- | --- | --- | ---
+vocab_memory | 0.00 | 0.12 | 0.12 | 0.12
+vocab_scores_memory | 0.00 | 0.12 | 0.12 | 0.12
+Total | 0.00 | 0.24 | 0.24 | 0.24
+
+
+### TransformerWeights Memory (per model)
+
+Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
+--- | --- | --- | --- | ---
+token_embedding_table | 0.12 | 35.16 | 62.50 | 93.75
+rms_att_weight | 0.00 | 0.01 | 0.02 | 0.04
+wq | 0.08 | 1.90 | 8.00 | 27.00
+wk | 0.04 | 1.90 | 8.00 | 27.00
+wv | 0.04 | 1.90 | 8.00 | 27.00
+wo | 0.08 | 1.90 | 8.00 | 27.00
+rms_ffn_weight | 0.00 | 0.01 | 0.02 | 0.04
+w1 | 0.21 | 5.06 | 21.50 | 72.00
+w2 | 0.21 | 5.06 | 21.50 | 72.00
+w3 | 0.21 | 5.06 | 21.50 | 72.00
+rms_final_weight | 0.00 | 0.00 | 0.00 | 0.00
+wcls | 0.12 | 35.16 | 62.50 | 93.75
+Total | 1.12 | 93.11 | 221.53 | 511.57
+
+
+### RunState Memory (per user)
+
+Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
+--- | --- | --- | --- | ---
+x | 0.00 | 0.00 | 0.00 | 0.00
+xb | 0.00 | 0.00 | 0.00 | 0.00
+xb2 | 0.00 | 0.00 | 0.00 | 0.00
+hb | 0.00 | 0.00 | 0.01 | 0.01
+hb2 | 0.00 | 0.00 | 0.01 | 0.01
+q | 0.00 | 0.00 | 0.00 | 0.00
+k | 0.00 | 0.00 | 0.00 | 0.00
+v | 0.00 | 0.00 | 0.00 | 0.00
+att | 0.02 | 0.01 | 0.03 | 0.05
+logits | 0.00 | 0.12 | 0.12 | 0.12
+key_cache | 0.31 | 1.69 | 16.00 | 36.00
+value_cache | 0.31 | 1.69 | 16.00 | 36.00
+Total | 0.65 | 3.52 | 32.18 | 72.20
+
+
+### Total Memory
+
+Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
+--- | --- | --- | --- | ---
+Total Tokenizer Memory (per model) | 0.00 | 0.24 | 0.24 | 0.24
+Total TransformerWeights Memory (per model) | 1.12 | 93.11 | 221.53 | 511.57
+Total RunState Memory (per user) | 0.65 | 3.52 | 32.18 | 72.20
+Overall Total Memory | 1.76 | 96.62 | 253.71 | 583.78
+
+
+### Canister Metrics
+
+Canister Metrics | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
+--- | --- | --- | --- | ---
+Max number of concurrent users | 4760 | 847 | 88 | 35
+
+
diff --git a/icpp_llama2/scripts/icpp_llama2_sizer.py b/icpp_llama2/scripts/icpp_llama2_sizer.py
new file mode 100644
index 0000000..4d82357
--- /dev/null
+++ b/icpp_llama2/scripts/icpp_llama2_sizer.py
@@ -0,0 +1,266 @@
+"""Calculates the require resources to deploy a Llama2 model to an IC canister"""
+# pylint: disable=invalid-name
+import sys
+import struct
+from pathlib import Path
+from typing import TextIO
+
+ROOT_PATH = Path(__file__).parent.parent
+
+# For 32 bit system
+SIZE_OF_FLOAT = 4  # bytes
+SIZE_OF_POINTER = 4  # bytes
+SIZE_OF_BYTE_PIECES = 512  # bytes (static size)
+
+
+def read_config_from_file(file_path: Path) -> dict[str, int]:
+    """
+    Reads the Config structure from a binary file and returns it as a dictionary.
+    """
+    with open(file_path, "rb") as f:
+        # Read the data corresponding to the Config struct
+        data: bytes = f.read(struct.calcsize("7i"))
+        config_values = struct.unpack("7i", data)
+
+        config: dict[str, int] = {
+            "dim": config_values[0],
+            "hidden_dim": config_values[1],
+            "n_layers": config_values[2],
+            "n_heads": config_values[3],
+            "n_kv_heads": config_values[4],
+            "vocab_size": abs(
+                config_values[5]
+            ),  # account for possible negative vocab_size
+            "seq_len": config_values[6],
+        }
+    return config
+
+
+def calculate_memory(config: dict[str, int]) -> dict[str, dict[str, float]]:
+    """Calculate required memory for all the LLM components"""
+    # Tokenizer
+    vocab_memory = config["vocab_size"] * SIZE_OF_POINTER
+    vocab_scores_memory = config["vocab_size"] * SIZE_OF_FLOAT
+
+    # TransformerWeights
+    head_size = config["dim"] / config["n_heads"]
+    n_layers = config["n_layers"]
+
+    token_embedding_table = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT
+    rms_att_weight = n_layers * config["dim"] * SIZE_OF_FLOAT
+    wq = n_layers * config["dim"] * (config["n_heads"] * head_size) * SIZE_OF_FLOAT
+    wk = n_layers * config["dim"] * (config["n_kv_heads"] * head_size) * SIZE_OF_FLOAT
+    wv = wk  # Same as wk
+    wo = n_layers * (config["n_heads"] * head_size) * config["dim"] * SIZE_OF_FLOAT
+    rms_ffn_weight = n_layers * config["dim"] * SIZE_OF_FLOAT
+    w1 = n_layers * config["dim"] * config["hidden_dim"] * SIZE_OF_FLOAT
+    w2 = n_layers * config["hidden_dim"] * config["dim"] * SIZE_OF_FLOAT
+    w3 = w1  # Same as w1
+    rms_final_weight = config["dim"] * SIZE_OF_FLOAT
+    wcls = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT
+
+    # RunState
+    kv_dim = (config["dim"] * config["n_kv_heads"]) / config["n_heads"]
+    x = config["dim"] * SIZE_OF_FLOAT
+    xb = x  # Same as x
+    xb2 = x  # Same as x
+    hb = config["hidden_dim"] * SIZE_OF_FLOAT
+    hb2 = hb  # Same as hb
+    q = x  # Same as x
+    k = kv_dim * SIZE_OF_FLOAT
+    v = k  # Same as k
+    att = config["n_heads"] * config["seq_len"] * SIZE_OF_FLOAT
+    logits = config["vocab_size"] * SIZE_OF_FLOAT
+    key_cache = n_layers * config["seq_len"] * kv_dim * SIZE_OF_FLOAT
+    value_cache = key_cache  # Same as key_cache
+
+    # Calculate total memory usage for Tokenizer, TransformerWeights and RunState
+    total_tokenizer = vocab_memory + vocab_scores_memory + SIZE_OF_BYTE_PIECES
+
+    total_transformer_weights = sum(
+        [
+            token_embedding_table,
+            rms_att_weight,
+            wq,
+            wk,
+            wv,
+            wo,
+            rms_ffn_weight,
+            w1,
+            w2,
+            w3,
+            rms_final_weight,
+            wcls,
+        ]
+    )
+    total_run_state = sum(
+        [x, xb, xb2, hb, hb2, q, k, v, att, logits, key_cache, value_cache]
+    )
+
+    # Collate the results in a dictionary
+    data: dict[str, dict[str, float]] = {
+        "Tokenizer Memory (per model)": {
+            "vocab_memory": vocab_memory / (1024 * 1024),
+            "vocab_scores_memory": vocab_scores_memory / (1024 * 1024),
+        },
+        "TransformerWeights Memory (per model)": {
+            "token_embedding_table": token_embedding_table / (1024 * 1024),
+            "rms_att_weight": rms_att_weight / (1024 * 1024),
+            "wq": wq / (1024 * 1024),
+            "wk": wk / (1024 * 1024),
+            "wv": wv / (1024 * 1024),
+            "wo": wo / (1024 * 1024),
+            "rms_ffn_weight": rms_ffn_weight / (1024 * 1024),
+            "w1": w1 / (1024 * 1024),
+            "w2": w2 / (1024 * 1024),
+            "w3": w3 / (1024 * 1024),
+            "rms_final_weight": rms_final_weight / (1024 * 1024),
+            "wcls": wcls / (1024 * 1024),
+        },
+        "RunState Memory (per user)": {
+            "x": x / (1024 * 1024),
+            "xb": xb / (1024 * 1024),
+            "xb2": xb2 / (1024 * 1024),
+            "hb": hb / (1024 * 1024),
+            "hb2": hb2 / (1024 * 1024),
+            "q": q / (1024 * 1024),
+            "k": k / (1024 * 1024),
+            "v": v / (1024 * 1024),
+            "att": att / (1024 * 1024),
+            "logits": logits / (1024 * 1024),
+            "key_cache": key_cache / (1024 * 1024),
+            "value_cache": value_cache / (1024 * 1024),
+        },
+        "Total Memory": {
+            "Total Tokenizer Memory (per model)": total_tokenizer / (1024 * 1024),
+            "Total TransformerWeights Memory (per model)": total_transformer_weights
+            / (1024 * 1024),
+            "Total RunState Memory (per user)": total_run_state / (1024 * 1024),
+            "Overall Total Memory": (total_transformer_weights + total_run_state)
+            / (1024 * 1024),
+        },
+    }
+    return data
+
+
+def write_data(file: TextIO, title: str, data: dict[str, dict[str, float]]) -> None:
+    """Writes it all to a Markdown file"""
+    # Get the models for headers
+    headers = ["Memory Type"] + [f"{model}<br>(MB)" for model in data.keys()]
+
+    # Write the table name
+    file.write(f"### {title}\n\n")
+
+    # Write the headers
+    file.write(" | ".join(headers) + "\n")
+    file.write(" | ".join(["---"] * len(headers)) + "\n")
+
+    # Assuming that all models have the same memory types,
+    # using the first model to get the list of memory types
+    memory_types = list(data[next(iter(data))].keys())
+
+    totals = {model: 0.0 for model in data.keys()}
+
+    for mtype in memory_types:
+        row_data = [mtype] + [
+            f"{model_data[mtype]:.2f}" for model_data in data.values()
+        ]
+        file.write(" | ".join(row_data) + "\n")
+
+        # Accumulate totals for the first three tables
+        if title in [
+            "Tokenizer Memory (per model)",
+            "TransformerWeights Memory (per model)",
+            "RunState Memory (per user)",
+        ]:
+            for model, value in zip(
+                data.keys(),
+                [model_data[mtype] for model_data in data.values()],
+            ):
+                totals[model] += value
+
+    if title in [
+        "Tokenizer Memory (per model)",
+        "TransformerWeights Memory (per model)",
+        "RunState Memory (per user)",
+    ]:
+        # Add the totals to the table
+        total_row = ["Total"] + [f"{totals[model]:.2f}" for model in data.keys()]
+        file.write(" | ".join(total_row) + "\n")
+    else:
+        # Calculate max users for each model
+        # Calculate number of users for each model and add it to the data
+        number_of_users = {}
+        for model, values in data.items():
+            total_available_memory = 3 * 1024  # Available canister memory in MB
+            total_tokenizer_memory = values["Total Tokenizer Memory (per model)"]
+            total_transformer_weights_memory = values[
+                "Total TransformerWeights Memory (per model)"
+            ]
+            total_runstate_memory = values["Total RunState Memory (per user)"]
+
+            number_of_users[model] = int(
+                (
+                    total_available_memory
+                    - total_tokenizer_memory
+                    - total_transformer_weights_memory
+                )
+                / total_runstate_memory
+            )
+
+        # Write the markdown table for number of users
+        file.write("\n\n")
+        # Get the models for headers
+        headers = ["Canister Metrics"] + [f"{model}<br>(MB)" for model in data.keys()]
+
+        # Write the table name
+        file.write("### Canister Metrics\n\n")
+
+        # Write the headers
+        file.write(" | ".join(headers) + "\n")
+        file.write(" | ".join(["---"] * len(headers)) + "\n")
+
+        row_data = ["Max number of concurrent users"] + [
+            f"{number_of_users[model]}" for model in data.keys()
+        ]
+        file.write(" | ".join(row_data) + "\n")
+
+    file.write("\n\n")
+
+
+def main() -> int:
+    """Reads the model.bin files and summarizes the resource requirements."""
+    file_paths: dict[str, Path] = {
+        "260K": ROOT_PATH / "stories260K/stories260K.bin",
+        "15M": ROOT_PATH / "models/stories15M.bin",
+        "42M": ROOT_PATH / "models/stories42M.bin",
+        "110M": ROOT_PATH / "models/stories110M.bin",
+    }
+
+    data = {}
+    for key, file_path in file_paths.items():
+        config: dict[str, int] = read_config_from_file(file_path)
+        data[key] = calculate_memory(config)
+
+    output_path = ROOT_PATH / "README_icpp_llama2_resource_requirements.md"
+    with open(output_path, "w") as file:
+        file.write("# Canister resource requirements for icpp_llama2.")
+        file.write("\n")
+        file.write("\nDo not edit this file. It is created with the command: ")
+        file.write("\n```bash")
+        file.write("\npython -m scripts.icpp_llama2_sizer")
+        file.write("\n```\n\n")
+        for key in [
+            "Tokenizer Memory (per model)",
+            "TransformerWeights Memory (per model)",
+            "RunState Memory (per user)",
+            "Total Memory",
+        ]:
+            subset_data = {k: v[key] for k, v in data.items()}
+            write_data(file, key, subset_data)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())