From c418bde3a2ea68df31ca62541401523eb0531528 Mon Sep 17 00:00:00 2001 From: icppWorld <124377669+icppWorld@users.noreply.github.com> Date: Thu, 7 Sep 2023 05:19:21 +0530 Subject: [PATCH] icpp_llama2_sizer, to calculate resource requirements (#10) --- ...EADME_icpp_llama2_resource_requirements.md | 71 +++++ icpp_llama2/scripts/icpp_llama2_sizer.py | 266 ++++++++++++++++++ 2 files changed, 337 insertions(+) create mode 100644 icpp_llama2/README_icpp_llama2_resource_requirements.md create mode 100644 icpp_llama2/scripts/icpp_llama2_sizer.py diff --git a/icpp_llama2/README_icpp_llama2_resource_requirements.md b/icpp_llama2/README_icpp_llama2_resource_requirements.md new file mode 100644 index 0000000..b0e6880 --- /dev/null +++ b/icpp_llama2/README_icpp_llama2_resource_requirements.md @@ -0,0 +1,71 @@ +# Canister resource requirements for icpp_llama2. + +Do not edit this file. It is created with the command: +```bash +python -m scripts.icpp_llama2_sizer +``` + +### Tokenizer Memory (per model) + +Memory Type | 260K
(MB) | 15M
(MB) | 42M
(MB) | 110M
(MB) +--- | --- | --- | --- | --- +vocab_memory | 0.00 | 0.12 | 0.12 | 0.12 +vocab_scores_memory | 0.00 | 0.12 | 0.12 | 0.12 +Total | 0.00 | 0.24 | 0.24 | 0.24 + + +### TransformerWeights Memory (per model) + +Memory Type | 260K
(MB) | 15M
(MB) | 42M
(MB) | 110M
(MB) +--- | --- | --- | --- | --- +token_embedding_table | 0.12 | 35.16 | 62.50 | 93.75 +rms_att_weight | 0.00 | 0.01 | 0.02 | 0.04 +wq | 0.08 | 1.90 | 8.00 | 27.00 +wk | 0.04 | 1.90 | 8.00 | 27.00 +wv | 0.04 | 1.90 | 8.00 | 27.00 +wo | 0.08 | 1.90 | 8.00 | 27.00 +rms_ffn_weight | 0.00 | 0.01 | 0.02 | 0.04 +w1 | 0.21 | 5.06 | 21.50 | 72.00 +w2 | 0.21 | 5.06 | 21.50 | 72.00 +w3 | 0.21 | 5.06 | 21.50 | 72.00 +rms_final_weight | 0.00 | 0.00 | 0.00 | 0.00 +wcls | 0.12 | 35.16 | 62.50 | 93.75 +Total | 1.12 | 93.11 | 221.53 | 511.57 + + +### RunState Memory (per user) + +Memory Type | 260K
(MB) | 15M
(MB) | 42M
(MB) | 110M
(MB) +--- | --- | --- | --- | --- +x | 0.00 | 0.00 | 0.00 | 0.00 +xb | 0.00 | 0.00 | 0.00 | 0.00 +xb2 | 0.00 | 0.00 | 0.00 | 0.00 +hb | 0.00 | 0.00 | 0.01 | 0.01 +hb2 | 0.00 | 0.00 | 0.01 | 0.01 +q | 0.00 | 0.00 | 0.00 | 0.00 +k | 0.00 | 0.00 | 0.00 | 0.00 +v | 0.00 | 0.00 | 0.00 | 0.00 +att | 0.02 | 0.01 | 0.03 | 0.05 +logits | 0.00 | 0.12 | 0.12 | 0.12 +key_cache | 0.31 | 1.69 | 16.00 | 36.00 +value_cache | 0.31 | 1.69 | 16.00 | 36.00 +Total | 0.65 | 3.52 | 32.18 | 72.20 + + +### Total Memory + +Memory Type | 260K
(MB) | 15M
(MB) | 42M
(MB) | 110M
(MB) +--- | --- | --- | --- | --- +Total Tokenizer Memory (per model) | 0.00 | 0.24 | 0.24 | 0.24 +Total TransformerWeights Memory (per model) | 1.12 | 93.11 | 221.53 | 511.57 +Total RunState Memory (per user) | 0.65 | 3.52 | 32.18 | 72.20 +Overall Total Memory | 1.76 | 96.62 | 253.71 | 583.78 + + +### Canister Metrics + +Canister Metrics | 260K
(MB) | 15M
(MB) | 42M
(MB) | 110M
(MB) +--- | --- | --- | --- | --- +Max number of concurrent users | 4760 | 847 | 88 | 35 + + diff --git a/icpp_llama2/scripts/icpp_llama2_sizer.py b/icpp_llama2/scripts/icpp_llama2_sizer.py new file mode 100644 index 0000000..4d82357 --- /dev/null +++ b/icpp_llama2/scripts/icpp_llama2_sizer.py @@ -0,0 +1,266 @@ +"""Calculates the require resources to deploy a Llama2 model to an IC canister""" +# pylint: disable=invalid-name +import sys +import struct +from pathlib import Path +from typing import TextIO + +ROOT_PATH = Path(__file__).parent.parent + +# For 32 bit system +SIZE_OF_FLOAT = 4 # bytes +SIZE_OF_POINTER = 4 # bytes +SIZE_OF_BYTE_PIECES = 512 # bytes (static size) + + +def read_config_from_file(file_path: Path) -> dict[str, int]: + """ + Reads the Config structure from a binary file and returns it as a dictionary. + """ + with open(file_path, "rb") as f: + # Read the data corresponding to the Config struct + data: bytes = f.read(struct.calcsize("7i")) + config_values = struct.unpack("7i", data) + + config: dict[str, int] = { + "dim": config_values[0], + "hidden_dim": config_values[1], + "n_layers": config_values[2], + "n_heads": config_values[3], + "n_kv_heads": config_values[4], + "vocab_size": abs( + config_values[5] + ), # account for possible negative vocab_size + "seq_len": config_values[6], + } + return config + + +def calculate_memory(config: dict[str, int]) -> dict[str, dict[str, float]]: + """Calculate required memory for all the LLM components""" + # Tokenizer + vocab_memory = config["vocab_size"] * SIZE_OF_POINTER + vocab_scores_memory = config["vocab_size"] * SIZE_OF_FLOAT + + # TransformerWeights + head_size = config["dim"] / config["n_heads"] + n_layers = config["n_layers"] + + token_embedding_table = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT + rms_att_weight = n_layers * config["dim"] * SIZE_OF_FLOAT + wq = n_layers * config["dim"] * (config["n_heads"] * head_size) * SIZE_OF_FLOAT + wk = n_layers * config["dim"] * (config["n_kv_heads"] * head_size) * SIZE_OF_FLOAT + wv = wk # Same as wk + wo = n_layers * (config["n_heads"] * head_size) * config["dim"] * SIZE_OF_FLOAT + rms_ffn_weight = n_layers * config["dim"] * SIZE_OF_FLOAT + w1 = n_layers * config["dim"] * config["hidden_dim"] * SIZE_OF_FLOAT + w2 = n_layers * config["hidden_dim"] * config["dim"] * SIZE_OF_FLOAT + w3 = w1 # Same as w1 + rms_final_weight = config["dim"] * SIZE_OF_FLOAT + wcls = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT + + # RunState + kv_dim = (config["dim"] * config["n_kv_heads"]) / config["n_heads"] + x = config["dim"] * SIZE_OF_FLOAT + xb = x # Same as x + xb2 = x # Same as x + hb = config["hidden_dim"] * SIZE_OF_FLOAT + hb2 = hb # Same as hb + q = x # Same as x + k = kv_dim * SIZE_OF_FLOAT + v = k # Same as k + att = config["n_heads"] * config["seq_len"] * SIZE_OF_FLOAT + logits = config["vocab_size"] * SIZE_OF_FLOAT + key_cache = n_layers * config["seq_len"] * kv_dim * SIZE_OF_FLOAT + value_cache = key_cache # Same as key_cache + + # Calculate total memory usage for Tokenizer, TransformerWeights and RunState + total_tokenizer = vocab_memory + vocab_scores_memory + SIZE_OF_BYTE_PIECES + + total_transformer_weights = sum( + [ + token_embedding_table, + rms_att_weight, + wq, + wk, + wv, + wo, + rms_ffn_weight, + w1, + w2, + w3, + rms_final_weight, + wcls, + ] + ) + total_run_state = sum( + [x, xb, xb2, hb, hb2, q, k, v, att, logits, key_cache, value_cache] + ) + + # Collate the results in a dictionary + data: dict[str, dict[str, float]] = { + "Tokenizer Memory (per model)": { + "vocab_memory": vocab_memory / (1024 * 1024), + "vocab_scores_memory": vocab_scores_memory / (1024 * 1024), + }, + "TransformerWeights Memory (per model)": { + "token_embedding_table": token_embedding_table / (1024 * 1024), + "rms_att_weight": rms_att_weight / (1024 * 1024), + "wq": wq / (1024 * 1024), + "wk": wk / (1024 * 1024), + "wv": wv / (1024 * 1024), + "wo": wo / (1024 * 1024), + "rms_ffn_weight": rms_ffn_weight / (1024 * 1024), + "w1": w1 / (1024 * 1024), + "w2": w2 / (1024 * 1024), + "w3": w3 / (1024 * 1024), + "rms_final_weight": rms_final_weight / (1024 * 1024), + "wcls": wcls / (1024 * 1024), + }, + "RunState Memory (per user)": { + "x": x / (1024 * 1024), + "xb": xb / (1024 * 1024), + "xb2": xb2 / (1024 * 1024), + "hb": hb / (1024 * 1024), + "hb2": hb2 / (1024 * 1024), + "q": q / (1024 * 1024), + "k": k / (1024 * 1024), + "v": v / (1024 * 1024), + "att": att / (1024 * 1024), + "logits": logits / (1024 * 1024), + "key_cache": key_cache / (1024 * 1024), + "value_cache": value_cache / (1024 * 1024), + }, + "Total Memory": { + "Total Tokenizer Memory (per model)": total_tokenizer / (1024 * 1024), + "Total TransformerWeights Memory (per model)": total_transformer_weights + / (1024 * 1024), + "Total RunState Memory (per user)": total_run_state / (1024 * 1024), + "Overall Total Memory": (total_transformer_weights + total_run_state) + / (1024 * 1024), + }, + } + return data + + +def write_data(file: TextIO, title: str, data: dict[str, dict[str, float]]) -> None: + """Writes it all to a Markdown file""" + # Get the models for headers + headers = ["Memory Type"] + [f"{model}
(MB)" for model in data.keys()] + + # Write the table name + file.write(f"### {title}\n\n") + + # Write the headers + file.write(" | ".join(headers) + "\n") + file.write(" | ".join(["---"] * len(headers)) + "\n") + + # Assuming that all models have the same memory types, + # using the first model to get the list of memory types + memory_types = list(data[next(iter(data))].keys()) + + totals = {model: 0.0 for model in data.keys()} + + for mtype in memory_types: + row_data = [mtype] + [ + f"{model_data[mtype]:.2f}" for model_data in data.values() + ] + file.write(" | ".join(row_data) + "\n") + + # Accumulate totals for the first three tables + if title in [ + "Tokenizer Memory (per model)", + "TransformerWeights Memory (per model)", + "RunState Memory (per user)", + ]: + for model, value in zip( + data.keys(), + [model_data[mtype] for model_data in data.values()], + ): + totals[model] += value + + if title in [ + "Tokenizer Memory (per model)", + "TransformerWeights Memory (per model)", + "RunState Memory (per user)", + ]: + # Add the totals to the table + total_row = ["Total"] + [f"{totals[model]:.2f}" for model in data.keys()] + file.write(" | ".join(total_row) + "\n") + else: + # Calculate max users for each model + # Calculate number of users for each model and add it to the data + number_of_users = {} + for model, values in data.items(): + total_available_memory = 3 * 1024 # Available canister memory in MB + total_tokenizer_memory = values["Total Tokenizer Memory (per model)"] + total_transformer_weights_memory = values[ + "Total TransformerWeights Memory (per model)" + ] + total_runstate_memory = values["Total RunState Memory (per user)"] + + number_of_users[model] = int( + ( + total_available_memory + - total_tokenizer_memory + - total_transformer_weights_memory + ) + / total_runstate_memory + ) + + # Write the markdown table for number of users + file.write("\n\n") + # Get the models for headers + headers = ["Canister Metrics"] + [f"{model}
(MB)" for model in data.keys()] + + # Write the table name + file.write("### Canister Metrics\n\n") + + # Write the headers + file.write(" | ".join(headers) + "\n") + file.write(" | ".join(["---"] * len(headers)) + "\n") + + row_data = ["Max number of concurrent users"] + [ + f"{number_of_users[model]}" for model in data.keys() + ] + file.write(" | ".join(row_data) + "\n") + + file.write("\n\n") + + +def main() -> int: + """Reads the model.bin files and summarizes the resource requirements.""" + file_paths: dict[str, Path] = { + "260K": ROOT_PATH / "stories260K/stories260K.bin", + "15M": ROOT_PATH / "models/stories15M.bin", + "42M": ROOT_PATH / "models/stories42M.bin", + "110M": ROOT_PATH / "models/stories110M.bin", + } + + data = {} + for key, file_path in file_paths.items(): + config: dict[str, int] = read_config_from_file(file_path) + data[key] = calculate_memory(config) + + output_path = ROOT_PATH / "README_icpp_llama2_resource_requirements.md" + with open(output_path, "w") as file: + file.write("# Canister resource requirements for icpp_llama2.") + file.write("\n") + file.write("\nDo not edit this file. It is created with the command: ") + file.write("\n```bash") + file.write("\npython -m scripts.icpp_llama2_sizer") + file.write("\n```\n\n") + for key in [ + "Tokenizer Memory (per model)", + "TransformerWeights Memory (per model)", + "RunState Memory (per user)", + "Total Memory", + ]: + subset_data = {k: v[key] for k, v in data.items()} + write_data(file, key, subset_data) + + return 0 + + +if __name__ == "__main__": + sys.exit(main())