-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
icpp_llama2_sizer, to calculate resource requirements (#10)
- Loading branch information
Showing
2 changed files
with
337 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Canister resource requirements for icpp_llama2. | ||
|
||
Do not edit this file. It is created with the command: | ||
```bash | ||
python -m scripts.icpp_llama2_sizer | ||
``` | ||
|
||
### Tokenizer Memory (per model) | ||
|
||
Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB) | ||
--- | --- | --- | --- | --- | ||
vocab_memory | 0.00 | 0.12 | 0.12 | 0.12 | ||
vocab_scores_memory | 0.00 | 0.12 | 0.12 | 0.12 | ||
Total | 0.00 | 0.24 | 0.24 | 0.24 | ||
|
||
|
||
### TransformerWeights Memory (per model) | ||
|
||
Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB) | ||
--- | --- | --- | --- | --- | ||
token_embedding_table | 0.12 | 35.16 | 62.50 | 93.75 | ||
rms_att_weight | 0.00 | 0.01 | 0.02 | 0.04 | ||
wq | 0.08 | 1.90 | 8.00 | 27.00 | ||
wk | 0.04 | 1.90 | 8.00 | 27.00 | ||
wv | 0.04 | 1.90 | 8.00 | 27.00 | ||
wo | 0.08 | 1.90 | 8.00 | 27.00 | ||
rms_ffn_weight | 0.00 | 0.01 | 0.02 | 0.04 | ||
w1 | 0.21 | 5.06 | 21.50 | 72.00 | ||
w2 | 0.21 | 5.06 | 21.50 | 72.00 | ||
w3 | 0.21 | 5.06 | 21.50 | 72.00 | ||
rms_final_weight | 0.00 | 0.00 | 0.00 | 0.00 | ||
wcls | 0.12 | 35.16 | 62.50 | 93.75 | ||
Total | 1.12 | 93.11 | 221.53 | 511.57 | ||
|
||
|
||
### RunState Memory (per user) | ||
|
||
Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB) | ||
--- | --- | --- | --- | --- | ||
x | 0.00 | 0.00 | 0.00 | 0.00 | ||
xb | 0.00 | 0.00 | 0.00 | 0.00 | ||
xb2 | 0.00 | 0.00 | 0.00 | 0.00 | ||
hb | 0.00 | 0.00 | 0.01 | 0.01 | ||
hb2 | 0.00 | 0.00 | 0.01 | 0.01 | ||
q | 0.00 | 0.00 | 0.00 | 0.00 | ||
k | 0.00 | 0.00 | 0.00 | 0.00 | ||
v | 0.00 | 0.00 | 0.00 | 0.00 | ||
att | 0.02 | 0.01 | 0.03 | 0.05 | ||
logits | 0.00 | 0.12 | 0.12 | 0.12 | ||
key_cache | 0.31 | 1.69 | 16.00 | 36.00 | ||
value_cache | 0.31 | 1.69 | 16.00 | 36.00 | ||
Total | 0.65 | 3.52 | 32.18 | 72.20 | ||
|
||
|
||
### Total Memory | ||
|
||
Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB) | ||
--- | --- | --- | --- | --- | ||
Total Tokenizer Memory (per model) | 0.00 | 0.24 | 0.24 | 0.24 | ||
Total TransformerWeights Memory (per model) | 1.12 | 93.11 | 221.53 | 511.57 | ||
Total RunState Memory (per user) | 0.65 | 3.52 | 32.18 | 72.20 | ||
Overall Total Memory | 1.76 | 96.62 | 253.71 | 583.78 | ||
|
||
|
||
### Canister Metrics | ||
|
||
Canister Metrics | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB) | ||
--- | --- | --- | --- | --- | ||
Max number of concurrent users | 4760 | 847 | 88 | 35 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
"""Calculates the require resources to deploy a Llama2 model to an IC canister""" | ||
# pylint: disable=invalid-name | ||
import sys | ||
import struct | ||
from pathlib import Path | ||
from typing import TextIO | ||
|
||
ROOT_PATH = Path(__file__).parent.parent | ||
|
||
# For 32 bit system | ||
SIZE_OF_FLOAT = 4 # bytes | ||
SIZE_OF_POINTER = 4 # bytes | ||
SIZE_OF_BYTE_PIECES = 512 # bytes (static size) | ||
|
||
|
||
def read_config_from_file(file_path: Path) -> dict[str, int]: | ||
""" | ||
Reads the Config structure from a binary file and returns it as a dictionary. | ||
""" | ||
with open(file_path, "rb") as f: | ||
# Read the data corresponding to the Config struct | ||
data: bytes = f.read(struct.calcsize("7i")) | ||
config_values = struct.unpack("7i", data) | ||
|
||
config: dict[str, int] = { | ||
"dim": config_values[0], | ||
"hidden_dim": config_values[1], | ||
"n_layers": config_values[2], | ||
"n_heads": config_values[3], | ||
"n_kv_heads": config_values[4], | ||
"vocab_size": abs( | ||
config_values[5] | ||
), # account for possible negative vocab_size | ||
"seq_len": config_values[6], | ||
} | ||
return config | ||
|
||
|
||
def calculate_memory(config: dict[str, int]) -> dict[str, dict[str, float]]: | ||
"""Calculate required memory for all the LLM components""" | ||
# Tokenizer | ||
vocab_memory = config["vocab_size"] * SIZE_OF_POINTER | ||
vocab_scores_memory = config["vocab_size"] * SIZE_OF_FLOAT | ||
|
||
# TransformerWeights | ||
head_size = config["dim"] / config["n_heads"] | ||
n_layers = config["n_layers"] | ||
|
||
token_embedding_table = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT | ||
rms_att_weight = n_layers * config["dim"] * SIZE_OF_FLOAT | ||
wq = n_layers * config["dim"] * (config["n_heads"] * head_size) * SIZE_OF_FLOAT | ||
wk = n_layers * config["dim"] * (config["n_kv_heads"] * head_size) * SIZE_OF_FLOAT | ||
wv = wk # Same as wk | ||
wo = n_layers * (config["n_heads"] * head_size) * config["dim"] * SIZE_OF_FLOAT | ||
rms_ffn_weight = n_layers * config["dim"] * SIZE_OF_FLOAT | ||
w1 = n_layers * config["dim"] * config["hidden_dim"] * SIZE_OF_FLOAT | ||
w2 = n_layers * config["hidden_dim"] * config["dim"] * SIZE_OF_FLOAT | ||
w3 = w1 # Same as w1 | ||
rms_final_weight = config["dim"] * SIZE_OF_FLOAT | ||
wcls = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT | ||
|
||
# RunState | ||
kv_dim = (config["dim"] * config["n_kv_heads"]) / config["n_heads"] | ||
x = config["dim"] * SIZE_OF_FLOAT | ||
xb = x # Same as x | ||
xb2 = x # Same as x | ||
hb = config["hidden_dim"] * SIZE_OF_FLOAT | ||
hb2 = hb # Same as hb | ||
q = x # Same as x | ||
k = kv_dim * SIZE_OF_FLOAT | ||
v = k # Same as k | ||
att = config["n_heads"] * config["seq_len"] * SIZE_OF_FLOAT | ||
logits = config["vocab_size"] * SIZE_OF_FLOAT | ||
key_cache = n_layers * config["seq_len"] * kv_dim * SIZE_OF_FLOAT | ||
value_cache = key_cache # Same as key_cache | ||
|
||
# Calculate total memory usage for Tokenizer, TransformerWeights and RunState | ||
total_tokenizer = vocab_memory + vocab_scores_memory + SIZE_OF_BYTE_PIECES | ||
|
||
total_transformer_weights = sum( | ||
[ | ||
token_embedding_table, | ||
rms_att_weight, | ||
wq, | ||
wk, | ||
wv, | ||
wo, | ||
rms_ffn_weight, | ||
w1, | ||
w2, | ||
w3, | ||
rms_final_weight, | ||
wcls, | ||
] | ||
) | ||
total_run_state = sum( | ||
[x, xb, xb2, hb, hb2, q, k, v, att, logits, key_cache, value_cache] | ||
) | ||
|
||
# Collate the results in a dictionary | ||
data: dict[str, dict[str, float]] = { | ||
"Tokenizer Memory (per model)": { | ||
"vocab_memory": vocab_memory / (1024 * 1024), | ||
"vocab_scores_memory": vocab_scores_memory / (1024 * 1024), | ||
}, | ||
"TransformerWeights Memory (per model)": { | ||
"token_embedding_table": token_embedding_table / (1024 * 1024), | ||
"rms_att_weight": rms_att_weight / (1024 * 1024), | ||
"wq": wq / (1024 * 1024), | ||
"wk": wk / (1024 * 1024), | ||
"wv": wv / (1024 * 1024), | ||
"wo": wo / (1024 * 1024), | ||
"rms_ffn_weight": rms_ffn_weight / (1024 * 1024), | ||
"w1": w1 / (1024 * 1024), | ||
"w2": w2 / (1024 * 1024), | ||
"w3": w3 / (1024 * 1024), | ||
"rms_final_weight": rms_final_weight / (1024 * 1024), | ||
"wcls": wcls / (1024 * 1024), | ||
}, | ||
"RunState Memory (per user)": { | ||
"x": x / (1024 * 1024), | ||
"xb": xb / (1024 * 1024), | ||
"xb2": xb2 / (1024 * 1024), | ||
"hb": hb / (1024 * 1024), | ||
"hb2": hb2 / (1024 * 1024), | ||
"q": q / (1024 * 1024), | ||
"k": k / (1024 * 1024), | ||
"v": v / (1024 * 1024), | ||
"att": att / (1024 * 1024), | ||
"logits": logits / (1024 * 1024), | ||
"key_cache": key_cache / (1024 * 1024), | ||
"value_cache": value_cache / (1024 * 1024), | ||
}, | ||
"Total Memory": { | ||
"Total Tokenizer Memory (per model)": total_tokenizer / (1024 * 1024), | ||
"Total TransformerWeights Memory (per model)": total_transformer_weights | ||
/ (1024 * 1024), | ||
"Total RunState Memory (per user)": total_run_state / (1024 * 1024), | ||
"Overall Total Memory": (total_transformer_weights + total_run_state) | ||
/ (1024 * 1024), | ||
}, | ||
} | ||
return data | ||
|
||
|
||
def write_data(file: TextIO, title: str, data: dict[str, dict[str, float]]) -> None: | ||
"""Writes it all to a Markdown file""" | ||
# Get the models for headers | ||
headers = ["Memory Type"] + [f"{model}<br>(MB)" for model in data.keys()] | ||
|
||
# Write the table name | ||
file.write(f"### {title}\n\n") | ||
|
||
# Write the headers | ||
file.write(" | ".join(headers) + "\n") | ||
file.write(" | ".join(["---"] * len(headers)) + "\n") | ||
|
||
# Assuming that all models have the same memory types, | ||
# using the first model to get the list of memory types | ||
memory_types = list(data[next(iter(data))].keys()) | ||
|
||
totals = {model: 0.0 for model in data.keys()} | ||
|
||
for mtype in memory_types: | ||
row_data = [mtype] + [ | ||
f"{model_data[mtype]:.2f}" for model_data in data.values() | ||
] | ||
file.write(" | ".join(row_data) + "\n") | ||
|
||
# Accumulate totals for the first three tables | ||
if title in [ | ||
"Tokenizer Memory (per model)", | ||
"TransformerWeights Memory (per model)", | ||
"RunState Memory (per user)", | ||
]: | ||
for model, value in zip( | ||
data.keys(), | ||
[model_data[mtype] for model_data in data.values()], | ||
): | ||
totals[model] += value | ||
|
||
if title in [ | ||
"Tokenizer Memory (per model)", | ||
"TransformerWeights Memory (per model)", | ||
"RunState Memory (per user)", | ||
]: | ||
# Add the totals to the table | ||
total_row = ["Total"] + [f"{totals[model]:.2f}" for model in data.keys()] | ||
file.write(" | ".join(total_row) + "\n") | ||
else: | ||
# Calculate max users for each model | ||
# Calculate number of users for each model and add it to the data | ||
number_of_users = {} | ||
for model, values in data.items(): | ||
total_available_memory = 3 * 1024 # Available canister memory in MB | ||
total_tokenizer_memory = values["Total Tokenizer Memory (per model)"] | ||
total_transformer_weights_memory = values[ | ||
"Total TransformerWeights Memory (per model)" | ||
] | ||
total_runstate_memory = values["Total RunState Memory (per user)"] | ||
|
||
number_of_users[model] = int( | ||
( | ||
total_available_memory | ||
- total_tokenizer_memory | ||
- total_transformer_weights_memory | ||
) | ||
/ total_runstate_memory | ||
) | ||
|
||
# Write the markdown table for number of users | ||
file.write("\n\n") | ||
# Get the models for headers | ||
headers = ["Canister Metrics"] + [f"{model}<br>(MB)" for model in data.keys()] | ||
|
||
# Write the table name | ||
file.write("### Canister Metrics\n\n") | ||
|
||
# Write the headers | ||
file.write(" | ".join(headers) + "\n") | ||
file.write(" | ".join(["---"] * len(headers)) + "\n") | ||
|
||
row_data = ["Max number of concurrent users"] + [ | ||
f"{number_of_users[model]}" for model in data.keys() | ||
] | ||
file.write(" | ".join(row_data) + "\n") | ||
|
||
file.write("\n\n") | ||
|
||
|
||
def main() -> int: | ||
"""Reads the model.bin files and summarizes the resource requirements.""" | ||
file_paths: dict[str, Path] = { | ||
"260K": ROOT_PATH / "stories260K/stories260K.bin", | ||
"15M": ROOT_PATH / "models/stories15M.bin", | ||
"42M": ROOT_PATH / "models/stories42M.bin", | ||
"110M": ROOT_PATH / "models/stories110M.bin", | ||
} | ||
|
||
data = {} | ||
for key, file_path in file_paths.items(): | ||
config: dict[str, int] = read_config_from_file(file_path) | ||
data[key] = calculate_memory(config) | ||
|
||
output_path = ROOT_PATH / "README_icpp_llama2_resource_requirements.md" | ||
with open(output_path, "w") as file: | ||
file.write("# Canister resource requirements for icpp_llama2.") | ||
file.write("\n") | ||
file.write("\nDo not edit this file. It is created with the command: ") | ||
file.write("\n```bash") | ||
file.write("\npython -m scripts.icpp_llama2_sizer") | ||
file.write("\n```\n\n") | ||
for key in [ | ||
"Tokenizer Memory (per model)", | ||
"TransformerWeights Memory (per model)", | ||
"RunState Memory (per user)", | ||
"Total Memory", | ||
]: | ||
subset_data = {k: v[key] for k, v in data.items()} | ||
write_data(file, key, subset_data) | ||
|
||
return 0 | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |