Skip to content

Commit

Permalink
icpp_llama2_sizer, to calculate resource requirements (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
icppWorld authored Sep 6, 2023
1 parent 8143435 commit c418bde
Show file tree
Hide file tree
Showing 2 changed files with 337 additions and 0 deletions.
71 changes: 71 additions & 0 deletions icpp_llama2/README_icpp_llama2_resource_requirements.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Canister resource requirements for icpp_llama2.

Do not edit this file. It is created with the command:
```bash
python -m scripts.icpp_llama2_sizer
```

### Tokenizer Memory (per model)

Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
--- | --- | --- | --- | ---
vocab_memory | 0.00 | 0.12 | 0.12 | 0.12
vocab_scores_memory | 0.00 | 0.12 | 0.12 | 0.12
Total | 0.00 | 0.24 | 0.24 | 0.24


### TransformerWeights Memory (per model)

Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
--- | --- | --- | --- | ---
token_embedding_table | 0.12 | 35.16 | 62.50 | 93.75
rms_att_weight | 0.00 | 0.01 | 0.02 | 0.04
wq | 0.08 | 1.90 | 8.00 | 27.00
wk | 0.04 | 1.90 | 8.00 | 27.00
wv | 0.04 | 1.90 | 8.00 | 27.00
wo | 0.08 | 1.90 | 8.00 | 27.00
rms_ffn_weight | 0.00 | 0.01 | 0.02 | 0.04
w1 | 0.21 | 5.06 | 21.50 | 72.00
w2 | 0.21 | 5.06 | 21.50 | 72.00
w3 | 0.21 | 5.06 | 21.50 | 72.00
rms_final_weight | 0.00 | 0.00 | 0.00 | 0.00
wcls | 0.12 | 35.16 | 62.50 | 93.75
Total | 1.12 | 93.11 | 221.53 | 511.57


### RunState Memory (per user)

Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
--- | --- | --- | --- | ---
x | 0.00 | 0.00 | 0.00 | 0.00
xb | 0.00 | 0.00 | 0.00 | 0.00
xb2 | 0.00 | 0.00 | 0.00 | 0.00
hb | 0.00 | 0.00 | 0.01 | 0.01
hb2 | 0.00 | 0.00 | 0.01 | 0.01
q | 0.00 | 0.00 | 0.00 | 0.00
k | 0.00 | 0.00 | 0.00 | 0.00
v | 0.00 | 0.00 | 0.00 | 0.00
att | 0.02 | 0.01 | 0.03 | 0.05
logits | 0.00 | 0.12 | 0.12 | 0.12
key_cache | 0.31 | 1.69 | 16.00 | 36.00
value_cache | 0.31 | 1.69 | 16.00 | 36.00
Total | 0.65 | 3.52 | 32.18 | 72.20


### Total Memory

Memory Type | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
--- | --- | --- | --- | ---
Total Tokenizer Memory (per model) | 0.00 | 0.24 | 0.24 | 0.24
Total TransformerWeights Memory (per model) | 1.12 | 93.11 | 221.53 | 511.57
Total RunState Memory (per user) | 0.65 | 3.52 | 32.18 | 72.20
Overall Total Memory | 1.76 | 96.62 | 253.71 | 583.78


### Canister Metrics

Canister Metrics | 260K<br>(MB) | 15M<br>(MB) | 42M<br>(MB) | 110M<br>(MB)
--- | --- | --- | --- | ---
Max number of concurrent users | 4760 | 847 | 88 | 35


266 changes: 266 additions & 0 deletions icpp_llama2/scripts/icpp_llama2_sizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
"""Calculates the require resources to deploy a Llama2 model to an IC canister"""
# pylint: disable=invalid-name
import sys
import struct
from pathlib import Path
from typing import TextIO

ROOT_PATH = Path(__file__).parent.parent

# For 32 bit system
SIZE_OF_FLOAT = 4 # bytes
SIZE_OF_POINTER = 4 # bytes
SIZE_OF_BYTE_PIECES = 512 # bytes (static size)


def read_config_from_file(file_path: Path) -> dict[str, int]:
"""
Reads the Config structure from a binary file and returns it as a dictionary.
"""
with open(file_path, "rb") as f:
# Read the data corresponding to the Config struct
data: bytes = f.read(struct.calcsize("7i"))
config_values = struct.unpack("7i", data)

config: dict[str, int] = {
"dim": config_values[0],
"hidden_dim": config_values[1],
"n_layers": config_values[2],
"n_heads": config_values[3],
"n_kv_heads": config_values[4],
"vocab_size": abs(
config_values[5]
), # account for possible negative vocab_size
"seq_len": config_values[6],
}
return config


def calculate_memory(config: dict[str, int]) -> dict[str, dict[str, float]]:
"""Calculate required memory for all the LLM components"""
# Tokenizer
vocab_memory = config["vocab_size"] * SIZE_OF_POINTER
vocab_scores_memory = config["vocab_size"] * SIZE_OF_FLOAT

# TransformerWeights
head_size = config["dim"] / config["n_heads"]
n_layers = config["n_layers"]

token_embedding_table = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT
rms_att_weight = n_layers * config["dim"] * SIZE_OF_FLOAT
wq = n_layers * config["dim"] * (config["n_heads"] * head_size) * SIZE_OF_FLOAT
wk = n_layers * config["dim"] * (config["n_kv_heads"] * head_size) * SIZE_OF_FLOAT
wv = wk # Same as wk
wo = n_layers * (config["n_heads"] * head_size) * config["dim"] * SIZE_OF_FLOAT
rms_ffn_weight = n_layers * config["dim"] * SIZE_OF_FLOAT
w1 = n_layers * config["dim"] * config["hidden_dim"] * SIZE_OF_FLOAT
w2 = n_layers * config["hidden_dim"] * config["dim"] * SIZE_OF_FLOAT
w3 = w1 # Same as w1
rms_final_weight = config["dim"] * SIZE_OF_FLOAT
wcls = config["vocab_size"] * config["dim"] * SIZE_OF_FLOAT

# RunState
kv_dim = (config["dim"] * config["n_kv_heads"]) / config["n_heads"]
x = config["dim"] * SIZE_OF_FLOAT
xb = x # Same as x
xb2 = x # Same as x
hb = config["hidden_dim"] * SIZE_OF_FLOAT
hb2 = hb # Same as hb
q = x # Same as x
k = kv_dim * SIZE_OF_FLOAT
v = k # Same as k
att = config["n_heads"] * config["seq_len"] * SIZE_OF_FLOAT
logits = config["vocab_size"] * SIZE_OF_FLOAT
key_cache = n_layers * config["seq_len"] * kv_dim * SIZE_OF_FLOAT
value_cache = key_cache # Same as key_cache

# Calculate total memory usage for Tokenizer, TransformerWeights and RunState
total_tokenizer = vocab_memory + vocab_scores_memory + SIZE_OF_BYTE_PIECES

total_transformer_weights = sum(
[
token_embedding_table,
rms_att_weight,
wq,
wk,
wv,
wo,
rms_ffn_weight,
w1,
w2,
w3,
rms_final_weight,
wcls,
]
)
total_run_state = sum(
[x, xb, xb2, hb, hb2, q, k, v, att, logits, key_cache, value_cache]
)

# Collate the results in a dictionary
data: dict[str, dict[str, float]] = {
"Tokenizer Memory (per model)": {
"vocab_memory": vocab_memory / (1024 * 1024),
"vocab_scores_memory": vocab_scores_memory / (1024 * 1024),
},
"TransformerWeights Memory (per model)": {
"token_embedding_table": token_embedding_table / (1024 * 1024),
"rms_att_weight": rms_att_weight / (1024 * 1024),
"wq": wq / (1024 * 1024),
"wk": wk / (1024 * 1024),
"wv": wv / (1024 * 1024),
"wo": wo / (1024 * 1024),
"rms_ffn_weight": rms_ffn_weight / (1024 * 1024),
"w1": w1 / (1024 * 1024),
"w2": w2 / (1024 * 1024),
"w3": w3 / (1024 * 1024),
"rms_final_weight": rms_final_weight / (1024 * 1024),
"wcls": wcls / (1024 * 1024),
},
"RunState Memory (per user)": {
"x": x / (1024 * 1024),
"xb": xb / (1024 * 1024),
"xb2": xb2 / (1024 * 1024),
"hb": hb / (1024 * 1024),
"hb2": hb2 / (1024 * 1024),
"q": q / (1024 * 1024),
"k": k / (1024 * 1024),
"v": v / (1024 * 1024),
"att": att / (1024 * 1024),
"logits": logits / (1024 * 1024),
"key_cache": key_cache / (1024 * 1024),
"value_cache": value_cache / (1024 * 1024),
},
"Total Memory": {
"Total Tokenizer Memory (per model)": total_tokenizer / (1024 * 1024),
"Total TransformerWeights Memory (per model)": total_transformer_weights
/ (1024 * 1024),
"Total RunState Memory (per user)": total_run_state / (1024 * 1024),
"Overall Total Memory": (total_transformer_weights + total_run_state)
/ (1024 * 1024),
},
}
return data


def write_data(file: TextIO, title: str, data: dict[str, dict[str, float]]) -> None:
"""Writes it all to a Markdown file"""
# Get the models for headers
headers = ["Memory Type"] + [f"{model}<br>(MB)" for model in data.keys()]

# Write the table name
file.write(f"### {title}\n\n")

# Write the headers
file.write(" | ".join(headers) + "\n")
file.write(" | ".join(["---"] * len(headers)) + "\n")

# Assuming that all models have the same memory types,
# using the first model to get the list of memory types
memory_types = list(data[next(iter(data))].keys())

totals = {model: 0.0 for model in data.keys()}

for mtype in memory_types:
row_data = [mtype] + [
f"{model_data[mtype]:.2f}" for model_data in data.values()
]
file.write(" | ".join(row_data) + "\n")

# Accumulate totals for the first three tables
if title in [
"Tokenizer Memory (per model)",
"TransformerWeights Memory (per model)",
"RunState Memory (per user)",
]:
for model, value in zip(
data.keys(),
[model_data[mtype] for model_data in data.values()],
):
totals[model] += value

if title in [
"Tokenizer Memory (per model)",
"TransformerWeights Memory (per model)",
"RunState Memory (per user)",
]:
# Add the totals to the table
total_row = ["Total"] + [f"{totals[model]:.2f}" for model in data.keys()]
file.write(" | ".join(total_row) + "\n")
else:
# Calculate max users for each model
# Calculate number of users for each model and add it to the data
number_of_users = {}
for model, values in data.items():
total_available_memory = 3 * 1024 # Available canister memory in MB
total_tokenizer_memory = values["Total Tokenizer Memory (per model)"]
total_transformer_weights_memory = values[
"Total TransformerWeights Memory (per model)"
]
total_runstate_memory = values["Total RunState Memory (per user)"]

number_of_users[model] = int(
(
total_available_memory
- total_tokenizer_memory
- total_transformer_weights_memory
)
/ total_runstate_memory
)

# Write the markdown table for number of users
file.write("\n\n")
# Get the models for headers
headers = ["Canister Metrics"] + [f"{model}<br>(MB)" for model in data.keys()]

# Write the table name
file.write("### Canister Metrics\n\n")

# Write the headers
file.write(" | ".join(headers) + "\n")
file.write(" | ".join(["---"] * len(headers)) + "\n")

row_data = ["Max number of concurrent users"] + [
f"{number_of_users[model]}" for model in data.keys()
]
file.write(" | ".join(row_data) + "\n")

file.write("\n\n")


def main() -> int:
"""Reads the model.bin files and summarizes the resource requirements."""
file_paths: dict[str, Path] = {
"260K": ROOT_PATH / "stories260K/stories260K.bin",
"15M": ROOT_PATH / "models/stories15M.bin",
"42M": ROOT_PATH / "models/stories42M.bin",
"110M": ROOT_PATH / "models/stories110M.bin",
}

data = {}
for key, file_path in file_paths.items():
config: dict[str, int] = read_config_from_file(file_path)
data[key] = calculate_memory(config)

output_path = ROOT_PATH / "README_icpp_llama2_resource_requirements.md"
with open(output_path, "w") as file:
file.write("# Canister resource requirements for icpp_llama2.")
file.write("\n")
file.write("\nDo not edit this file. It is created with the command: ")
file.write("\n```bash")
file.write("\npython -m scripts.icpp_llama2_sizer")
file.write("\n```\n\n")
for key in [
"Tokenizer Memory (per model)",
"TransformerWeights Memory (per model)",
"RunState Memory (per user)",
"Total Memory",
]:
subset_data = {k: v[key] for k, v in data.items()}
write_data(file, key, subset_data)

return 0


if __name__ == "__main__":
sys.exit(main())

0 comments on commit c418bde

Please sign in to comment.