-
Notifications
You must be signed in to change notification settings - Fork 8
/
get_tokens.py
36 lines (29 loc) · 967 Bytes
/
get_tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Read binary data from sp.dat
data = open(
"/System/Library/LinguisticData/RequiredAssets_en.bundle/AssetData/en.lm/unilm.bundle/sp.dat",
"rb",
).read()
# Find the <pad> token, which is the first token in the vocab
first_token_offset = data.find(b"<pad>", data.find(b"<pad>") + 1)
if first_token_offset == -1:
raise Exception(
"Could not find <pad> token. You may need to update to macOS Sonoma."
)
# Parse the tokens
tokens = []
current_token = b""
for byte in range(first_token_offset, len(data)):
# Tokens are split by null bytes
if data[byte] == 0:
tokens.append(current_token.decode("utf-8"))
current_token = b""
if len(tokens) == 15000:
break
else:
current_token += bytes([data[byte]])
# Write all tokens to vocab.txt
with open("vocab.txt", "w") as f:
for i, token in enumerate(tokens):
f.write(token)
if i != len(tokens) - 1:
f.write("\n")