-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
90 lines (72 loc) · 2.81 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import string
from nltk.corpus import stopwords
from iteration_utilities import deepflatten
ex = "This is 3 tokeni#zer example w?ith shreeny a_mounts of words"
class Tokenizer: # For english only
def __init__(self, texts: list[str], vocab_size=100000) -> None:
self.texts = texts
self.vocab_size = vocab_size
self.stopwords = set(stopwords.words("english"))
self.corpus = {}
self.operators = ["</w>", "<BOS>", "<EOS>"]
self.preproc()
self.mapping()
self.indexer()
def preproc(self):
## Check if texts is singular or multiple
if isinstance(self.texts, str) == True:
raise TypeError("Given string must be encapsulated as a list")
## Format text
processed_texts = []
for text in self.texts:
words = text.lower().split()
filtered = []
filtered.append("<BOS>")
for word in words:
if word not in self.stopwords:
for char in word: # remove symbols from words
if char in string.punctuation:
word = word.replace(char, "")
filtered.append(word)
if word not in self.corpus:
self.corpus[word] = 1
else:
self.corpus[word] += 1
filtered.append("<EOS>")
processed_texts.append(filtered)
self.preproced = processed_texts
def mapping(self):
## Tokenize to characters
tokenized_texts = []
for text in self.preproced:
processed_chars = []
for word in text:
if word in self.operators:
processed_chars.append(word)
continue
processed_chars.append([*word, "</w>"])
tokenized_texts.append(processed_chars)
self.tokenized_text = list(deepflatten(tokenized_texts[0], depth=1, types=list))
# char counts
chars = {}
for char in self.tokenized_text:
if char in self.operators:
continue
if char not in chars:
chars[char] = 1
continue
chars[char] += 1
# mapping
charmap = {}
sorted_chars = sorted(chars, key=chars.get, reverse=True)
for i, char in enumerate(sorted_chars):
charmap[char] = i
self.charmap = charmap
self.inv_charmap = {j: i for i, j in self.charmap.items()}
def indexer(self):
self.indexed = [
self.charmap[i] if i not in self.operators else i
for i in self.tokenized_text
]
print(self.indexed)
token = Tokenizer([ex])