add OPT tokenizer

mit-han-lab · May 21, 2023 · 3813bb9 · 3813bb9
1 parent 8404c56
commit 3813bb9
Show file tree

Hide file tree

Showing 6 changed files with 281 additions and 954 deletions.
diff --git a/experimental/transformer/Makefile b/experimental/transformer/Makefile
@@ -3,7 +3,7 @@ CXX = g++
 CXXFLAGS = -std=c++17 -mavx2 -mfma -pthread -O3
 
 # Executable and source files
-TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_ops_layer5_1.3B
+TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_ops_layer5_1.3B test_OPTTokenizer
 
 LIB_DIR = ../matmul_optimization/src
 LIB_SRC = $(wildcard $(LIB_DIR)/lib/*.cc)
@@ -42,6 +42,8 @@ test_ops_layer5_1.3B:
 profile_OPTForCausalLM:
  $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o profile_OPTForCausalLM tests/test_OPTForCausalLM.cc $(SRC)
 
+test_OPTTokenizer:
+ $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o test_OPTTokenizer tests/test_OPTTokenizer.cc $(SRC)
 
 # Clean up
 clean:

diff --git a/experimental/transformer/include/OPTTokenizer.h b/experimental/transformer/include/OPTTokenizer.h
@@ -1,3 +1,6 @@
+#ifndef OPT_TOKENIZER_H
+#define OPT_TOKENIZER_H
+
 #include <cstdio>
 #include <vector>
 #include <string>
@@ -14,10 +17,14 @@
 #include <sstream>
 #include <fstream>
 #include <utility>
-//#include <boost/regex.hpp>
-//#include <nlohmann/json.hpp>
+#include <regex>
+#include <boost/regex.hpp>
+#include <nlohmann/json.hpp>
+
+//#include "OPT.h"
+
+//std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);
 
-#include "OPT.h"
 
 /* TODO */
 inline int n_ctx = 1024;
@@ -35,4 +42,33 @@ inline int OPT_token_nl() {
  return 13;
 }
 
-std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);
+struct pair_hash {
+ template <class T1, class T2>
+ std::size_t operator () (const std::pair<T1,T2> &p) const {
+ auto h1 = std::hash<T1>{}(p.first);
+ auto h2 = std::hash<T2>{}(p.second); 
+ return h1 ^ h2;
+ }
+};
+
+class Encoder {
+public:
+ Encoder(std::map<std::string, int> encoder, std::vector<std::pair<std::string, std::string>> bpe_merges);
+ std::unordered_map<int, std::string> bytes_to_unicode();
+ std::set<std::pair<std::string, std::string>> get_pairs(std::vector<std::string> word);
+ std::string bpe(std::string token);
+ std::vector<int> encode(std::string text);
+ std::string decode(std::vector<int> tokens);
+
+private:
+ std::map<std::string, int> encoder;
+ std::map<int, std::string> decoder;
+ std::unordered_map<int, std::string> byte_encoder;
+ std::unordered_map<std::string, int> byte_decoder;
+ std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
+ std::unordered_map<std::string, std::string> cache;
+};
+
+Encoder get_encoder(std::string vocab_file, std::string bpe_file);
+
+#endif