Skip to content

Commit

Permalink
add OPT tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
RaymondWang0 committed May 21, 2023
1 parent 8404c56 commit 3813bb9
Show file tree
Hide file tree
Showing 6 changed files with 281 additions and 954 deletions.
4 changes: 3 additions & 1 deletion experimental/transformer/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ CXX = g++
CXXFLAGS = -std=c++17 -mavx2 -mfma -pthread -O3

# Executable and source files
TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_ops_layer5_1.3B
TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_ops_layer5_1.3B test_OPTTokenizer

LIB_DIR = ../matmul_optimization/src
LIB_SRC = $(wildcard $(LIB_DIR)/lib/*.cc)
Expand Down Expand Up @@ -42,6 +42,8 @@ test_ops_layer5_1.3B:
profile_OPTForCausalLM:
$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o profile_OPTForCausalLM tests/test_OPTForCausalLM.cc $(SRC)

test_OPTTokenizer:
$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o test_OPTTokenizer tests/test_OPTTokenizer.cc $(SRC)

# Clean up
clean:
Expand Down
44 changes: 40 additions & 4 deletions experimental/transformer/include/OPTTokenizer.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#ifndef OPT_TOKENIZER_H
#define OPT_TOKENIZER_H

#include <cstdio>
#include <vector>
#include <string>
Expand All @@ -14,10 +17,14 @@
#include <sstream>
#include <fstream>
#include <utility>
//#include <boost/regex.hpp>
//#include <nlohmann/json.hpp>
#include <regex>
#include <boost/regex.hpp>
#include <nlohmann/json.hpp>

//#include "OPT.h"

//std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);

#include "OPT.h"

/* TODO */
inline int n_ctx = 1024;
Expand All @@ -35,4 +42,33 @@ inline int OPT_token_nl() {
return 13;
}

std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);
struct pair_hash {
template <class T1, class T2>
std::size_t operator () (const std::pair<T1,T2> &p) const {
auto h1 = std::hash<T1>{}(p.first);
auto h2 = std::hash<T2>{}(p.second);
return h1 ^ h2;
}
};

class Encoder {
public:
Encoder(std::map<std::string, int> encoder, std::vector<std::pair<std::string, std::string>> bpe_merges);
std::unordered_map<int, std::string> bytes_to_unicode();
std::set<std::pair<std::string, std::string>> get_pairs(std::vector<std::string> word);
std::string bpe(std::string token);
std::vector<int> encode(std::string text);
std::string decode(std::vector<int> tokens);

private:
std::map<std::string, int> encoder;
std::map<int, std::string> decoder;
std::unordered_map<int, std::string> byte_encoder;
std::unordered_map<std::string, int> byte_decoder;
std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
std::unordered_map<std::string, std::string> cache;
};

Encoder get_encoder(std::string vocab_file, std::string bpe_file);

#endif
Loading

0 comments on commit 3813bb9

Please sign in to comment.