hook up the opt model inference interface into text generate

mit-han-lab · May 23, 2023 · db54093 · db54093
1 parent 222ad20
commit db54093
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 146 deletions.
diff --git a/experimental/transformer/include/OPTGenerate.h b/experimental/transformer/include/OPTGenerate.h
@@ -1,14 +1,15 @@
+#include <algorithm>
+#include <cassert>
 #include <cstdio>
-#include <vector>
-#include <string>
+#include <iostream>
 #include <queue>
-#include <cassert>
-#include <unordered_map>
 #include <random>
-#include <algorithm>
-#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 #include "OPTForCausalLM.h"
+#include "OPTTokenizer.h"
 #include "operators.h"
 #include "utils.h"
 
@@ -27,29 +28,29 @@ typedef struct OPT_token_data_array {
 } OPT_token_data_array;
 
 struct opt_params {
- int32_t seed  = -1;  // RNG seed
- int32_t n_threads = 1;  // TODO: fix this
- int32_t n_predict = 128;  // new tokens to predict
- int32_t n_parts  = -1;  // amount of model parts (-1 = determine from model dimensions)
- int32_t n_ctx  = 512;  // context size
- int32_t n_batch  = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_keep  = 0;  // number of tokens to keep from initial prompt
- int32_t n_vocab  = 50272;  // vocabulary size
+ int32_t seed = -1; // RNG seed
+ int32_t n_threads = 1; // TODO: fix this
+ int32_t n_predict = 128; // new tokens to predict
+ int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
+ int32_t n_ctx = 512; // context size
+ int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_vocab = 50272; // vocabulary size
 
  // sampling parameters
- std::unordered_map<int, float> logit_bias;  // logit bias for specific tokens
- int32_t top_k    = 40; // <= 0 to use vocab size
- float  top_p   = 0.95f; // 1.0 = disabled
- float  tfs_z   = 1.00f; // 1.0 = disabled
- float  typical_p   = 1.00f; // 1.0 = disabled
- float  temp   = 0.80f; // 1.0 = disabled
- float  repeat_penalty  = 1.10f; // 1.0 = disabled
- int32_t repeat_last_n  = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
- float  frequency_penalty = 0.00f; // 0.0 = disabled
- float  presence_penalty  = 0.00f; // 0.0 = disabled
- int  mirostat  = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
- float  mirostat_tau  = 5.00f; // target entropy
- float  mirostat_eta  = 0.10f; // learning rate
+ std::unordered_map<int, float> logit_bias; // logit bias for specific tokens
+ int32_t top_k = 40; // <= 0 to use vocab size
+ float top_p = 0.95f;   // 1.0 = disabled
+ float tfs_z = 1.00f;   // 1.0 = disabled
+ float typical_p = 1.00f;   // 1.0 = disabled
+ float temp = 0.80f;   // 1.0 = disabled
+ float repeat_penalty = 1.10f;  // 1.0 = disabled
+ int32_t repeat_last_n = 64;  // last n tokens to penalize (0 = disable penalty, -1 = context size)
+ float frequency_penalty = 0.00f; // 0.0 = disabled
+ float presence_penalty = 0.00f;  // 0.0 = disabled
+ int mirostat = 0;  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+ float mirostat_tau = 5.00f;  // target entropy
+ float mirostat_eta = 0.10f;  // learning rate
 };
 
 void OPT_sample_repetition_penalty(OPT_token_data_array* candidates, const int* last_tokens, size_t last_tokens_size,
@@ -79,6 +80,8 @@ void OPT_sample_typical(OPT_token_data_array* candidates, float p, size_t min_ke
 
 void OPT_sample_top_p(OPT_token_data_array* candidates, float p, size_t min_keep);
 
+std::vector<int> OPTGenerate(OPTForCausalLM model, std::vector<int> input_ids,
+ const struct opt_params generation_config);
 
-std::vector<int> OPTGenerate(std::vector<int> input_ids,
- const struct opt_params generation_config);
+void OPTGenerate_interactive(OPTForCausalLM model, std::vector<int> input_ids,
+  const struct opt_params generation_config, Encoder encoder);