Skip to content

Commit

Permalink
hook up the opt model inference interface into text generate
Browse files Browse the repository at this point in the history
  • Loading branch information
meenchen committed May 23, 2023
1 parent 222ad20 commit db54093
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 146 deletions.
61 changes: 32 additions & 29 deletions experimental/transformer/include/OPTGenerate.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <vector>
#include <string>
#include <iostream>
#include <queue>
#include <cassert>
#include <unordered_map>
#include <random>
#include <algorithm>
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>

#include "OPTForCausalLM.h"
#include "OPTTokenizer.h"
#include "operators.h"
#include "utils.h"

Expand All @@ -27,29 +28,29 @@ typedef struct OPT_token_data_array {
} OPT_token_data_array;

struct opt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = 1; // TODO: fix this
int32_t n_predict = 128; // new tokens to predict
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_vocab = 50272; // vocabulary size
int32_t seed = -1; // RNG seed
int32_t n_threads = 1; // TODO: fix this
int32_t n_predict = 128; // new tokens to predict
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_vocab = 50272; // vocabulary size

// sampling parameters
std::unordered_map<int, float> logit_bias; // logit bias for specific tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
std::unordered_map<int, float> logit_bias; // logit bias for specific tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
};

void OPT_sample_repetition_penalty(OPT_token_data_array* candidates, const int* last_tokens, size_t last_tokens_size,
Expand Down Expand Up @@ -79,6 +80,8 @@ void OPT_sample_typical(OPT_token_data_array* candidates, float p, size_t min_ke

void OPT_sample_top_p(OPT_token_data_array* candidates, float p, size_t min_keep);

std::vector<int> OPTGenerate(OPTForCausalLM model, std::vector<int> input_ids,
const struct opt_params generation_config);

std::vector<int> OPTGenerate(std::vector<int> input_ids,
const struct opt_params generation_config);
void OPTGenerate_interactive(OPTForCausalLM model, std::vector<int> input_ids,
const struct opt_params generation_config, Encoder encoder);
Loading

0 comments on commit db54093

Please sign in to comment.