From 6b15cfde6d0b1c7fd9178cd4caa47e8ea5ff6640 Mon Sep 17 00:00:00 2001 From: rexwang8 Date: Tue, 16 Jul 2024 11:35:34 -0400 Subject: [PATCH 1/6] refactor: seperate file load and process --- gpt_bpe.go | 56 +++----- gpt_bpe_test.go | 23 +++ resources/resolver.go | 314 ++++++++++++++++++++++++++--------------- resources/resources.go | 2 + 4 files changed, 241 insertions(+), 154 deletions(-) diff --git a/gpt_bpe.go b/gpt_bpe.go index 6ce00be..927639c 100644 --- a/gpt_bpe.go +++ b/gpt_bpe.go @@ -147,8 +147,8 @@ func NewMistralEncoder() GPTEncoder { // Returns a GPTEncoder with the tokenizer data loaded for that vocabulary // id. func NewEncoder(vocabId string) (*GPTEncoder, error) { - hfConfig, resourcesPtr, vocabErr := resources.ResolveVocabId(vocabId, - "") + hfConfig, resourcesPtr, vocabErr := resources.ResolveVocabId(vocabId, "") + if vocabErr != nil { return nil, vocabErr } @@ -176,32 +176,6 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) { } } - tokenizerSpecialConfig := resources.TokenizerSpecialsConfig{ - AddBosToken: false, - AddEosToken: false, - PadToken: "", - } - altMistralSpecialsConfig := resources.MistralSpecialsConfig{ - AddBosToken: false, - AddEosToken: false, - PadToken: "", - } - if special, ok := (rsrcs)["tokenizer_config.json"]; ok { - if special.Data != nil { - err := json.Unmarshal(*special.Data, &tokenizerSpecialConfig) - if err != nil { - err = json.Unmarshal(*special.Data, &altMistralSpecialsConfig) - if err != nil { - log.Fatal("Error unmarshalling tokenizer_config.json") - } - //populate the tokenizerSpecialConfig from the altMistralSpecialsConfig - tokenizerSpecialConfig.AddBosToken = altMistralSpecialsConfig.AddBosToken - tokenizerSpecialConfig.AddEosToken = altMistralSpecialsConfig.AddEosToken - tokenizerSpecialConfig.PadToken = altMistralSpecialsConfig.PadToken - } - } - } - puncRunes := make([]rune, 0) if specialConfig.PuncRunes != nil { for _, r := range specialConfig.PuncRunes { @@ -364,23 +338,27 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) { } if specialConfig.EncloseEosBos { - tokenizerSpecialConfig.AddBosToken = true - tokenizerSpecialConfig.AddEosToken = true + eosBosBool := true + hfConfig.AddBosToken = &eosBosBool + hfConfig.AddEosToken = &eosBosBool } // Add in default pad token if not already set - padTokenNotFound := (tokenizerSpecialConfig.PadToken == "" && hfConfig.PadTokenStr == nil) + padTokenNotFound := (hfConfig.PadTokenStr == nil) if padTokenNotFound { // Inject the pad token into the encoder to uintmax16, // throw an error if vocab is larger than uintmax16 - if len(encoderTokens) >= math.MaxInt16 { - log.Fatalf("Vocab size is larger than uint16 max, default pad token cannot be added." + - "Please specify a pad token in the vocab file.") + if len(encoderTokens) >= math.MaxUint16 { + log.Fatalf("Vocab size of %d is larger than uint16 max of %d. "+ + "Please specify a pad token in the vocab file.", + len(encoderTokens), math.MaxUint16) } - encoderTokens[defaultPadTokenString] = math.MaxUint16 - tokenizerSpecialConfig.PadToken = defaultPadTokenString - hfConfig.PadTokenStr = &tokenizerSpecialConfig.PadToken + padToken := defaultPadTokenString + encoderTokens[padToken] = math.MaxUint16 + hfConfig.PadTokenStr = &padToken } + + // Create the encoder encoder := &GPTEncoder{ encoderTokens, tokensEncoder, @@ -403,8 +381,8 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) { encoderTokens[*hfConfig.EosTokenStr], encoderTokens[*hfConfig.PadTokenStr], specialConfig.EncloseEosBos, - tokenizerSpecialConfig.AddBosToken, - tokenizerSpecialConfig.AddEosToken, + *hfConfig.AddBosToken, + *hfConfig.AddEosToken, specialConfig.PrefixSpace, specialConfig.LowerCase, specialConfig.EndOfWord, diff --git a/gpt_bpe_test.go b/gpt_bpe_test.go index 379b8f3..56c4a95 100644 --- a/gpt_bpe_test.go +++ b/gpt_bpe_test.go @@ -884,6 +884,29 @@ func TestReadTokenizerConfig(t *testing.T) { fmt.Println("All Exists - Looks good.") } +func TestPythiaRemoteDownloadTokenizer(t *testing.T) { + // Tests the ability to download a tokenizer from a remote model + // and use it to encode and decode strings + modelId := "EleutherAI/pythia-70m" + destPath := "./TestPythiaRemoteDownloadTokenizer" + defer os.RemoveAll(destPath) + encoderPythia, err := NewEncoder(modelId) + if err != nil { + t.Errorf("Error creating encoder: %v", err) + } + + // Attempt to tokenize + testString := "The fox jumped over the hare.\nThe turtle is faster than the hare." + + // Encode the string + encoded := encoderPythia.Encode(&testString) + // Check that the encoded string is the same as the expected - Reference from python's transformers lib + expected := Tokens{510, 30013, 16780, 689, 253, 419, 250, 15, 187, 510, 45993, 310, 7938, 685, 253, 419, 250, 15} + if !assert.Equal(t, expected, *encoded) { + t.Errorf("Expected: %v\nActual: %v", expected, *encoded) + } +} + func TestGPTDecoder_Decode(t *testing.T) { // TBD } diff --git a/resources/resolver.go b/resources/resolver.go index 1573261..988d5c9 100644 --- a/resources/resolver.go +++ b/resources/resolver.go @@ -677,6 +677,8 @@ type HFConfig struct { VocabSize *uint16 `json:"vocab_size,omitempty"` Newlinemode *string `json:"newlinemode,omitempty"` TokenizerClass *string `json:"tokenizer_class"` + AddBosToken *bool `json:"add_bos_token,omitempty"` + AddEosToken *bool `json:"add_eos_token,omitempty"` } // Additional special tokenizer configuration. @@ -691,35 +693,6 @@ type SpecialConfig struct { SplitRegex *string `json:"split_regex"` } -// TokenizerConfig file, new HF format -type TokenizerSpecialsConfig struct { - AddBosToken bool `json:"add_bos_token,omitempty"` - BosToken TokenizerSpecials `json:"bos_token,omitempty"` - EosToken TokenizerSpecials `json:"eos_token,omitempty"` - AddEosToken bool `json:"add_eos_token,omitempty"` - PadToken string `json:"pad_token,omitempty"` -} - -// sub type of TokenizerSpecialsConfig, for eos, bos, pad tokens -type TokenizerSpecials struct { - Type string `json:"__type,omitempty"` - Content string `json:"content,omitempty"` - Lstrip bool `json:"lstrip,omitempty"` - Normalized bool `json:"normalized,omitempty"` - Rstrip bool `json:"rstrip,omitempty"` - Single_word bool `json:"single_word,omitempty"` -} - -// MistralConfig contains an additional level, we use a seperate struct to -// unmarshal the config file. -type MistralSpecialsConfig struct { - AddBosToken bool `json:"add_bos_token,omitempty"` - AddEosToken bool `json:"add_eos_token,omitempty"` - BosToken string `json:"bos_token,omitempty"` - EosToken string `json:"eos_token,omitempty"` - PadToken string `json:"pad_token,omitempty"` -} - // Processor stores config to process one step of the pipeline type Processor struct { ProcessorType string @@ -736,14 +709,13 @@ func (p *Processor) Process(input interface{}) (interface{}, error) { } } -// ResolveConfig +// ResolveResourcesFromRemoteOrLocal // Resolves a given vocabulary id, and returns the corresponding HuggingFace // configuration, and the resources for the tokenizer. -func ResolveConfig(vocabId string, token string) (config *HFConfig, - resources *Resources, err error) { +func ResolveResourcesFromRemoteOrLocal(vocabId string, token string) (resources *Resources, err error) { dir, dirErr := ioutil.TempDir("", "resources") if dirErr != nil { - return nil, nil, dirErr + return nil, dirErr } defer os.RemoveAll(dir) rslvdResources, rsrcErr := ResolveResources( @@ -753,47 +725,11 @@ func ResolveConfig(vocabId string, token string) (config *HFConfig, RESOURCETYPE_TRANSFORMERS, token) if rsrcErr != nil { - return nil, nil, rsrcErr + return nil, rsrcErr } else { resources = rslvdResources } - - var hfConfig *HFConfig - if configErr := json.Unmarshal(*((*resources)["config.json"]).Data, - &hfConfig); configErr != nil { - resources.Cleanup() - return nil, nil, errors.New(fmt.Sprintf( - "error unmarshalling config.json: %s", configErr)) - } - - specialTokens, specialsErr := resources.ResolveSpecialTokens(dir) - if specialsErr != nil { - resources.Cleanup() - return nil, nil, specialsErr - } - defaultTkn := "<|endoftext|>" - eosToken, ok := specialTokens["eos_token"] - if !ok { - eosToken = defaultTkn - } - hfConfig.EosTokenStr = &eosToken - padToken, ok := specialTokens["pad_token"] - if !ok { - padToken = defaultTkn - } - hfConfig.PadTokenStr = &padToken - bosToken, ok := specialTokens["bos_token"] - if !ok { - bosToken = defaultTkn - } - hfConfig.BosTokenStr = &bosToken - - hfConfig, err = ResolveHFFromResources(resources, hfConfig) - if err != nil { - return nil, nil, err - } - - return hfConfig, resources, nil + return resources, nil } @@ -814,6 +750,114 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig if err != nil { return nil, err } + + // Resolve Vocab size from vocab.json or encoder.json + hfConfig, err = resolveVocabSize(resources, hfConfig) + if err != nil { + return nil, err + } + + // Sometimes TokenIDs are not properly resolved, so we need to check + if hfConfig != nil { + if hfConfig.EosTokenId == nil || hfConfig.BosTokenId == nil || hfConfig.PadTokenId == nil { + hfConfig, err = resolveTokenIds(resources, hfConfig) + if err != nil { + return nil, err + } + } + } + return hfConfig, nil +} + +// resolveTokenIds +// Resolve token ids for eos, bos, and pad tokens from resources. +func resolveTokenIds(resources *Resources, hfConfig *HFConfig) (*HFConfig, error) { + // Use interfaces to unmarshal the vocab file from resources + var vocab interface{} + if _, err := resources.GetFile("vocab.json"); err == nil { + if err := json.Unmarshal(*((*resources)["vocab.json"]).Data, &vocab); err != nil { + fmt.Errorf("Error unmarshalling vocab.json: %s", err) + return nil, err + } + } else { + if _, err := resources.GetFile("encoder.json"); err == nil { + if err := json.Unmarshal(*((*resources)["encoder.json"]).Data, &vocab); err != nil { + fmt.Errorf("Error unmarshalling encoder.json: %s", err) + return nil, err + } + } else { + log.Printf("Vocab file not found, will attempt to skip\n") + } + } + + // Get the token ids for eos, bos, and pad tokens + if vocab != nil { + if vocabMap, ok := vocab.(map[string]interface{}); ok { + if hfConfig.EosTokenStr != nil { + if eosToken, ok := vocabMap[*(hfConfig.EosTokenStr)]; ok { + if eosTokenInt, ok := eosToken.(float64); ok { + hfConfig.EosTokenId = new(uint16) + *hfConfig.EosTokenId = uint16(eosTokenInt) + } + } + } + if hfConfig.BosTokenStr != nil { + if bosToken, ok := vocabMap[*(hfConfig.BosTokenStr)]; ok { + if bosTokenInt, ok := bosToken.(float64); ok { + hfConfig.BosTokenId = new(uint16) + *hfConfig.BosTokenId = uint16(bosTokenInt) + } + } + } + + if hfConfig.PadTokenStr != nil { + if padToken, ok := vocabMap[*(hfConfig.PadTokenStr)]; ok { + if padTokenInt, ok := padToken.(float64); ok { + hfConfig.PadTokenId = new(uint16) + *hfConfig.PadTokenId = uint16(padTokenInt) + } + } + } + } + } + + return hfConfig, nil +} + +// resolveVocabSize +// Resolve vocab size from resources. +// Used to be able to resolve both embedded and local resources. +// Continuation of ResolveHFFromResources. +func resolveVocabSize(resources *Resources, hfConfig *HFConfig) (*HFConfig, error) { + // Use interfaces to unmarshal the vocab file + var vocab interface{} + // If exists, unmarshal vocab.json, else + // use GetFile to get the file, then unmarshal it + if _, err := resources.GetFile("vocab.json"); err == nil { + if err := json.Unmarshal(*((*resources)["vocab.json"]).Data, &vocab); err != nil { + fmt.Errorf("Error unmarshalling vocab.json: %s", err) + return nil, err + } + } else { + if _, err := resources.GetFile("encoder.json"); err == nil { + if err := json.Unmarshal(*((*resources)["encoder.json"]).Data, &vocab); err != nil { + fmt.Errorf("Error unmarshalling encoder.json: %s", err) + return nil, err + } + } else { + log.Printf("Vocab file not found, will attempt to skip\n") + } + } + + // Get length of vocab + var vocabLen *uint16 + if vocab != nil { + if vocabMap, ok := vocab.(map[string]interface{}); ok { + vocabLen = new(uint16) + *vocabLen = uint16(len(vocabMap)) + } + } + hfConfig.VocabSize = vocabLen return hfConfig, nil } @@ -850,7 +894,10 @@ func resolveConfigAndTokenizerConfig(resources *Resources, hfConfig *HFConfig) ( // If not, try to unmarshal to the tokenizerSpecials // that llama 2 has, else try mistral format if config != nil || tokenizerConfig != nil { - hasReadConfig := false + hasReadForEOSBOS := false + hasReadForVocabSize := false + + // Read config.json if config != nil { // Using interfaces, first check if bos_token is in string format if bosToken, ok := config.(map[string]interface{})["bos_token"].(string); ok { @@ -861,24 +908,47 @@ func resolveConfigAndTokenizerConfig(resources *Resources, hfConfig *HFConfig) ( if padToken, ok := config.(map[string]interface{})["pad_token"].(string); ok { hfConfig.PadTokenStr = &padToken } - hasReadConfig = true + hasReadForEOSBOS = true } - } - if tokenizerConfig != nil && !hasReadConfig { - // Using interfaces, first check if bos_token is in string format - if bosToken, ok := tokenizerConfig.(map[string]interface{})["bos_token"].(string); ok { - hfConfig.BosTokenStr = &bosToken - if eosToken, ok := tokenizerConfig.(map[string]interface{})["eos_token"].(string); ok { - hfConfig.EosTokenStr = &eosToken - } - if padToken, ok := tokenizerConfig.(map[string]interface{})["pad_token"].(string); ok { - hfConfig.PadTokenStr = &padToken + + // Read for EOS BOS token ID + if eosTokenId, ok := config.(map[string]interface{})["eos_token_id"].(float64); ok { + eosTokenIdInt := uint16(eosTokenId) + hfConfig.EosTokenId = &eosTokenIdInt + } + if bosTokenId, ok := config.(map[string]interface{})["bos_token_id"].(float64); ok { + bosTokenIdInt := uint16(bosTokenId) + hfConfig.BosTokenId = &bosTokenIdInt + } + + // Read for vocab size + if !hasReadForVocabSize { + if vocabSize, ok := config.(map[string]interface{})["vocab_size"].(float64); ok { + vocabSizeInt := uint16(vocabSize) + hfConfig.VocabSize = &vocabSizeInt + hasReadForVocabSize = true } - hasReadConfig = true + } + } + + // Read tokenizer_config.json + if tokenizerConfig != nil { + if !hasReadForEOSBOS { + // Using interfaces, first check if bos_token is in string format + if bosToken, ok := tokenizerConfig.(map[string]interface{})["bos_token"].(string); ok { + hfConfig.BosTokenStr = &bosToken + if eosToken, ok := tokenizerConfig.(map[string]interface{})["eos_token"].(string); ok { + hfConfig.EosTokenStr = &eosToken + } + if padToken, ok := tokenizerConfig.(map[string]interface{})["pad_token"].(string); ok { + hfConfig.PadTokenStr = &padToken + } + hasReadForEOSBOS = true + } } // If not, assume llama2 format and try to unmarshal - if !hasReadConfig { + if !hasReadForEOSBOS { cfg := tokenizerConfig.(map[string]interface{}) if bosToken, ok := cfg["bos_token"].(map[string]interface{}); ok { if content, ok := bosToken["content"].(string); ok { @@ -895,7 +965,7 @@ func resolveConfigAndTokenizerConfig(resources *Resources, hfConfig *HFConfig) ( } } // If that doesn't work, assume mistral format - if !hasReadConfig { + if !hasReadForEOSBOS { if bosToken, ok := tokenizerConfig.(map[string]interface{})["bos_token"].(string); ok { hfConfig.BosTokenStr = &bosToken } @@ -906,6 +976,15 @@ func resolveConfigAndTokenizerConfig(resources *Resources, hfConfig *HFConfig) ( hfConfig.PadTokenStr = &padToken } } + + // Read for enclose eos bos + if encloseEos, ok := tokenizerConfig.(map[string]interface{})["add_bos_token"].(bool); ok { + hfConfig.AddBosToken = &encloseEos + } + + if encloseBos, ok := tokenizerConfig.(map[string]interface{})["add_eos_token"].(bool); ok { + hfConfig.AddEosToken = &encloseBos + } } } @@ -952,20 +1031,12 @@ func resolveSpecialsAndSpecialTokens(resources *Resources, hfConfig *HFConfig) ( return hfConfig, nil } -// ResolveVocabId -// Resolves a vocabulary id to a set of resources, from embedded, -// local filesystem, or remote. -func ResolveVocabId(vocabId string, token string) (*HFConfig, *Resources, error) { - var resolvedVocabId string - log.Printf("Resolving vocab id: %s\n", vocabId) +// ResolveResourcesList +// Resolves a list of resources, and checks if they exist in the given directory. +// If they don't exist, they are downloaded. +func ResolveResourcesList(vocabId string, token string) (*Resources, error) { + // Resolve the vocab id - Embedded resources if _, vocabErr := EmbeddedDirExists(vocabId); vocabErr == nil { - endOfText := "<|endoftext|>" - bosText := "<|startoftext|>" - hf := &HFConfig{ - ModelId: &vocabId, - BosTokenStr: &bosText, - EosTokenStr: &endOfText, - } resources := make(Resources, 0) if config := GetEmbeddedResource(vocabId + "/encoder." + @@ -993,33 +1064,46 @@ func ResolveVocabId(vocabId string, token string) (*HFConfig, *Resources, error) if tokenizerJson != nil { resources["tokenizer.json"] = *tokenizerJson } - tokenizer_specials_config := GetEmbeddedResource(vocabId + "/tokenizer_config.json") if tokenizer_specials_config != nil { resources["tokenizer_config.json"] = *tokenizer_specials_config } + return &resources, nil - hf, err := ResolveHFFromResources(&resources, hf) - if err != nil { - return nil, nil, err - } + } + // Local resources + resources, err := ResolveResourcesFromRemoteOrLocal(vocabId, token) + if err != nil { + return nil, err + } + return resources, nil - return hf, &resources, nil +} + +// ResolveVocabId +// Resolves a vocabulary id to a set of resources, from embedded, +// local filesystem, or remote, and applies processing to the resources. +func ResolveVocabId(vocabId string, token string) (*HFConfig, *Resources, error) { + rsc, err := ResolveResourcesList(vocabId, token) + if err != nil { + return nil, nil, err } - if isValidUrl(vocabId) { - u, _ := url.Parse(vocabId) - basePath := path.Base(u.Path) - resolvedVocabId = basePath - } else { - resolvedVocabId = vocabId + + endOfText := "<|endoftext|>" + bosText := "<|startoftext|>" + eosbosDefault := false + hf := &HFConfig{ + ModelId: &vocabId, + BosTokenStr: &bosText, + EosTokenStr: &endOfText, + AddBosToken: &eosbosDefault, + AddEosToken: &eosbosDefault, } - config, resources, err := ResolveConfig(vocabId, token) + hf, err = ResolveHFFromResources(rsc, hf) if err != nil { return nil, nil, err - } else { - config.ModelId = &resolvedVocabId - return config, resources, nil } + return hf, rsc, nil } func ExtractModelFromTokenizer(dir *string) (map[string]interface{}, error) { diff --git a/resources/resources.go b/resources/resources.go index e486a68..e24dc98 100644 --- a/resources/resources.go +++ b/resources/resources.go @@ -92,6 +92,8 @@ func FetchHTTP(uri string, rsrc string, auth string) (io.ReadCloser, error) { resp.StatusCode)) } return resp.Body, nil + + // testing copilot: } // SizeHTTP From 1545cbcf09cb936ebd7e8fcf8c7d951fc8f46810 Mon Sep 17 00:00:00 2001 From: rexwang8 Date: Tue, 16 Jul 2024 11:43:05 -0400 Subject: [PATCH 2/6] chore: minor comment change --- resources/resources.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/resources/resources.go b/resources/resources.go index e24dc98..e486a68 100644 --- a/resources/resources.go +++ b/resources/resources.go @@ -92,8 +92,6 @@ func FetchHTTP(uri string, rsrc string, auth string) (io.ReadCloser, error) { resp.StatusCode)) } return resp.Body, nil - - // testing copilot: } // SizeHTTP From 42fb5de77f9ddd903fc7127d40685ec4bbe37d3d Mon Sep 17 00:00:00 2001 From: rexwang8 Date: Tue, 16 Jul 2024 13:06:31 -0400 Subject: [PATCH 3/6] refactor: fix funct names and ptrs --- gpt_bpe.go | 7 +-- resources/resolver.go | 102 ++++++++++++++++++++++++++---------------- 2 files changed, 68 insertions(+), 41 deletions(-) diff --git a/gpt_bpe.go b/gpt_bpe.go index 927639c..6ddece0 100644 --- a/gpt_bpe.go +++ b/gpt_bpe.go @@ -338,9 +338,10 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) { } if specialConfig.EncloseEosBos { - eosBosBool := true - hfConfig.AddBosToken = &eosBosBool - hfConfig.AddEosToken = &eosBosBool + bosBool := true + eosBool := true + hfConfig.AddBosToken = &bosBool + hfConfig.AddEosToken = &eosBool } // Add in default pad token if not already set diff --git a/resources/resolver.go b/resources/resolver.go index 988d5c9..667aabf 100644 --- a/resources/resolver.go +++ b/resources/resolver.go @@ -693,6 +693,39 @@ type SpecialConfig struct { SplitRegex *string `json:"split_regex"` } +// NewHFConfig creates a new HFConfig object with default values. +func NewHFConfig() *HFConfig { + defaultModelId := "" + defaultModelType := "gpt2" + defaultEosTokenId := uint16(0) + defaultBosTokenId := uint16(0) + defaultPadTokenId := uint16(0) + defaultBosTokenStr := "<|startoftext|>" + defaultEosTokenStr := "<|endoftext|>" + defaultPadTokenStr := "" + defaultVocabSize := uint16(50257) + defaultNewlinemode := "prefix" + defaultTokenizerClass := "GPT2BPETokenizer" + defaultAddBosToken := false + defaultAddEosToken := false + HFConfig := &HFConfig{ + ModelId: &defaultModelId, + ModelType: &defaultModelType, + EosTokenId: &defaultEosTokenId, + BosTokenId: &defaultBosTokenId, + PadTokenId: &defaultPadTokenId, + BosTokenStr: &defaultBosTokenStr, + EosTokenStr: &defaultEosTokenStr, + PadTokenStr: &defaultPadTokenStr, + VocabSize: &defaultVocabSize, + Newlinemode: &defaultNewlinemode, + TokenizerClass: &defaultTokenizerClass, + AddBosToken: &defaultAddBosToken, + AddEosToken: &defaultAddEosToken, + } + return HFConfig +} + // Processor stores config to process one step of the pipeline type Processor struct { ProcessorType string @@ -709,10 +742,10 @@ func (p *Processor) Process(input interface{}) (interface{}, error) { } } -// ResolveResourcesFromRemoteOrLocal +// LoadExternalResources // Resolves a given vocabulary id, and returns the corresponding HuggingFace // configuration, and the resources for the tokenizer. -func ResolveResourcesFromRemoteOrLocal(vocabId string, token string) (resources *Resources, err error) { +func LoadExternalResources(vocabId string, token string) (resources *Resources, err error) { dir, dirErr := ioutil.TempDir("", "resources") if dirErr != nil { return nil, dirErr @@ -759,7 +792,7 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig // Sometimes TokenIDs are not properly resolved, so we need to check if hfConfig != nil { - if hfConfig.EosTokenId == nil || hfConfig.BosTokenId == nil || hfConfig.PadTokenId == nil { + if *hfConfig.EosTokenId == 0 || *hfConfig.BosTokenId == 0 || *hfConfig.PadTokenId == 0 { hfConfig, err = resolveTokenIds(resources, hfConfig) if err != nil { return nil, err @@ -894,36 +927,37 @@ func resolveConfigAndTokenizerConfig(resources *Resources, hfConfig *HFConfig) ( // If not, try to unmarshal to the tokenizerSpecials // that llama 2 has, else try mistral format if config != nil || tokenizerConfig != nil { - hasReadForEOSBOS := false + hasReadForEosBos := false hasReadForVocabSize := false // Read config.json if config != nil { + configMap := config.(map[string]interface{}) // Using interfaces, first check if bos_token is in string format - if bosToken, ok := config.(map[string]interface{})["bos_token"].(string); ok { + if bosToken, ok := configMap["bos_token"].(string); ok { hfConfig.BosTokenStr = &bosToken - if eosToken, ok := config.(map[string]interface{})["eos_token"].(string); ok { + if eosToken, ok := configMap["eos_token"].(string); ok { hfConfig.EosTokenStr = &eosToken } - if padToken, ok := config.(map[string]interface{})["pad_token"].(string); ok { + if padToken, ok := configMap["pad_token"].(string); ok { hfConfig.PadTokenStr = &padToken } - hasReadForEOSBOS = true + hasReadForEosBos = true } // Read for EOS BOS token ID - if eosTokenId, ok := config.(map[string]interface{})["eos_token_id"].(float64); ok { + if eosTokenId, ok := configMap["eos_token_id"].(float64); ok { eosTokenIdInt := uint16(eosTokenId) hfConfig.EosTokenId = &eosTokenIdInt } - if bosTokenId, ok := config.(map[string]interface{})["bos_token_id"].(float64); ok { + if bosTokenId, ok := configMap["bos_token_id"].(float64); ok { bosTokenIdInt := uint16(bosTokenId) hfConfig.BosTokenId = &bosTokenIdInt } // Read for vocab size if !hasReadForVocabSize { - if vocabSize, ok := config.(map[string]interface{})["vocab_size"].(float64); ok { + if vocabSize, ok := configMap["vocab_size"].(float64); ok { vocabSizeInt := uint16(vocabSize) hfConfig.VocabSize = &vocabSizeInt hasReadForVocabSize = true @@ -933,56 +967,56 @@ func resolveConfigAndTokenizerConfig(resources *Resources, hfConfig *HFConfig) ( // Read tokenizer_config.json if tokenizerConfig != nil { - if !hasReadForEOSBOS { + tokenizerConfigMap := tokenizerConfig.(map[string]interface{}) + if !hasReadForEosBos { // Using interfaces, first check if bos_token is in string format - if bosToken, ok := tokenizerConfig.(map[string]interface{})["bos_token"].(string); ok { + if bosToken, ok := tokenizerConfigMap["bos_token"].(string); ok { hfConfig.BosTokenStr = &bosToken - if eosToken, ok := tokenizerConfig.(map[string]interface{})["eos_token"].(string); ok { + if eosToken, ok := tokenizerConfigMap["eos_token"].(string); ok { hfConfig.EosTokenStr = &eosToken } - if padToken, ok := tokenizerConfig.(map[string]interface{})["pad_token"].(string); ok { + if padToken, ok := tokenizerConfigMap["pad_token"].(string); ok { hfConfig.PadTokenStr = &padToken } - hasReadForEOSBOS = true + hasReadForEosBos = true } } // If not, assume llama2 format and try to unmarshal - if !hasReadForEOSBOS { - cfg := tokenizerConfig.(map[string]interface{}) - if bosToken, ok := cfg["bos_token"].(map[string]interface{}); ok { + if !hasReadForEosBos { + if bosToken, ok := tokenizerConfigMap["bos_token"].(map[string]interface{}); ok { if content, ok := bosToken["content"].(string); ok { hfConfig.BosTokenStr = &content } } - if eosToken, ok := cfg["eos_token"].(map[string]interface{}); ok { + if eosToken, ok := tokenizerConfigMap["eos_token"].(map[string]interface{}); ok { if content, ok := eosToken["content"].(string); ok { hfConfig.EosTokenStr = &content } } - if padToken, ok := cfg["pad_token"].(string); ok { + if padToken, ok := tokenizerConfigMap["pad_token"].(string); ok { hfConfig.PadTokenStr = &padToken } } // If that doesn't work, assume mistral format - if !hasReadForEOSBOS { - if bosToken, ok := tokenizerConfig.(map[string]interface{})["bos_token"].(string); ok { + if !hasReadForEosBos { + if bosToken, ok := tokenizerConfigMap["bos_token"].(string); ok { hfConfig.BosTokenStr = &bosToken } - if eosToken, ok := tokenizerConfig.(map[string]interface{})["eos_token"].(string); ok { + if eosToken, ok := tokenizerConfigMap["eos_token"].(string); ok { hfConfig.EosTokenStr = &eosToken } - if padToken, ok := tokenizerConfig.(map[string]interface{})["pad_token"].(string); ok { + if padToken, ok := tokenizerConfigMap["pad_token"].(string); ok { hfConfig.PadTokenStr = &padToken } } // Read for enclose eos bos - if encloseEos, ok := tokenizerConfig.(map[string]interface{})["add_bos_token"].(bool); ok { + if encloseEos, ok := tokenizerConfigMap["add_bos_token"].(bool); ok { hfConfig.AddBosToken = &encloseEos } - if encloseBos, ok := tokenizerConfig.(map[string]interface{})["add_eos_token"].(bool); ok { + if encloseBos, ok := tokenizerConfigMap["add_eos_token"].(bool); ok { hfConfig.AddEosToken = &encloseBos } } @@ -1072,7 +1106,7 @@ func ResolveResourcesList(vocabId string, token string) (*Resources, error) { } // Local resources - resources, err := ResolveResourcesFromRemoteOrLocal(vocabId, token) + resources, err := LoadExternalResources(vocabId, token) if err != nil { return nil, err } @@ -1089,16 +1123,8 @@ func ResolveVocabId(vocabId string, token string) (*HFConfig, *Resources, error) return nil, nil, err } - endOfText := "<|endoftext|>" - bosText := "<|startoftext|>" - eosbosDefault := false - hf := &HFConfig{ - ModelId: &vocabId, - BosTokenStr: &bosText, - EosTokenStr: &endOfText, - AddBosToken: &eosbosDefault, - AddEosToken: &eosbosDefault, - } + hf := NewHFConfig() + hf.ModelId = &vocabId hf, err = ResolveHFFromResources(rsc, hf) if err != nil { return nil, nil, err From ae29969de51cefad02641795dcb4b5b759c0fe5e Mon Sep 17 00:00:00 2001 From: rexwang8 Date: Thu, 18 Jul 2024 11:09:50 -0400 Subject: [PATCH 4/6] refactor: reduce nesting in the resolveTokenIds function --- resources/resolver.go | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/resources/resolver.go b/resources/resolver.go index 667aabf..1f5d5cb 100644 --- a/resources/resolver.go +++ b/resources/resolver.go @@ -823,33 +823,30 @@ func resolveTokenIds(resources *Resources, hfConfig *HFConfig) (*HFConfig, error } } + // If vocab is nil, return an error + if vocab == nil { + return nil, errors.New("vocab file not found") + } + // Get the token ids for eos, bos, and pad tokens - if vocab != nil { - if vocabMap, ok := vocab.(map[string]interface{}); ok { - if hfConfig.EosTokenStr != nil { - if eosToken, ok := vocabMap[*(hfConfig.EosTokenStr)]; ok { - if eosTokenInt, ok := eosToken.(float64); ok { - hfConfig.EosTokenId = new(uint16) - *hfConfig.EosTokenId = uint16(eosTokenInt) - } - } + if vocabMap, ok := vocab.(map[string]interface{}); ok { + if hfConfig.EosTokenStr != nil { + if eosTokenInt, ok := vocabMap[*(hfConfig.EosTokenStr)].(float64); ok { + hfConfig.EosTokenId = new(uint16) + *hfConfig.EosTokenId = uint16(eosTokenInt) } - if hfConfig.BosTokenStr != nil { - if bosToken, ok := vocabMap[*(hfConfig.BosTokenStr)]; ok { - if bosTokenInt, ok := bosToken.(float64); ok { - hfConfig.BosTokenId = new(uint16) - *hfConfig.BosTokenId = uint16(bosTokenInt) - } - } + } + if hfConfig.BosTokenStr != nil { + if bosTokenInt, ok := vocabMap[*(hfConfig.BosTokenStr)].(float64); ok { + hfConfig.BosTokenId = new(uint16) + *hfConfig.BosTokenId = uint16(bosTokenInt) } + } - if hfConfig.PadTokenStr != nil { - if padToken, ok := vocabMap[*(hfConfig.PadTokenStr)]; ok { - if padTokenInt, ok := padToken.(float64); ok { - hfConfig.PadTokenId = new(uint16) - *hfConfig.PadTokenId = uint16(padTokenInt) - } - } + if hfConfig.PadTokenStr != nil { + if padTokenInt, ok := vocabMap[*(hfConfig.PadTokenStr)].(float64); ok { + hfConfig.PadTokenId = new(uint16) + *hfConfig.PadTokenId = uint16(padTokenInt) } } } From dc33bf95ea5ae46cf545b67fd0d01e2928301931 Mon Sep 17 00:00:00 2001 From: rexwang8 Date: Thu, 18 Jul 2024 11:14:35 -0400 Subject: [PATCH 5/6] refactor: add another guard statement --- resources/resolver.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/resources/resolver.go b/resources/resolver.go index 1f5d5cb..ffe4336 100644 --- a/resources/resolver.go +++ b/resources/resolver.go @@ -881,12 +881,14 @@ func resolveVocabSize(resources *Resources, hfConfig *HFConfig) (*HFConfig, erro // Get length of vocab var vocabLen *uint16 - if vocab != nil { - if vocabMap, ok := vocab.(map[string]interface{}); ok { - vocabLen = new(uint16) - *vocabLen = uint16(len(vocabMap)) - } + if vocab == nil { + return nil, errors.New("vocab file not found") } + if vocabMap, ok := vocab.(map[string]interface{}); ok { + vocabLen = new(uint16) + *vocabLen = uint16(len(vocabMap)) + } + hfConfig.VocabSize = vocabLen return hfConfig, nil } From d260b08fd21230ce75c827ece4381c0820a55029 Mon Sep 17 00:00:00 2001 From: rexwang8 Date: Thu, 18 Jul 2024 11:16:11 -0400 Subject: [PATCH 6/6] docs: changed function description for encoder --- resources/resolver.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/resources/resolver.go b/resources/resolver.go index ffe4336..a2f23be 100644 --- a/resources/resolver.go +++ b/resources/resolver.go @@ -805,7 +805,9 @@ func ResolveHFFromResources(resources *Resources, hfConfig *HFConfig) (*HFConfig // resolveTokenIds // Resolve token ids for eos, bos, and pad tokens from resources. func resolveTokenIds(resources *Resources, hfConfig *HFConfig) (*HFConfig, error) { - // Use interfaces to unmarshal the vocab file from resources + // Vocab is stored in either the vocab.json or encoder.json file + // We want to unmarshal the vocab file into an interface to work with + // We attempt to unmarshal under the vocab.json key first, then encoder.json if it fails var vocab interface{} if _, err := resources.GetFile("vocab.json"); err == nil { if err := json.Unmarshal(*((*resources)["vocab.json"]).Data, &vocab); err != nil { @@ -859,10 +861,10 @@ func resolveTokenIds(resources *Resources, hfConfig *HFConfig) (*HFConfig, error // Used to be able to resolve both embedded and local resources. // Continuation of ResolveHFFromResources. func resolveVocabSize(resources *Resources, hfConfig *HFConfig) (*HFConfig, error) { - // Use interfaces to unmarshal the vocab file var vocab interface{} - // If exists, unmarshal vocab.json, else - // use GetFile to get the file, then unmarshal it + // Vocab is stored in either the vocab.json or encoder.json file + // We want to unmarshal the vocab file into an interface to work with + // We attempt to unmarshal under the vocab.json key first, then encoder.json if it fails if _, err := resources.GetFile("vocab.json"); err == nil { if err := json.Unmarshal(*((*resources)["vocab.json"]).Data, &vocab); err != nil { fmt.Errorf("Error unmarshalling vocab.json: %s", err)