diff --git a/.github/workflows/TestCommands.yml b/.github/workflows/TestCommands.yml new file mode 100644 index 0000000..8da5edf --- /dev/null +++ b/.github/workflows/TestCommands.yml @@ -0,0 +1,29 @@ +name: Test Commands + +on: + push: + paths-ignore: + - "README.md" + - "LICENSE" + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: "go.mod" + + - name: Build + run: go build -v ./ + + - name: Test dataset_tokenizer + uses: robherley/go-test-action@v0 + with: + testArguments: ./ + moduleDirectory: ./cmd/dataset_tokenizer diff --git a/.github/workflows/TestGPT_BPE.yml b/.github/workflows/TestGPT_BPE.yml new file mode 100644 index 0000000..c84ae27 --- /dev/null +++ b/.github/workflows/TestGPT_BPE.yml @@ -0,0 +1,29 @@ +name: Test gpt_bpe_test.go + +on: + push: + paths-ignore: + - "README.md" + - "LICENSE" + - "cmd/**" + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: "go.mod" + + - name: Build + run: go build -v ./ + + - name: Test gpt_bpe + uses: robherley/go-test-action@v0 + with: + testArguments: ./ -timeout 20m diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dd2eff3..50f1506 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,13 +1,16 @@ name: Publish on: - push: - branches: ["main"] + workflow_run: + workflows: ['Test Commands'] + types: ['completed'] + branches: ['main'] jobs: publish: name: Publish runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} steps: - uses: actions/setup-go@v2 with: diff --git a/cmd/dataset_tokenizer/dataset_tokenizer_test.go b/cmd/dataset_tokenizer/dataset_tokenizer_test.go index 99e78cf..9c1b40f 100644 --- a/cmd/dataset_tokenizer/dataset_tokenizer_test.go +++ b/cmd/dataset_tokenizer/dataset_tokenizer_test.go @@ -3,7 +3,6 @@ package main import ( "bufio" "bytes" - "encoding/binary" "errors" "fmt" "io" @@ -19,6 +18,7 @@ import ( "github.com/aws/aws-sdk-go/service/s3" "github.com/stretchr/testify/assert" "github.com/wbrown/gpt_bpe" + "github.com/wbrown/gpt_bpe/types" ) type SanitizerTest struct { @@ -39,6 +39,7 @@ type S3MockClient struct { type SanitizerTests []SanitizerTest +// Test data for sanitizing text. var sanitizerTests = SanitizerTests{ {"\\n handling", "\nfoobar\\n\n", @@ -65,24 +66,11 @@ var sanitizerTests = SanitizerTests{ const corpusPath = "../../resources/frankenstein.txt" -func TokensFromBin(bin *[]byte) *gpt_bpe.Tokens { - tokens := make(gpt_bpe.Tokens, 0) - buf := bytes.NewReader(*bin) - for { - var token gpt_bpe.Token - if err := binary.Read(buf, binary.LittleEndian, &token); err != nil { - break - } - tokens = append(tokens, token) - } - return &tokens -} - // DecodeBuffer // Decode Tokens from a byte array into a string. func DecodeBuffer(encoded *[]byte) (text string) { // First convert our bytearray into a uint32 `Token` array. - tokens := TokensFromBin(encoded) + tokens := types.TokensFromBin(encoded) // Decode our tokens into a string. var enc *gpt_bpe.GPTEncoder encoderString := "gpt2" @@ -735,3 +723,199 @@ func TestListObjectsRecursively(t *testing.T) { wg.Wait() // Wait for all goroutines to finish } + +func TestUInt16WithNoEnforce(t *testing.T) { + // Test if with Uint32 enforce disabled, + // using a Uint16 tokenizer works as intended with no padding. + + textsTokenizer := NewTextsTokenizer() + textsTokenizer.ContextSize = 2048 + textsTokenizer.TokenizerId = "gpt2" + textsTokenizer.EndOfText = "" + + // Test data + testString := "The quick brown fox jumps over the lazy dog." + expectedTokens := types.Tokens{464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 50256} + // Generate temp directory and test file + tempDir := os.TempDir() + testFile := tempDir + "/test.txt" + f, err := os.Create(testFile) + if err != nil { + log.Fatal(err) + } + // Write test string to file + _, err = f.WriteString(testString) + if err != nil { + log.Fatal(err) + } + f.Close() + defer os.Remove(testFile) + + reorderPaths := "" + sampling := 100 + outputFile := "base.chunk" + defer os.Remove(outputFile) + + enc, tokErr := textsTokenizer.InitTokenizer() + if tokErr != nil { + log.Fatal(tokErr) + } + + if texts, err := ReadTexts( + testFile, false, + reorderPaths, + 1, + ); err != nil { + log.Fatal(err) + } else { + begin := time.Now() + contexts, tokErr := textsTokenizer.TokenizeTexts( + texts, "./test", enc, + ) + if tokErr != nil { + log.Fatal(tokErr) + } + + total, writeErr := WriteContexts( + outputFile, + contexts, + enc, + sampling, + false, + false, + false, + ) + if writeErr != nil { + log.Fatal(writeErr) + } + duration := time.Since(begin).Seconds() + log.Printf( + "%d tokens in %0.2fs, %0.2f tokens/s", total, + duration, float64(total)/duration, + ) + } + // Read the encoded tokens from the output file + binaryData, err := os.ReadFile(outputFile) + if err != nil { + log.Fatal(err) + } + + // Convert to Tokens array + tokens := types.TokensFromBin(&binaryData) + + if len(*tokens) != len(expectedTokens) { + t.Fatalf( + "Expected %d tokens, but got %d", len(expectedTokens), + len(*tokens), + ) + } + for i, token := range *tokens { + if token != expectedTokens[i] { + t.Fatalf( + "Expected token %d, but got %d", expectedTokens[i], + token, + ) + } + } + + // Verify the encoded tokens + assert.Equal(t, &expectedTokens, tokens) +} + +func TestUInt16WithEnforce(t *testing.T) { + // Test if with Uint32 enforce enabled, + // using a Uint16 tokenizer works as intended with padding + // ie X, 0 Y, 0, Z, 0 + + textsTokenizer := NewTextsTokenizer() + textsTokenizer.ContextSize = 2048 + textsTokenizer.TokenizerId = "gpt2" + textsTokenizer.EndOfText = "" + + // Test data + testString := "The quick brown fox jumps over the lazy dog." + expectedTokens := types.Tokens{464, 0, 2068, 0, 7586, 0, 21831, 0, 18045, 0, 625, 0, 262, 0, 16931, 0, 3290, 0, 13, 0, 50256, 0} + // Generate temp directory and test file + tempDir := os.TempDir() + testFile := tempDir + "/test.txt" + f, err := os.Create(testFile) + if err != nil { + log.Fatal(err) + } + // Write test string to file + _, err = f.WriteString(testString) + if err != nil { + log.Fatal(err) + } + f.Close() + defer os.Remove(testFile) + + reorderPaths := "" + sampling := 100 + outputFile := "base.chunk" + defer os.Remove(outputFile) + + enc, tokErr := textsTokenizer.InitTokenizer() + if tokErr != nil { + log.Fatal(tokErr) + } + + if texts, err := ReadTexts( + testFile, false, + reorderPaths, + 1, + ); err != nil { + log.Fatal(err) + } else { + begin := time.Now() + contexts, tokErr := textsTokenizer.TokenizeTexts( + texts, "./test", enc, + ) + if tokErr != nil { + log.Fatal(tokErr) + } + + total, writeErr := WriteContexts( + outputFile, + contexts, + enc, + sampling, + false, + true, + false, + ) + if writeErr != nil { + log.Fatal(writeErr) + } + duration := time.Since(begin).Seconds() + log.Printf( + "%d tokens in %0.2fs, %0.2f tokens/s", total, + duration, float64(total)/duration, + ) + } + // Read the encoded tokens from the output file + binaryData, err := os.ReadFile(outputFile) + if err != nil { + log.Fatal(err) + } + + // Convert to Tokens array + tokens := types.TokensFromBin(&binaryData) + + if len(*tokens) != len(expectedTokens) { + t.Fatalf( + "Expected %d tokens, but got %d", len(expectedTokens), + len(*tokens), + ) + } + for i, token := range *tokens { + if token != expectedTokens[i] { + t.Fatalf( + "Expected token %d, but got %d", expectedTokens[i], + token, + ) + } + } + // Verify the encoded tokens + assert.Equal(t, &expectedTokens, tokens) +} diff --git a/gpt_bpe_test.go b/gpt_bpe_test.go index ac63556..2ebe742 100644 --- a/gpt_bpe_test.go +++ b/gpt_bpe_test.go @@ -564,6 +564,18 @@ func TestGPTEncoder_Encode(t *testing.T) { } } +func TestGPTEncode(t *testing.T) { + // This test is to check if the GPTEncoder is able to encode the tokens correctly + strin := "The quick brown fox jumps over the lazy dog." + expected := Tokens{464, 21831, 11687, 625, 262, 387, 260, 25970, 82, 29, 464, 28699, 318, 5443, 621, 262, 387, 260, 13} + encoded := gpt2Encoder.Encode(&strin) + fmt.Printf("Encoded: with commas:") + for _, token := range *encoded { + fmt.Printf("%v, ", token) + } + assert.Equal(t, *encoded, expected) +} + func TestGPTEncoder_StreamingEncode(t *testing.T) { // This test is to check if the GPTEncoder is able to encode the tokens correctly start := time.Now() @@ -1407,7 +1419,7 @@ func TestLlama3RemoteDownloadTokenizer(t *testing.T) { func TestMistralRemoteDownloadTokenizer(t *testing.T) { // Tests the ability to download a tokenizer from a remote model // and use it to encode and decode strings - modelId := "Open-Orca/Mistral-7B-OpenOrca" + modelId := "openaccess-ai-collective/tiny-mistral" //destPath := "./TestMistralRemoteDownloadTokenizer" //defer os.RemoveAll(destPath) encoderMistral, err := NewEncoder(modelId) @@ -1496,7 +1508,7 @@ func TestModelDownloadLlama(t *testing.T) { // the vocab.json and merges.txt files are stored in the // tokenizer.json file. We want to check if we are able to // download the model and extract the vocab.json and merges.txt - modelId := "georgesung/llama2_7b_chat_uncensored" + modelId := "Maykeye/TinyLLama-v0" destPath := "./TestModelDownloadLlama" err := downloadModel(modelId, destPath) if err != nil { @@ -1508,7 +1520,7 @@ func TestModelDownloadLlama(t *testing.T) { // Check that the model files are there // We want to check for the presence of the following files: // config.json, pytorch_model.bin, - // tokenizer.json, vocab.json + // tokenizer.model, vocab.json // Check for pytorch_model.bin singleModelPattern := regexp.MustCompile(`pytorch_model\.bin$`) @@ -1543,7 +1555,7 @@ func TestModelDownloadLlama(t *testing.T) { } // Check for additional metadata files - metaFiles := []string{"tokenizer.json", "vocab.json", "config.json"} + metaFiles := []string{"tokenizer.model", "vocab.json", "config.json"} for _, metaFile := range metaFiles { metaPath := destPath + "/" + metaFile assertFileExists(t, metaPath) @@ -1555,7 +1567,7 @@ func TestModelDownloadLlama(t *testing.T) { func TestModelDownloadMistral(t *testing.T) { // Download a downstream mistral model due to mistral being gated - modelId := "Open-Orca/Mistral-7B-OpenOrca" + modelId := "openaccess-ai-collective/tiny-mistral" destPath := "./TestModelDownloadMistral" err := downloadModel(modelId, destPath) if err != nil { @@ -1567,10 +1579,10 @@ func TestModelDownloadMistral(t *testing.T) { // Check that the model files are there // We want to check for the presence of the following files: // config.json, pytorch_model.bin, - // tokenizer.json, vocab.json + // tokenizer.model // Check for additional metadata files - metaFiles := []string{"tokenizer.json", "vocab.json, config.json", "pytorch_model-00001-of-00002.bin"} + metaFiles := []string{"tokenizer.model", "config.json", "pytorch_model.bin"} for _, metaFile := range metaFiles { metaPath := destPath + "/" + metaFile assertFileExists(t, metaPath) @@ -1599,7 +1611,7 @@ func TestModelDownloadFairseq(t *testing.T) { // vocab, config. merges, pytorch_model // Check for additional metadata files - metaFiles := []string{"tokenizer.json", "vocab.json, config.json", "pytorch_model.bin", "merges.txt"} + metaFiles := []string{"vocab.json", "config.json", "pytorch_model.bin", "merges.txt"} for _, metaFile := range metaFiles { metaPath := destPath + "/" + metaFile assertFileExists(t, metaPath) diff --git a/js/js.go b/js/js.go index 990c238..c224565 100644 --- a/js/js.go +++ b/js/js.go @@ -3,9 +3,11 @@ package main //go:generate gopherjs build --minify import ( + "log" + "github.com/gopherjs/gopherjs/js" "github.com/wbrown/gpt_bpe" - "log" + "github.com/wbrown/gpt_bpe/types" ) var encoder gpt_bpe.GPTEncoder @@ -15,7 +17,7 @@ func Tokenize(text string) gpt_bpe.Tokens { } func Decode(arr []byte) string { - tokens := gpt_bpe.TokensFromBin(&arr) + tokens := types.TokensFromBin(&arr) return encoder.Decode(tokens) }