Skip to content

Commit

Permalink
Merge pull request wbrown#56 from wbrown/rwang.githubActions2.10182024
Browse files Browse the repository at this point in the history
Run tests on push - Action
  • Loading branch information
wbrown authored Nov 4, 2024
2 parents f1784b0 + 2dba8d0 commit fb3c65c
Show file tree
Hide file tree
Showing 6 changed files with 286 additions and 27 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/TestCommands.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Test Commands

on:
push:
paths-ignore:
- "README.md"
- "LICENSE"
workflow_dispatch:

jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: "go.mod"

- name: Build
run: go build -v ./

- name: Test dataset_tokenizer
uses: robherley/go-test-action@v0
with:
testArguments: ./
moduleDirectory: ./cmd/dataset_tokenizer
29 changes: 29 additions & 0 deletions .github/workflows/TestGPT_BPE.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Test gpt_bpe_test.go

on:
push:
paths-ignore:
- "README.md"
- "LICENSE"
- "cmd/**"
workflow_dispatch:

jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: "go.mod"

- name: Build
run: go build -v ./

- name: Test gpt_bpe
uses: robherley/go-test-action@v0
with:
testArguments: ./ -timeout 20m
7 changes: 5 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
name: Publish

on:
push:
branches: ["main"]
workflow_run:
workflows: ['Test Commands']
types: ['completed']
branches: ['main']

jobs:
publish:
name: Publish
runs-on: ubuntu-latest
if: ${{ github.event.workflow_run.conclusion == 'success' }}
steps:
- uses: actions/setup-go@v2
with:
Expand Down
214 changes: 199 additions & 15 deletions cmd/dataset_tokenizer/dataset_tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package main
import (
"bufio"
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
Expand All @@ -19,6 +18,7 @@ import (
"github.com/aws/aws-sdk-go/service/s3"
"github.com/stretchr/testify/assert"
"github.com/wbrown/gpt_bpe"
"github.com/wbrown/gpt_bpe/types"
)

type SanitizerTest struct {
Expand All @@ -39,6 +39,7 @@ type S3MockClient struct {

type SanitizerTests []SanitizerTest

// Test data for sanitizing text.
var sanitizerTests = SanitizerTests{
{"\\n handling",
"\nfoobar\\n\n",
Expand All @@ -65,24 +66,11 @@ var sanitizerTests = SanitizerTests{

const corpusPath = "../../resources/frankenstein.txt"

func TokensFromBin(bin *[]byte) *gpt_bpe.Tokens {
tokens := make(gpt_bpe.Tokens, 0)
buf := bytes.NewReader(*bin)
for {
var token gpt_bpe.Token
if err := binary.Read(buf, binary.LittleEndian, &token); err != nil {
break
}
tokens = append(tokens, token)
}
return &tokens
}

// DecodeBuffer
// Decode Tokens from a byte array into a string.
func DecodeBuffer(encoded *[]byte) (text string) {
// First convert our bytearray into a uint32 `Token` array.
tokens := TokensFromBin(encoded)
tokens := types.TokensFromBin(encoded)
// Decode our tokens into a string.
var enc *gpt_bpe.GPTEncoder
encoderString := "gpt2"
Expand Down Expand Up @@ -735,3 +723,199 @@ func TestListObjectsRecursively(t *testing.T) {

wg.Wait() // Wait for all goroutines to finish
}

func TestUInt16WithNoEnforce(t *testing.T) {
// Test if with Uint32 enforce disabled,
// using a Uint16 tokenizer works as intended with no padding.

textsTokenizer := NewTextsTokenizer()
textsTokenizer.ContextSize = 2048
textsTokenizer.TokenizerId = "gpt2"
textsTokenizer.EndOfText = ""

// Test data
testString := "The quick brown fox jumps over the lazy dog."
expectedTokens := types.Tokens{464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 50256}
// Generate temp directory and test file
tempDir := os.TempDir()
testFile := tempDir + "/test.txt"
f, err := os.Create(testFile)
if err != nil {
log.Fatal(err)
}
// Write test string to file
_, err = f.WriteString(testString)
if err != nil {
log.Fatal(err)
}
f.Close()
defer os.Remove(testFile)

reorderPaths := ""
sampling := 100
outputFile := "base.chunk"
defer os.Remove(outputFile)

enc, tokErr := textsTokenizer.InitTokenizer()
if tokErr != nil {
log.Fatal(tokErr)
}

if texts, err := ReadTexts(
testFile, false,
reorderPaths,
1,
); err != nil {
log.Fatal(err)
} else {
begin := time.Now()
contexts, tokErr := textsTokenizer.TokenizeTexts(
texts, "./test", enc,
)
if tokErr != nil {
log.Fatal(tokErr)
}

total, writeErr := WriteContexts(
outputFile,
contexts,
enc,
sampling,
false,
false,
false,
)
if writeErr != nil {
log.Fatal(writeErr)
}
duration := time.Since(begin).Seconds()
log.Printf(
"%d tokens in %0.2fs, %0.2f tokens/s", total,
duration, float64(total)/duration,
)
}
// Read the encoded tokens from the output file
binaryData, err := os.ReadFile(outputFile)
if err != nil {
log.Fatal(err)
}

// Convert to Tokens array
tokens := types.TokensFromBin(&binaryData)

if len(*tokens) != len(expectedTokens) {
t.Fatalf(
"Expected %d tokens, but got %d", len(expectedTokens),
len(*tokens),
)
}
for i, token := range *tokens {
if token != expectedTokens[i] {
t.Fatalf(
"Expected token %d, but got %d", expectedTokens[i],
token,
)
}
}

// Verify the encoded tokens
assert.Equal(t, &expectedTokens, tokens)
}

func TestUInt16WithEnforce(t *testing.T) {
// Test if with Uint32 enforce enabled,
// using a Uint16 tokenizer works as intended with padding
// ie X, 0 Y, 0, Z, 0

textsTokenizer := NewTextsTokenizer()
textsTokenizer.ContextSize = 2048
textsTokenizer.TokenizerId = "gpt2"
textsTokenizer.EndOfText = ""

// Test data
testString := "The quick brown fox jumps over the lazy dog."
expectedTokens := types.Tokens{464, 0, 2068, 0, 7586, 0, 21831, 0, 18045, 0, 625, 0, 262, 0, 16931, 0, 3290, 0, 13, 0, 50256, 0}
// Generate temp directory and test file
tempDir := os.TempDir()
testFile := tempDir + "/test.txt"
f, err := os.Create(testFile)
if err != nil {
log.Fatal(err)
}
// Write test string to file
_, err = f.WriteString(testString)
if err != nil {
log.Fatal(err)
}
f.Close()
defer os.Remove(testFile)

reorderPaths := ""
sampling := 100
outputFile := "base.chunk"
defer os.Remove(outputFile)

enc, tokErr := textsTokenizer.InitTokenizer()
if tokErr != nil {
log.Fatal(tokErr)
}

if texts, err := ReadTexts(
testFile, false,
reorderPaths,
1,
); err != nil {
log.Fatal(err)
} else {
begin := time.Now()
contexts, tokErr := textsTokenizer.TokenizeTexts(
texts, "./test", enc,
)
if tokErr != nil {
log.Fatal(tokErr)
}

total, writeErr := WriteContexts(
outputFile,
contexts,
enc,
sampling,
false,
true,
false,
)
if writeErr != nil {
log.Fatal(writeErr)
}
duration := time.Since(begin).Seconds()
log.Printf(
"%d tokens in %0.2fs, %0.2f tokens/s", total,
duration, float64(total)/duration,
)
}
// Read the encoded tokens from the output file
binaryData, err := os.ReadFile(outputFile)
if err != nil {
log.Fatal(err)
}

// Convert to Tokens array
tokens := types.TokensFromBin(&binaryData)

if len(*tokens) != len(expectedTokens) {
t.Fatalf(
"Expected %d tokens, but got %d", len(expectedTokens),
len(*tokens),
)
}
for i, token := range *tokens {
if token != expectedTokens[i] {
t.Fatalf(
"Expected token %d, but got %d", expectedTokens[i],
token,
)
}
}
// Verify the encoded tokens
assert.Equal(t, &expectedTokens, tokens)
}
Loading

0 comments on commit fb3c65c

Please sign in to comment.