Skip to content

Commit

Permalink
*: replace line by line text loaders by chunk by chunk text loaders. …
Browse files Browse the repository at this point in the history
…Loaders now yield token sequences of length blockSize
  • Loading branch information
JulienVig committed Nov 12, 2024
1 parent 128cedc commit f7f96dc
Show file tree
Hide file tree
Showing 21 changed files with 547 additions and 194 deletions.
12 changes: 5 additions & 7 deletions cli/src/benchmark_gpt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,12 @@ async function main(args: Required<CLIArguments>): Promise<void> {
// to make sure the dataset is batched and tokenized correctly
task.trainingInformation.batchSize = batchSize
task.trainingInformation.maxSequenceLength = contextLength
const dataset = loadText('../datasets/wikitext/wiki.train.tokens')
const dataset = loadText(
'../datasets/wikitext/wiki.train.tokens',
tokenizer, config.blockSize, batchSize
)

const maxLength = task.trainingInformation.maxSequenceLength ?? (tokenizer.model_max_length as number) + 1
const preprocessedDataset = dataset
.map((line) => processing.tokenizeAndLeftPad(line, tokenizer, maxLength))
.map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])
.batch(batchSize);

Expand Down Expand Up @@ -111,10 +112,7 @@ async function main(args: Required<CLIArguments>): Promise<void> {
const iterations = 10
console.log("Generating", maxNewTokens, "new tokens")

let tokens = List(
(tokenizer(prompt, { return_tensor: false }) as { input_ids: number[] })
.input_ids,
);
let tokens = processing.tokenize(tokenizer, prompt);

let inferenceTime = 0
for (let i = 0; i < iterations; i++) {
Expand Down
47 changes: 29 additions & 18 deletions cli/src/train_gpt.ts
Original file line number Diff line number Diff line change
@@ -1,38 +1,49 @@
import * as tf from "@tensorflow/tfjs-node"
import { AutoTokenizer } from "@xenova/transformers";
import { models, processing } from "@epfml/discojs";
import { loadText } from '@epfml/discojs-node'
import { List } from "immutable";

async function main(): Promise<void> {
const data = "Lorem ipsum dolor sit amet, consectetur adipis"
const datasetSource = new tf.data.FileDataSource(Buffer.from(data))
const textDataset = new tf.data.TextLineDataset(datasetSource)

async function main(): Promise<void> {

const config: models.GPTConfig = {
modelType: 'gpt-nano',
lr: 0.01,
maxIter: 50,
maxIter: 10,
evaluateEvery:50,
maxEvalBatches: 10,
blockSize: 16,
vocabSize: 50257,
debug: false
}


const batchSize = 8
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
const tokenDataset = textDataset.map((text: string) => {
const tokens = processing.tokenizeAndLeftPad(text, tokenizer, config.blockSize + 1)
const ys = tf.oneHot(tokens.slice(1), tokenizer.model.vocab.length)
const xs = tf.tensor(tokens.slice(0, config.blockSize), undefined, 'int32')
return {xs, ys}
}).repeat().batch(16) as tf.data.Dataset<{ xs: tf.Tensor2D, ys: tf.Tensor3D }>
const dataset = loadText(
'../datasets/wikitext/wiki.train.tokens',
tokenizer, config.blockSize, batchSize
)
const tokenDataset = dataset
.map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])
.batch(batchSize);

const model = new models.GPT(config)

for await (const logs of model.train(tokenDataset, undefined)) {
console.log(logs)
for (let i = 0; i < 6; i++) {
console.log(`Epoch ${i}`)
for await (const logs of model.train(tokenDataset, undefined)) {
console.log(logs)
}
}

const generation = await model.generate("Lorem", tokenizer, { maxNewTokens: 10, doSample: false, topk: 5, temperature:0.1 })
let tokens = processing.tokenize(tokenizer, "First");

const maxNewTokens = 10
for (let n = 0; n < maxNewTokens; n++) {
const next: number = (await model.predict(List.of(tokens),
{ doSample: false, topk: 5, temperature: 0.1 }))
.first();
tokens = tokens.push(next)
}
const generation = tokenizer.decode(tokens.toArray(), { skip_special_tokens: true })
console.log(generation)
}

Expand Down
80 changes: 77 additions & 3 deletions discojs-node/src/loaders.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ import * as fs from "node:fs/promises";
import { withFile } from "tmp-promise";
import { describe, it } from "mocha";
import { expect } from "chai";
import { Dataset, processing, Text } from "@epfml/discojs";
import { AutoTokenizer } from "@xenova/transformers";
import { List } from "immutable";

import {
loadCSV,
Expand Down Expand Up @@ -50,13 +53,84 @@ describe("image directory parser", () => {
});

describe("text parser", () => {

it("parses basic file", async () => {
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
const text = ["a", "b", "c"].join("\n")
await withFile(async ({ path }) => {
await fs.writeFile(path, ["a", "b", "c"].join("\n"));
await fs.writeFile(path, text);
// set block size to 4 to get 1 sequence of 4 tokens + 1 label token
const parsed = loadText(path, tokenizer, 4, 1);
const expectedTokens = processing.tokenize(tokenizer, text)
// should return 2 sequences: one with 4 tokens + 1 label token
// and the other with some padding and the label token
const sequences = await arrayFromAsync(parsed)
expect(sequences.length).to.equal(2);
expect(sequences[0]).to.deep.equal(expectedTokens);
});
});

async function checkEachSequence(parsed: Dataset<Text>,
expectedTokens: number[], blockSize: number) {
// ceiling because the remaining tokens in the last chunk are padded instead of dropped
// expect the number of sequences to be the total number of tokens divided by blockSize
expect(await parsed.size()).to.equal(Math.ceil(expectedTokens.length / blockSize));

let i = 0
// exclude the last sequence because it has been padded
let sequences = List(await arrayFromAsync(parsed))
// we expect the last sequence to have blockSize + 1 tokens via padding
expect(sequences.last()?.size).to.equal(blockSize + 1)
sequences = sequences.pop()
for await (const tokens of sequences) {
// each sequence has length blockSize + 1 (for the label)
expect(tokens.toArray()).to.deep.equal(expectedTokens.slice(i, i + blockSize + 1));
// but the window should move by blockSize only
i += blockSize
}
}

it("yields the correct block size", async () => {
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
const expectedTokens = processing.tokenize(tokenizer, text).toArray()

await withFile(async ({ path }) => {
await fs.writeFile(path, text);

// set block size to 4 to get 1 sequence of 4 tokens + 1 label token
// so we expect 5 tokens per read
const blockSize = 4
const parsed = loadText(path, tokenizer, blockSize, 1);
await checkEachSequence(parsed, expectedTokens, blockSize)
})
});

const parsed = loadText(path);
it("reads multiple chunks", async () => {
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
const text = [
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
"Donec sed risus maximus, ultricies ex sed, dictum elit.",
"Curabitur faucibus egestas enim et auctor. Quisque vel dignissim turpis.",
"Curabitur justo tellus, elementum sit amet erat eget, auctor ornare nisi.",
"Nunc tortor odio, ultrices id leo vitae, euismod congue ex. Curabitur arcu leo,",
"sagittis quis felis nec, imperdiet aliquet tellus.Integer a mollis nulla.",
"Quisque pulvinar lectus eget nisi pharetra, non molestie magna ullamcorper.",
"Sed porttitor diam non blandit molestie.Duis tristique arcu ut efficitur efficitur.",
"Fusce et ullamcorper tortor.Pellentesque a accumsan lacus, nec mollis risus.",
"Nunc quis eros a orci ultricies cursus. Maecenas sodales ipsum a magna ",
"malesuada efficitur.Maecenas at sapien blandit, egestas nisi eu, mollis elit."
].join(" ")

const expectedTokens = processing.tokenize(tokenizer, text).toArray()
await withFile(async ({ path }) => {
await fs.writeFile(path, text);

expect(await parsed.size()).to.equal(3);
// set block size to 4 to get 1 sequence of 4 tokens + 1 label token
// so we expect 5 tokens per read
const blockSize = 4
const parsed = loadText(path, tokenizer, blockSize, 1, 1); // set the min chunk size allowed to 1 bit
await checkEachSequence(parsed, expectedTokens, blockSize)
});
});
});
116 changes: 108 additions & 8 deletions discojs-node/src/loaders/text.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,114 @@
import * as fs from "node:fs/promises";
import * as readline from "node:readline/promises";
import createDebug from "debug";
import { createReadStream } from 'node:fs';
import { PreTrainedTokenizer } from '@xenova/transformers';
import { Dataset, Text, processing } from "@epfml/discojs";

import { Dataset, Text } from "@epfml/discojs";
const debug = createDebug("discojs-node:loaders:text");

export function load(path: string): Dataset<Text> {
/**
* Returns a Dataset that streams and tokenizes text to yield tokenized sequences
* one at a time.
* The sequences returned are going to be split into input and label sequences of size `blockSize`
* The label sequences are the input sequences shifted by one token.
* Since the last token of the input sequence needs a label,
* we include one more token (`blockSize` + 1 total) in the sequences returned.
* * Thus, each sequence yielded has size `blockSize` + 1, where the last token
* is included only to be the label of the last input token:
* xs = tokens[0:blockSize]
* ys = tokens[1:blockSize+1]
*
* Because the `blockSize+1`nth token is only used as label and not as input,
* the next sequence will be shifted by `blockSize` (and not `blockSize + 1`)
* In other words, the dataset yields sequences of size `blockSize` + 1
* with an overlap of 1 token between each sequence.
*
* @param path path to the text file to read
* @param tokenizer the tokenizer to use, should match the model that will be trained
* @param blockSize the context length, the maximum number of tokens of input sequences
* @param batchSize default to 1, the number of input sequences (of `blockSize` tokens) in each batch.
* The batch size is only used to configure the chunk size of the file stream such that each chunk is
* big enough to contain at least one batch.
* @param minChunkSize default to 16KiB, the minimum size of each chunk in bits
* @returns a dataset of tokenized input and label sequences
*/
export function load(path: string, tokenizer: PreTrainedTokenizer,
blockSize: number, batchSize: number = 1, minChunkSize = 16384): Dataset<Text> {
return new Dataset(async function* () {
const input = (await fs.open(path)).createReadStream({ encoding: "utf8" });
if (batchSize < 1 || !Number.isInteger(batchSize) ||
blockSize < 1 || !Number.isInteger(blockSize) ||
minChunkSize < 1 || !Number.isInteger(minChunkSize))
throw new Error("batchSize, blockSize and minChunkSize must be positive integers");
const sequenceLength = blockSize + 1 // + 1 for the blockSize'nth token's label
// we want each chunk to be at least bigger than the block size (each chunk corresponds to a block)
// (or event bigger than batch size * block size so that each chunk corresponds to a batch)
const chunkTokenSize = batchSize * (sequenceLength)
// We read 8*8 = 8 bytes per expected token to ensure we have enough tokens
// For reference, the GPT-2 tokenizer encodes 3 to 4 bytes per token on average
const chunkBitSize = Math.max(minChunkSize, chunkTokenSize * 8 * 8);
debug("Setting the chunk size to %o bits", chunkBitSize)
// Create a stream to read the text file chunk by chunk
const stream = createReadStream(path, {
encoding: "utf8",
highWaterMark: chunkBitSize
});

// `readline` is a bit overkill but seems standard
// https://nodejs.org/api/readline.html#example-read-file-stream-line-by-line
yield* readline.createInterface({ input, crlfDelay: Infinity });
// iterate over the chunks
let endOfPreviousChunk = ""
let alreadyAppliedPadding = false
for await (const chunk of stream) {
if (typeof chunk !== 'string') throw new Error('Expected file stream to yield string')
debug("Reading chunk of size %o", chunk.length)
// tokenize the whole chunk at once
// Concatenate with potential leftovers from the previous chunk
let tokens = processing.tokenize(tokenizer, endOfPreviousChunk + chunk)
if (tokens.size < sequenceLength) {
// throw if we need to apply padding more than once
// We can pad if the whole text is smaller than block size or
// if the very last chunk is smaller than block size
if (alreadyAppliedPadding)
throw new Error(`the chunk (${tokens.size} tokens) is too small ` +
`to get a sequence of length blockSize (${sequenceLength} tokens). ` +
`Either the text file or the chunk size (${chunkBitSize} bits) is too small.`);
// if this isn't the first iteration we simply skip
// as we expect the last chunk to be potentially smaller than the block size
debug("chunk smaller than block size, padding to blockSize")
yield processing.tokenize(tokenizer, endOfPreviousChunk + chunk, {
padding: true, max_length: sequenceLength
})
alreadyAppliedPadding = true
continue
}
debug("batch per chunk: %o", tokens.size / (batchSize * blockSize))
// yield one block of tokens at a time
while (tokens.size >= sequenceLength) {
yield tokens.take(sequenceLength);
tokens = tokens.slice(blockSize); // only shift by blockSize rather than sequenceLength
}
// keep the last tokens for the next chunk
// if this was the last one the remaining tokens are discarded
if (tokens.size > 0) {
// We actually need to decode the tokens to get the leftover text
// instead of simply keeping the remaining tokens.
// this is because the tokens may be different once prepended to the next chunk
// e.g. if the remaining text is ". A" and the next chunk starts with "nother"
// the tokenization will be different than if we simply concatenate the remaining tokens
endOfPreviousChunk = tokenizer.decode(
tokens.toArray(),
{ skip_special_tokens: true }
)
debug("End of chunk, remaining text: '%s'", endOfPreviousChunk)
} else {
// Note that the difference between tokenizing and then concatenating
// vs concatenating and then tokenizing can happen if their is no
// remaining text. We consider this difference negligible
endOfPreviousChunk = "";
}
}
if (endOfPreviousChunk.length === 0) return

// flush the remaining text after the last chunk
yield processing.tokenize(tokenizer, endOfPreviousChunk, {
padding: true, max_length: sequenceLength
})
});
}
Loading

0 comments on commit f7f96dc

Please sign in to comment.