*: replace line by line text loaders by chunk by chunk text loaders. …

…Loaders now yield token sequences of length blockSize
epfml · Nov 12, 2024 · f7f96dc · f7f96dc
1 parent 128cedc
commit f7f96dc
Show file tree

Hide file tree

Showing 21 changed files with 547 additions and 194 deletions.
diff --git a/cli/src/benchmark_gpt.ts b/cli/src/benchmark_gpt.ts
@@ -76,11 +76,12 @@ async function main(args: Required<CLIArguments>): Promise<void> {
     // to make sure the dataset is batched and tokenized correctly
     task.trainingInformation.batchSize = batchSize
     task.trainingInformation.maxSequenceLength = contextLength
-    const dataset = loadText('../datasets/wikitext/wiki.train.tokens')
+    const dataset = loadText(
+      '../datasets/wikitext/wiki.train.tokens',
+      tokenizer, config.blockSize, batchSize
+    )
 
-    const maxLength = task.trainingInformation.maxSequenceLength ?? (tokenizer.model_max_length as number) + 1
     const preprocessedDataset = dataset
-      .map((line) => processing.tokenizeAndLeftPad(line, tokenizer, maxLength))
       .map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])
       .batch(batchSize);
 
@@ -111,10 +112,7 @@ async function main(args: Required<CLIArguments>): Promise<void> {
     const iterations = 10
     console.log("Generating", maxNewTokens, "new tokens")
 
-    let tokens = List(
-      (tokenizer(prompt, { return_tensor: false }) as { input_ids: number[] })
-        .input_ids,
-    );
+    let tokens = processing.tokenize(tokenizer, prompt);
 
     let inferenceTime = 0
     for (let i = 0; i < iterations; i++) {

diff --git a/cli/src/train_gpt.ts b/cli/src/train_gpt.ts
@@ -1,38 +1,49 @@
-import * as tf from "@tensorflow/tfjs-node"
 import { AutoTokenizer } from "@xenova/transformers";
 import { models, processing } from "@epfml/discojs";
+import { loadText } from '@epfml/discojs-node'
+import { List } from "immutable";
 
-async function main(): Promise<void> { 
-  const data = "Lorem ipsum dolor sit amet, consectetur adipis"
-  const datasetSource = new tf.data.FileDataSource(Buffer.from(data))
-  const textDataset = new tf.data.TextLineDataset(datasetSource)
 
+async function main(): Promise<void> { 
+
   const config: models.GPTConfig = {
     modelType: 'gpt-nano',
     lr: 0.01,
-    maxIter: 50,
+    maxIter: 10,
     evaluateEvery:50,
     maxEvalBatches: 10,
     blockSize: 16,
-    vocabSize: 50257,
     debug: false
   }
-
+
+  const batchSize = 8
   const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
-  const tokenDataset = textDataset.map((text: string) => {
-    const tokens = processing.tokenizeAndLeftPad(text, tokenizer, config.blockSize + 1)
-    const ys = tf.oneHot(tokens.slice(1), tokenizer.model.vocab.length)
-    const xs = tf.tensor(tokens.slice(0, config.blockSize), undefined, 'int32')
-    return {xs, ys}
-  }).repeat().batch(16) as tf.data.Dataset<{ xs: tf.Tensor2D, ys: tf.Tensor3D }>
+  const dataset = loadText(
+    '../datasets/wikitext/wiki.train.tokens',
+    tokenizer, config.blockSize, batchSize
+  )
+  const tokenDataset = dataset
+    .map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])
+    .batch(batchSize);
 
   const model = new models.GPT(config)
-
-  for await (const logs of model.train(tokenDataset, undefined)) {
-    console.log(logs)
+  for (let i = 0; i < 6; i++) {
+    console.log(`Epoch ${i}`)
+    for await (const logs of model.train(tokenDataset, undefined)) {
+      console.log(logs)
+    }
   }
 
-  const generation = await model.generate("Lorem", tokenizer, { maxNewTokens: 10, doSample: false, topk: 5, temperature:0.1 })
+  let tokens = processing.tokenize(tokenizer, "First");
+
+  const maxNewTokens = 10
+  for (let n = 0; n < maxNewTokens; n++) {
+    const next: number = (await model.predict(List.of(tokens),
+      { doSample: false, topk: 5, temperature: 0.1 }))
+      .first();
+    tokens = tokens.push(next)
+  }
+  const generation = tokenizer.decode(tokens.toArray(), { skip_special_tokens: true })
   console.log(generation)
 }
 

diff --git a/discojs-node/src/loaders.spec.ts b/discojs-node/src/loaders.spec.ts
@@ -2,6 +2,9 @@ import * as fs from "node:fs/promises";
 import { withFile } from "tmp-promise";
 import { describe, it } from "mocha";
 import { expect } from "chai";
+import { Dataset, processing, Text } from "@epfml/discojs";
+import { AutoTokenizer } from "@xenova/transformers";
+import { List } from "immutable";
 
 import {
   loadCSV,
@@ -50,13 +53,84 @@ describe("image directory parser", () => {
 });
 
 describe("text parser", () => {
+
   it("parses basic file", async () => {
+    const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
+    const text = ["a", "b", "c"].join("\n")
     await withFile(async ({ path }) => {
-      await fs.writeFile(path, ["a", "b", "c"].join("\n"));
+      await fs.writeFile(path, text);
+      // set block size to 4 to get 1 sequence of 4 tokens + 1 label token
+      const parsed = loadText(path, tokenizer, 4, 1);
+      const expectedTokens = processing.tokenize(tokenizer, text)
+      // should return 2 sequences: one with 4 tokens + 1 label token
+      // and the other with some padding and the label token
+      const sequences = await arrayFromAsync(parsed)
+      expect(sequences.length).to.equal(2);
+      expect(sequences[0]).to.deep.equal(expectedTokens);
+    });
+  });
+
+  async function checkEachSequence(parsed: Dataset<Text>,
+    expectedTokens: number[], blockSize: number) {
+    // ceiling because the remaining tokens in the last chunk are padded instead of dropped
+    // expect the number of sequences to be the total number of tokens divided by blockSize
+    expect(await parsed.size()).to.equal(Math.ceil(expectedTokens.length / blockSize));
+
+    let i = 0
+    // exclude the last sequence because it has been padded
+    let sequences = List(await arrayFromAsync(parsed))
+    // we expect the last sequence to have blockSize + 1 tokens via padding
+    expect(sequences.last()?.size).to.equal(blockSize + 1)
+    sequences = sequences.pop()
+    for await (const tokens of sequences) {
+      // each sequence has length blockSize + 1 (for the label)
+      expect(tokens.toArray()).to.deep.equal(expectedTokens.slice(i, i + blockSize + 1));
+      // but the window should move by blockSize only
+      i += blockSize
+    }
+  }
+
+  it("yields the correct block size", async () => {
+    const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
+    const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+    const expectedTokens = processing.tokenize(tokenizer, text).toArray()
+
+    await withFile(async ({ path }) => {
+      await fs.writeFile(path, text);
+
+      // set block size to 4 to get 1 sequence of 4 tokens + 1 label token
+      // so we expect 5 tokens per read
+      const blockSize = 4
+      const parsed = loadText(path, tokenizer, blockSize, 1);
+      await checkEachSequence(parsed, expectedTokens, blockSize)
+    })
+  });
 
-      const parsed = loadText(path);
+  it("reads multiple chunks", async () => {
+    const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
+    const text = [
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Donec sed risus maximus, ultricies ex sed, dictum elit.",
+      "Curabitur faucibus egestas enim et auctor. Quisque vel dignissim turpis.",
+      "Curabitur justo tellus, elementum sit amet erat eget, auctor ornare nisi.",
+      "Nunc tortor odio, ultrices id leo vitae, euismod congue ex. Curabitur arcu leo,",
+      "sagittis quis felis nec, imperdiet aliquet tellus.Integer a mollis nulla.",
+      "Quisque pulvinar lectus eget nisi pharetra, non molestie magna ullamcorper.",
+      "Sed porttitor diam non blandit molestie.Duis tristique arcu ut efficitur efficitur.",
+      "Fusce et ullamcorper tortor.Pellentesque a accumsan lacus, nec mollis risus.",
+      "Nunc quis eros a orci ultricies cursus. Maecenas sodales ipsum a magna ",
+      "malesuada efficitur.Maecenas at sapien blandit, egestas nisi eu, mollis elit."
+    ].join(" ")
+
+    const expectedTokens = processing.tokenize(tokenizer, text).toArray()
+    await withFile(async ({ path }) => {
+      await fs.writeFile(path, text);
 
-      expect(await parsed.size()).to.equal(3);
+      // set block size to 4 to get 1 sequence of 4 tokens + 1 label token
+      // so we expect 5 tokens per read
+      const blockSize = 4
+      const parsed = loadText(path, tokenizer, blockSize, 1, 1); // set the min chunk size allowed to 1 bit
+      await checkEachSequence(parsed, expectedTokens, blockSize)
     });
   });
 });
diff --git a/discojs-node/src/loaders/text.ts b/discojs-node/src/loaders/text.ts
@@ -1,14 +1,114 @@
-import * as fs from "node:fs/promises";
-import * as readline from "node:readline/promises";
+import createDebug from "debug";
+import { createReadStream } from 'node:fs';
+import { PreTrainedTokenizer } from '@xenova/transformers';
+import { Dataset, Text, processing } from "@epfml/discojs";
 
-import { Dataset, Text } from "@epfml/discojs";
+const debug = createDebug("discojs-node:loaders:text");
 
-export function load(path: string): Dataset<Text> {
+/**
+ * Returns a Dataset that streams and tokenizes text to yield tokenized sequences
+ * one at a time. 
+ * The sequences returned are going to be split into input and label sequences of size `blockSize`
+ * The label sequences are the input sequences shifted by one token.
+ * Since the last token of the input sequence needs a label, 
+ * we include one more token (`blockSize` + 1 total) in the sequences returned.
+ * * Thus, each sequence yielded has size `blockSize` + 1, where the last token
+ * is included only to be the label of the last input token:
+ * xs = tokens[0:blockSize]
+ * ys = tokens[1:blockSize+1]
+ * 
+ * Because the `blockSize+1`nth token is only used as label and not as input,
+ * the next sequence will be shifted by `blockSize` (and not `blockSize + 1`)
+ * In other words, the dataset yields sequences of size `blockSize` + 1 
+ * with an overlap of 1 token between each sequence.
+ * 
+ * @param path path to the text file to read
+ * @param tokenizer the tokenizer to use, should match the model that will be trained
+ * @param blockSize the context length, the maximum number of tokens of input sequences
+ * @param batchSize default to 1, the number of input sequences (of `blockSize` tokens) in each batch. 
+ * The batch size is only used to configure the chunk size of the file stream such that each chunk is
+ * big enough to contain at least one batch.
+ * @param minChunkSize default to 16KiB, the minimum size of each chunk in bits
+ * @returns a dataset of tokenized input and label sequences
+ */
+export function load(path: string, tokenizer: PreTrainedTokenizer,
+  blockSize: number, batchSize: number = 1, minChunkSize = 16384): Dataset<Text> {
   return new Dataset(async function* () {
-    const input = (await fs.open(path)).createReadStream({ encoding: "utf8" });
+    if (batchSize < 1 || !Number.isInteger(batchSize) ||
+      blockSize < 1 || !Number.isInteger(blockSize) || 
+      minChunkSize < 1 || !Number.isInteger(minChunkSize))
+      throw new Error("batchSize, blockSize and minChunkSize must be positive integers");
+    const sequenceLength = blockSize + 1 // + 1 for the blockSize'nth token's label
+    // we want each chunk to be at least bigger than the block size (each chunk corresponds to a block)
+    // (or event bigger than batch size * block size so that each chunk corresponds to a batch)
+    const chunkTokenSize = batchSize * (sequenceLength) 
+    // We read 8*8 = 8 bytes per expected token to ensure we have enough tokens
+    // For reference, the GPT-2 tokenizer encodes 3 to 4 bytes per token on average
+    const chunkBitSize = Math.max(minChunkSize, chunkTokenSize * 8 * 8);
+    debug("Setting the chunk size to %o bits", chunkBitSize)
+    // Create a stream to read the text file chunk by chunk
+    const stream = createReadStream(path, {
+      encoding: "utf8",
+      highWaterMark: chunkBitSize
+    });
 
-    // `readline` is a bit overkill but seems standard
-    // https://nodejs.org/api/readline.html#example-read-file-stream-line-by-line
-    yield* readline.createInterface({ input, crlfDelay: Infinity });
+    // iterate over the chunks
+    let endOfPreviousChunk = ""
+    let alreadyAppliedPadding = false
+    for await (const chunk of stream) {
+      if (typeof chunk !== 'string') throw new Error('Expected file stream to yield string')
+      debug("Reading chunk of size %o", chunk.length)
+      // tokenize the whole chunk at once
+      // Concatenate with potential leftovers from the previous chunk
+      let tokens = processing.tokenize(tokenizer, endOfPreviousChunk + chunk)
+      if (tokens.size < sequenceLength) {
+        // throw if we need to apply padding more than once
+        // We can pad if the whole text is smaller than block size or 
+        // if the very last chunk is smaller than block size
+        if (alreadyAppliedPadding)
+          throw new Error(`the chunk (${tokens.size} tokens) is too small ` +
+            `to get a sequence of length blockSize (${sequenceLength} tokens). ` +
+            `Either the text file or the chunk size (${chunkBitSize} bits) is too small.`);
+        // if this isn't the first iteration we simply skip
+        // as we expect the last chunk to be potentially smaller than the block size
+        debug("chunk smaller than block size, padding to blockSize")
+        yield processing.tokenize(tokenizer, endOfPreviousChunk + chunk, {
+            padding: true, max_length: sequenceLength
+        })
+        alreadyAppliedPadding = true
+        continue
+      }
+      debug("batch per chunk: %o", tokens.size / (batchSize * blockSize))
+      // yield one block of tokens at a time
+      while (tokens.size >= sequenceLength) {
+        yield tokens.take(sequenceLength);
+        tokens = tokens.slice(blockSize); // only shift by blockSize rather than sequenceLength
+      }
+      // keep the last tokens for the next chunk
+      // if this was the last one the remaining tokens are discarded
+      if (tokens.size > 0) {
+        // We actually need to decode the tokens to get the leftover text
+        // instead of simply keeping the remaining tokens.
+        // this is because the tokens may be different once prepended to the next chunk
+        // e.g. if the remaining text is ". A" and the next chunk starts with "nother"
+        // the tokenization will be different than if we simply concatenate the remaining tokens
+        endOfPreviousChunk = tokenizer.decode(
+          tokens.toArray(),
+          { skip_special_tokens: true }
+        )
+        debug("End of chunk, remaining text: '%s'", endOfPreviousChunk)
+      } else {
+        // Note that the difference between tokenizing and then concatenating
+        // vs concatenating and then tokenizing can happen if their is no
+        // remaining text. We consider this difference negligible
+        endOfPreviousChunk = "";
+      }
+    }
+    if (endOfPreviousChunk.length === 0) return
+
+    // flush the remaining text after the last chunk
+    yield processing.tokenize(tokenizer, endOfPreviousChunk, {
+        padding: true, max_length: sequenceLength
+    })
   });
 }