Skip to content

Commit

Permalink
discojs/src/models/gpt: add seed, loss is now identical between runs
Browse files Browse the repository at this point in the history
  • Loading branch information
JulienVig committed Oct 16, 2024
1 parent 91e500d commit 75a5269
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 14 deletions.
2 changes: 2 additions & 0 deletions discojs/src/models/gpt/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export interface GPTConfig {
nLayer?: number
nHead?: number
nEmbd?: number
seed?: number,
}
// for a benchmark of performance, see https://github.com/epfml/disco/pull/659
export const DefaultGPTConfig: Required<GPTConfig> = {
Expand All @@ -47,6 +48,7 @@ export const DefaultGPTConfig: Required<GPTConfig> = {
nLayer: 3,
nHead: 3,
nEmbd: 48,
seed: 42,
}

export type ModelSize = {
Expand Down
40 changes: 26 additions & 14 deletions discojs/src/models/gpt/layers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ tf.serialization.registerClass(LogLayer)

type CausalSelfAttentionConfig =
ConstructorParameters<typeof tf.layers.Layer>[0]
& Record<'blockSize' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer', number>
& Record<'blockSize' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number>

class CausalSelfAttention extends tf.layers.Layer {
static readonly className = 'CausalSelfAttention'
Expand All @@ -70,6 +70,7 @@ class CausalSelfAttention extends tf.layers.Layer {
private readonly nEmbd: number
private readonly nLayer: number
private readonly dropout: number
private readonly seed: number
private readonly mask: tf.Tensor2D
cAttnKernel?: tf.LayerVariable
cAttnBias?: tf.LayerVariable
Expand All @@ -85,6 +86,7 @@ class CausalSelfAttention extends tf.layers.Layer {
this.nHead = config.nHead
this.nLayer = config.nLayer
this.dropout = config.dropout
this.seed = config.seed

// mask is a lower triangular matrix filled with 1
// calling bandPart zero out the upper triangular part of the all-ones matrix
Expand All @@ -98,7 +100,7 @@ class CausalSelfAttention extends tf.layers.Layer {
'c_attn.weight',
[this.nEmbd, 3 * this.nEmbd],
'float32',
tf.initializers.randomNormal({ mean:0, stddev:0.02 }) // use same init as GPT2
tf.initializers.randomNormal({ mean:0, stddev:0.02, seed: this.seed }) // use same init as GPT2
)
this.cAttnBias = this.addWeight(
'c_attn.bias',
Expand All @@ -116,7 +118,9 @@ class CausalSelfAttention extends tf.layers.Layer {
// Sources:
// https://github.com/karpathy/build-nanogpt/blob/6104ab1b53920f6e2159749676073ff7d815c1fa/train_gpt2.py#L103
// https://youtu.be/l8pRSuU81PU?si=5GcKfi_kPgLgvtg2&t=4640
tf.initializers.randomNormal({ mean:0, stddev: 0.02 * Math.sqrt(2 * this.nLayer) })
tf.initializers.randomNormal({
mean: 0, stddev: 0.02 * Math.sqrt(2 * this.nLayer), seed: this.seed
})
)
this.cProjBias = this.addWeight(
'c_proj.bias',
Expand Down Expand Up @@ -194,14 +198,14 @@ class CausalSelfAttention extends tf.layers.Layer {
// (attention weights of past tokens). The probability distribution ensures
// that the attention weights of past tokens for a particular token sum to one
att = tf.softmax(att, -1)
att = kwargs.training === true ? tf.dropout(att, this.dropout) : att
att = kwargs.training === true ? tf.dropout(att, this.dropout, undefined, this.seed) : att

// This is where the (attention-)weighted sum of past values is performed
let y = tf.matMul(att, v) // (B, nHead, T, T) x (B, nHead, T, hs) -> (B, nHead, T, hs)
y = tf.transpose(y, [0, 2, 1, 3]) // (B, T, nHead, hs)
y = tf.reshape(y, [B, T, C]) // (B, T, C = nHead * hs)
y = dense(y, this.cProjKernel, this.cProjBias) // output projection (B, T, C)
y = kwargs.training === true ? tf.dropout(y, this.dropout) : y
y = kwargs.training === true ? tf.dropout(y, this.dropout, undefined, this.seed) : y
return y
})
}
Expand Down Expand Up @@ -251,7 +255,7 @@ class GELU extends tf.layers.Layer {
tf.serialization.registerClass(GELU)

type MLPConfig = ConstructorParameters<typeof tf.layers.Layer>[0] &
Required<ModelSize> & Record<'blockSize' | 'residDrop' | 'nLayer', number>
Required<ModelSize> & Record<'blockSize' | 'residDrop' | 'nLayer' | 'seed', number>

function MLP(config: MLPConfig): tf.LayersModel {
return tf.sequential({ layers: [
Expand All @@ -260,7 +264,9 @@ function MLP(config: MLPConfig): tf.LayersModel {
units: 4 * config.nEmbd,
inputDim: config.nEmbd,
inputShape: [config.blockSize, config.nEmbd],
kernelInitializer: tf.initializers.randomNormal({ mean: 0, stddev: 0.02 }),
kernelInitializer: tf.initializers.randomNormal({
mean: 0, stddev: 0.02, seed: config.seed
}),
}),
new GELU(),
tf.layers.dense({
Expand All @@ -269,12 +275,13 @@ function MLP(config: MLPConfig): tf.LayersModel {
inputDim: 4 * config.nEmbd,
inputShape: [config.blockSize, 4 * config.nEmbd],
kernelInitializer: tf.initializers.randomNormal({
mean: 0, stddev: 0.02 * Math.sqrt(2 * config.nLayer)
mean: 0, stddev: 0.02 * Math.sqrt(2 * config.nLayer), seed: config.seed
}),
}),
tf.layers.dropout({
name: config.name + '.mlp.drop',
rate: config.residDrop
rate: config.residDrop,
seed: config.seed
}),
]})
}
Expand Down Expand Up @@ -354,15 +361,16 @@ class LMEmbedding extends tf.layers.Layer {
static readonly className = 'LMEmbedding'
embeddings?: tf.LayerVariable

constructor(private readonly vocabSize: number, private readonly nEmbd: number) {
constructor(private readonly vocabSize: number,
private readonly nEmbd: number, private readonly seed: number) {
super({})
}
override build(): void {
this.embeddings = this.addWeight(
'wte', //use same name as GPT2
[this.vocabSize, this.nEmbd],
'float32',
tf.initializers.randomNormal({ mean:0, stddev:0.02 })
tf.initializers.randomNormal({ mean:0, stddev:0.02, seed: this.seed })
)
}

Expand Down Expand Up @@ -431,7 +439,7 @@ export function GPTArchitecture(config: Required<GPTConfig>): tf.LayersModel {
const inputs = tf.input({ shape: [null] })

// token embedding
const wte = new LMEmbedding(config.vocabSize, config.nEmbd)
const wte = new LMEmbedding(config.vocabSize, config.nEmbd, config.seed)
let tokEmb = wte.apply(inputs) as tf.SymbolicTensor // (batch_size, input length T, nEmbd)

if (config.debug) {
Expand All @@ -443,7 +451,9 @@ export function GPTArchitecture(config: Required<GPTConfig>): tf.LayersModel {
name: config.name + '.wpe',
inputDim: config.blockSize,
outputDim: config.nEmbd,
embeddingsInitializer: tf.initializers.randomNormal({ mean: 0, stddev: 0.02 }),
embeddingsInitializer: tf.initializers.randomNormal({
mean: 0, stddev: 0.02, seed: config.seed
}),
}).apply(range) as tf.SymbolicTensor

if (config.debug) {
Expand All @@ -453,7 +463,9 @@ export function GPTArchitecture(config: Required<GPTConfig>): tf.LayersModel {
// token and positional embeddings are added together
let x = tf.layers.add().apply([tokEmb, posEmb])
// dropout
x = tf.layers.dropout({name: 'drop', rate: config.embdDrop}).apply(x)
x = tf.layers.dropout({
name: 'drop', rate: config.embdDrop, seed: config.seed
}).apply(x)
if (config.debug) {
x = new LogLayer({ name: 'drop_log' }).apply(x)
}
Expand Down

0 comments on commit 75a5269

Please sign in to comment.