Skip to content

Commit

Permalink
(m)t5 lora parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
calpt committed Dec 21, 2024
1 parent 18f9053 commit 4e40256
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
15 changes: 12 additions & 3 deletions src/adapters/models/mt5/modeling_mt5.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,12 @@ def forward(
is_cross_attention = key_value_states is not None

query_states = self.q(hidden_states)
query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
# >>> START AH Changes <<<
# adapt bsz for lora parallel
query_states = query_states.view(query_states.shape[0], -1, self.n_heads, self.key_value_proj_dim).transpose(
1, 2
)
# >>> END AH Changes <<<

if past_key_value is not None:
is_updated = past_key_value.is_updated.get(self.layer_idx)
Expand All @@ -95,8 +100,12 @@ def forward(
else:
key_states = self.k(current_states)
value_states = self.v(current_states)
key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
key_states = key_states.view(key_states.shape[0], -1, self.n_heads, self.key_value_proj_dim).transpose(
1, 2
)
value_states = value_states.view(
value_states.shape[0], -1, self.n_heads, self.key_value_proj_dim
).transpose(1, 2)

if past_key_value is not None:
# save all key/value_states to cache to be re-used for fast auto-regressive generation
Expand Down
15 changes: 12 additions & 3 deletions src/adapters/models/t5/modeling_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,12 @@ def forward(
is_cross_attention = key_value_states is not None

query_states = self.q(hidden_states)
query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
# >>> START AH Changes <<<
# adapt bsz for lora parallel
query_states = query_states.view(query_states.shape[0], -1, self.n_heads, self.key_value_proj_dim).transpose(
1, 2
)
# >>> END AH Changes <<<

if past_key_value is not None:
is_updated = past_key_value.is_updated.get(self.layer_idx)
Expand All @@ -95,8 +100,12 @@ def forward(
else:
key_states = self.k(current_states)
value_states = self.v(current_states)
key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
key_states = key_states.view(key_states.shape[0], -1, self.n_heads, self.key_value_proj_dim).transpose(
1, 2
)
value_states = value_states.view(
value_states.shape[0], -1, self.n_heads, self.key_value_proj_dim
).transpose(1, 2)

if past_key_value is not None:
# save all key/value_states to cache to be re-used for fast auto-regressive generation
Expand Down

0 comments on commit 4e40256

Please sign in to comment.