-
Notifications
You must be signed in to change notification settings - Fork 54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
to make repetition penalty faster #442
base: habana_main
Are you sure you want to change the base?
Changes from all commits
15e5d79
0004cc5
6bb5cb9
ec38d67
5016cc4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import gc | ||
import inspect | ||
import ipaddress | ||
import math | ||
import os | ||
import socket | ||
import subprocess | ||
|
@@ -752,9 +753,6 @@ | |
elif current_platform.is_neuron(): | ||
print_warning_once("Pin memory is not supported on Neuron.") | ||
return False | ||
elif current_platform.is_hpu(): | ||
print_warning_once("Pin memory is not supported on HPU.") | ||
return False | ||
elif current_platform.is_cpu() or current_platform.is_openvino(): | ||
return False | ||
return True | ||
|
@@ -812,6 +810,29 @@ | |
|
||
return padded_x | ||
|
||
def make_ndarray_with_pad_align( | ||
x: List[List[T]], | ||
pad: T, | ||
dtype: npt.DTypeLike, | ||
*, | ||
max_len_align: Optional[int] = None, | ||
) -> npt.NDArray: | ||
""" | ||
Make a padded array from 2D inputs. | ||
|
||
The padding is applied to the end of each inner list until it reaches | ||
`max_len`. | ||
""" | ||
# Unlike for most functions, map is faster than a genexpr over `len` | ||
max_len = max(map(len, x), default=0) | ||
max_len_aligned = math.ceil(max_len / max_len_align) * max_len_align | ||
Check failure on line 828 in vllm/utils.py GitHub Actions / mypy (3.9)
Check failure on line 828 in vllm/utils.py GitHub Actions / mypy (3.9)
Check failure on line 828 in vllm/utils.py GitHub Actions / mypy (3.10)
Check failure on line 828 in vllm/utils.py GitHub Actions / mypy (3.10)
Check failure on line 828 in vllm/utils.py GitHub Actions / mypy (3.11)
Check failure on line 828 in vllm/utils.py GitHub Actions / mypy (3.11)
Check failure on line 828 in vllm/utils.py GitHub Actions / mypy (3.12)
|
||
padded_x = np.full((len(x), max_len_aligned), pad, dtype=dtype) | ||
|
||
for ind, blocktb in enumerate(x): | ||
assert len(blocktb) <= max_len_aligned | ||
padded_x[ind, :len(blocktb)] = blocktb | ||
|
||
return padded_x | ||
|
||
def make_tensor_with_pad( | ||
x: List[List[T]], | ||
|
@@ -833,10 +854,37 @@ | |
|
||
tensor = torch.from_numpy(padded_x).to(device) | ||
if pin_memory: | ||
tensor = tensor.pin_memory() | ||
if not current_platform.is_hpu(): | ||
tensor = tensor.pin_memory() | ||
else: | ||
tensor = tensor.pin_memory("hpu") | ||
|
||
return tensor | ||
|
||
def make_tensor_with_pad_align( | ||
x: List[List[T]], | ||
pad: T, | ||
dtype: torch.dtype, | ||
*, | ||
max_len_align: Optional[int] = None, | ||
device: Optional[Union[str, torch.device]] = None, | ||
pin_memory: bool = False, | ||
) -> torch.Tensor: | ||
""" | ||
Make a padded tensor from 2D inputs. | ||
|
||
The padding is applied to the end of each inner list until it reaches | ||
`max_len`. | ||
""" | ||
np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] | ||
padded_x = make_ndarray_with_pad_align(x, pad, np_dtype, | ||
max_len_align=max_len_align) | ||
|
||
tensor = torch.from_numpy(padded_x).to(device) | ||
if pin_memory: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if not needed, since this method is called for hpu There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. HI Michal, I removed the device check logic here and kept the pin_memory check. In this way, this method behavior is exactly the same to the un-aligned version. |
||
tensor = tensor.pin_memory("hpu") | ||
|
||
return tensor | ||
|
||
def async_tensor_h2d( | ||
data: list, | ||
|
@@ -845,7 +893,13 @@ | |
pin_memory: bool, | ||
) -> torch.Tensor: | ||
"""Asynchronously create a tensor and copy it from host to device.""" | ||
t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu") | ||
t = torch.tensor(data, dtype=dtype, device="cpu") | ||
if pin_memory: | ||
if not current_platform.is_hpu(): | ||
t.pin_memory() | ||
else: | ||
t.pin_memory(device="hpu") | ||
|
||
return t.to(device=target_device, non_blocking=True) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can be removed, as it wont be called now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Michal, this method make_tensor_with_pad is still called from different places. It is replaced by make_tensor_with_pad_align in the repetition penaly. So I think we still need the check here.