Skip to content

Commit

Permalink
Optimize awq kernel in pytorch engine(#2965)
Browse files Browse the repository at this point in the history
* first

* update threshold

* sem relaxed

* remove check

* Update daily_ete_test.yml

* Update daily_ete_test_v100.yml

* Update benchmark.yml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
  • Loading branch information
grimoire and zhulinJulia24 authored Dec 31, 2024
1 parent 9cdf3cf commit bec8f24
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 315 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ jobs:
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
- name: Install lmdeploy
Expand Down
12 changes: 0 additions & 12 deletions .github/workflows/daily_ete_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,6 @@ jobs:
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -251,8 +249,6 @@ jobs:
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -361,8 +357,6 @@ jobs:
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -454,8 +448,6 @@ jobs:
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -516,8 +508,6 @@ jobs:
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -580,8 +570,6 @@ jobs:
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/daily_ete_test_v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ jobs:
run: |
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -244,7 +243,6 @@ jobs:
run: |
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -343,7 +341,6 @@ jobs:
run: |
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -434,7 +431,6 @@ jobs:
run: |
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -494,7 +490,6 @@ jobs:
run: |
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down Expand Up @@ -555,7 +550,6 @@ jobs:
run: |
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/autoawq-*.whl --no-deps
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
- name: Install lmdeploy
Expand Down
11 changes: 1 addition & 10 deletions lmdeploy/pytorch/backends/cuda/awq_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,14 @@ def wq_gemm_forward(
out_features=0,
):
"""wq gemm forward."""
from awq.modules.linear.gemm import awq_ext

from lmdeploy.pytorch.kernels.cuda.awq_kernels import awq_linear
out_shape = x.shape[:-1] + (out_features, )
input_dtype = x.dtype
if input_dtype != torch.float16:
x = x.half()

FP16_MATMUL_HEURISTIC_CONDITION = x.size(0) * x.size(1) >= 64

x = x.flatten(0, -2)
if FP16_MATMUL_HEURISTIC_CONDITION:
out = awq_linear(x, qweight, scales, qzeros)
else:
if not x.is_contiguous():
x = x.contiguous()
out = awq_ext.gemm_forward_cuda(x, qweight, scales, qzeros, 8)
out = awq_linear(x, qweight, scales, qzeros)

out = out + bias if bias is not None else out
out = out.reshape(out_shape)
Expand Down
14 changes: 2 additions & 12 deletions lmdeploy/pytorch/backends/cuda/op_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,8 @@ def get_layer_impl_builder(cls, layer_type: OpType):
from .activation import TritonSiluAndMulBuilder
return TritonSiluAndMulBuilder
elif layer_type == OpType.LinearW4A16:
try:
from awq.modules.linear.gemm import awq_ext # noqa: F401
AWQ_INSTALLED = True
except Exception:
AWQ_INSTALLED = False
if AWQ_INSTALLED:
from .awq_modules import AwqLinearW4A16Builder
return AwqLinearW4A16Builder
else:
logger.debug(
f'Op {layer_type} fallback to default implementation.')
return super().get_layer_impl_builder(layer_type)
from .awq_modules import AwqLinearW4A16Builder
return AwqLinearW4A16Builder
elif layer_type == OpType.FusedMoE:
from .moe import TritonFusedMoEBuilder
return TritonFusedMoEBuilder
Expand Down
30 changes: 0 additions & 30 deletions lmdeploy/pytorch/check_env/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,33 +72,6 @@ def check_dtype(self, config):
'Please send issue to LMDeploy with error logs.')
self.log_and_exit(e, 'Model', message=message)

def check_awq(self, config):
"""check awq."""
logger = self.get_logger()
device_type = self.device_type
if device_type != 'cuda':
return

quantization_config = getattr(config, 'quantization_config', dict())
quant_method = quantization_config.get('quant_method', None)
if quant_method != 'awq':
return
try:
import awq # noqa
except Exception as e:
self.log_and_exit(e, 'autoawq', logger)

try:
import awq_ext # noqa
except Exception as e:
logger.debug('Exception:', exc_info=1)
self.log_and_exit(
e,
'awq_ext',
message='Failed to import `awq_ext`. '
'Try reinstall it from source: '
'https://github.com/casper-hansen/AutoAWQ_kernels')

def check(self):
"""check."""
import transformers
Expand All @@ -112,6 +85,3 @@ def check(self):

# dtype check
self.check_dtype(config)

# awq
self.check_awq(config)
Loading

0 comments on commit bec8f24

Please sign in to comment.