Optimize awq kernel in pytorch engine(#2965)

* first * update threshold * sem relaxed * remove check * Update daily_ete_test.yml * Update daily_ete_test_v100.yml * Update benchmark.yml --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
InternLM · Dec 31, 2024 · bec8f24 · bec8f24
1 parent 9cdf3cf
commit bec8f24
Show file tree

Hide file tree

Showing 7 changed files with 129 additions and 315 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -106,8 +106,6 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-*.whl
-          python3 -m pip install -e /root/packages/AutoAWQ_kernels
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy

diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
@@ -156,8 +156,6 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-*.whl
-          python3 -m pip install -e /root/packages/AutoAWQ_kernels
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -251,8 +249,6 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-*.whl
-          python3 -m pip install -e /root/packages/AutoAWQ_kernels
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -361,8 +357,6 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-*.whl
-          python3 -m pip install -e /root/packages/AutoAWQ_kernels
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -454,8 +448,6 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-*.whl
-          python3 -m pip install -e /root/packages/AutoAWQ_kernels
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -516,8 +508,6 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-*.whl
-          python3 -m pip install -e /root/packages/AutoAWQ_kernels
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -580,8 +570,6 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-*.whl
-          python3 -m pip install -e /root/packages/AutoAWQ_kernels
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy

diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml
@@ -158,7 +158,6 @@ jobs:
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -244,7 +243,6 @@ jobs:
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -343,7 +341,6 @@ jobs:
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -434,7 +431,6 @@ jobs:
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -494,7 +490,6 @@ jobs:
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -555,7 +550,6 @@ jobs:
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/autoawq-*.whl --no-deps
           python3 -m pip install /root/packages/xformers-*.whl --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy

diff --git a/lmdeploy/pytorch/backends/cuda/awq_modules.py b/lmdeploy/pytorch/backends/cuda/awq_modules.py
@@ -18,23 +18,14 @@ def wq_gemm_forward(
     out_features=0,
 ):
     """wq gemm forward."""
-    from awq.modules.linear.gemm import awq_ext
-
     from lmdeploy.pytorch.kernels.cuda.awq_kernels import awq_linear
     out_shape = x.shape[:-1] + (out_features, )
     input_dtype = x.dtype
     if input_dtype != torch.float16:
         x = x.half()
 
-    FP16_MATMUL_HEURISTIC_CONDITION = x.size(0) * x.size(1) >= 64
-
     x = x.flatten(0, -2)
-    if FP16_MATMUL_HEURISTIC_CONDITION:
-        out = awq_linear(x, qweight, scales, qzeros)
-    else:
-        if not x.is_contiguous():
-            x = x.contiguous()
-        out = awq_ext.gemm_forward_cuda(x, qweight, scales, qzeros, 8)
+    out = awq_linear(x, qweight, scales, qzeros)
 
     out = out + bias if bias is not None else out
     out = out.reshape(out_shape)

diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -51,18 +51,8 @@ def get_layer_impl_builder(cls, layer_type: OpType):
             from .activation import TritonSiluAndMulBuilder
             return TritonSiluAndMulBuilder
         elif layer_type == OpType.LinearW4A16:
-            try:
-                from awq.modules.linear.gemm import awq_ext  # noqa: F401
-                AWQ_INSTALLED = True
-            except Exception:
-                AWQ_INSTALLED = False
-            if AWQ_INSTALLED:
-                from .awq_modules import AwqLinearW4A16Builder
-                return AwqLinearW4A16Builder
-            else:
-                logger.debug(
-                    f'Op {layer_type} fallback to default implementation.')
-                return super().get_layer_impl_builder(layer_type)
+            from .awq_modules import AwqLinearW4A16Builder
+            return AwqLinearW4A16Builder
         elif layer_type == OpType.FusedMoE:
             from .moe import TritonFusedMoEBuilder
             return TritonFusedMoEBuilder

diff --git a/lmdeploy/pytorch/check_env/model.py b/lmdeploy/pytorch/check_env/model.py
@@ -72,33 +72,6 @@ def check_dtype(self, config):
                        'Please send issue to LMDeploy with error logs.')
             self.log_and_exit(e, 'Model', message=message)
 
-    def check_awq(self, config):
-        """check awq."""
-        logger = self.get_logger()
-        device_type = self.device_type
-        if device_type != 'cuda':
-            return
-
-        quantization_config = getattr(config, 'quantization_config', dict())
-        quant_method = quantization_config.get('quant_method', None)
-        if quant_method != 'awq':
-            return
-        try:
-            import awq  # noqa
-        except Exception as e:
-            self.log_and_exit(e, 'autoawq', logger)
-
-        try:
-            import awq_ext  # noqa
-        except Exception as e:
-            logger.debug('Exception:', exc_info=1)
-            self.log_and_exit(
-                e,
-                'awq_ext',
-                message='Failed to import `awq_ext`. '
-                'Try reinstall it from source: '
-                'https://github.com/casper-hansen/AutoAWQ_kernels')
-
     def check(self):
         """check."""
         import transformers
@@ -112,6 +85,3 @@ def check(self):
 
         # dtype check
         self.check_dtype(config)
-
-        # awq
-        self.check_awq(config)