diff --git a/CMakeLists.txt b/CMakeLists.txt index c32b39513..53b5c7eee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -268,4 +268,4 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA" OR APHRODITE_GPU_LANG STREQUAL "HIP") message(STATUS "Enabling moe extension.") add_dependencies(default _moe_C) -endif() \ No newline at end of file +endif() diff --git a/amdpatch.sh b/amdpatch.sh new file mode 100755 index 000000000..c330e26df --- /dev/null +++ b/amdpatch.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +ROCM_PATH=$(hipconfig --rocmpath) + +sudo patch $ROCM_PATH/lib/llvm/lib/clang/18/include/__clang_hip_cmath.h ./patches/amd.patch \ No newline at end of file diff --git a/aphrodite/quantization/gptq_marlin.py b/aphrodite/quantization/gptq_marlin.py index ad7685ef1..a277d421a 100644 --- a/aphrodite/quantization/gptq_marlin.py +++ b/aphrodite/quantization/gptq_marlin.py @@ -19,6 +19,7 @@ marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor, verify_marlin_supported, verify_marlin_supports_shape) from aphrodite.scalar_type import scalar_types +from aphrodite.common.utils import is_hip class GPTQMarlinConfig(QuantizationConfig): @@ -93,6 +94,9 @@ def override_quantization_method(cls, hf_quant_cfg, is_valid_user_quant = (user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin") + if is_hip(): + return None + if can_convert and is_valid_user_quant: msg = ("The model is convertible to {} during runtime." " Using {} kernel.".format(cls.get_name(), cls.get_name())) @@ -105,6 +109,7 @@ def override_quantization_method(cls, hf_quant_cfg, " so forcing gptq. Use quantization=gptq_marlin for" " faster inference") return None + def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["GPTQMarlinLinearMethod"]: diff --git a/docs/pages/installation/installation-rocm.md b/docs/pages/installation/installation-rocm.md index d88fb85ef..e0e3366ac 100644 --- a/docs/pages/installation/installation-rocm.md +++ b/docs/pages/installation/installation-rocm.md @@ -72,6 +72,8 @@ Finally, build Aphrodite: git clone https://github.com/PygmalionAI/aphrodite-engine.git cd aphrodite-engine +chmod +x ./amdpatch.sh +./amdpatch.sh pip install -U -r requirements-rocm.txt python setup.py develop # pip install -e . won't work for now ``` diff --git a/kernels/ops.h b/kernels/ops.h index 08be7ea8f..5e5deb983 100644 --- a/kernels/ops.h +++ b/kernels/ops.h @@ -62,29 +62,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor expert_ids, torch::Tensor num_tokens_post_pad); -std::vector selective_scan_fwd( - const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, - const torch::Tensor& B, const torch::Tensor& C, - const c10::optional& D_, - const c10::optional& z_, - const c10::optional& delta_bias_, bool delta_softplus, - const c10::optional& index_, - const c10::optional& x); - -at::Tensor causal_conv1d_update(const at::Tensor& x, - const at::Tensor& conv_state, - const at::Tensor& weight, - const c10::optional& bias_, - bool silu_activation); - -at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, - const c10::optional& bias_, - const c10::optional& seq_idx_, - const c10::optional& seq_pos_idx_, - const c10::optional& initial_states_, - const c10::optional& final_states_out_, - bool silu_activation); - #ifndef USE_ROCM using fptr_t = int64_t; fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, @@ -105,4 +82,24 @@ std::tuple> get_graph_buffer_ipc_meta( fptr_t _fa); void register_graph_buffers(fptr_t _fa, const std::vector& handles, const std::vector>& offsets); +std::vector selective_scan_fwd( + const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, + const torch::Tensor& B, const torch::Tensor& C, + const c10::optional& D_, + const c10::optional& z_, + const c10::optional& delta_bias_, bool delta_softplus, + const c10::optional& index_, + const c10::optional& x); +at::Tensor causal_conv1d_update(const at::Tensor& x, + const at::Tensor& conv_state, + const at::Tensor& weight, + const c10::optional& bias_, + bool silu_activation); +at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, + const c10::optional& bias_, + const c10::optional& seq_idx_, + const c10::optional& seq_pos_idx_, + const c10::optional& initial_states_, + const c10::optional& final_states_out_, + bool silu_activation); #endif diff --git a/kernels/torch_bindings.cpp b/kernels/torch_bindings.cpp index f2b39e317..d166a4d72 100644 --- a/kernels/torch_bindings.cpp +++ b/kernels/torch_bindings.cpp @@ -271,7 +271,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "()"); ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, &dynamic_scaled_int8_quant); - +#ifndef USE_ROCM // Mamba kernels ops.def( "selective_scan_fwd(Tensor! u, Tensor! delta," @@ -298,6 +298,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor? final_states_out_," "bool silu_activation) -> Tensor"); ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd); +#endif } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { diff --git a/patches/amd.patch b/patches/amd.patch new file mode 100644 index 000000000..b7a9fb0da --- /dev/null +++ b/patches/amd.patch @@ -0,0 +1,17 @@ +diff --git a/clang/lib/Headers/__clang_hip_cmath.h b/clang/lib/Headers/__clang_hip_cmath.h +index 071c64c7af8d5b..e04fc7824b1771 100644 +--- a/clang/lib/Headers/__clang_hip_cmath.h ++++ b/clang/lib/Headers/__clang_hip_cmath.h +@@ -397,7 +397,12 @@ template struct __numeric_type { + // No support for long double, use double instead. + static double __test(long double); + +- typedef decltype(__test(declval<_Tp>())) type; ++ template ++ static auto __test_impl(int) -> decltype(__test(declval<_U>())); ++ ++ template static void __test_impl(...); ++ ++ typedef decltype(__test_impl<_Tp>(0)) type; + static const bool value = !is_same::value; + }; \ No newline at end of file diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 960e4a0c7..24cce0564 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -5,5 +5,5 @@ awscli boto3 botocore -ray == 2.10.0 +ray >= 2.10.0 peft \ No newline at end of file