Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[model] add support for mixtral moe model #128

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added gongfeng-copilot-vscode-latest (2).vsix
Binary file not shown.
103 changes: 51 additions & 52 deletions src/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,77 +1,76 @@
include(cc_library)

cc_library(
NAME
kernels
HDRS
reduce_kernel_utils.cuh
activation_kernels.h
layernorm_kernels.h
pos_embedding_kernels.h
kv_cache_kernels.h
sampling/sampling_kernels.h
SRCS
activation_kernels.cu
layernorm_kernels.cu
pos_embedding_kernels.cu
kv_cache_kernels.cu
sampling/penalty_kernels.cu
sampling/softmax_kernels.cu
sampling/topk_kernels.cu
sampling/topp_kernels.cu
NAME
kernels
HDRS
reduce_kernel_utils.cuh
activation_kernels.h
layernorm_kernels.h
pos_embedding_kernels.h
kv_cache_kernels.h
sampling/sampling_kernels.h
SRCS
activation_kernels.cu
layernorm_kernels.cu
pos_embedding_kernels.cu
kv_cache_kernels.cu
sampling/penalty_kernels.cu
sampling/softmax_kernels.cu
sampling/topk_kernels.cu
sampling/topp_kernels.cu
DEPS
glog::glog
torch
glog::glog
torch
DEFINES
__CUDA_NO_HALF_OPERATORS__
__CUDA_NO_HALF_OPERATORS__
)

cc_library(
NAME
gptq.kernels
HDRS
SRCS
gptq/gptq_kernel.cu
NAME
gptq.kernels
HDRS
SRCS
gptq/gptq_kernel.cu
DEPS
torch
torch
)

cc_library(
NAME
awq.kernels
HDRS
SRCS
awq/gemm_cuda_gen.cu
NAME
awq.kernels
HDRS
SRCS
awq/gemm_cuda_gen.cu
DEPS
torch
torch
)

cc_library(
NAME
exllama.kernels
SRCS
exllama/exllama_ext.cpp
exllama/cuda_buffers.cu
exllama/cuda_func/column_remap.cu
exllama/cuda_func/q4_matmul.cu
exllama/cuda_func/q4_matrix.cu
NAME
exllama.kernels
SRCS
exllama/exllama_ext.cpp
exllama/cuda_buffers.cu
exllama/cuda_func/column_remap.cu
exllama/cuda_func/q4_matmul.cu
exllama/cuda_func/q4_matrix.cu
DEPS
torch
LINKOPTS
cublas
torch
LINKOPTS
cublas
)

cc_library(
NAME
exllamav2.kernels
SRCS
exllamav2/ext.cpp
exllamav2/cuda/q_matrix.cu
exllamav2/cuda/q_gemm.cu
NAME
exllamav2.kernels
SRCS
exllamav2/ext.cpp
exllamav2/cuda/q_matrix.cu
exllamav2/cuda/q_gemm.cu
DEPS
torch
torch
)

add_subdirectory(flash_attn)
add_subdirectory(flash_infer)

15 changes: 10 additions & 5 deletions src/kernels/activation_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,15 @@ struct SiluActivation {
template <template <typename T> class Activation, typename T>
__global__ void activation_kernel(T* __restrict__ out,
const T* __restrict__ input,
int n,
int stride) {
const uint32_t src_base_idx = blockIdx.x * stride;
const uint32_t dst_base_idx = blockIdx.x * n;
int n, // tensor的列数
int stride) { // tensor的行数
const uint32_t src_base_idx =
blockIdx.x *
stride; // TODO:
// 一个block处理张量的一行,所以应该src_base_idx=blockIdx.x*n吧?
const uint32_t dst_base_idx =
blockIdx.x *
n; // TODO:为什么做?感觉很复杂,出于什么考虑,直接grid=input.size(0),dim=input.size(1),然后每个线程就地activation
for (uint32_t i = threadIdx.x; i < n; i += blockDim.x) {
const T x = __ldg(&input[src_base_idx + i]);
out[dst_base_idx + i] = Activation<T>::apply(x);
Expand Down Expand Up @@ -108,7 +113,7 @@ template <template <typename T> class Activation>
void launch_activation_and_mul(torch::Tensor& out, torch::Tensor input) {
const int n = static_cast<int>(input.size(1)) / 2;
dim3 grid(input.size(0));
dim3 block(std::min(n, 1024));
dim3 block(std::min(n, 1024)); // TODO:why 1024?
DISPATCH_FLOATING_TYPES(
input.scalar_type(), "activation_and_mul_kernel", ([&] {
activation_and_mul_kernel<Activation, scalar_t>
Expand Down
57 changes: 57 additions & 0 deletions src/kernels/fused_moe_kernels.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <ATen/cuda/CUDAContext.h>
#include <glog/logging.h>
#include <torch/torch.h>

#include <iostream>
#include <unordered_map>

#include "fused_moe_kernels.h"
namespace llm::kernel {

template <typename T>
__global__ void fused_moe_kernel() {}

torch::Tensor apply_fused_moe(torch::Tensor hidden_states,
torch::Tensor w13,
torch::Tensor w2,
torch::Tensor topk_weight,
torch::Tensor topk_ids,
bool inplace) {
// Check Constraints
// match the number of hidden_size
CHECK(hidden_states.sizes()[1] == w13.sizes()[2]);
// match topk shape
CHECK(topk_weight.sizes() == topk_ids.sizes());

auto M = hidden_states.sizes()[0]; // num_tokens
auto E = w13.sizes()[0]; // w13 [n_experts,2*intermediate_size,hidden_size]
auto N = w13.sizes()[1];
// load kernel config(Now we use the default config)
std::unordered_map<std::string, int> configs;
if (M <= E) {
configs["BLOCK_SIZE_M"] = 16;
configs["BLOCK_SIZE_N"] = 32;
configs["BLOCK_SIZE_K"] = 64;
configs["GROUP_SIZE_M"] = 1;
} else {
configs["BLOCK_SIZE_M"] = 64;
configs["BLOCK_SIZE_N"] = 64;
configs["BLOCK_SIZE_K"] = 32;
configs["GROUP_SIZE_M"] = 8;
}
// Create intermediate_cache
auto intermediate_cache1 = torch::empty((M, topk_ids.sizes()[1], N),
hidden_states.device(),
hidden_states.dtype());
auto intermediate_cache2 = torch::empty((M, topk_ids.sizes()[1], N / 2),
hidden_states.device(),
hidden_states.dtype());
auto intermediate_cache3 =
torch::empty((M),
hidden_states.device(M, topk_ids.sizes()[1], w2.sizes()[1]),
hidden_states.dtype());
// moe_align_block_size

return torch::Tensor();
}
} // namespace llm::kernel
13 changes: 13 additions & 0 deletions src/kernels/fused_moe_kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <torch/torch.h>

namespace llm::kernel {
// don't implement the feature of quant temporarily
torch::Tensor apply_fused_moe(torch::Tensor hidden_states,
torch::Tensor w13,
torch::Tensor w2,
torch::Tensor topk_weight,
torch::Tensor topk_ids,
bool inplace);
} // namespace llm::kernel
11 changes: 8 additions & 3 deletions src/kernels/sampling/softmax_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ __global__ void softmax_kernel(T* logits, int vocab_size) {
}

// get max value in the thread block and save it to shared memory
max_val = block_reduce_max<float>(max_val);
max_val = block_reduce_max<float>(
max_val); // TODO:这个function里面就是把block又划分为warp,然后使用warp的相关的api,这个warp相关的api是有什么手册吗?
if (tid == 0) {
s_max_val = max_val;
}
Expand Down Expand Up @@ -63,8 +64,12 @@ void invoke_softmax(torch::Tensor& logits) {

// each thread block handles one batch
dim3 grid(batch_size);
dim3 block(std::min(vocab_size, 1024));

dim3 block(std::min(
vocab_size,
1024)); // TODO:
// 一个线程处理batch中的一个prompt,那么不应该设置为max(vocab_size,1024)?
// 1024这个超参数设定有什么技巧吗?
//
DISPATCH_FLOATING_TYPES(logits.scalar_type(), "softmax_kernel", [&] {
softmax_kernel<scalar_t>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
Expand Down
108 changes: 55 additions & 53 deletions src/layers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,76 +3,78 @@ include(cc_test)

cc_library(
NAME
linear
linear
HDRS
linear.h
qkv_linear.h
linear_impl.h
linear.h
qkv_linear.h
linear_impl.h
SRCS
linear.cpp
qkv_linear.cpp
linear_impl.cpp
linear.cpp
qkv_linear.cpp
linear_impl.cpp
DEPS
:state_dict
:model_parallel
:quantization
:kernels
glog::glog
gflags::gflags
torch
:state_dict
:model_parallel
:quantization
:kernels
glog::glog
gflags::gflags
torch
)

cc_library(
NAME
pos_embedding
HDRS
pos_embedding.h
SRCS
pos_embedding.cpp
NAME
pos_embedding
HDRS
pos_embedding.h
SRCS
pos_embedding.cpp
DEPS
:state_dict
:memory
:kernels
glog::glog
gflags::gflags
torch
:state_dict
:memory
:kernels
glog::glog
gflags::gflags
torch
)

cc_library(
NAME
layers
HDRS
normalization.h
embedding.h
activation.h
SRCS
activation.cpp
NAME
layers
HDRS
normalization.h
embedding.h
activation.h
fused_moe.h
SRCS
activation.cpp
fused_moe.cpp
DEPS
:state_dict
:memory
:linear
:pos_embedding
:attention
:kernels
:flash_attn.kernels
glog::glog
gflags::gflags
torch
:state_dict
:memory
:linear
:pos_embedding
:attention
:kernels
:flash_attn.kernels
glog::glog
gflags::gflags
torch
)

cc_test(
NAME
layers_test
layers_test
SRCS
activation_test.cpp
layers_test.cpp
pos_embedding_test.cpp
normalization_test.cpp
activation_test.cpp
layers_test.cpp
pos_embedding_test.cpp
normalization_test.cpp
DEPS
:layers
:state_dict
absl::random_random
GTest::gtest_main
:layers
:state_dict
absl::random_random
GTest::gtest_main
)

add_subdirectory(attention)
Loading
Loading