vectorch-ai · 936187425 · Apr 16, 2024 · Apr 18, 2024 · Apr 19, 2024 · Apr 26, 2024
diff --git a/gongfeng-copilot-vscode-latest (2).vsix b/gongfeng-copilot-vscode-latest (2).vsix
diff --git a/src/kernels/CMakeLists.txt b/src/kernels/CMakeLists.txt
@@ -1,77 +1,76 @@
 include(cc_library)
 
 cc_library(
- NAME 
-  kernels
- HDRS 
-  reduce_kernel_utils.cuh
-  activation_kernels.h
-  layernorm_kernels.h
-  pos_embedding_kernels.h
-  kv_cache_kernels.h
-  sampling/sampling_kernels.h
- SRCS 
-  activation_kernels.cu
-  layernorm_kernels.cu
-  pos_embedding_kernels.cu
-  kv_cache_kernels.cu
-  sampling/penalty_kernels.cu
-  sampling/softmax_kernels.cu
-  sampling/topk_kernels.cu
-  sampling/topp_kernels.cu
+ NAME
+ kernels
+ HDRS
+ reduce_kernel_utils.cuh
+ activation_kernels.h
+ layernorm_kernels.h
+ pos_embedding_kernels.h
+ kv_cache_kernels.h
+ sampling/sampling_kernels.h
+ SRCS
+ activation_kernels.cu
+ layernorm_kernels.cu
+ pos_embedding_kernels.cu
+ kv_cache_kernels.cu
+ sampling/penalty_kernels.cu
+ sampling/softmax_kernels.cu
+ sampling/topk_kernels.cu
+ sampling/topp_kernels.cu
  DEPS
-  glog::glog
-  torch
+ glog::glog
+ torch
  DEFINES
-  __CUDA_NO_HALF_OPERATORS__
+ __CUDA_NO_HALF_OPERATORS__
 )
 
 cc_library(
- NAME 
-  gptq.kernels
- HDRS 
- SRCS 
-  gptq/gptq_kernel.cu
+ NAME
+ gptq.kernels
+ HDRS
+ SRCS
+ gptq/gptq_kernel.cu
  DEPS
-  torch
+ torch
 )
 
 cc_library(
- NAME 
-  awq.kernels
- HDRS 
- SRCS 
-  awq/gemm_cuda_gen.cu
+ NAME
+ awq.kernels
+ HDRS
+ SRCS
+ awq/gemm_cuda_gen.cu
  DEPS
-  torch
+ torch
 )
 
 cc_library(
- NAME 
-  exllama.kernels
- SRCS 
-  exllama/exllama_ext.cpp
-  exllama/cuda_buffers.cu
-  exllama/cuda_func/column_remap.cu
-  exllama/cuda_func/q4_matmul.cu
-  exllama/cuda_func/q4_matrix.cu
+ NAME
+ exllama.kernels
+ SRCS
+ exllama/exllama_ext.cpp
+ exllama/cuda_buffers.cu
+ exllama/cuda_func/column_remap.cu
+ exllama/cuda_func/q4_matmul.cu
+ exllama/cuda_func/q4_matrix.cu
  DEPS
-  torch
- LINKOPTS 
-  cublas
+ torch
+ LINKOPTS
+ cublas
 )
 
 cc_library(
- NAME 
-  exllamav2.kernels
- SRCS 
-  exllamav2/ext.cpp
-  exllamav2/cuda/q_matrix.cu
-  exllamav2/cuda/q_gemm.cu
+ NAME
+ exllamav2.kernels
+ SRCS
+ exllamav2/ext.cpp
+ exllamav2/cuda/q_matrix.cu
+ exllamav2/cuda/q_gemm.cu
  DEPS
-  torch
+ torch
 )
 
 add_subdirectory(flash_attn)
 add_subdirectory(flash_infer)
-
diff --git a/src/kernels/activation_kernels.cu b/src/kernels/activation_kernels.cu
@@ -63,10 +63,15 @@ struct SiluActivation {
 template <template <typename T> class Activation, typename T>
 __global__ void activation_kernel(T* __restrict__ out,
  const T* __restrict__ input,
- int n,
- int stride) {
- const uint32_t src_base_idx = blockIdx.x * stride;
- const uint32_t dst_base_idx = blockIdx.x * n;
+ int n, // tensor的列数
+ int stride) { // tensor的行数
+ const uint32_t src_base_idx =
+ blockIdx.x *
+ stride; // TODO:
+ // 一个block处理张量的一行,所以应该src_base_idx=blockIdx.x*n吧？
+ const uint32_t dst_base_idx =
+ blockIdx.x *
+ n; // TODO:为什么做？感觉很复杂,出于什么考虑，直接grid=input.size(0),dim=input.size(1),然后每个线程就地activation
  for (uint32_t i = threadIdx.x; i < n; i += blockDim.x) {
  const T x = __ldg(&input[src_base_idx + i]);
  out[dst_base_idx + i] = Activation<T>::apply(x);
@@ -108,7 +113,7 @@ template <template <typename T> class Activation>
 void launch_activation_and_mul(torch::Tensor& out, torch::Tensor input) {
  const int n = static_cast<int>(input.size(1)) / 2;
  dim3 grid(input.size(0));
- dim3 block(std::min(n, 1024));
+ dim3 block(std::min(n, 1024)); // TODO:why 1024?
  DISPATCH_FLOATING_TYPES(
  input.scalar_type(), "activation_and_mul_kernel", ([&] {
  activation_and_mul_kernel<Activation, scalar_t>

diff --git a/src/kernels/fused_moe_kernels.cu b/src/kernels/fused_moe_kernels.cu
@@ -0,0 +1,57 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <glog/logging.h>
+#include <torch/torch.h>
+
+#include <iostream>
+#include <unordered_map>
+
+#include "fused_moe_kernels.h"
+namespace llm::kernel {
+
+template <typename T>
+__global__ void fused_moe_kernel() {}
+
+torch::Tensor apply_fused_moe(torch::Tensor hidden_states,
+ torch::Tensor w13,
+ torch::Tensor w2,
+ torch::Tensor topk_weight,
+ torch::Tensor topk_ids,
+ bool inplace) {
+ // Check Constraints
+ // match the number of hidden_size
+ CHECK(hidden_states.sizes()[1] == w13.sizes()[2]);
+ // match topk shape
+ CHECK(topk_weight.sizes() == topk_ids.sizes());
+
+ auto M = hidden_states.sizes()[0]; // num_tokens
+ auto E = w13.sizes()[0]; // w13 [n_experts,2*intermediate_size,hidden_size]
+ auto N = w13.sizes()[1];
+ // load kernel config(Now we use the default config)
+ std::unordered_map<std::string, int> configs;
+ if (M <= E) {
+ configs["BLOCK_SIZE_M"] = 16;
+ configs["BLOCK_SIZE_N"] = 32;
+ configs["BLOCK_SIZE_K"] = 64;
+ configs["GROUP_SIZE_M"] = 1;
+ } else {
+ configs["BLOCK_SIZE_M"] = 64;
+ configs["BLOCK_SIZE_N"] = 64;
+ configs["BLOCK_SIZE_K"] = 32;
+ configs["GROUP_SIZE_M"] = 8;
+ }
+ // Create intermediate_cache
+ auto intermediate_cache1 = torch::empty((M, topk_ids.sizes()[1], N),
+ hidden_states.device(),
+ hidden_states.dtype());
+ auto intermediate_cache2 = torch::empty((M, topk_ids.sizes()[1], N / 2),
+ hidden_states.device(),
+ hidden_states.dtype());
+ auto intermediate_cache3 =
+ torch::empty((M),
+ hidden_states.device(M, topk_ids.sizes()[1], w2.sizes()[1]),
+ hidden_states.dtype());
+ // moe_align_block_size
+
+ return torch::Tensor();
+}
+} // namespace llm::kernel
diff --git a/src/kernels/fused_moe_kernels.h b/src/kernels/fused_moe_kernels.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/torch.h>
+
+namespace llm::kernel {
+// don't implement the feature of quant temporarily
+torch::Tensor apply_fused_moe(torch::Tensor hidden_states,
+ torch::Tensor w13,
+ torch::Tensor w2,
+ torch::Tensor topk_weight,
+ torch::Tensor topk_ids,
+ bool inplace);
+} // namespace llm::kernel
diff --git a/src/kernels/sampling/softmax_kernels.cu b/src/kernels/sampling/softmax_kernels.cu
@@ -28,7 +28,8 @@ __global__ void softmax_kernel(T* logits, int vocab_size) {
  }
 
  // get max value in the thread block and save it to shared memory
- max_val = block_reduce_max<float>(max_val);
+ max_val = block_reduce_max<float>(
+ max_val); // TODO:这个function里面就是把block又划分为warp,然后使用warp的相关的api,这个warp相关的api是有什么手册吗？
  if (tid == 0) {
  s_max_val = max_val;
  }
@@ -63,8 +64,12 @@ void invoke_softmax(torch::Tensor& logits) {
 
  // each thread block handles one batch
  dim3 grid(batch_size);
- dim3 block(std::min(vocab_size, 1024));
-
+ dim3 block(std::min(
+ vocab_size,
+ 1024)); // TODO:
+ // 一个线程处理batch中的一个prompt,那么不应该设置为max(vocab_size,1024)?
+ // 1024这个超参数设定有什么技巧吗？
+ //
  DISPATCH_FLOATING_TYPES(logits.scalar_type(), "softmax_kernel", [&] {
  softmax_kernel<scalar_t>
  <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(

diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
@@ -3,76 +3,78 @@ include(cc_test)
 
 cc_library(
  NAME
-  linear 
+ linear
  HDRS
-  linear.h
-  qkv_linear.h
-  linear_impl.h
+ linear.h
+ qkv_linear.h
+ linear_impl.h
  SRCS
-  linear.cpp
-  qkv_linear.cpp
-  linear_impl.cpp
+ linear.cpp
+ qkv_linear.cpp
+ linear_impl.cpp
  DEPS
-  :state_dict
-  :model_parallel
-  :quantization
-  :kernels
-  glog::glog
-  gflags::gflags
-  torch
+ :state_dict
+ :model_parallel
+ :quantization
+ :kernels
+ glog::glog
+ gflags::gflags
+ torch
 )
 
 cc_library(
- NAME 
-  pos_embedding
- HDRS 
-  pos_embedding.h
- SRCS 
-  pos_embedding.cpp
+ NAME
+ pos_embedding
+ HDRS
+ pos_embedding.h
+ SRCS
+ pos_embedding.cpp
  DEPS
-  :state_dict
-  :memory
-  :kernels
-  glog::glog
-  gflags::gflags
-  torch
+ :state_dict
+ :memory
+ :kernels
+ glog::glog
+ gflags::gflags
+ torch
 )
 
 cc_library(
- NAME 
- layers
- HDRS 
- normalization.h
- embedding.h
- activation.h
- SRCS 
- activation.cpp
+ NAME
+ layers
+ HDRS
+ normalization.h
+ embedding.h
+ activation.h
+ fused_moe.h
+ SRCS
+ activation.cpp
+ fused_moe.cpp
  DEPS
-  :state_dict
-  :memory
-  :linear
-  :pos_embedding
-  :attention
-  :kernels
-  :flash_attn.kernels
-  glog::glog
-  gflags::gflags
-  torch
+ :state_dict
+ :memory
+ :linear
+ :pos_embedding
+ :attention
+ :kernels
+ :flash_attn.kernels
+ glog::glog
+ gflags::gflags
+ torch
 )
 
 cc_test(
  NAME
-  layers_test
+ layers_test
  SRCS
-  activation_test.cpp
-  layers_test.cpp
-  pos_embedding_test.cpp
-  normalization_test.cpp
+ activation_test.cpp
+ layers_test.cpp
+ pos_embedding_test.cpp
+ normalization_test.cpp
  DEPS
-  :layers
-  :state_dict
-  absl::random_random
-  GTest::gtest_main
+ :layers
+ :state_dict
+ absl::random_random
+ GTest::gtest_main
 )
 
 add_subdirectory(attention)