diff --git a/gongfeng-copilot-vscode-latest (2).vsix b/gongfeng-copilot-vscode-latest (2).vsix
new file mode 100644
index 00000000..35217faa
Binary files /dev/null and b/gongfeng-copilot-vscode-latest (2).vsix differ
diff --git a/src/kernels/CMakeLists.txt b/src/kernels/CMakeLists.txt
index 397123cb..cffd6bd6 100644
--- a/src/kernels/CMakeLists.txt
+++ b/src/kernels/CMakeLists.txt
@@ -1,77 +1,76 @@
 include(cc_library)
 
 cc_library(
-  NAME 
-    kernels
-  HDRS 
-    reduce_kernel_utils.cuh
-    activation_kernels.h
-    layernorm_kernels.h
-    pos_embedding_kernels.h
-    kv_cache_kernels.h
-    sampling/sampling_kernels.h
-  SRCS 
-    activation_kernels.cu
-    layernorm_kernels.cu
-    pos_embedding_kernels.cu
-    kv_cache_kernels.cu
-    sampling/penalty_kernels.cu
-    sampling/softmax_kernels.cu
-    sampling/topk_kernels.cu
-    sampling/topp_kernels.cu
+  NAME
+  kernels
+  HDRS
+  reduce_kernel_utils.cuh
+  activation_kernels.h
+  layernorm_kernels.h
+  pos_embedding_kernels.h
+  kv_cache_kernels.h
+  sampling/sampling_kernels.h
+  SRCS
+  activation_kernels.cu
+  layernorm_kernels.cu
+  pos_embedding_kernels.cu
+  kv_cache_kernels.cu
+  sampling/penalty_kernels.cu
+  sampling/softmax_kernels.cu
+  sampling/topk_kernels.cu
+  sampling/topp_kernels.cu
   DEPS
-    glog::glog
-    torch
+  glog::glog
+  torch
   DEFINES
-    __CUDA_NO_HALF_OPERATORS__
+  __CUDA_NO_HALF_OPERATORS__
 )
 
 cc_library(
-  NAME 
-    gptq.kernels
-  HDRS 
-  SRCS 
-    gptq/gptq_kernel.cu
+  NAME
+  gptq.kernels
+  HDRS
+  SRCS
+  gptq/gptq_kernel.cu
   DEPS
-    torch
+  torch
 )
 
 cc_library(
-  NAME 
-    awq.kernels
-  HDRS 
-  SRCS 
-    awq/gemm_cuda_gen.cu
+  NAME
+  awq.kernels
+  HDRS
+  SRCS
+  awq/gemm_cuda_gen.cu
   DEPS
-    torch
+  torch
 )
 
 cc_library(
-  NAME 
-    exllama.kernels
-  SRCS 
-    exllama/exllama_ext.cpp
-    exllama/cuda_buffers.cu
-    exllama/cuda_func/column_remap.cu
-    exllama/cuda_func/q4_matmul.cu
-    exllama/cuda_func/q4_matrix.cu
+  NAME
+  exllama.kernels
+  SRCS
+  exllama/exllama_ext.cpp
+  exllama/cuda_buffers.cu
+  exllama/cuda_func/column_remap.cu
+  exllama/cuda_func/q4_matmul.cu
+  exllama/cuda_func/q4_matrix.cu
   DEPS
-    torch
-  LINKOPTS 
-    cublas
+  torch
+  LINKOPTS
+  cublas
 )
 
 cc_library(
-  NAME 
-    exllamav2.kernels
-  SRCS 
-    exllamav2/ext.cpp
-    exllamav2/cuda/q_matrix.cu
-    exllamav2/cuda/q_gemm.cu
+  NAME
+  exllamav2.kernels
+  SRCS
+  exllamav2/ext.cpp
+  exllamav2/cuda/q_matrix.cu
+  exllamav2/cuda/q_gemm.cu
   DEPS
-    torch
+  torch
 )
 
 add_subdirectory(flash_attn)
 add_subdirectory(flash_infer)
-
diff --git a/src/kernels/activation_kernels.cu b/src/kernels/activation_kernels.cu
index cf274b31..803af67f 100644
--- a/src/kernels/activation_kernels.cu
+++ b/src/kernels/activation_kernels.cu
@@ -63,10 +63,15 @@ struct SiluActivation {
 template <template <typename T> class Activation, typename T>
 __global__ void activation_kernel(T* __restrict__ out,
                                   const T* __restrict__ input,
-                                  int n,
-                                  int stride) {
-  const uint32_t src_base_idx = blockIdx.x * stride;
-  const uint32_t dst_base_idx = blockIdx.x * n;
+                                  int n,         // tensor的列数
+                                  int stride) {  // tensor的行数
+  const uint32_t src_base_idx =
+      blockIdx.x *
+      stride;  // TODO:
+               // 一个block处理张量的一行,所以应该src_base_idx=blockIdx.x*n吧？
+  const uint32_t dst_base_idx =
+      blockIdx.x *
+      n;  // TODO:为什么做？感觉很复杂,出于什么考虑，直接grid=input.size(0),dim=input.size(1),然后每个线程就地activation
   for (uint32_t i = threadIdx.x; i < n; i += blockDim.x) {
     const T x = __ldg(&input[src_base_idx + i]);
     out[dst_base_idx + i] = Activation<T>::apply(x);
@@ -108,7 +113,7 @@ template <template <typename T> class Activation>
 void launch_activation_and_mul(torch::Tensor& out, torch::Tensor input) {
   const int n = static_cast<int>(input.size(1)) / 2;
   dim3 grid(input.size(0));
-  dim3 block(std::min(n, 1024));
+  dim3 block(std::min(n, 1024));  // TODO:why 1024?
   DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "activation_and_mul_kernel", ([&] {
         activation_and_mul_kernel<Activation, scalar_t>
diff --git a/src/kernels/fused_moe_kernels.cu b/src/kernels/fused_moe_kernels.cu
new file mode 100644
index 00000000..2aa249c8
--- /dev/null
+++ b/src/kernels/fused_moe_kernels.cu
@@ -0,0 +1,57 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <glog/logging.h>
+#include <torch/torch.h>
+
+#include <iostream>
+#include <unordered_map>
+
+#include "fused_moe_kernels.h"
+namespace llm::kernel {
+
+template <typename T>
+__global__ void fused_moe_kernel() {}
+
+torch::Tensor apply_fused_moe(torch::Tensor hidden_states,
+                              torch::Tensor w13,
+                              torch::Tensor w2,
+                              torch::Tensor topk_weight,
+                              torch::Tensor topk_ids,
+                              bool inplace) {
+  // Check Constraints
+  // match the number of hidden_size
+  CHECK(hidden_states.sizes()[1] == w13.sizes()[2]);
+  // match topk shape
+  CHECK(topk_weight.sizes() == topk_ids.sizes());
+
+  auto M = hidden_states.sizes()[0];  // num_tokens
+  auto E = w13.sizes()[0];  // w13 [n_experts,2*intermediate_size,hidden_size]
+  auto N = w13.sizes()[1];
+  // load kernel config(Now we use the default config)
+  std::unordered_map<std::string, int> configs;
+  if (M <= E) {
+    configs["BLOCK_SIZE_M"] = 16;
+    configs["BLOCK_SIZE_N"] = 32;
+    configs["BLOCK_SIZE_K"] = 64;
+    configs["GROUP_SIZE_M"] = 1;
+  } else {
+    configs["BLOCK_SIZE_M"] = 64;
+    configs["BLOCK_SIZE_N"] = 64;
+    configs["BLOCK_SIZE_K"] = 32;
+    configs["GROUP_SIZE_M"] = 8;
+  }
+  // Create intermediate_cache
+  auto intermediate_cache1 = torch::empty((M, topk_ids.sizes()[1], N),
+                                          hidden_states.device(),
+                                          hidden_states.dtype());
+  auto intermediate_cache2 = torch::empty((M, topk_ids.sizes()[1], N / 2),
+                                          hidden_states.device(),
+                                          hidden_states.dtype());
+  auto intermediate_cache3 =
+      torch::empty((M),
+                   hidden_states.device(M, topk_ids.sizes()[1], w2.sizes()[1]),
+                   hidden_states.dtype());
+  // moe_align_block_size
+
+  return torch::Tensor();
+}
+}  // namespace llm::kernel
\ No newline at end of file
diff --git a/src/kernels/fused_moe_kernels.h b/src/kernels/fused_moe_kernels.h
new file mode 100644
index 00000000..df34414d
--- /dev/null
+++ b/src/kernels/fused_moe_kernels.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/torch.h>
+
+namespace llm::kernel {
+// don't implement the feature of quant temporarily
+torch::Tensor apply_fused_moe(torch::Tensor hidden_states,
+                              torch::Tensor w13,
+                              torch::Tensor w2,
+                              torch::Tensor topk_weight,
+                              torch::Tensor topk_ids,
+                              bool inplace);
+}  // namespace llm::kernel
diff --git a/src/kernels/sampling/softmax_kernels.cu b/src/kernels/sampling/softmax_kernels.cu
index 74a784bf..7d48e9c5 100644
--- a/src/kernels/sampling/softmax_kernels.cu
+++ b/src/kernels/sampling/softmax_kernels.cu
@@ -28,7 +28,8 @@ __global__ void softmax_kernel(T* logits, int vocab_size) {
   }
 
   // get max value in the thread block and save it to shared memory
-  max_val = block_reduce_max<float>(max_val);
+  max_val = block_reduce_max<float>(
+      max_val);  // TODO:这个function里面就是把block又划分为warp,然后使用warp的相关的api,这个warp相关的api是有什么手册吗？
   if (tid == 0) {
     s_max_val = max_val;
   }
@@ -63,8 +64,12 @@ void invoke_softmax(torch::Tensor& logits) {
 
   // each thread block handles one batch
   dim3 grid(batch_size);
-  dim3 block(std::min(vocab_size, 1024));
-
+  dim3 block(std::min(
+      vocab_size,
+      1024));  // TODO:
+               // 一个线程处理batch中的一个prompt,那么不应该设置为max(vocab_size,1024)?
+               // 1024这个超参数设定有什么技巧吗？
+  //
   DISPATCH_FLOATING_TYPES(logits.scalar_type(), "softmax_kernel", [&] {
     softmax_kernel<scalar_t>
         <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
index 682b7d77..78d8836a 100644
--- a/src/layers/CMakeLists.txt
+++ b/src/layers/CMakeLists.txt
@@ -3,76 +3,78 @@ include(cc_test)
 
 cc_library(
   NAME
-    linear 
+  linear
   HDRS
-    linear.h
-    qkv_linear.h
-    linear_impl.h
+  linear.h
+  qkv_linear.h
+  linear_impl.h
   SRCS
-    linear.cpp
-    qkv_linear.cpp
-    linear_impl.cpp
+  linear.cpp
+  qkv_linear.cpp
+  linear_impl.cpp
   DEPS
-    :state_dict
-    :model_parallel
-    :quantization
-    :kernels
-    glog::glog
-    gflags::gflags
-    torch
+  :state_dict
+  :model_parallel
+  :quantization
+  :kernels
+  glog::glog
+  gflags::gflags
+  torch
 )
 
 cc_library(
-  NAME 
-    pos_embedding
-  HDRS 
-    pos_embedding.h
-  SRCS 
-    pos_embedding.cpp
+  NAME
+  pos_embedding
+  HDRS
+  pos_embedding.h
+  SRCS
+  pos_embedding.cpp
   DEPS
-    :state_dict
-    :memory
-    :kernels
-    glog::glog
-    gflags::gflags
-    torch
+  :state_dict
+  :memory
+  :kernels
+  glog::glog
+  gflags::gflags
+  torch
 )
 
 cc_library(
-  NAME 
-    layers
-  HDRS 
-    normalization.h
-    embedding.h
-    activation.h
-  SRCS 
-    activation.cpp
+  NAME
+  layers
+  HDRS
+  normalization.h
+  embedding.h
+  activation.h
+  fused_moe.h
+  SRCS
+  activation.cpp
+  fused_moe.cpp
   DEPS
-    :state_dict
-    :memory
-    :linear
-    :pos_embedding
-    :attention
-    :kernels
-    :flash_attn.kernels
-    glog::glog
-    gflags::gflags
-    torch
+  :state_dict
+  :memory
+  :linear
+  :pos_embedding
+  :attention
+  :kernels
+  :flash_attn.kernels
+  glog::glog
+  gflags::gflags
+  torch
 )
 
 cc_test(
   NAME
-    layers_test
+  layers_test
   SRCS
-    activation_test.cpp
-    layers_test.cpp
-    pos_embedding_test.cpp
-    normalization_test.cpp
+  activation_test.cpp
+  layers_test.cpp
+  pos_embedding_test.cpp
+  normalization_test.cpp
   DEPS
-    :layers
-    :state_dict
-    absl::random_random
-    GTest::gtest_main
+  :layers
+  :state_dict
+  absl::random_random
+  GTest::gtest_main
 )
 
 add_subdirectory(attention)
\ No newline at end of file
diff --git a/src/layers/fused_moe.cpp b/src/layers/fused_moe.cpp
new file mode 100644
index 00000000..3a8f4aaa
--- /dev/null
+++ b/src/layers/fused_moe.cpp
@@ -0,0 +1,93 @@
+#include "fused_moe.h"
+
+#include <torch/torch.h>
+
+#include "kernels/fused_moe_kernels.h"
+#include "models/model_args.h"
+#include "models/parameters.h"
+namespace llm {
+FusedMoeLayerImpl::FusedMoeLayerImpl(bool renormalize,
+                                     bool inplace,
+                                     const ModelArgs& args,
+                                     const QuantArgs& quant_args,
+                                     const ParallelArgs& parallel_args,
+                                     const torch::TensorOptions& options)
+    : parallel_args_(parallel_args),
+      renormalize_(renormalize),
+      inplace_(inplace) {
+  topk_ = args.n_experts_per_tok();
+  num_total_experts_ = args.n_local_experts();
+  intermediate_size_ = args.intermediate_size();
+  hidden_size_ = args.hidden_size();
+
+  w13_ = register_parameter(
+      "weight",
+      torch::empty({num_total_experts_, 2 * intermediate_size_, hidden_size_},
+                   options),
+      /*required_grad*/ false);
+  w2_ = register_parameter(
+      "weight",
+      torch::empty({num_total_experts_, hidden_size_, intermediate_size_},
+                   options),
+      /*required_grad*/ false);
+}
+
+torch::Tensor FusedMoeLayerImpl::forward(
+    torch::Tensor hidden_states,  // [hidden_size,hidden_dim]
+    torch::Tensor gating_output   // [n_tokens,n_expert]
+) {
+  // ========  fused_topk: topk(softmax(gating_output))============
+  // match the number of tokens
+  DCHECK_EQ(hidden_states.sizes()[0], gating_output.sizes()[0]);
+  auto router_weight = torch::softmax(gating_output, -1, torch::kFloat32);
+  auto [topk_weights, topk_indices] =
+      torch::topk(router_weight, topk_, -1);  // [n_tokens,n_topk]
+
+  if (renormalize_) {
+    topk_weights = topk_weights / topk_weights.sum(-1, true);
+  }
+
+  // ================   fused_expert =================
+  // be sure that hidden_states/w13/w2 are contiguous
+  DCHECK_EQ(hidden_states.is_contiguous(), true);
+  DCHECK_EQ(w13_.is_contiguous(), true);
+  DCHECK_EQ(w2_.is_contiguous(), true);
+  return kernel::apply_fused_moe(
+      hidden_states, w13_, w2_, topk_weights, topk_indices, inplace_);
+}
+
+void FusedMoeLayerImpl::load_state_dict(const StateDict& state_dict) {
+  // prefix:model.layers.0.block_sparse_moe.experts.
+  auto shard_size = intermediate_size_;
+  auto world_size = shard_size * state_dict.num_shards();
+  if (w2_.defined()) {
+    for (int i = 0; i < num_total_experts_; i++) {
+      auto w2 = state_dict.select(std::to_string(i) + ".w2.")
+                    .get_sharded_tensor("weight",
+                                        /*dim*/ 1,
+                                        /*rank*/ parallel_args_.rank(),
+                                        /*world_size*/ world_size);
+      w2_.slice(0, i).copy_(w2);
+    }
+    is_loaded_w2_ = true;
+  }
+  if (w13_.defined()) {
+    for (int i = 0; i < num_total_experts_; i++) {
+      auto w1 = state_dict.select(std::to_string(i) + ".w1.")
+                    .get_sharded_tensor("weight",
+                                        /*dim*/ 0,
+                                        /*rank*/ parallel_args_.rank(),
+                                        /*world_size*/ world_size);
+      w13_.slice(0, i).slice(1, 0, shard_size).copy_(w1);
+      auto w3 = state_dict.select(std::to_string(i) + ".w3.")
+                    .get_sharded_tensor("weight",
+                                        /*dim*/ 0,
+                                        /*rank*/ parallel_args_.rank(),
+                                        /*world_size*/ world_size);
+      w13_.slice(0, i).slice(1, shard_size, 2 * shard_size).copy_(w3);
+    }
+    is_loaded_w13_ = true;
+  }
+}
+
+}  // namespace llm
\ No newline at end of file
diff --git a/src/layers/fused_moe.h b/src/layers/fused_moe.h
new file mode 100644
index 00000000..acc20d19
--- /dev/null
+++ b/src/layers/fused_moe.h
@@ -0,0 +1,52 @@
+#pragma once
+#include <torch/torch.h>
+
+#include "model_loader/state_dict.h"
+#include "model_parallel/parallel_args.h"
+#include "models/model_args.h"
+#include "quantization/quant_args.h"
+namespace llm {
+
+// ======= a Mixture of Experts (MoE) layer using two sets of weights, w1 and
+// w2, and top-k gating mechanis ======== Main parameters are below:
+// - w1 (torch.Tensor): The first set of expert weights.
+//       [n_expert,2*intermediate_size,hidden_size]
+// - w2 (torch.Tensor): The second set of expert weights.
+//       [n_expert,hidden_size,intermediate_size]
+// - topk (int): The number of top-k experts to select.
+// - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+// - inplace (bool): If True, perform the operation in-place.
+class FusedMoeLayerImpl : public torch::nn::Module {
+ public:
+  FusedMoeLayerImpl(bool renormalize,
+                    bool inplace,
+                    const ModelArgs& args,
+                    const QuantArgs& quant_args,
+                    const ParallelArgs& parallel_args,
+                    const torch::TensorOptions& options);
+  torch::Tensor forward(torch::Tensor hidden_states, torch::Tensor gating_out);
+  void load_state_dict(const StateDict& state_dict);
+  void verify_loaded_weights(const std::string& prefix = "") const {
+    CHECK(is_loaded_w13_) << "weight is not loaded for" << prefix + "weight13";
+    CHECK(is_loaded_w2_) << "weight is not loaded for" << prefix + "weight2";
+  };
+
+ private:
+  ParallelArgs parallel_args_;
+  // for gate mechanism
+  int topk_ = 0;
+  int num_total_experts_;
+  int intermediate_size_;
+  int hidden_size_;
+  bool renormalize_;
+  bool inplace_;
+  // for expert mechanism
+  torch::Tensor w13_{nullptr};
+  torch::Tensor w2_{nullptr};
+
+  // whether the weight is loaded
+  bool is_loaded_w13_ = false;
+  bool is_loaded_w2_ = false;
+};
+TORCH_MODULE(FusedMoeLayer);
+}  // namespace llm
\ No newline at end of file
diff --git a/src/layers/linear_impl.cpp b/src/layers/linear_impl.cpp
index ce273d6b..5eea1e91 100644
--- a/src/layers/linear_impl.cpp
+++ b/src/layers/linear_impl.cpp
@@ -257,4 +257,44 @@ void RowParallelLinearImpl::load_state_dict(const StateDict& state_dict) {
   }
 }
 
+ReplicatedLinearImpl::ReplicatedLinearImpl(int64_t in_features,
+                                           int64_t out_features,
+                                           bool bias,
+                                           bool skip_bias_add,
+                                           const QuantArgs& quant_args,
+                                           const torch::TensorOptions& options)
+    : skip_bias_add(skip_bias_add) {
+  weight_ = register_parameter(
+      "weight", torch::empty({out_features, in_features}, options));
+  if (bias) {
+    bias_ = register_parameter("bias", torch::empty({out_features}, options));
+  }
+}
+
+torch::Tensor ReplicatedLinearImpl::forward(torch::Tensor input,
+                                            torch::Tensor& output_bias) {
+  torch::Tensor bias = (!skip_bias_add) ? bias_ : torch::Tensor();
+  namespace F = torch::nn::functional;
+  auto output = F::linear(input, weight_, bias);
+  output_bias = (skip_bias_add) ? bias_ : torch::Tensor();
+}
+
+// load the weight from the checkpoint
+void ReplicatedLinearImpl::load_state_dict(const StateDict& state_dict) {
+  if (weight_.defined()) {
+    const auto weight = state_dict.get_tensor("weight");
+    CHECK_EQ(weight_.sizes(), weight.sizes())
+        << "weight size mismatch for" << name();
+    weight_.copy_(weight);
+    weight_is_loaded_ = true;
+  }
+  if (bias_.defined()) {
+    const auto bias = state_dict.get_tensor("bias");
+    CHECK_EQ(bias.sizes(), bias_.sizes())
+        << "weight size mismatch for" << name();
+    bias_.copy_(bias);
+    bias_is_loaded_ = true;
+  }
+}
+
 }  // namespace llm
diff --git a/src/layers/linear_impl.h b/src/layers/linear_impl.h
index 425819b1..0134311f 100644
--- a/src/layers/linear_impl.h
+++ b/src/layers/linear_impl.h
@@ -135,4 +135,37 @@ class RowParallelLinearImpl : public ParallelLinearImpl {
   // parallel args
   ParallelArgs parallel_args_;
 };
+
+// Replicated linear layer for moe model
+class ReplicatedLinearImpl : public torch::nn::Module {
+ public:
+  ReplicatedLinearImpl(int64_t in_features,
+                       int64_t out_features,
+                       bool bias,
+                       bool skip_bias_add,
+                       const QuantArgs& quant_args,
+                       const torch::TensorOptions& options);
+  torch::Tensor forward(torch::Tensor input, torch::Tensor& output_bias);
+  void load_state_dict(const StateDict& state_dict);
+
+  void verify_loaded_weights(const std::string& prefix = "") const {
+    CHECK(weight_is_loaded_)
+        << "weight is not loaded for " << prefix + "weight";
+    CHECK(!bias_.defined() || bias_is_loaded_)
+        << "bias is not loaded for " << prefix + "bias";
+  };
+
+ private:
+  // parameter members, must be registered
+  // we allocate the transpose since linear performs XA^T.
+  // A^T: [out_features, in_features_per_partition]
+  torch::Tensor weight_{nullptr};
+  torch::Tensor bias_{nullptr};
+
+  bool skip_bias_add = false;
+  // whether the weight is loaded
+  bool weight_is_loaded_ = false;
+  bool bias_is_loaded_ = false;
+};
+TORCH_MODULE(ReplicatedLinear);
 }  // namespace llm
diff --git a/src/models/huggingface/mixtral.h b/src/models/huggingface/mixtral.h
new file mode 100644
index 00000000..549e2e91
--- /dev/null
+++ b/src/models/huggingface/mixtral.h
@@ -0,0 +1,497 @@
+#pragma once
+#include <torch/nn/functional/embedding.h>
+#include <torch/nn/options/activation.h>
+#include <torch/torch.h>
+
+#include "chat_template/coded_chat_template.h"
+#include "layers/activation.h"
+#include "layers/attention/attention.h"
+#include "layers/attention/handler.h"
+#include "layers/embedding.h"
+#include "layers/fused_moe.h"
+#include "layers/linear.h"
+#include "layers/linear_impl.h"
+#include "layers/normalization.h"
+#include "memory/kv_cache.h"
+#include "models/model_args.h"
+#include "models/model_registry.h"
+#include "models/parameters.h"
+
+namespace llm::hf {
+
+class MixtralBlockExpertImpl : public torch::nn::Module {
+ public:
+  MixtralBlockExpertImpl(const ModelArgs& args,
+                         const QuantArgs& quant_args,
+                         const ParallelArgs& parallel_args,
+                         const torch::TensorOptions& options) {
+    auto ffn_dim = args.intermediate_size();
+    auto hidden_dim = args.hidden_size();
+
+    w1_ = register_module("w1",
+                          ReplicatedLinear(hidden_dim,
+                                           ffn_dim,
+                                           /*bias*/ false,
+                                           /*skip_bias_add*/ false,
+                                           quant_args,
+                                           options));
+    w2_ = register_module("w2",
+                          ReplicatedLinear(ffn_dim,
+                                           hidden_dim,
+                                           /*bias*/ false,
+                                           /*skip_bias_add*/ false,
+                                           quant_args,
+                                           options));
+    w3_ = register_module("w3",
+                          ReplicatedLinear(hidden_dim,
+                                           ffn_dim,
+                                           /*bias*/ false,
+                                           /*skip_bias_add*/ false,
+                                           quant_args,
+                                           options));
+    act_fn_ = Activation::get_act_func(args.hidden_act(), options.device());
+  }
+  torch::Tensor forward(torch::Tensor hidden_states) {
+    torch::Tensor out_bias;
+    auto current_hidden_states =
+        act_fn_(w1_(hidden_states, out_bias)) * w3_(hidden_states, out_bias);
+    current_hidden_states = w2_(current_hidden_states, out_bias);
+    return current_hidden_states;
+  }
+
+  void load_state_dict(const StateDict& state_dict) {
+    w1_->load_state_dict(state_dict.select("w1."));
+    w2_->load_state_dict(state_dict.select("w2."));
+    w3_->load_state_dict(state_dict.select("w3."));
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    w1_->verify_loaded_weights(prefix + "w1.");
+    w2_->verify_loaded_weights(prefix + "w2.");
+    w3_->verify_loaded_weights(prefix + "w3.");
+  }
+
+ private:
+  ReplicatedLinear w1_{nullptr};
+  ReplicatedLinear w2_{nullptr};
+  ReplicatedLinear w3_{nullptr};
+  ActFunc act_fn_{nullptr};
+};
+TORCH_MODULE(MixtralBlockExpert);
+class MixtralMoEImpl : public torch::nn::Module {
+ public:
+  MixtralMoEImpl(const ModelArgs& args,
+                 const QuantArgs& quant_args,
+                 const ParallelArgs& parallel_args,
+                 const torch::TensorOptions& options) {
+    args_ = args;
+    gate_ = register_module("gate",
+                            ReplicatedLinear(args_.hidden_size(),
+                                             args_.n_local_experts(),
+                                             /*bias*/ false,
+                                             /*skip_bias_add*/ false,
+                                             quant_args,
+                                             options));
+    layers_.reserve(args_.n_local_experts());
+    experts_ = register_module("experts", torch::nn::ModuleList());
+    for (auto i = 0; i < args_.n_experts_per_tok(); i++) {
+      auto expert =
+          MixtralBlockExpert(args, quant_args, parallel_args, options);
+      layers_.push_back(expert);
+      experts_->push_back(expert);
+    }
+  }
+  // [selected_n_tokens,hidden_size]
+  torch::Tensor forward(torch::Tensor hidden_states) {
+    auto sizes = hidden_states.sizes();
+    auto num_token = sizes[0];
+    auto hidden_size = sizes[1];
+    hidden_states = hidden_states.view({-1, hidden_size});
+
+    torch::Tensor out_bias;
+    auto router_logits = gate_(hidden_states, out_bias);
+    auto routing_weights = torch::softmax(router_logits, 1, torch::kFloat32);
+    auto [topk_weights, topk_indices] =
+        torch::topk(routing_weights, args_.n_experts_per_tok(), -1);
+    topk_weights = topk_weights / topk_weights.sum(-1, true);
+    // we cast back to the input dtype
+    topk_weights = topk_weights.to(hidden_states.dtype());
+
+    auto final_hidden_states = torch::zeros({num_token, hidden_size},
+                                            torch::TensorOptions()
+                                                .device(hidden_states.device())
+                                                .dtype(hidden_states.dtype()));
+    // One hot encode the selected experts to create an expert mask
+    // this will be used to easily index which expert is going to be
+    // sollicitated
+    auto expert_mask =
+        torch::nn::functional::one_hot(topk_indices, args_.n_local_experts())
+            .permute({2, 1, 0});  // [n_experts,n_topk,n_tokens]
+
+    // Loop over all available experts in the model and perform the computation
+    // on each expert
+    for (uint i = 0; i < args_.n_local_experts(); i++) {
+      auto expert_layer = layers_[i];  //[topk,n_tokens]
+      std::vector<torch::Tensor> v = torch::where(expert_mask[i]);
+      auto idx = v[0];    // row indexs, num_topk
+      auto top_x = v[1];  // col indexs, num_tokens
+      // TODO: 需要验证下情况?
+      auto current_state = hidden_states.index(
+          {top_x,
+           torch::indexing::None});  // select specific tokens' hidden_states
+      current_state = current_state.reshape({-1, hidden_size});
+      auto current_hidden_states =
+          expert_layer(current_state) * topk_weights.index({top_x, idx});
+      final_hidden_states.index_add_(
+          0, top_x, current_hidden_states.to(hidden_states.dtype()));
+    }
+    return final_hidden_states.view({-1, hidden_size});
+  }
+  void load_state_dict(const StateDict& state_dict) {
+    gate_->load_state_dict(state_dict.select("gate."));
+    for (int i = 0; i < args_.n_local_experts(); i++) {
+      layers_[i]->load_state_dict(
+          state_dict.select("experts." + std::to_string(i) + "."));
+    }
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    gate_->verify_loaded_weights(prefix + "gate.");
+    for (int i = 0; i < args_.n_local_experts(); i++) {
+      layers_[i]->verify_loaded_weights(prefix + "experts." +
+                                        std::to_string(i) + ".");
+    }
+  }
+
+ private:
+  ModelArgs args_;
+
+  ReplicatedLinear gate_{nullptr};
+
+  std::vector<MixtralBlockExpert> layers_{nullptr};
+  torch::nn::ModuleList experts_{nullptr};
+};
+TORCH_MODULE(MixtralMoE);
+
+class MixtralAttentionImpl : public torch::nn::Module {
+ public:
+  MixtralAttentionImpl(const ModelArgs& args,
+                       const QuantArgs& quant_args,
+                       const ParallelArgs& parallel_args,
+                       const torch::TensorOptions& options,
+                       AttentionHandler* handler) {
+    const int32_t world_size = parallel_args.world_size();
+    const int64_t hidden_size = args.hidden_size();
+    const int64_t n_heads = args.n_heads();
+    const int64_t head_dim = args.head_dim();
+    const int64_t n_kv_heads = args.n_kv_heads().value_or(n_heads);
+    const int64_t n_local_heads = n_heads / world_size;
+    const int64_t n_local_kv_heads = n_kv_heads / world_size;
+
+    // size for q, k, v
+    qkv_sizes_ = {n_local_heads * head_dim,
+                  n_local_kv_heads * head_dim,
+                  n_local_kv_heads * head_dim};
+
+    // register submodules
+    qkv_proj_ = register_module(
+        "qkv_proj",
+        ColumnParallelLinear(hidden_size,
+                             (n_heads + 2 * n_kv_heads) * head_dim,
+                             /*bias=*/false,
+                             /*gather_output=*/false,
+                             quant_args,
+                             parallel_args,
+                             options));
+
+    o_proj_ = register_module("o_proj",
+                              RowParallelLinear(n_heads * head_dim,
+                                                hidden_size,
+                                                /*bias=*/false,
+                                                /*input_is_parallelized=*/true,
+                                                quant_args,
+                                                parallel_args,
+                                                options));
+
+    // initialize attention
+    atten_ = register_module(
+        "atten", Attention(n_local_heads, n_local_kv_heads, head_dim, handler));
+  }
+  torch::Tensor forward(torch::Tensor x,
+                        torch::Tensor positions,
+                        KVCache& kv_cache,
+                        const InputParameters& input_params) {
+    // (num_tokens, dim) x (dim, n_local_heads * head_dim)
+    // => (num_tokens, n_local_heads * head_dim)
+    auto qkv = qkv_proj_(x).split(/*split_size=*/qkv_sizes_, /*dim=*/-1);
+    DCHECK_EQ(qkv.size(), 3);
+
+    // calculate attention,
+    // output: (num_tokens, n_local_heads*head_dim)
+    auto output =
+        atten_(qkv[0], qkv[1], qkv[2], positions, kv_cache, input_params);
+    return o_proj_(output);
+  }
+
+  void load_state_dict(const StateDict& state_dict) {
+    // call each submodule's load_state_dict function
+    qkv_proj_->load_state_dict(state_dict, {"q_proj.", "k_proj.", "v_proj."});
+    o_proj_->load_state_dict(state_dict.select("o_proj."));
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    qkv_proj_->verify_loaded_weights(prefix + "[q_proj,k_proj,v_proj].");
+    o_proj_->verify_loaded_weights(prefix + "o_proj.");
+  }
+
+ private:
+  // parameter members, must be registered
+  ColumnParallelLinear qkv_proj_{nullptr};
+
+  RowParallelLinear o_proj_{nullptr};
+
+  // module members without parameters
+  Attention atten_{nullptr};
+
+  // size for q, k, v
+  std::vector<int64_t> qkv_sizes_;
+};
+TORCH_MODULE(MixtralAttention);
+
+class MixtralDecoderLayerImpl : public torch::nn::Module {
+ public:
+  MixtralDecoderLayerImpl(const ModelArgs& args,
+                          const QuantArgs& quant_args,
+                          const ParallelArgs& parallel_args,
+                          const torch::TensorOptions& options,
+                          AttentionHandler* handler) {
+    // register submodules
+    self_attn_ = register_module(
+        "self_attn",
+        MixtralAttention(args, quant_args, parallel_args, options, handler));
+
+    moe_ = register_module(
+        "moe", MixtralMoE(args, quant_args, parallel_args, options));
+
+    input_layernorm_ = register_module(
+        "input_layernorm",
+        RMSNormResidual(args.hidden_size(), args.rms_norm_eps(), options));
+
+    post_attention_layernorm_ = register_module(
+        "post_attention_layernorm",
+        RMSNormResidual(args.hidden_size(), args.rms_norm_eps(), options));
+  }
+
+  torch::Tensor forward(torch::Tensor x,
+                        torch::Tensor positions,
+                        KVCache& kv_cache,
+                        const InputParameters& input_params,
+                        torch::Tensor& residual) {
+    auto hidden_states = input_layernorm_(x, residual);
+
+    hidden_states =
+        self_attn_(hidden_states, positions, kv_cache, input_params);
+
+    // fully connected
+    hidden_states = post_attention_layernorm_(hidden_states, residual);
+
+    return moe_(hidden_states);
+  }
+
+  void load_state_dict(const StateDict& state_dict) {
+    self_attn_->load_state_dict(state_dict.select("self_attn."));
+    input_layernorm_->load_state_dict(state_dict.select("input_layernorm."));
+    post_attention_layernorm_->load_state_dict(
+        state_dict.select("post_attention_layernorm."));
+    moe_->load_state_dict(state_dict.select("block_sparse_moe."));
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    self_attn_->verify_loaded_weights(prefix + "self_attn.");
+    input_layernorm_->verify_loaded_weights(prefix + "input_layernorm.");
+    post_attention_layernorm_->verify_loaded_weights(
+        prefix + "post_attention_layernorm.");
+    moe_->verify_loaded_weights(prefix + "block_sparse_moe.");
+  }
+
+ private:
+  MixtralAttention self_attn_{nullptr};
+
+  MixtralMoE moe_{nullptr};
+
+  RMSNormResidual input_layernorm_{nullptr};
+
+  RMSNormResidual post_attention_layernorm_{nullptr};
+};
+TORCH_MODULE(MixtralDecoderLayer);
+
+class MixtralModelImpl : public torch::nn::Module {
+ public:
+  MixtralModelImpl(const ModelArgs& args,
+                   const QuantArgs& quant_args,
+                   const ParallelArgs& parallel_args,
+                   const torch::TensorOptions& options) {
+    modelArgs_ = args;
+
+    // TODO: If we have implemented the lora, the vocab_size should be
+    // processed.
+    embed_tokens_ = register_module(
+        "embed_tokens",
+        ParallelEmbedding(
+            args.vocab_size(), args.hidden_size(), parallel_args, options));
+
+    handler_ = AttentionHandler::create_handler_with_rope(
+        args, /*interleaved=*/false, options);
+
+    blocks_ = register_module("layers", torch::nn::ModuleList());
+    layers_.reserve(args.n_layers());
+    for (int32_t i = 0; i < args.n_layers(); i++) {
+      auto block = MixtralDecoderLayer(
+          args, quant_args, parallel_args, options, handler_.get());
+      layers_.push_back(block);
+      blocks_->push_back(block);
+    }
+
+    norm_ = register_module(
+        "norm",
+        RMSNormResidual(args.hidden_size(), args.rms_norm_eps(), options));
+  }
+
+  torch::Tensor forward(torch::Tensor tokens,
+                        torch::Tensor positions,
+                        std::vector<KVCache>& kv_caches,
+                        const InputParameters& input_params) {
+    auto h = embed_tokens_(tokens);
+
+    torch::Tensor residual;
+    for (int32_t i = 0; i < modelArgs_.n_layers(); i++) {
+      auto& layer = layers_[i];
+      h = layer(h, positions, kv_caches[i], input_params, residual);
+    }
+
+    return norm_(h, residual);
+  }
+
+  void load_state_dict(const StateDict& state_dict) {
+    embed_tokens_->load_state_dict(state_dict.select("embed_tokens.weight"));
+
+    for (int i = 0; i < layers_.size(); i++) {
+      layers_[i]->load_state_dict(
+          state_dict.select("layers." + std::to_string(i) + "."));
+    }
+    norm_->load_state_dict(state_dict.select("norm.weight"));
+  }
+
+  void verify_loaded_weights(const std::string& prefix) const {
+    embed_tokens_->verify_loaded_weights(prefix + "embed_tokens.weight");
+
+    for (int i = 0; i < layers_.size(); i++) {
+      layers_[i]->verify_loaded_weights(prefix + "layers." + std::to_string(i) +
+                                        ".");
+    }
+
+    norm_->verify_loaded_weights(prefix + "norm.weight");
+  }
+
+ private:
+  ModelArgs modelArgs_;
+  // parameter members, must be registered
+  // embedding module
+  ParallelEmbedding embed_tokens_{nullptr};
+
+  RMSNormResidual norm_{nullptr};
+
+  // attention handler
+  std::unique_ptr<AttentionHandler> handler_{nullptr};
+
+  torch::nn::ModuleList blocks_{nullptr};
+  // hold same data but different type as blocks_ to avoid type cast
+  std::vector<MixtralDecoderLayer> layers_{nullptr};
+};
+TORCH_MODULE(MixtralModel);
+
+class MixtralForCausalLMImpl : public torch::nn::Module {
+ public:
+  MixtralForCausalLMImpl(const ModelArgs& args,
+                         const QuantArgs& quant_args,
+                         const ParallelArgs& parallel_args,
+                         const torch::TensorOptions& options) {
+    model_ = register_module(
+        "model", MixtralModel(args, quant_args, parallel_args, options));
+
+    lm_head_ = register_module("lm_head",
+                               ColumnParallelLinear(args.hidden_size(),
+                                                    args.vocab_size(),
+                                                    /*bias=*/false,
+                                                    /*gather_output=*/true,
+                                                    parallel_args,
+                                                    options));
+  }
+  // tokens
+  torch::Tensor forward(const torch::Tensor& tokens,
+                        const torch::Tensor& positions,
+                        std::vector<KVCache>& kv_caches,
+                        const InputParameters& input_params) {
+    return model_(tokens, positions, kv_caches, input_params);
+  }
+
+  torch::Tensor logits(const torch::Tensor& hidden_states,
+                       const torch::Tensor& selected_idxes) {
+    // select tokens if provided
+    auto h = hidden_states;
+    if (selected_idxes.defined()) {
+      h = h.index_select(/*dim=*/0, selected_idxes);
+    }
+    return lm_head_(h);
+  }
+
+  void load_state_dict(const StateDict& state_dict) {
+    model_->load_state_dict(state_dict.select("model."));
+
+    lm_head_->load_state_dict(state_dict.select("lm_head."));
+  }
+
+  void verify_loaded_weights() const {
+    model_->verify_loaded_weights("model.");
+    lm_head_->verify_loaded_weights("lm_head.");
+  }
+
+ private:
+  MixtralModel model_{nullptr};
+
+  ColumnParallelLinear lm_head_{nullptr};
+};
+TORCH_MODULE(MixtralForCausalLM);
+
+// register the model to make it available
+REGISTER_CAUSAL_MODEL(mixtral, MixtralForCausalLM);
+
+REGISTER_MODEL_ARGS(mixtral, [&] {
+  // example config from huggingface
+  // https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json
+  LOAD_ARG_OR(model_type, "model_type", "mixtral");
+  LOAD_ARG_OR(bos_token_id, "bos_token_id", 1);
+  LOAD_ARG_OR(eos_token_id, "eos_token_id", 2);
+  LOAD_ARG_OR(hidden_size, "hidden_size", 4096);
+  LOAD_ARG_OR(intermediate_size, "intermediate_size", 14336);
+  LOAD_ARG_OR(max_position_embeddings, "max_position_embeddings", 4096 * 32);
+  LOAD_ARG_OR(n_heads, "num_attention_heads", 32);
+  LOAD_ARG_OR(n_experts_per_tok, "num_experts_per_tok", 2);
+  LOAD_ARG_OR(n_layers, "num_hidden_layers", 32);
+  LOAD_ARG_OR(n_kv_heads, "num_key_value_heads", 8);
+  LOAD_ARG_OR(n_local_experts, "num_local_experts", 8);
+  LOAD_ARG_OR(out_router_logits, "output_router_logits", false);
+  LOAD_ARG_OR(rms_norm_eps, "rms_norm_eps", 1e-5);
+  LOAD_ARG_OR(rope_theta, "rope_theta", 1e6);
+  LOAD_ARG_OR(router_aux_loss_coef, "router_aux_loss_coef", 0.001);
+  LOAD_ARG_OR(dtype, "torch_dtype", "bfloat16");
+  LOAD_ARG_OR(vocab_size, "vocab_size", 32000);
+
+  LOAD_ARG_OR(hidden_act, "hidden_activation", "silu");
+
+  LOAD_ARG_OR_FUNC(head_dim, "head_dim", [&] {
+    return args->hidden_size() / args->n_heads();
+  });
+});
+
+}  // namespace llm::hf
\ No newline at end of file
diff --git a/src/models/model_args.h b/src/models/model_args.h
index ddbd45e5..3b873093 100644
--- a/src/models/model_args.h
+++ b/src/models/model_args.h
@@ -103,6 +103,16 @@ struct ModelArgs {
 
   // Stop token ids for decoding.
   DEFINE_ARG(std::unordered_set<int32_t>, stop_token_ids);
+
+  // The number of experts per tok in MoE used in topk
+  DEFINE_ARG(int, n_experts_per_tok);
+
+  // The number of experts in MoE
+  DEFINE_ARG(int64_t, n_local_experts);
+
+  DEFINE_ARG(bool, out_router_logits);
+
+  DEFINE_ARG(float, router_aux_loss_coef);
 };
 
 inline std::ostream& operator<<(std::ostream& os, const ModelArgs& args) {
@@ -134,6 +144,10 @@ inline std::ostream& operator<<(std::ostream& os, const ModelArgs& args) {
   os << ", linear_bias: " << args.linear_bias();
   os << ", qkv_bias: " << args.qkv_bias();
   os << ", residual_post_layernorm: " << args.residual_post_layernorm();
+  os << ", n_experts_per_tok: " << args.n_experts_per_tok();
+  os << ", n_local_experts: " << args.n_local_experts();
+  os << ", out_router_logits: " << args.out_router_logits();
+  os << ", router_aux_loss_coef: " << args.router_aux_loss_coef();
   os << "]";
   return os;
 }
diff --git a/src/models/model_registry.cpp b/src/models/model_registry.cpp
index 5f0b10e0..f95efc49 100644
--- a/src/models/model_registry.cpp
+++ b/src/models/model_registry.cpp
@@ -14,6 +14,7 @@
 #include "huggingface/internlm.h"  // IWYU pragma: keep
 #include "huggingface/llama.h"     // IWYU pragma: keep
 #include "huggingface/mistral.h"   // IWYU pragma: keep
+#include "huggingface/mixtral.h"   // IWYU pragma: keep
 #include "huggingface/mpt.h"       // IWYU pragma: keep
 #include "huggingface/phi.h"       // IWYU pragma: keep
 #include "huggingface/qwen.h"      // IWYU pragma: keep
diff --git a/src/server/simple.cpp b/src/server/simple.cpp
index 860441ad..f79a1339 100644
--- a/src/server/simple.cpp
+++ b/src/server/simple.cpp
@@ -21,7 +21,7 @@ namespace py = pybind11;
 static constexpr int64_t GB = int64_t(1024) * 1024 * 1024;
 
 DEFINE_string(model_name_or_path,
-              "THUDM/chatglm3-6b",
+              "mistralai/Mixtral-8x7B-v0.1",
               "hf model name or path to the model file.");
 
 DEFINE_string(draft_model_name_or_path,
@@ -33,7 +33,7 @@ DEFINE_string(model_allow_patterns,
               "Allow patterns for model files.");
 
 DEFINE_string(device,
-              "cuda",
+              "cuda:0,cuda:1,cuda:2,cuda:3",
               "Device to run the model on, e.g. cpu, cuda:0, cuda:0,cuda:1, or "
               "auto to use all available gpus.");
 
@@ -120,6 +120,7 @@ std::unique_ptr<Engine> create_engine(const std::string& model_path,
 }
 
 int main(int argc, char* argv[]) {
+  std::cout << "=======test========" << std::flush;
   // initialize glog and gflags
   google::InitGoogleLogging(argv[0]);
   gflags::ParseCommandLineFlags(&argc, &argv, true);