diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 046358bcca708d..15d6d124cc748a 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -434,67 +434,6 @@ Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
   return Target;
 }
 
-/// Compute the occupancy with the constraint on the number of SGPRs
-/// Follow the logic on the backend
-/// Ref:
-/// llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getOccupancyWithNumSGPRs
-unsigned getOccupancyWithNumSGPRs(unsigned SGPRCount) {
-
-  if (SGPRCount <= llvm::omp::amdgpu_arch::SGPRCountOccupancy10) {
-    return 10;
-  } else if (SGPRCount <= llvm::omp::amdgpu_arch::SGPRCountOccupancy9) {
-    return 9;
-  } else if (SGPRCount <= llvm::omp::amdgpu_arch::SGPRCountOccupancy8) {
-    return 8;
-  }
-
-  return 7;
-}
-
-/// Compute the occupancy with the constraint on LDS
-/// Follow the logic on the backend
-/// Ref:
-/// llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getOccupancyWithLocalMemSize
-unsigned getOccupancyWithLDS(uint32_t GroupSegmentSize, unsigned MaxWavesPerEU,
-                             uint32_t MaxFlatWorkgroupSize) {
-
-  unsigned MaxWorkgroupNum =
-      llvm::omp::amdgpu_arch::LocalMemorySize / GroupSegmentSize;
-
-  // workgroup size
-  unsigned ThreadsPerWorkgroup = MaxFlatWorkgroupSize;
-  unsigned WavesPerWorkgroup =
-      divideCeil(ThreadsPerWorkgroup, llvm::omp::amdgpu_arch::WaveFrontSize64);
-
-  unsigned MaxWavesPerCU = MaxWavesPerEU * llvm::omp::amdgpu_arch::SIMDPerCU;
-
-  // if a workgroup has just one wavefront, the max # of workgroup per CU is
-  // 40 if a workgroup has more than one wavefront, the max # of workgroup per
-  // CU is 16 https://github.com/ROCm/ROCm/issues/746#issuecomment-474656922
-  if (WavesPerWorkgroup <= 1) {
-
-    MaxWorkgroupNum = std::min(MaxWorkgroupNum, MaxWavesPerCU);
-  } else {
-    MaxWorkgroupNum =
-        std::min(MaxWorkgroupNum, MaxWavesPerCU / WavesPerWorkgroup);
-    MaxWorkgroupNum =
-        std::min(MaxWorkgroupNum, llvm::omp::amdgpu_arch::MaxWorkgroupNumPerCU);
-  }
-
-  // per SIMD
-  unsigned WaveNumByLDS = divideCeil(WavesPerWorkgroup * MaxWorkgroupNum,
-                                     llvm::omp::amdgpu_arch::SIMDPerCU);
-  WaveNumByLDS = std::min(WaveNumByLDS, MaxWavesPerEU);
-
-  return WaveNumByLDS;
-}
-
-// forward declaration
-unsigned computeOccupancy(
-    GenericDeviceTy &Device,
-    std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo,
-    uint32_t NumThreads, uint64_t NumBlocks);
-
 } // namespace hsa_utils
 
 /// Utility class representing generic resource references to AMDGPU resources.
@@ -1288,6 +1227,111 @@ struct AMDGPUKernelTy : public GenericKernelTy {
     return std::min(PreferredNumBlocks,
                     (uint64_t)GenericDevice.getBlockLimit());
   }
+
+  /// Compute the occupancy with the constraint on the number of SGPRs
+  /// Follow the logic on the backend
+  /// Ref:
+  /// llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getOccupancyWithNumSGPRs
+  unsigned getOccupancyWithNumSGPRs(unsigned SGPRCount) const {
+
+    if (SGPRCount <= llvm::omp::amdgpu_arch::SGPRCountOccupancy10) {
+      return 10;
+    } else if (SGPRCount <= llvm::omp::amdgpu_arch::SGPRCountOccupancy9) {
+      return 9;
+    } else if (SGPRCount <= llvm::omp::amdgpu_arch::SGPRCountOccupancy8) {
+      return 8;
+    }
+    return 7;
+  }
+
+  /// Compute the occupancy with the constraint on LDS
+  /// Follow the logic on the backend
+  /// Ref:
+  /// llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getOccupancyWithLocalMemSize
+  unsigned getOccupancyWithLDS(uint32_t GroupSegmentSize,
+                               unsigned MaxWavesPerEU,
+                               uint32_t MaxFlatWorkgroupSize) const {
+
+    unsigned MaxWorkgroupNum =
+        llvm::omp::amdgpu_arch::LocalMemorySize / GroupSegmentSize;
+
+    // workgroup size
+    unsigned ThreadsPerWorkgroup = MaxFlatWorkgroupSize;
+    unsigned WavesPerWorkgroup = divideCeil(
+        ThreadsPerWorkgroup, llvm::omp::amdgpu_arch::WaveFrontSize64);
+
+    unsigned MaxWavesPerCU = MaxWavesPerEU * llvm::omp::amdgpu_arch::SIMDPerCU;
+
+    // if a workgroup has just one wavefront, the max # of workgroup per CU is
+    // 40 if a workgroup has more than one wavefront, the max # of workgroup per
+    // CU is 16 https://github.com/ROCm/ROCm/issues/746#issuecomment-474656922
+    if (WavesPerWorkgroup <= 1) {
+      MaxWorkgroupNum = std::min(MaxWorkgroupNum, MaxWavesPerCU);
+    } else {
+      MaxWorkgroupNum =
+          std::min(MaxWorkgroupNum, MaxWavesPerCU / WavesPerWorkgroup);
+      MaxWorkgroupNum = std::min(MaxWorkgroupNum,
+                                 llvm::omp::amdgpu_arch::MaxWorkgroupNumPerCU);
+    }
+
+    // per SIMD
+    unsigned WaveNumByLDS = divideCeil(WavesPerWorkgroup * MaxWorkgroupNum,
+                                       llvm::omp::amdgpu_arch::SIMDPerCU);
+    WaveNumByLDS = std::min(WaveNumByLDS, MaxWavesPerEU);
+
+    return WaveNumByLDS;
+  }
+
+  /// Compute the max kernel occupancy for AMD GPU
+  unsigned computeMaxOccupancy(GenericDeviceTy &Device) const override {
+    uint32_t GroupSegmentSize = (*KernelInfo).GroupSegmentList;
+    uint32_t SGPRCount = (*KernelInfo).SGPRCount;
+    uint32_t VGPRCount = (*KernelInfo).VGPRCount;
+    uint32_t MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
+
+    // Default number of waves per EU
+    unsigned MaxWavesPerEU = llvm::omp::amdgpu_arch::MaxWavesPerEU10;
+
+    // Get GPU info
+    bool IsEquippedWithGFX90A = Device.hasGfx90aDevice();
+    if (IsEquippedWithGFX90A) {
+      MaxWavesPerEU = llvm::omp::amdgpu_arch::MaxWavesPerEU8;
+    }
+
+    unsigned Occupancy = INT_MAX;
+
+    // Contraint on SGPR
+    if (SGPRCount) {
+      Occupancy = getOccupancyWithNumSGPRs(SGPRCount);
+    }
+
+    Occupancy = std::min(Occupancy, MaxWavesPerEU);
+
+    // Constraint on VGPR
+    // Follow the logic on the backend
+    // Ref:
+    // llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getNumWavesPerEUWithNumVGPRs
+    if (VGPRCount) {
+      unsigned WaveNumByVGPR =
+          llvm::omp::amdgpu_arch::VGPRNumPerThread / VGPRCount;
+      Occupancy = std::min(Occupancy, WaveNumByVGPR);
+    }
+
+    // Constraint on LDS
+    if (GroupSegmentSize) {
+      unsigned WaveNumByLDS = getOccupancyWithLDS(
+          GroupSegmentSize, MaxWavesPerEU, MaxFlatWorkgroupSize);
+      Occupancy = std::min(Occupancy, WaveNumByLDS);
+    } else {
+      // If 0 LDS required by the kernel
+      Occupancy = std::min(Occupancy, MaxWavesPerEU);
+    }
+
+    // Cache the value before return
+    MaxOccupancy = Occupancy;
+
+    return Occupancy;
+  }
 };
 
 /// Class representing an HSA signal. Signals are used to define dependencies
@@ -4980,10 +5024,6 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
   auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
   // auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
 
-  // kernel occupancy
-  auto Occupancy =
-      hsa_utils::computeOccupancy(GenericDevice, KernelInfo, NumThreads, NumBlocks);
-
   // This line should print exactly as the one in the old plugin.
   fprintf(stderr,
           "DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4luX%4d) "
@@ -4994,7 +5034,7 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
           KernelArgs.NumArgs, NumBlocks, NumThreads, 0, 0, GroupSegmentSize,
           SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
           KernelArgs.Tripcount, NeedsHostServices, isMultiDeviceKernel(),
-          MultiDeviceLB, MultiDeviceUB, Occupancy, getName());
+          MultiDeviceLB, MultiDeviceUB, MaxOccupancy, getName());
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
@@ -5221,63 +5261,6 @@ void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
   FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
 }
 
-namespace hsa_utils {
-// TODO: improve the computation logic
-//        with more corner cases
-// split namespace utils for solving the dependency
-/// Compute kernel occupancy
-unsigned computeOccupancy(
-    GenericDeviceTy &Device,
-    std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo,
-    uint32_t NumThreads, uint64_t NumBlocks) {
-  uint32_t GroupSegmentSize = (*KernelInfo).GroupSegmentList;
-  uint32_t SGPRCount = (*KernelInfo).SGPRCount;
-  uint32_t VGPRCount = (*KernelInfo).VGPRCount;
-  uint32_t MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
-
-  // device info
-  AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
-  unsigned MaxWavesPerEU = llvm::omp::amdgpu_arch::MaxWavesPerEU10;
-
-  // get GPU info
-  bool IsEquippedWithGFX90A = AMDGPUDevice.hasGfx90aDevice();
-  if (IsEquippedWithGFX90A) {
-    MaxWavesPerEU = llvm::omp::amdgpu_arch::MaxWavesPerEU8;
-  }
-
-  unsigned Occupancy = INT_MAX;
-
-  // contraint on SGPR
-  if (SGPRCount) {
-    Occupancy = hsa_utils::getOccupancyWithNumSGPRs(SGPRCount);
-  }
-
-  Occupancy = std::min(Occupancy, MaxWavesPerEU);
-
-  // constraint on VGPR
-  // follow the logic on the backend
-  // ref:
-  // llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp:getNumWavesPerEUWithNumVGPRs
-  if (VGPRCount) {
-    unsigned WaveNumByVGPR =
-        llvm::omp::amdgpu_arch::VGPRNumPerThread / VGPRCount;
-    Occupancy = std::min(Occupancy, WaveNumByVGPR);
-  }
-
-  // constraint on LDS
-  if (GroupSegmentSize) {
-    unsigned WaveNumByLDS = hsa_utils::getOccupancyWithLDS(
-        GroupSegmentSize, MaxWavesPerEU, MaxFlatWorkgroupSize);
-    Occupancy = std::min(Occupancy, WaveNumByLDS);
-  } else {
-    // if 0 LDS required by the kernel
-    Occupancy = std::min(Occupancy, MaxWavesPerEU);
-  }
-
-  return Occupancy;
-}
-} // namespace hsa_utils
-
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index b7c83d558f91ee..63e8b6fabd63fa 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -310,6 +310,15 @@ struct GenericKernelTy {
   /// Check if kernel is a multi-device kernel.
   bool isMultiDeviceKernel() const { return IsMultiDeviceKernel; }
 
+  /// Compute kernel occupancy
+  /// This function computes the max(upperbound) occupancy for a lanuched kernel
+  /// based on the given hardware resources e.g. the number of registers, size
+  /// of the local memory, etc.
+  virtual unsigned computeMaxOccupancy(GenericDeviceTy &Device) const {
+    // This function should be overridden in the derived class.
+    return MaxOccupancy;
+  }
+
 protected:
   /// Get the execution mode name of the kernel.
   const char *getExecutionModeName() const {
@@ -415,6 +424,7 @@ struct GenericKernelTy {
   bool isXTeamReductionsMode() const {
     return ExecutionMode == OMP_TGT_EXEC_MODE_XTEAM_RED;
   }
+
   /// The kernel environment, including execution flags.
   KernelEnvironmentTy KernelEnvironment;
 
@@ -423,6 +433,10 @@ struct GenericKernelTy {
 
   /// If the kernel is a bare kernel.
   bool IsBareKernel = false;
+
+  /// Upper-bound for the launched kernel occupancy.
+  /// -1 indicates an invalid result.
+  mutable unsigned MaxOccupancy = -1;
 };
 
 /// Information about an allocation, when it has been allocated, and when/if it
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index bbdbf5420cd666..6c00b27f8d44fc 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -714,6 +714,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                     Args, Ptrs, *KernelLaunchEnvOrErr);
   }
 
+  // Get max occupancy for this kernel
+  computeMaxOccupancy(GenericDevice);
+
   uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
 
   std::pair<bool, uint32_t> AdjustInfo = adjustNumThreadsForLowTripCount(