From 651afb49fa7807073d493571913d3b999d742cc4 Mon Sep 17 00:00:00 2001
From: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
Date: Mon, 15 Jul 2024 17:49:40 -0500
Subject: [PATCH] NVQC Optimizations for VQE (C++ and Python) (#1901)

This set of changes optimizes VQE performance when running simulations on NVQC.

The changes work for both C++ and Python applications.

Details: allow cudaq::vqe() to be invoked with variadic arguments similar to how we invoke cudaq::sample() and cudaq::observe(). More specifically, that means that we allow the user to pass the concrete, non-variational arguments (the arguments not subject to VQE optimization) directly into the cudaq::vqe() call. Users must switch to this calling convention to achieve notable speedups.
---
 .../examples/cpp/algorithms/qaoa_maxcut.cpp   |   6 +-
 .../sphinx/examples/cpp/algorithms/vqe_h2.cpp |   8 +-
 include/cudaq/Optimizer/Builder/Factory.h     |   5 +-
 include/cudaq/Optimizer/Transforms/Passes.h   |   4 +-
 include/cudaq/Optimizer/Transforms/Passes.td  |   2 +
 lib/Optimizer/Builder/Factory.cpp             |   9 +-
 .../Transforms/GenKernelExecution.cpp         |  21 ++-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |  54 ++++---
 python/runtime/cudaq/algorithms/py_vqe.cpp    | 121 +++++++++++++-
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  65 ++++++--
 python/runtime/utils/PyRemoteSimulatorQPU.cpp |  49 ++++--
 python/tests/remote/test_remote_code_exec.py  |  25 +++
 python/utils/OpaqueArguments.h                |  12 +-
 runtime/common/BaseRemoteSimulatorQPU.h       |  86 +++++++---
 runtime/common/BaseRestRemoteClient.h         | 108 ++++++++++---
 runtime/common/KernelWrapper.h                |  31 ++++
 runtime/common/RemoteKernelExecutor.h         |  18 ++-
 runtime/cudaq/algorithms/gradient.h           |  57 ++++++-
 .../algorithms/gradients/central_difference.h |   6 +
 .../algorithms/gradients/forward_difference.h |   6 +
 .../algorithms/gradients/parameter_shift.h    |   6 +
 runtime/cudaq/algorithms/vqe.h                | 146 ++++++++++++++---
 .../rest_server/helpers/RestRemoteServer.cpp  |  94 ++++++++++-
 runtime/cudaq/platform/qpu.h                  |  10 ++
 runtime/cudaq/platform/quantum_platform.cpp   |  21 +++
 runtime/cudaq/platform/quantum_platform.h     |  11 ++
 targettests/Remote-Sim/vqe_h2.cpp             | 149 ++++++++++++++++++
 27 files changed, 977 insertions(+), 153 deletions(-)
 create mode 100644 targettests/Remote-Sim/vqe_h2.cpp
diff --git a/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp b/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp
index 63d55f51d3f..0c14b578c60 100644
--- a/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp
+++ b/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp
@@ -72,10 +72,8 @@ int main() {
       -M_PI / 8.0, M_PI / 8.0, n_params, std::mt19937::default_seed);
 
   // Call the optimizer
-  auto [opt_val, opt_params] = cudaq::vqe(
-      ansatz{}, Hp, optimizer, n_params, [&](std::vector<double> params) {
-        return std::make_tuple(params, n_qubits, n_layers);
-      });
+  auto [opt_val, opt_params] =
+      cudaq::vqe(ansatz{}, Hp, optimizer, n_params, n_qubits, n_layers);
 
   // Print the optimized value and the parameters
   printf("Optimal value = %.16lf\n", opt_val);
diff --git a/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp b/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp
index c38df10d15a..50c65da2d6a 100644
--- a/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp
+++ b/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp
@@ -104,18 +104,14 @@ int main() {
 
   so4_fabric ansatz;
 
-  auto argMapper = [&](std::vector<double> x) {
-    return std::make_tuple(x, n_qubits, n_layers);
-  };
-
   // Run VQE.
   cudaq::optimizers::lbfgs optimizer;
   optimizer.initial_parameters = init_params;
   optimizer.max_eval = 20;
   optimizer.max_line_search_trials = 10;
-  cudaq::gradients::central_difference gradient(ansatz, argMapper);
+  cudaq::gradients::central_difference gradient;
   auto [opt_val, opt_params] =
-      cudaq::vqe(ansatz, gradient, H, optimizer, n_params, argMapper);
+      cudaq::vqe(ansatz, gradient, H, optimizer, n_params, n_qubits, n_layers);
 
   printf("Optimal value = %.16lf\n", opt_val);
 }
diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h
index 4771f828c52..42c2d306b83 100644
--- a/include/cudaq/Optimizer/Builder/Factory.h
+++ b/include/cudaq/Optimizer/Builder/Factory.h
@@ -91,7 +91,10 @@ mlir::Type genArgumentBufferType(mlir::Type ty);
 /// ```
 /// where the values of the vector argument are pass-by-value and appended to
 /// the end of the struct as a sequence of \i n double values.
-cudaq::cc::StructType buildInvokeStructType(mlir::FunctionType funcTy);
+///
+/// The leading `startingArgIdx + 1` parameters are omitted from the struct.
+cudaq::cc::StructType buildInvokeStructType(mlir::FunctionType funcTy,
+                                            std::size_t startingArgIdx = 0);
 
 /// Return the LLVM-IR dialect type: `[length x i8]`.
 inline mlir::Type getStringType(mlir::MLIRContext *ctx, std::size_t length) {
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 996b6e56a70..4b15be99155 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -41,7 +41,9 @@ std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
-std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
+std::unique_ptr<mlir::Pass>
+createQuakeSynthesizer(std::string_view, const void *,
+                       std::size_t startingArgIdx = 0);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
 std::unique_ptr<mlir::Pass> createUnwindLoweringPass();
 
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 1016351f7a2..e1a2d4f8b30 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -240,6 +240,8 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
   let options = [
     Option<"outputFilename", "output-filename", "std::string",
       /*default=*/"\"-\"", "Name of output file.">,
+    Option<"startingArgIdx", "starting-arg-idx", "std::size_t", /*default=*/"0",
+      "The starting argument index for the argsCreator.">,
   ];
 }
 
diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp
index d22cfc097ce..df618fbc095 100644
--- a/lib/Optimizer/Builder/Factory.cpp
+++ b/lib/Optimizer/Builder/Factory.cpp
@@ -62,11 +62,14 @@ Type factory::genArgumentBufferType(Type ty) {
   return genBufferType</*isOutput=*/false>(ty);
 }
 
-cudaq::cc::StructType factory::buildInvokeStructType(FunctionType funcTy) {
+cudaq::cc::StructType
+factory::buildInvokeStructType(FunctionType funcTy,
+                               std::size_t startingArgIdx) {
   auto *ctx = funcTy.getContext();
   SmallVector<Type> eleTys;
-  for (auto inTy : funcTy.getInputs())
-    eleTys.push_back(genBufferType</*isOutput=*/false>(inTy));
+  for (auto inTy : llvm::enumerate(funcTy.getInputs()))
+    if (inTy.index() >= startingArgIdx)
+      eleTys.push_back(genBufferType</*isOutput=*/false>(inTy.value()));
   for (auto outTy : funcTy.getResults())
     eleTys.push_back(genBufferType</*isOutput=*/true>(outTy));
   return cudaq::cc::StructType::get(ctx, eleTys);
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index be0326a4a49..c3ce43382bc 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -352,7 +352,7 @@ class GenerateKernelExecution
     builder.setInsertionPointToStart(entry);
 
     // Get the original function args
-    auto kernelArgTypes = devKernelTy.getInputs();
+    auto kernelArgTypes = devKernelTy.getInputs().drop_front(startingArgIdx);
 
     // Init the struct
     Value stVal = builder.create<cudaq::cc::UndefOp>(loc, msgStructTy);
@@ -1531,8 +1531,23 @@ class GenerateKernelExecution
                                     funcTy, funcOp);
 
       // Generate the argsCreator function used by synthesis.
-      auto argsCreatorFunc = genKernelArgsCreatorFunction(
-          loc, builder, funcTy, structTy, classNameStr, hostFuncTy, hasThisPtr);
+      mlir::func::FuncOp argsCreatorFunc;
+      if (startingArgIdx == 0) {
+        argsCreatorFunc =
+            genKernelArgsCreatorFunction(loc, builder, funcTy, structTy,
+                                         classNameStr, hostFuncTy, hasThisPtr);
+      } else {
+        // We are operating in a very special case where we want the argsCreator
+        // function to ignore the first `startingArgIdx` arguments. In this
+        // situation, the argsCreator function will not be compatible with the
+        // other helper functions created in this pass, so it is assumed that
+        // the caller is OK with that.
+        auto structTy_argsCreator =
+            cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx);
+        argsCreatorFunc = genKernelArgsCreatorFunction(
+            loc, builder, funcTy, structTy_argsCreator, classNameStr,
+            hostFuncTy, hasThisPtr);
+      }
 
       // Generate a new mangled function on the host side to call the
       // callback function.
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 79548fd5fff..731b436edab 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -55,14 +55,14 @@ class state;
 /// BlockArgument with it.
 template <typename ConcreteType>
 void synthesizeRuntimeArgument(
-    OpBuilder &builder, BlockArgument argument, void *args, std::size_t offset,
-    std::size_t typeSize,
+    OpBuilder &builder, BlockArgument argument, const void *args,
+    std::size_t offset, std::size_t typeSize,
     std::function<Value(OpBuilder &, ConcreteType *)> &&opGenerator) {
 
   // Create an instance of the concrete type
   ConcreteType concrete;
   // Copy the void* struct member into that concrete instance
-  std::memcpy(&concrete, ((char *)args) + offset, typeSize);
+  std::memcpy(&concrete, ((const char *)args) + offset, typeSize);
 
   // Generate the MLIR Value (arith constant for example)
   auto runtimeArg = opGenerator(builder, &concrete);
@@ -387,18 +387,26 @@ class QuakeSynthesizer
   std::string kernelName;
 
   // The raw pointer to the runtime arguments.
-  void *args;
+  const void *args;
+
+  // The starting argument index to synthesize. Typically 0 but may be >0 for
+  // partial synthesis. If >0, it is assumed that the first argument(s) are NOT
+  // in `args`.
+  std::size_t startingArgIdx = 0;
 
 public:
   QuakeSynthesizer() = default;
-  QuakeSynthesizer(std::string_view kernel, void *a)
+  QuakeSynthesizer(std::string_view kernel, const void *a)
       : kernelName(kernel), args(a) {}
+  QuakeSynthesizer(std::string_view kernel, const void *a, std::size_t s)
+      : kernelName(kernel), args(a), startingArgIdx(s) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
   std::pair<std::size_t, std::vector<std::size_t>>
   getTargetLayout(FunctionType funcTy) {
-    auto bufferTy = cudaq::opt::factory::buildInvokeStructType(funcTy);
+    auto bufferTy =
+        cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx);
     StringRef dataLayoutSpec = "";
     if (auto attr =
             getModule()->getAttr(cudaq::opt::factory::targetDataLayoutAttrName))
@@ -449,10 +457,10 @@ class QuakeSynthesizer
     // Keep track of the stdVec sizes.
     std::vector<std::tuple<std::size_t, Type, std::uint64_t>> stdVecInfo;
 
-    for (auto iter : llvm::enumerate(arguments)) {
-      auto argNum = iter.index();
-      auto argument = iter.value();
-      std::size_t offset = structLayout.second[argNum];
+    for (std::size_t argNum = startingArgIdx, end = arguments.size();
+         argNum < end; argNum++) {
+      auto argument = arguments[argNum];
+      std::size_t offset = structLayout.second[argNum - startingArgIdx];
 
       // Get the argument type
       auto type = argument.getType();
@@ -560,9 +568,10 @@ class QuakeSynthesizer
           signalPassFailure();
           return;
         }
-        char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
+        const char *ptrToSizeInBuffer =
+            static_cast<const char *>(args) + offset;
         auto sizeFromBuffer =
-            *reinterpret_cast<std::uint64_t *>(ptrToSizeInBuffer);
+            *reinterpret_cast<const std::uint64_t *>(ptrToSizeInBuffer);
         auto bytesInType = [&eleTy]() -> unsigned {
           if (isa<cudaq::cc::CharspanType>(eleTy))
             return 16 /*bytes: sizeof(ptr) + sizeof(i64)*/;
@@ -589,8 +598,10 @@ class QuakeSynthesizer
           // TODO: for now we can ignore empty struct types.
           continue;
         }
-        char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
-        auto rawSize = *reinterpret_cast<std::uint64_t *>(ptrToSizeInBuffer);
+        const char *ptrToSizeInBuffer =
+            static_cast<const char *>(args) + offset;
+        auto rawSize =
+            *reinterpret_cast<const std::uint64_t *>(ptrToSizeInBuffer);
         stdVecInfo.emplace_back(argNum, Type{}, rawSize);
         continue;
       }
@@ -604,7 +615,7 @@ class QuakeSynthesizer
     // the block arg with the actual vector element data. First get the pointer
     // to the start of the buffer's appendix.
     auto structSize = structLayout.first;
-    char *bufferAppendix = static_cast<char *>(args) + structSize;
+    const char *bufferAppendix = static_cast<const char *>(args) + structSize;
     for (auto [idx, eleTy, vecLength] : stdVecInfo) {
       if (!eleTy) {
         // FIXME: Skip struct values.
@@ -614,7 +625,7 @@ class QuakeSynthesizer
         continue;
       }
       auto doVector = [&]<typename T>(T) {
-        auto *ptr = reinterpret_cast<T *>(bufferAppendix);
+        auto *ptr = reinterpret_cast<const T *>(bufferAppendix);
         std::vector<T> v(ptr, ptr + vecLength);
         if (failed(synthesizeVectorArgument(builder, module, counter,
                                             arguments[idx], v)))
@@ -667,7 +678,8 @@ class QuakeSynthesizer
         // of sizes that are encoded starting at bufferAppendix.
         // At the end of the block of sizes, the C-strings will be encoded.
         auto numberSpans = vecLength;
-        auto *spanSizes = reinterpret_cast<std::uint64_t *>(bufferAppendix);
+        auto *spanSizes =
+            reinterpret_cast<const std::uint64_t *>(bufferAppendix);
         bufferAppendix += vecLength * sizeof(std::uint64_t);
         // These strings are reified in the following way:
         //   - Create an array numberSpans in length and where each element
@@ -726,7 +738,8 @@ class QuakeSynthesizer
     // Remove the old arguments.
     auto numArgs = funcOp.getNumArguments();
     BitVector argsToErase(numArgs);
-    for (std::size_t argIndex = 0; argIndex < numArgs; ++argIndex) {
+    for (std::size_t argIndex = startingArgIdx; argIndex < numArgs;
+         ++argIndex) {
       argsToErase.set(argIndex);
       if (!funcOp.getBody().front().getArgument(argIndex).getUses().empty()) {
         funcOp.emitError("argument(s) still in use after synthesis.");
@@ -745,6 +758,7 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
 }
 
 std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, void *a) {
-  return std::make_unique<QuakeSynthesizer>(kernelName, a);
+cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a,
+                                   std::size_t startingArgIdx) {
+  return std::make_unique<QuakeSynthesizer>(kernelName, a, startingArgIdx);
 }
diff --git a/python/runtime/cudaq/algorithms/py_vqe.cpp b/python/runtime/cudaq/algorithms/py_vqe.cpp
index c88328f54b7..61b676a4fc5 100644
--- a/python/runtime/cudaq/algorithms/py_vqe.cpp
+++ b/python/runtime/cudaq/algorithms/py_vqe.cpp
@@ -9,6 +9,7 @@
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
 
+#include "common/ArgumentWrapper.h"
 #include "common/JsonConvert.h"
 #include "common/SerializedCodeExecutionContext.h"
 #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
@@ -26,6 +27,11 @@ namespace cudaq {
 void pyAltLaunchKernel(const std::string &, MlirModule, OpaqueArguments &,
                        const std::vector<std::string> &);
 
+void *pyGetKernelArgs(const std::string &name, MlirModule module,
+                      cudaq::OpaqueArguments &runtimeArgs,
+                      const std::vector<std::string> &names,
+                      std::size_t startingArgIdx);
+
 /// @brief Return the quantum kernel `FuncOp` from the given `ModuleOp`.
 mlir::func::FuncOp getKernelFuncOp(mlir::ModuleOp &mod,
                                    const std::string &kernelName) {
@@ -57,14 +63,21 @@ bool isArgumentStdVec(MlirModule &module, const std::string &kernelName,
   return isa<cudaq::cc::StdvecType>(kernel.getArgument(argIdx).getType());
 }
 
-/// @brief Run `cudaq::observe` on the provided kernel and spin operator.
-observe_result pyObserve(py::object &kernel, spin_op &spin_operator,
-                         py::args args, const int shots,
-                         bool argMapperProvided = false) {
+/// @brief Return the kernel name and MLIR module for a kernel.
+static inline std::pair<std::string, MlirModule>
+getKernelNameAndModule(py::object &kernel) {
   if (py::hasattr(kernel, "compile"))
     kernel.attr("compile")();
   auto kernelName = kernel.attr("name").cast<std::string>();
   auto kernelMod = kernel.attr("module").cast<MlirModule>();
+  return std::make_pair(kernelName, kernelMod);
+}
+
+/// @brief Run `cudaq::observe` on the provided kernel and spin operator.
+observe_result pyObserve(py::object &kernel, spin_op &spin_operator,
+                         py::args args, const int shots,
+                         bool argMapperProvided = false) {
+  auto [kernelName, kernelMod] = getKernelNameAndModule(kernel);
   auto &platform = cudaq::get_platform();
   args = simplifiedValidateInputArguments(args);
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
@@ -98,6 +111,69 @@ observe_result pyObserve(py::object &kernel, spin_op &spin_operator,
       .value();
 }
 
+/// @brief Return whether or not \p kernel is compatible with the remote VQE
+/// implementation that requires the variation parameters to be the first
+/// argument in the kernel.
+static bool firstArgIsCompatibleWithRemoteVQE(py::object &kernel) {
+  auto [kernelName, kernelMod] = getKernelNameAndModule(kernel);
+  auto kernelFunc = getKernelFuncOp(kernelMod, kernelName);
+  if (kernelFunc.getNumArguments() < 1)
+    return false;
+  auto firstKernelArgTy = kernelFunc.getArgument(0).getType();
+  if (auto stdVec = dyn_cast<cudaq::cc::StdvecType>(firstKernelArgTy)) {
+    auto eleTy = stdVec.getElementType();
+    return isa<mlir::Float32Type, mlir::Float64Type>(eleTy);
+  } else {
+    return false;
+  }
+}
+
+/// @brief Perform VQE on a remote platform using the C++ capabilities. This
+/// function is used for many of the pyVQE variants below, so some of the
+/// parameters may be nullptr.
+static optimization_result
+pyVQE_remote_cpp(cudaq::quantum_platform &platform, py::object &kernel,
+                 spin_op &hamiltonian, cudaq::optimizer &optimizer,
+                 cudaq::gradient *gradient, py::function *argumentMapper,
+                 const int n_params, const int shots) {
+  auto [kernelName, kernelMod] = getKernelNameAndModule(kernel);
+  auto ctx = std::make_unique<ExecutionContext>("observe", /*shots=*/0);
+  ctx->kernelName = kernelName;
+  ctx->spin = &hamiltonian;
+  platform.set_exec_ctx(ctx.get());
+
+  constexpr std::size_t startingArgIdx = 1;
+  cudaq::OpaqueArguments args;
+  void *kernelArgs = nullptr;
+  if (argumentMapper) {
+    std::vector<double> myArg(n_params);
+    py::list py_list = py::cast(myArg);
+    py::tuple result = (*argumentMapper)(py_list);
+    py::args runtimeArgs = result;
+
+    // Serialize arguments (all concrete parameters except for the first one)
+    // into kernelArgs buffer space.
+    auto kernelFunc = getKernelFuncOp(kernelMod, kernelName);
+    cudaq::packArgs(
+        args, runtimeArgs, kernelFunc,
+        [](OpaqueArguments &, py::object &) { return false; }, startingArgIdx);
+  }
+  kernelArgs = pyGetKernelArgs(kernelName, kernelMod, args, /*names=*/{},
+                               startingArgIdx);
+
+  // Need to form cudaq::ArgWrapper and pass that into launchVQE.
+  std::vector<std::string> names;
+  auto *wrapper = new cudaq::ArgWrapper{unwrap(kernelMod), names, kernelArgs};
+
+  platform.launchVQE(kernelName, wrapper, gradient, hamiltonian, optimizer,
+                     n_params, shots);
+  platform.reset_exec_ctx();
+  delete wrapper;
+  if (kernelArgs)
+    std::free(kernelArgs);
+  return ctx->optResult.value_or(optimization_result{});
+}
+
 /// @brief Perform VQE on a remote platform. This function is used for many of
 /// the pyVQE variants below, so some of the parameters may be nullptr.
 static optimization_result
@@ -185,11 +261,28 @@ pyVQE_remote(cudaq::quantum_platform &platform, py::object &kernel,
   return result;
 }
 
+/// @brief Throw an exception instructing the user how to achieve optimal
+/// performance
+static void throwPerformanceError() {
+  throw std::runtime_error(
+      "ERROR: Achieving optimal VQE kernel on this platform requires the first "
+      "parameter in the kernel to be the variational parameter (list of "
+      "floats). Please update your VQE kernel to have list[float] as a its "
+      "first parameter\n");
+}
+
 /// @brief Run `cudaq.vqe()` without a gradient strategy.
 optimization_result pyVQE(py::object &kernel, spin_op &hamiltonian,
                           cudaq::optimizer &optimizer, const int n_params,
                           const int shots = -1) {
   auto &platform = cudaq::get_platform();
+  if (platform.supports_remote_vqe()) {
+    if (firstArgIsCompatibleWithRemoteVQE(kernel))
+      return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer,
+                              /*gradient=*/nullptr, /*argumentMapper=*/nullptr,
+                              n_params, shots);
+    throwPerformanceError();
+  }
   if (platform.supports_remote_serialized_code())
     return pyVQE_remote(platform, kernel, hamiltonian, optimizer,
                         /*gradient=*/nullptr, /*argumentMapper=*/nullptr,
@@ -209,6 +302,13 @@ optimization_result pyVQE(py::object &kernel, spin_op &hamiltonian,
                           cudaq::optimizer &optimizer, const int n_params,
                           py::function &argumentMapper, const int shots = -1) {
   auto &platform = cudaq::get_platform();
+  if (platform.supports_remote_vqe()) {
+    if (firstArgIsCompatibleWithRemoteVQE(kernel))
+      return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer,
+                              /*gradient=*/nullptr, &argumentMapper, n_params,
+                              shots);
+    throwPerformanceError();
+  }
   if (platform.supports_remote_serialized_code())
     return pyVQE_remote(platform, kernel, hamiltonian, optimizer,
                         /*gradient=*/nullptr, &argumentMapper, n_params, shots);
@@ -235,6 +335,13 @@ optimization_result pyVQE(py::object &kernel, cudaq::gradient &gradient,
   // to allow for the calculation of the gradient vector with the
   // provided gradient strategy.
   auto &platform = cudaq::get_platform();
+  if (platform.supports_remote_vqe()) {
+    if (firstArgIsCompatibleWithRemoteVQE(kernel))
+      return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer,
+                              &gradient,
+                              /*argumentMapper=*/nullptr, n_params, shots);
+    throwPerformanceError();
+  }
   if (platform.supports_remote_serialized_code())
     return pyVQE_remote(platform, kernel, hamiltonian, optimizer, &gradient,
                         /*argumentMapper=*/nullptr, n_params, shots);
@@ -267,6 +374,12 @@ optimization_result pyVQE(py::object &kernel, cudaq::gradient &gradient,
   // to allow for the calculation of the gradient vector with the
   // provided gradient strategy.
   auto &platform = cudaq::get_platform();
+  if (platform.supports_remote_vqe()) {
+    if (firstArgIsCompatibleWithRemoteVQE(kernel))
+      return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer,
+                              &gradient, &argumentMapper, n_params, shots);
+    throwPerformanceError();
+  }
   if (platform.supports_remote_serialized_code())
     return pyVQE_remote(platform, kernel, hamiltonian, optimizer, &gradient,
                         &argumentMapper, n_params, shots);
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 6639e9b240c..392b1fa8fab 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -66,12 +66,19 @@ static std::unique_ptr<PyStateStorage> cudaqStateStorage =
 std::tuple<ExecutionEngine *, void *, std::size_t, std::int32_t>
 jitAndCreateArgs(const std::string &name, MlirModule module,
                  cudaq::OpaqueArguments &runtimeArgs,
-                 const std::vector<std::string> &names, Type returnType) {
+                 const std::vector<std::string> &names, Type returnType,
+                 std::size_t startingArgIdx = 0) {
   ScopedTraceWithContext(cudaq::TIMING_JIT, "jitAndCreateArgs", name);
   auto mod = unwrap(module);
   auto cloned = mod.clone();
   auto context = cloned.getContext();
 
+  // Do not cache the JIT if we are running with startingArgIdx > 0 because a)
+  // we won't be executing right after JIT-ing, and b) we might get called later
+  // this with startingArgIdx == 0, and we need that JIT to be performed and
+  // cached.
+  const bool allowCache = startingArgIdx == 0;
+
   // Have we JIT compiled this before?
   auto hash = llvm::hash_code{0};
   mod.walk([&hash](Operation *op) {
@@ -80,7 +87,7 @@ jitAndCreateArgs(const std::string &name, MlirModule module,
   auto hashKey = static_cast<size_t>(hash);
 
   ExecutionEngine *jit = nullptr;
-  if (jitCache->hasJITEngine(hashKey)) {
+  if (allowCache && jitCache->hasJITEngine(hashKey)) {
     jit = jitCache->getJITEngine(hashKey);
   } else {
     ScopedTraceWithContext(cudaq::TIMING_JIT,
@@ -90,7 +97,8 @@ jitAndCreateArgs(const std::string &name, MlirModule module,
     pm.addNestedPass<func::FuncOp>(
         cudaq::opt::createPySynthCallableBlockArgs(names));
     pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader(/*genAsQuake=*/true));
-    pm.addPass(cudaq::opt::createGenerateKernelExecution());
+    pm.addPass(cudaq::opt::createGenerateKernelExecution(
+        {.startingArgIdx = startingArgIdx}));
     pm.addPass(cudaq::opt::createLambdaLiftingPass());
     cudaq::opt::addPipelineConvertToQIR(pm);
 
@@ -137,7 +145,8 @@ jitAndCreateArgs(const std::string &name, MlirModule module,
 
     auto uniqueJit = std::move(jitOrError.get());
     jit = uniqueJit.release();
-    jitCache->cache(hashKey, jit);
+    if (allowCache)
+      jitCache->cache(hashKey, jit);
   }
 
   // We need to append the return type to the OpaqueArguments here
@@ -227,9 +236,14 @@ jitAndCreateArgs(const std::string &name, MlirModule module,
 std::tuple<void *, std::size_t, std::int32_t>
 pyAltLaunchKernelBase(const std::string &name, MlirModule module,
                       Type returnType, cudaq::OpaqueArguments &runtimeArgs,
-                      const std::vector<std::string> &names) {
-  auto [jit, rawArgs, size, returnOffset] =
-      jitAndCreateArgs(name, module, runtimeArgs, names, returnType);
+                      const std::vector<std::string> &names,
+                      std::size_t startingArgIdx = 0) {
+  // Do not allow kernel execution if we are running with startingArgIdx > 0.
+  // This is used in remote VQE execution.
+  const bool launch = startingArgIdx == 0;
+
+  auto [jit, rawArgs, size, returnOffset] = jitAndCreateArgs(
+      name, module, runtimeArgs, names, returnType, startingArgIdx);
 
   auto mod = unwrap(module);
   auto thunkName = name + ".thunk";
@@ -308,16 +322,18 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module,
   auto kernelReg = reinterpret_cast<void (*)()>(*regFuncPtr);
   kernelReg();
 
-  auto &platform = cudaq::get_platform();
-  if (platform.is_remote() || platform.is_emulated()) {
-    auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs};
-    cudaq::altLaunchKernel(name.c_str(), thunk,
-                           reinterpret_cast<void *>(wrapper), size,
-                           (uint64_t)returnOffset);
-    delete wrapper;
-  } else
-    cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, size,
-                           (uint64_t)returnOffset);
+  if (launch) {
+    auto &platform = cudaq::get_platform();
+    if (platform.is_remote() || platform.is_emulated()) {
+      auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs};
+      cudaq::altLaunchKernel(name.c_str(), thunk,
+                             reinterpret_cast<void *>(wrapper), size,
+                             (uint64_t)returnOffset);
+      delete wrapper;
+    } else
+      cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, size,
+                             (uint64_t)returnOffset);
+  }
 
   return std::make_tuple(rawArgs, size, returnOffset);
 }
@@ -388,6 +404,21 @@ void pyAltLaunchKernel(const std::string &name, MlirModule module,
   std::free(rawArgs);
 }
 
+/// @brief Serialize \p runtimeArgs into a flat buffer starting at
+/// \p startingArgIdx (0-based). This does not execute the kernel. This is
+/// useful for VQE applications when you want to serialize the constant
+/// parameters that are not being optimized. The caller is responsible for
+/// executing `std::free()` on the return value.
+void *pyGetKernelArgs(const std::string &name, MlirModule module,
+                      cudaq::OpaqueArguments &runtimeArgs,
+                      const std::vector<std::string> &names,
+                      std::size_t startingArgIdx) {
+  auto noneType = mlir::NoneType::get(unwrap(module).getContext());
+  auto [rawArgs, size, returnOffset] = pyAltLaunchKernelBase(
+      name, module, noneType, runtimeArgs, names, startingArgIdx);
+  return rawArgs;
+}
+
 inline unsigned int byteSize(mlir::Type ty) {
   if (isa<ComplexType>(ty)) {
     auto eleTy = cast<ComplexType>(ty).getElementType();
diff --git a/python/runtime/utils/PyRemoteSimulatorQPU.cpp b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
index f59c296a1d0..b39f56f54ff 100644
--- a/python/runtime/utils/PyRemoteSimulatorQPU.cpp
+++ b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
@@ -22,6 +22,35 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU {
 
   virtual bool isEmulated() override { return true; }
 
+  void launchVQE(const std::string &name, const void *kernelArgs,
+                 cudaq::gradient *gradient, cudaq::spin_op H,
+                 cudaq::optimizer &optimizer, const int n_params,
+                 const std::size_t shots) override {
+    cudaq::ExecutionContext *executionContextPtr =
+        getExecutionContextForMyThread();
+
+    auto *wrapper = reinterpret_cast<const cudaq::ArgWrapper *>(kernelArgs);
+    auto m_module = wrapper->mod;
+    auto *mlirContext = m_module->getContext();
+
+    if (executionContextPtr && executionContextPtr->name == "tracer")
+      return;
+
+    auto ctx = std::make_unique<cudaq::ExecutionContext>("observe", shots);
+    ctx->kernelName = name;
+    ctx->spin = &H;
+    if (shots > 0)
+      ctx->shots = shots;
+
+    std::string errorMsg;
+    const bool requestOkay = m_client->sendRequest(
+        *mlirContext, *executionContextPtr, /*serializedCodeContext=*/nullptr,
+        gradient, &optimizer, n_params, m_simName, name, /*kernelFunc=*/nullptr,
+        wrapper->rawArgs, /*argSize=*/0, &errorMsg);
+    if (!requestOkay)
+      throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg);
+  }
+
   void launchKernel(const std::string &name, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset) override {
@@ -35,13 +64,8 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU {
     auto *mlirContext = m_module->getContext();
 
     cudaq::ExecutionContext *executionContextPtr =
-        [&]() -> cudaq::ExecutionContext * {
-      std::scoped_lock<std::mutex> lock(m_contextMutex);
-      const auto iter = m_contexts.find(std::this_thread::get_id());
-      if (iter == m_contexts.end())
-        return nullptr;
-      return iter->second;
-    }();
+        getExecutionContextForMyThread();
+
     // Default context for a 'fire-and-ignore' kernel launch; i.e., no context
     // was set before launching the kernel. Use a static variable per thread to
     // set up a single-shot execution context for this case.
@@ -52,6 +76,7 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU {
     std::string errorMsg;
     const bool requestOkay = m_client->sendRequest(
         *mlirContext, executionContext, /*serializedCodeContext=*/nullptr,
+        /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
         m_simName, name, kernelFunc, wrapper->rawArgs, voidStarSize, &errorMsg);
     if (!requestOkay)
       throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
@@ -84,13 +109,8 @@ class PyNvcfSimulatorQPU : public cudaq::BaseNvcfSimulatorQPU {
     auto *mlirContext = m_module->getContext();
 
     cudaq::ExecutionContext *executionContextPtr =
-        [&]() -> cudaq::ExecutionContext * {
-      std::scoped_lock<std::mutex> lock(m_contextMutex);
-      const auto iter = m_contexts.find(std::this_thread::get_id());
-      if (iter == m_contexts.end())
-        return nullptr;
-      return iter->second;
-    }();
+        getExecutionContextForMyThread();
+
     // Default context for a 'fire-and-ignore' kernel launch; i.e., no context
     // was set before launching the kernel. Use a static variable per thread to
     // set up a single-shot execution context for this case.
@@ -101,6 +121,7 @@ class PyNvcfSimulatorQPU : public cudaq::BaseNvcfSimulatorQPU {
     std::string errorMsg;
     const bool requestOkay = m_client->sendRequest(
         *mlirContext, executionContext, /*serializedCodeContext=*/nullptr,
+        /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
         m_simName, name, kernelFunc, wrapper->rawArgs, voidStarSize, &errorMsg);
     if (!requestOkay)
       throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
diff --git a/python/tests/remote/test_remote_code_exec.py b/python/tests/remote/test_remote_code_exec.py
index 33af45ddb95..960f22a8dfc 100644
--- a/python/tests/remote/test_remote_code_exec.py
+++ b/python/tests/remote/test_remote_code_exec.py
@@ -297,6 +297,31 @@ def kernel(angles: list[float], num_qubits: int):
     assert assert_close(parameter[0], 0.5840908448487905, 1e-3)
 
 
+@skipIfPythonLessThan39
+def test_vqe_perf_warning():
+    hamiltonian = 5.907 - 2.1433 * spin.x(0) * spin.x(1) - 2.1433 * spin.y(
+        0) * spin.y(1) + .21829 * spin.z(0) - 6.125 * spin.z(1)
+
+    @cudaq.kernel
+    def kernel(num_qubits: int, angles: list[float]):
+        qvector = cudaq.qvector(num_qubits)
+        x(qvector[0])
+        ry(angles[0], qvector[1])
+        x.ctrl(qvector[1], qvector[0])
+
+    optimizer = cudaq.optimizers.Adam()
+    grad = cudaq.gradients.CentralDifference()
+
+    num_qubits = 2
+    with pytest.raises(RuntimeError) as error:
+        energy, parameter = cudaq.vqe(kernel=kernel,
+                                      gradient_strategy=grad,
+                                      spin_operator=hamiltonian,
+                                      optimizer=optimizer,
+                                      argument_mapper=lambda x: (num_qubits, x),
+                                      parameter_count=1)
+
+
 # This is a helper function used by parameterized tests below.
 @pytest.mark.skip
 def test_complex_vqe_named_lambda(optimizer, gradient):
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index f7b1f4bc988..661f5efb5d1 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -185,18 +185,18 @@ inline std::string mlirTypeToString(mlir::Type ty) {
   return msg;
 }
 
-inline void
-packArgs(OpaqueArguments &argData, py::args args,
-         mlir::func::FuncOp kernelFuncOp,
-         const std::function<bool(OpaqueArguments &argData, py::object &arg)>
-             &backupHandler) {
+inline void packArgs(OpaqueArguments &argData, py::args args,
+                     mlir::func::FuncOp kernelFuncOp,
+                     const std::function<bool(OpaqueArguments &argData,
+                                              py::object &arg)> &backupHandler,
+                     std::size_t startingArgIdx = 0) {
   if (kernelFuncOp.getNumArguments() != args.size())
     throw std::runtime_error("Invalid runtime arguments - kernel expected " +
                              std::to_string(kernelFuncOp.getNumArguments()) +
                              " but was provided " +
                              std::to_string(args.size()) + " arguments.");
 
-  for (std::size_t i = 0; i < args.size(); i++) {
+  for (std::size_t i = startingArgIdx; i < args.size(); i++) {
     py::object arg = args[i];
     auto kernelArgTy = kernelFuncOp.getArgument(i).getType();
     llvm::TypeSwitch<mlir::Type, void>(kernelArgTy)
diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index 79cb796103e..fd73b89c7c8 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -14,15 +14,16 @@
 #include "common/RuntimeMLIR.h"
 #include "common/SerializedCodeExecutionContext.h"
 #include "cudaq.h"
+#include "cudaq/algorithms/gradient.h"
+#include "cudaq/algorithms/optimizer.h"
 #include "cudaq/platform/qpu.h"
 #include "cudaq/platform/quantum_platform.h"
 #include <fstream>
 
 namespace cudaq {
 
-// TODO - Remove this once the public NVQC deployment supports this capability.
-static inline bool serializedCodeExecOverride() {
-  if (auto envVal = std::getenv("CUDAQ_SER_CODE_EXEC")) {
+static inline bool getEnvVarBool(const char *envVarName) {
+  if (auto envVal = std::getenv(envVarName)) {
     std::string tmp(envVal);
     std::transform(tmp.begin(), tmp.end(), tmp.begin(),
                    [](unsigned char c) { return std::tolower(c); });
@@ -32,6 +33,16 @@ static inline bool serializedCodeExecOverride() {
   return false;
 }
 
+// TODO - Remove this once the public NVQC deployment supports this capability.
+static inline bool remoteVQEExecOverride() {
+  return getEnvVarBool("CUDAQ_REMOTE_VQE");
+}
+
+// TODO - Remove this once the public NVQC deployment supports this capability.
+static inline bool serializedCodeExecOverride() {
+  return getEnvVarBool("CUDAQ_SER_CODE_EXEC");
+}
+
 // Remote QPU: delegating the execution to a remotely-hosted server, which can
 // reinstate the execution context and JIT-invoke the kernel.
 class BaseRemoteSimulatorQPU : public cudaq::QPU {
@@ -42,6 +53,16 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
   std::unique_ptr<mlir::MLIRContext> m_mlirContext;
   std::unique_ptr<cudaq::RemoteRuntimeClient> m_client;
 
+  /// @brief Return a pointer to the execution context for this thread. It will
+  /// return `nullptr` if it was not found in `m_contexts`.
+  cudaq::ExecutionContext *getExecutionContextForMyThread() {
+    std::scoped_lock<std::mutex> lock(m_contextMutex);
+    const auto iter = m_contexts.find(std::this_thread::get_id());
+    if (iter == m_contexts.end())
+      return nullptr;
+    return iter->second;
+  }
+
 public:
   BaseRemoteSimulatorQPU()
       : QPU(),
@@ -57,6 +78,10 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
   // Conditional feedback is handled by the server side.
   virtual bool supportsConditionalFeedback() override { return true; }
 
+  // VQE is executed fully on the server without the need to go back and forth
+  // in between observe calls
+  virtual bool supportsRemoteVQE() override { return true; }
+
   // Remote serializable code is executed fully on the server without the need
   // to go back and forth in between observe calls (see
   // launchSerializedCodeExecution).
@@ -80,6 +105,31 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
     execution_queue->enqueue(task);
   }
 
+  void launchVQE(const std::string &name, const void *kernelArgs,
+                 cudaq::gradient *gradient, cudaq::spin_op H,
+                 cudaq::optimizer &optimizer, const int n_params,
+                 const std::size_t shots) override {
+    cudaq::ExecutionContext *executionContextPtr =
+        getExecutionContextForMyThread();
+
+    if (executionContextPtr && executionContextPtr->name == "tracer")
+      return;
+
+    auto ctx = std::make_unique<ExecutionContext>("observe", shots);
+    ctx->kernelName = name;
+    ctx->spin = &H;
+    if (shots > 0)
+      ctx->shots = shots;
+
+    std::string errorMsg;
+    const bool requestOkay = m_client->sendRequest(
+        *m_mlirContext, *executionContextPtr, /*serializedCodeContext=*/nullptr,
+        gradient, &optimizer, n_params, m_simName, name, /*kernelFunc=*/nullptr,
+        kernelArgs, /*argSize=*/0, &errorMsg);
+    if (!requestOkay)
+      throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg);
+  }
+
   void launchKernel(const std::string &name, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset) override {
@@ -89,13 +139,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
         name, qpu_id, m_simName);
 
     cudaq::ExecutionContext *executionContextPtr =
-        [&]() -> cudaq::ExecutionContext * {
-      std::scoped_lock<std::mutex> lock(m_contextMutex);
-      const auto iter = m_contexts.find(std::this_thread::get_id());
-      if (iter == m_contexts.end())
-        return nullptr;
-      return iter->second;
-    }();
+        getExecutionContextForMyThread();
 
     if (executionContextPtr && executionContextPtr->name == "tracer") {
       return;
@@ -109,10 +153,10 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
     cudaq::ExecutionContext &executionContext =
         executionContextPtr ? *executionContextPtr : defaultContext;
     std::string errorMsg;
-    const bool requestOkay =
-        m_client->sendRequest(*m_mlirContext, executionContext,
-                              /*serializedCodeContext=*/nullptr, m_simName,
-                              name, kernelFunc, args, voidStarSize, &errorMsg);
+    const bool requestOkay = m_client->sendRequest(
+        *m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr,
+        /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
+        m_simName, name, kernelFunc, args, voidStarSize, &errorMsg);
     if (!requestOkay)
       throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
   }
@@ -127,13 +171,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
         name, qpu_id, m_simName);
 
     cudaq::ExecutionContext *executionContextPtr =
-        [&]() -> cudaq::ExecutionContext * {
-      std::scoped_lock<std::mutex> lock(m_contextMutex);
-      const auto iter = m_contexts.find(std::this_thread::get_id());
-      if (iter == m_contexts.end())
-        return nullptr;
-      return iter->second;
-    }();
+        getExecutionContextForMyThread();
 
     if (executionContextPtr && executionContextPtr->name == "tracer") {
       return;
@@ -150,6 +188,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
     std::string errorMsg;
     const bool requestOkay = m_client->sendRequest(
         *m_mlirContext, executionContext, &serializeCodeExecutionObject,
+        /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
         m_simName, name, /*kernelFunc=*/nullptr, /*args=*/nullptr,
         /*voidStarSize=*/0, &errorMsg);
     if (!requestOkay)
@@ -240,6 +279,11 @@ class BaseNvcfSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU {
     m_client->setConfig(clientConfigs);
   }
 
+  // VQE is executed fully on the server without the need to go back and forth
+  // in between observe calls (see launchVQE).
+  // TODO - set this to true when NVQC supports this.
+  virtual bool supportsRemoteVQE() override { return remoteVQEExecOverride(); }
+
   // Remote serializable code is executed fully on the server without the need
   // to go back and forth in between observe calls (see
   // launchSerializedCodeExecution).
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index fb95c38f7a2..ada308c7799 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -111,8 +111,10 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
 
   std::string constructKernelPayload(mlir::MLIRContext &mlirContext,
                                      const std::string &name,
-                                     void (*kernelFunc)(void *), void *args,
-                                     std::uint64_t voidStarSize) {
+                                     void (*kernelFunc)(void *),
+                                     const void *args,
+                                     std::uint64_t voidStarSize,
+                                     std::size_t startingArgIdx) {
     if (cudaq::__internal__::isLibraryMode(name)) {
       // Library mode: retrieve the embedded bitcode in the executable.
       const auto path = llvm::sys::fs::getMainExecutable(nullptr, nullptr);
@@ -174,7 +176,8 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
-        pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args));
+        pm.addPass(
+            cudaq::opt::createQuakeSynthesizer(name, args, startingArgIdx));
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
@@ -210,12 +213,53 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       return llvm::encodeBase64(mlirCode);
     }
   }
+  cudaq::RestRequest constructVQEJobRequest(
+      mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context,
+      const std::string &backendSimName, const std::string &kernelName,
+      const void *kernelArgs, cudaq::gradient *gradient,
+      cudaq::optimizer &optimizer, const int n_params) {
+    cudaq::RestRequest request(io_context, version());
+
+    request.opt = RestRequestOptFields();
+    request.opt->optimizer_n_params = n_params;
+    request.opt->optimizer_type = get_optimizer_type(optimizer);
+    request.opt->optimizer_ptr = &optimizer;
+    request.opt->gradient_ptr = gradient;
+    if (gradient)
+      request.opt->gradient_type = get_gradient_type(*gradient);
+
+    request.entryPoint = kernelName;
+    request.passes = serverPasses;
+    request.format = cudaq::CodeFormat::MLIR;
+    request.code =
+        constructKernelPayload(mlirContext, kernelName, /*kernelFunc=*/nullptr,
+                               /*kernelArgs=*/kernelArgs,
+                               /*argsSize=*/0, /*startingArgIdx=*/1);
+    request.simulator = backendSimName;
+    // Remote server seed
+    // Note: unlike local executions whereby a static instance of the simulator
+    // is seeded once when `cudaq::set_random_seed` is called, thus not being
+    // re-seeded between executions. For remote executions, we use the runtime
+    // level seed value to seed a random number generator to seed the server.
+    // i.e., consecutive remote executions on the server from the same client
+    // session (where `cudaq::set_random_seed` is called), get new random seeds
+    // for each execution. The sequence is still deterministic based on the
+    // runtime-level seed value.
+    request.seed = [&]() {
+      std::uniform_int_distribution<std::size_t> seedGen(
+          std::numeric_limits<std::size_t>::min(),
+          std::numeric_limits<std::size_t>::max());
+      return seedGen(randEngine);
+    }();
+    return request;
+  }
 
   cudaq::RestRequest constructJobRequest(
       mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context,
       cudaq::SerializedCodeExecutionContext *serializedCodeContext,
       const std::string &backendSimName, const std::string &kernelName,
-      void (*kernelFunc)(void *), void *kernelArgs, std::uint64_t argsSize) {
+      void (*kernelFunc)(void *), const void *kernelArgs,
+      std::uint64_t argsSize) {
 
     cudaq::RestRequest request(io_context, version());
     if (serializedCodeContext)
@@ -257,19 +301,22 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       cudaq::IRPayLoad stateIrPayload1, stateIrPayload2;
 
       stateIrPayload1.entryPoint = kernelName1;
-      stateIrPayload1.ir = constructKernelPayload(mlirContext, kernelName1,
-                                                  nullptr, args1, argsSize1);
+      stateIrPayload1.ir =
+          constructKernelPayload(mlirContext, kernelName1, nullptr, args1,
+                                 argsSize1, /*startingArgIdx=*/0);
       stateIrPayload2.entryPoint = kernelName2;
-      stateIrPayload2.ir = constructKernelPayload(mlirContext, kernelName2,
-                                                  nullptr, args2, argsSize2);
+      stateIrPayload2.ir =
+          constructKernelPayload(mlirContext, kernelName2, nullptr, args2,
+                                 argsSize2, /*startingArgIdx=*/0);
       // First kernel of the overlap calculation
       request.code = stateIrPayload1.ir;
       request.entryPoint = stateIrPayload1.entryPoint;
       // Second kernel of the overlap calculation
       request.overlapKernel = stateIrPayload2;
     } else if (serializedCodeContext == nullptr) {
-      request.code = constructKernelPayload(mlirContext, kernelName, kernelFunc,
-                                            kernelArgs, argsSize);
+      request.code =
+          constructKernelPayload(mlirContext, kernelName, kernelFunc,
+                                 kernelArgs, argsSize, /*startingArgIdx=*/0);
     }
     request.simulator = backendSimName;
     // Remote server seed
@@ -294,17 +341,26 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
   sendRequest(mlir::MLIRContext &mlirContext,
               cudaq::ExecutionContext &io_context,
               cudaq::SerializedCodeExecutionContext *serializedCodeContext,
-              const std::string &backendSimName, const std::string &kernelName,
-              void (*kernelFunc)(void *), void *kernelArgs,
-              std::uint64_t argsSize, std::string *optionalErrorMsg) override {
+              cudaq::gradient *vqe_gradient, cudaq::optimizer *vqe_optimizer,
+              const int vqe_n_params, const std::string &backendSimName,
+              const std::string &kernelName, void (*kernelFunc)(void *),
+              const void *kernelArgs, std::uint64_t argsSize,
+              std::string *optionalErrorMsg) override {
     if (isDisallowed(io_context.name))
       throw std::runtime_error(
           io_context.name +
           " operation is not supported with cudaq target remote-mqpu!");
 
-    cudaq::RestRequest request = constructJobRequest(
-        mlirContext, io_context, serializedCodeContext, backendSimName,
-        kernelName, kernelFunc, kernelArgs, argsSize);
+    cudaq::RestRequest request = [&]() {
+      if (vqe_n_params > 0)
+        return constructVQEJobRequest(mlirContext, io_context, backendSimName,
+                                      kernelName, kernelArgs, vqe_gradient,
+                                      *vqe_optimizer, vqe_n_params);
+      return constructJobRequest(mlirContext, io_context, serializedCodeContext,
+                                 backendSimName, kernelName, kernelFunc,
+                                 kernelArgs, argsSize);
+    }();
+
     if (request.code.empty() && (serializedCodeContext == nullptr ||
                                  serializedCodeContext->source_code.empty())) {
       if (optionalErrorMsg)
@@ -705,9 +761,11 @@ class BaseNvcfRuntimeClient : public cudaq::BaseRemoteRestRuntimeClient {
   sendRequest(mlir::MLIRContext &mlirContext,
               cudaq::ExecutionContext &io_context,
               cudaq::SerializedCodeExecutionContext *serializedCodeContext,
-              const std::string &backendSimName, const std::string &kernelName,
-              void (*kernelFunc)(void *), void *kernelArgs,
-              std::uint64_t argsSize, std::string *optionalErrorMsg) override {
+              cudaq::gradient *vqe_gradient, cudaq::optimizer *vqe_optimizer,
+              const int vqe_n_params, const std::string &backendSimName,
+              const std::string &kernelName, void (*kernelFunc)(void *),
+              const void *kernelArgs, std::uint64_t argsSize,
+              std::string *optionalErrorMsg) override {
     if (isDisallowed(io_context.name))
       throw std::runtime_error(
           io_context.name +
@@ -735,9 +793,15 @@ class BaseNvcfRuntimeClient : public cudaq::BaseRemoteRestRuntimeClient {
       }
     }
     // Construct the base `cudaq-qpud` request payload.
-    cudaq::RestRequest request = constructJobRequest(
-        mlirContext, io_context, serializedCodeContext, backendSimName,
-        kernelName, kernelFunc, kernelArgs, argsSize);
+    cudaq::RestRequest request = [&]() {
+      if (vqe_n_params > 0)
+        return constructVQEJobRequest(mlirContext, io_context, backendSimName,
+                                      kernelName, kernelArgs, vqe_gradient,
+                                      *vqe_optimizer, vqe_n_params);
+      return constructJobRequest(mlirContext, io_context, serializedCodeContext,
+                                 backendSimName, kernelName, kernelFunc,
+                                 kernelArgs, argsSize);
+    }();
 
     if (request.code.empty() && (serializedCodeContext == nullptr ||
                                  serializedCodeContext->source_code.empty())) {
diff --git a/runtime/common/KernelWrapper.h b/runtime/common/KernelWrapper.h
index 29b44d16e3c..e0af46e71ee 100644
--- a/runtime/common/KernelWrapper.h
+++ b/runtime/common/KernelWrapper.h
@@ -422,6 +422,8 @@ class WrapperFunctionHandlerHelper<void(SignatureArgTs...), InvokeArgTs...> {
 public:
   using ArgTuple = std::tuple<std::decay_t<InvokeArgTs>...>;
   using ArgIndices = std::make_index_sequence<std::tuple_size<ArgTuple>::value>;
+  using ArgIndicesPlus1 =
+      std::make_index_sequence<1 + std::tuple_size<ArgTuple>::value>;
 
   template <typename CallableT>
   static void invoke(CallableT &&func, const char *argData,
@@ -436,6 +438,22 @@ class WrapperFunctionHandlerHelper<void(SignatureArgTs...), InvokeArgTs...> {
                                        ArgIndices{});
   }
 
+  // Specialization when the 1st std::vector<double> argument has been excluded
+  // from the serialized args, but now you want to call it.
+  template <typename CallableT>
+  static void invoke(CallableT &&func, const std::vector<double> &vec_parms,
+                     const char *argData, std::size_t argSize) {
+    ArgTuple argsTuple;
+    // Deserialize buffer to args tuple
+    if (!deserialize(argData, argSize, argsTuple, ArgIndices{}))
+      throw std::runtime_error(
+          "Failed to deserialize arguments for wrapper function call");
+    // Call the wrapped function with args tuple
+    auto newArgsTuple = std::tuple_cat(std::make_tuple(vec_parms), argsTuple);
+    WrapperFunctionHandlerCaller::call(std::forward<CallableT>(func),
+                                       newArgsTuple, ArgIndicesPlus1{});
+  }
+
 private:
   // Helper to deserialize a flat args buffer into typed args tuple.
   template <std::size_t... I>
@@ -476,6 +494,19 @@ void invokeCallableWithSerializedArgs(const char *argData, std::size_t argSize,
       InvokeArgTs...>::invoke(std::forward<CallableT>(func), argData, argSize);
 }
 
+// Invoke a typed callable (functions) with a std::vec<double> + serialized
+// `args`.
+template <typename CallableT, typename... InvokeArgTs>
+void invokeCallableWithSerializedArgs_vec(const std::vector<double> &vec_parms,
+                                          const char *argData,
+                                          std::size_t argSize,
+                                          CallableT &&func) {
+  WrapperFunctionHandlerHelper<
+      std::remove_reference_t<CallableT>,
+      InvokeArgTs...>::invoke(std::forward<CallableT>(func), vec_parms, argData,
+                              argSize);
+}
+
 // Wrapper for quantum kernel invocation, i.e., `kernel(args...)`.
 // In library mode, if the remote platform is used, we redirect it to the
 // platform's `launchKernel` instead of invoking it.
diff --git a/runtime/common/RemoteKernelExecutor.h b/runtime/common/RemoteKernelExecutor.h
index 8647a2a5ff8..969d76b92b3 100644
--- a/runtime/common/RemoteKernelExecutor.h
+++ b/runtime/common/RemoteKernelExecutor.h
@@ -24,6 +24,8 @@ class MLIRContext;
 }
 namespace cudaq {
 class ExecutionContext;
+class gradient;
+class optimizer;
 class SerializedCodeExecutionContext;
 
 /// Base interface encapsulating a CUDA-Q runtime server capable of
@@ -49,7 +51,14 @@ class RemoteRuntimeServer
                              std::string_view ir, std::string_view kernelName,
                              void *kernelArgs, std::uint64_t argsSize,
                              std::size_t seed) = 0;
-
+  // Handle incoming VQE requests
+  virtual void handleVQERequest(std::size_t reqId,
+                                cudaq::ExecutionContext &io_context,
+                                const std::string &backendSimName,
+                                std::string_view ir, cudaq::gradient *gradient,
+                                cudaq::optimizer &optimizer, const int n_params,
+                                std::string_view kernelName,
+                                std::size_t seed) = 0;
   // Destructor
   virtual ~RemoteRuntimeServer() = default;
 };
@@ -79,9 +88,10 @@ class RemoteRuntimeClient
   virtual bool
   sendRequest(mlir::MLIRContext &mlirContext, ExecutionContext &io_context,
               SerializedCodeExecutionContext *serializedCodeContext,
-              const std::string &backendSimName, const std::string &kernelName,
-              void (*kernelFunc)(void *), void *kernelArgs,
-              std::uint64_t argsSize,
+              cudaq::gradient *vqe_gradient, cudaq::optimizer *vqe_optimizer,
+              const int vqe_n_params, const std::string &backendSimName,
+              const std::string &kernelName, void (*kernelFunc)(void *),
+              const void *kernelArgs, std::uint64_t argsSize,
               std::string *optionalErrorMsg = nullptr) = 0;
   // Destructor
   virtual ~RemoteRuntimeClient() = default;
diff --git a/runtime/cudaq/algorithms/gradient.h b/runtime/cudaq/algorithms/gradient.h
index e094d7bb286..fe848fde561 100644
--- a/runtime/cudaq/algorithms/gradient.h
+++ b/runtime/cudaq/algorithms/gradient.h
@@ -37,12 +37,22 @@ class gradient {
   /// The parameterized ansatz, a quantum kernel expression
   std::function<void(std::vector<double>)> ansatz_functor;
 
+  // As an alternative to an ArgsMapper, we can have serialized arguments
+  // (excluding the initial std::vector<double> variational parameters).
+  std::vector<char> serializedArgs;
+
   // Given the parameters x and the spin_op h, compute the
   // expected value with respect to the ansatz.
   double getExpectedValue(std::vector<double> &x, spin_op h) {
     return cudaq::observe(ansatz_functor, h, x);
   }
 
+  // Copy constructor. Derived classes should implement the clone() method.
+  gradient(const gradient &o) {
+    ansatz_functor = o.ansatz_functor;
+    serializedArgs = o.serializedArgs;
+  }
+
 public:
   /// Constructor, takes the quantum kernel with prescribed signature
   gradient(std::function<void(std::vector<double>)> &&kernel)
@@ -66,14 +76,48 @@ class gradient {
     };
   }
 
+  /// Take the quantum kernel and concrete arguments for all arguments except
+  /// the first std::vector<double> argument, which is used for the variational
+  /// parameters for the gradient. Serialize and save those arguments into this
+  /// object. (Useful for NVQC.)
+  template <typename QuantumKernel, typename... Args>
+  void setArgs(QuantumKernel &kernel, Args &&...args) {
+    static_assert(
+        std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>,
+        "Kernel must be invocable with std::vector<double> and Args...");
+    // Serialize all the parameters except for the first std::vector<double>
+    // parameter. The serialized ones will be saved and used later during each
+    // ansatz_functor invocation.
+    serializedArgs = serializeArgs(std::forward<Args>(args)...);
+    ansatz_functor = [&](std::vector<double> x) {
+      cudaq::invokeCallableWithSerializedArgs_vec<QuantumKernel,
+                                                  std::decay_t<Args>...>(
+          x, serializedArgs.data(), serializedArgs.size(),
+          std::forward<QuantumKernel>(kernel));
+    };
+  }
+
+  /// Set the kernel after the gradient has been constructed. Use of this
+  /// function requires that the kernel ONLY accept the variational parameters.
+  /// It cannot have any non-variational parameters.
+  template <typename QuantumKernel>
+  void setKernel(QuantumKernel &kernel) {
+    static_assert(std::is_invocable_v<QuantumKernel, std::vector<double>>,
+                  "Kernel must be invocable with std::vector<double>");
+    ansatz_functor = kernel;
+  }
+
   /// Constructor, takes a callable that must have the
   /// prescribed call signature (void(std::vector<double>))
-  template <typename KernelT>
+  template <typename KernelT, typename = std::enable_if_t<std::is_invocable_v<
+                                  KernelT, std::vector<double>>>>
   gradient(KernelT &kernel) {
-    if (kernel.getNumParams() != 1)
-      throw std::invalid_argument(
-          "Callable kernel from cudaq::make_kernel must "
-          "have 1 std::vector<double> argument. Provide an ArgMapper if not.");
+    if constexpr (has_name<KernelT>::value)
+      if (kernel.getNumParams() != 1)
+        throw std::invalid_argument(
+            "Callable kernel from cudaq::make_kernel must "
+            "have 1 std::vector<double> argument. Provide an ArgMapper if "
+            "not.");
     ansatz_functor = [&](std::vector<double> x) {
       return cudaq::invokeKernel(std::forward<KernelT>(kernel), x);
     };
@@ -107,6 +151,9 @@ class gradient {
           const std::function<double(std::vector<double>)> &func,
           double funcAtX) = 0;
 
+  /// Clone the object. Must be implemented by derived classes.
+  virtual std::unique_ptr<cudaq::gradient> clone() = 0;
+
   virtual ~gradient() = default;
 };
 } // namespace cudaq
diff --git a/runtime/cudaq/algorithms/gradients/central_difference.h b/runtime/cudaq/algorithms/gradients/central_difference.h
index 08bb72ea926..79840f4a98a 100644
--- a/runtime/cudaq/algorithms/gradients/central_difference.h
+++ b/runtime/cudaq/algorithms/gradients/central_difference.h
@@ -17,6 +17,12 @@ class central_difference : public gradient {
   using gradient::gradient;
   double step = 1e-4;
 
+  virtual std::unique_ptr<cudaq::gradient> clone() override {
+    auto newGrad = std::make_unique<central_difference>(*this);
+    newGrad->step = this->step;
+    return newGrad;
+  }
+
   void compute(const std::vector<double> &x, std::vector<double> &dx,
                const spin_op &h, double exp_h) override {
     auto tmpX = x;
diff --git a/runtime/cudaq/algorithms/gradients/forward_difference.h b/runtime/cudaq/algorithms/gradients/forward_difference.h
index 3663ae2cda1..777a1843d64 100644
--- a/runtime/cudaq/algorithms/gradients/forward_difference.h
+++ b/runtime/cudaq/algorithms/gradients/forward_difference.h
@@ -19,6 +19,12 @@ class forward_difference : public gradient {
   using gradient::gradient;
   double step = 1e-4;
 
+  virtual std::unique_ptr<cudaq::gradient> clone() override {
+    auto newGrad = std::make_unique<forward_difference>(*this);
+    newGrad->step = this->step;
+    return newGrad;
+  }
+
   /// @brief Compute the `forward_difference` gradient
   void compute(const std::vector<double> &x, std::vector<double> &dx,
                const spin_op &h, double funcAtX) override {
diff --git a/runtime/cudaq/algorithms/gradients/parameter_shift.h b/runtime/cudaq/algorithms/gradients/parameter_shift.h
index cb5963aed9d..18b131620f4 100644
--- a/runtime/cudaq/algorithms/gradients/parameter_shift.h
+++ b/runtime/cudaq/algorithms/gradients/parameter_shift.h
@@ -16,6 +16,12 @@ class parameter_shift : public gradient {
   using gradient::gradient;
   double shiftScalar = 0.5;
 
+  virtual std::unique_ptr<cudaq::gradient> clone() override {
+    auto newGrad = std::make_unique<parameter_shift>(*this);
+    newGrad->shiftScalar = this->shiftScalar;
+    return newGrad;
+  }
+
   void compute(const std::vector<double> &x, std::vector<double> &dx,
                const spin_op &h, double exp_h) override {
     auto tmpX = x;
diff --git a/runtime/cudaq/algorithms/vqe.h b/runtime/cudaq/algorithms/vqe.h
index f97d4574817..47dfc3696ed 100644
--- a/runtime/cudaq/algorithms/vqe.h
+++ b/runtime/cudaq/algorithms/vqe.h
@@ -7,12 +7,48 @@
  ******************************************************************************/
 
 #pragma once
+#include "cudaq/gradients.h"
 #include "gradient.h"
 #include "observe.h"
 #include "optimizer.h"
+#include <stdio.h>
 
 namespace cudaq {
 
+namespace __internal__ {
+/// \brief This is an internal helper function to reduce duplicated code in the
+/// user-facing `vqe()` functions below. Users should not directly call this
+/// function.
+template <typename QuantumKernel, typename... Args,
+          typename = std::enable_if_t<
+              std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>>>
+static inline optimization_result
+remote_vqe(cudaq::quantum_platform &platform, QuantumKernel &&kernel,
+           cudaq::spin_op &H, cudaq::optimizer &optimizer,
+           cudaq::gradient *gradient, const int n_params,
+           const std::size_t shots, Args &&...args) {
+  auto ctx = std::make_unique<ExecutionContext>("observe", shots);
+  ctx->kernelName = cudaq::getKernelName(kernel);
+  ctx->spin = &H;
+  platform.set_exec_ctx(ctx.get());
+  auto serializedArgsBuffer = serializeArgs(args...);
+  platform.launchVQE(ctx->kernelName, serializedArgsBuffer.data(), gradient, H,
+                     optimizer, n_params, shots);
+  platform.reset_exec_ctx();
+  return ctx->optResult.value_or(optimization_result{});
+}
+
+static inline void print_arg_mapper_warning() {
+  printf(
+      "WARNING: Usage of ArgMapper type on this platform will result in "
+      "suboptimal performance. Consider updating your code to update your "
+      "kernel to use this signature (std::function<void(std::vector<double>, "
+      "arg1, arg2, ...)>) and pass concrete arguments to cudaq::vqe() for "
+      "the non-variational arguments.\n");
+}
+
+} // namespace __internal__
+
 ///
 /// \brief Compute the minimal eigenvalue of \p H with VQE.
 ///
@@ -24,6 +60,8 @@ namespace cudaq {
 ///        gradients.
 /// \param n_params The number of variational parameters in the ansatz quantum
 ///        kernel callable.
+/// \param args Non-variational arguments to \p kernel that will be passed to
+///        \p kernel on each invocation during VQE.
 /// \returns The optimal value and corresponding parameters as a
 ///        cudaq::optimization_result (std::tuple<double,std::vector<double>>)
 ///
@@ -52,22 +90,31 @@ namespace cudaq {
 /// auto [val, params] = cudaq::vqe(ansatz{}, H, optimizer, 1);
 /// \endcode
 ///
-template <typename QuantumKernel>
+template <typename QuantumKernel, typename... Args,
+          typename = std::enable_if_t<
+              std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>>>
 optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H,
-                        cudaq::optimizer &optimizer, const int n_params) {
+                        cudaq::optimizer &optimizer, const int n_params,
+                        Args &&...args) {
   static_assert(
-      std::is_invocable_v<QuantumKernel, std::vector<double>>,
+      std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>,
       "Invalid parameterized quantum kernel expression. Must have "
-      "void(std::vector<double>) signature, or provide "
+      "void(std::vector<double>, <Args...>) signature, or provide "
       "std::tuple<Args...>(std::vector<double>) ArgMapper function object.");
   if (optimizer.requiresGradients()) {
     throw std::invalid_argument("Provided cudaq::optimizer requires gradients. "
                                 "Please provide a cudaq::gradient instance.");
   }
 
+  auto &platform = cudaq::get_platform();
+  if (platform.supports_remote_vqe())
+    return __internal__::remote_vqe(platform, kernel, H, optimizer,
+                                    /*gradient=*/nullptr, n_params, /*shots=*/0,
+                                    args...);
+
   return optimizer.optimize(n_params, [&](const std::vector<double> &x,
                                           std::vector<double> &grad_vec) {
-    double e = cudaq::observe(kernel, H, x);
+    double e = cudaq::observe(kernel, H, x, args...);
     return e;
   });
 }
@@ -84,6 +131,8 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H,
 ///        gradients.
 /// \param n_params The number of variational parameters in the ansatz quantum
 ///        kernel callable.
+/// \param args Non-variational arguments to \p kernel that will be passed to
+///        \p kernel on each invocation during VQE.
 /// \returns The optimal value and corresponding parameters as a
 ///        cudaq::optimization_result (std::tuple<double,std::vector<double>>)
 ///
@@ -112,23 +161,32 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H,
 /// auto [val, params] = cudaq::vqe(/*shots*/ 100, ansatz{}, H, optimizer, 1);
 /// \endcode
 ///
-template <typename QuantumKernel>
+template <typename QuantumKernel, typename... Args,
+          typename = std::enable_if_t<
+              std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>>>
 optimization_result vqe(std::size_t shots, QuantumKernel &&kernel,
                         cudaq::spin_op H, cudaq::optimizer &optimizer,
-                        const int n_params) {
+                        const int n_params, Args &&...args) {
   static_assert(
-      std::is_invocable_v<QuantumKernel, std::vector<double>>,
+      std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>,
       "Invalid parameterized quantum kernel expression. Must have "
-      "void(std::vector<double>) signature, or provide "
+      "void(std::vector<double>, <Args...>) signature, or provide "
       "std::tuple<Args...>(std::vector<double>) ArgMapper function object.");
   if (optimizer.requiresGradients()) {
     throw std::invalid_argument("Provided cudaq::optimizer requires gradients. "
                                 "Please provide a cudaq::gradient instance.");
   }
 
+  auto &platform = cudaq::get_platform();
+  if (platform.supports_remote_vqe())
+    return __internal__::remote_vqe(platform, kernel, H, optimizer,
+                                    /*gradient=*/nullptr, n_params, shots,
+                                    args...);
+
   return optimizer.optimize(n_params, [&](const std::vector<double> &x,
                                           std::vector<double> &grad_vec) {
-    double e = cudaq::observe(shots, kernel, H, x);
+    observe_options options{static_cast<int>(shots), cudaq::noise_model{}};
+    double e = cudaq::observe(options, kernel, H, x, args...);
     return e;
   });
 }
@@ -147,6 +205,8 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel,
 ///        the minimal eigenvalue of \p H.
 /// \param n_params The number of variational parameters in the ansatz quantum
 ///        kernel callable.
+/// \param args Non-variational arguments to \p kernel that will be passed to
+///        \p kernel on each invocation during VQE.
 /// \returns The optimal value and corresponding parameters as a
 ///        cudaq::optimization_result (std::tuple<double,std::vector<double>>)
 ///
@@ -176,22 +236,45 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel,
 ///     cudaq::vqe(ansatz, gradient, H, optimizer, 1);
 /// \endcode
 ///
-template <typename QuantumKernel>
+template <typename QuantumKernel, typename... Args,
+          typename = std::enable_if_t<
+              std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>>>
 optimization_result vqe(QuantumKernel &&kernel, cudaq::gradient &gradient,
                         cudaq::spin_op H, cudaq::optimizer &optimizer,
-                        const int n_params) {
+                        const int n_params, Args &&...args) {
   static_assert(
-      std::is_invocable_v<QuantumKernel, std::vector<double>>,
+      std::is_invocable_v<QuantumKernel, std::vector<double>> ||
+          std::is_invocable_v<QuantumKernel, std::vector<double>, Args...>,
       "Invalid parameterized quantum kernel expression. Must have "
-      "void(std::vector<double>) signature, or provide "
+      "void(std::vector<double>, <Args...>) signature, or provide "
       "std::tuple<Args...>(std::vector<double>) ArgMapper function object.");
+
+  auto &platform = cudaq::get_platform();
+  if (platform.supports_remote_vqe())
+    return __internal__::remote_vqe(platform, kernel, H, optimizer, &gradient,
+                                    n_params,
+                                    /*shots=*/0, args...);
+
   auto requires_grad = optimizer.requiresGradients();
-  return optimizer.optimize(n_params, [&](const std::vector<double> &x,
-                                          std::vector<double> &grad_vec) {
-    double e = cudaq::observe(kernel, H, x);
+  // If there are additional arguments, we need to clone the gradient and
+  // provide it the concrete arguments.
+  // Note: the strange initialization of newGrad is to avoid a C++17 compiler
+  // error that happens because the `swap` is ambiguous between the unique_ptr
+  // and the qubit swap.
+  std::unique_ptr<cudaq::gradient> newGrad = [&]() {
     if (requires_grad) {
-      gradient.compute(x, grad_vec, H, e);
+      auto newGrad_ = gradient.clone();
+      if constexpr (sizeof...(args) > 0)
+        newGrad_->setArgs(kernel, args...);
+      return newGrad_;
     }
+    return std::unique_ptr<cudaq::gradient>();
+  }();
+  return optimizer.optimize(n_params, [&](const std::vector<double> &x,
+                                          std::vector<double> &grad_vec) {
+    double e = cudaq::observe(kernel, H, x, args...);
+    if (requires_grad)
+      newGrad->compute(x, grad_vec, H, e);
     return e;
   });
 }
@@ -248,7 +331,11 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::gradient &gradient,
 ///                      });
 /// \endcode
 ///
-template <typename QuantumKernel, typename ArgMapper>
+template <typename QuantumKernel, typename ArgMapper, typename... Args,
+          typename = std::enable_if_t<
+              std::is_invocable_v<ArgMapper, std::vector<double>> ||
+              std::is_invocable_v<ArgMapper, std::vector<double> &> ||
+              std::is_invocable_v<ArgMapper, const std::vector<double> &>>>
 optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H,
                         cudaq::optimizer &optimizer, const int n_params,
                         ArgMapper &&argsMapper) {
@@ -258,6 +345,9 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H,
         "Please provide a cudaq::gradient instance. Make sure the gradient is "
         "aware of the ArgMapper.");
   }
+  if (cudaq::get_platform().supports_remote_vqe())
+    __internal__::print_arg_mapper_warning();
+
   return optimizer.optimize(n_params, [&](const std::vector<double> &x,
                                           std::vector<double> &grad_vec) {
     auto args = argsMapper(x);
@@ -323,7 +413,11 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H,
 ///                      });
 /// \endcode
 ///
-template <typename QuantumKernel, typename ArgMapper>
+template <typename QuantumKernel, typename ArgMapper, typename... Args,
+          typename = std::enable_if_t<
+              std::is_invocable_v<ArgMapper, std::vector<double>> ||
+              std::is_invocable_v<ArgMapper, std::vector<double> &> ||
+              std::is_invocable_v<ArgMapper, const std::vector<double> &>>>
 optimization_result vqe(std::size_t shots, QuantumKernel &&kernel,
                         cudaq::spin_op H, cudaq::optimizer &optimizer,
                         const int n_params, ArgMapper &&argsMapper) {
@@ -333,6 +427,9 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel,
         "Please provide a cudaq::gradient instance. Make sure the gradient is "
         "aware of the ArgMapper.");
   }
+  if (cudaq::get_platform().supports_remote_vqe())
+    __internal__::print_arg_mapper_warning();
+
   return optimizer.optimize(n_params, [&](const std::vector<double> &x,
                                           std::vector<double> &grad_vec) {
     auto args = argsMapper(x);
@@ -373,11 +470,18 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel,
 /// \p H. This function will use the custom ArgMapper to map input variational
 /// parameters to a tuple for use in evaluating the kernel function.
 ///
-template <typename QuantumKernel, typename ArgMapper>
+template <typename QuantumKernel, typename ArgMapper, typename... Args,
+          typename = std::enable_if_t<
+              std::is_invocable_v<ArgMapper, std::vector<double>> ||
+              std::is_invocable_v<ArgMapper, std::vector<double> &> ||
+              std::is_invocable_v<ArgMapper, const std::vector<double> &>>>
 optimization_result vqe(QuantumKernel &&kernel, cudaq::gradient &gradient,
                         cudaq::spin_op H, cudaq::optimizer &optimizer,
                         const int n_params, ArgMapper &&argsMapper) {
   bool requiresGrad = optimizer.requiresGradients();
+  if (cudaq::get_platform().supports_remote_vqe())
+    __internal__::print_arg_mapper_warning();
+
   return optimizer.optimize(n_params, [&](const std::vector<double> &x,
                                           std::vector<double> &grad_vec) {
     auto args = argsMapper(x);
diff --git a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
index c8ad3046f0c..c6ec69ff701 100644
--- a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
+++ b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
@@ -259,6 +259,80 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
   // Stop the server.
   virtual void stop() override { m_server->stop(); }
 
+  virtual void handleVQERequest(std::size_t reqId,
+                                cudaq::ExecutionContext &io_context,
+                                const std::string &backendSimName,
+                                std::string_view ir, cudaq::gradient *gradient,
+                                cudaq::optimizer &optimizer, const int n_params,
+                                std::string_view kernelName,
+                                std::size_t seed) override {
+    cudaq::optimization_result result;
+
+    // If we're changing the backend, load the new simulator library from file.
+    if (m_simHandle.name != backendSimName) {
+      if (m_simHandle.libHandle)
+        dlclose(m_simHandle.libHandle);
+
+      m_simHandle =
+          SimulatorHandle(backendSimName, loadNvqirSimLib(backendSimName));
+    }
+
+    if (seed != 0)
+      cudaq::set_random_seed(seed);
+    simulationStart = std::chrono::high_resolution_clock::now();
+
+    auto &requestInfo = m_codeTransform[reqId];
+    if (requestInfo.format == cudaq::CodeFormat::LLVM) {
+      throw std::runtime_error("CodeFormat::LLVM is not supported with VQE. "
+                               "Use CodeFormat::MLIR instead.");
+    } else {
+      llvm::SourceMgr sourceMgr;
+      sourceMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBufferCopy(ir),
+                                   llvm::SMLoc());
+      auto module = parseSourceFile<ModuleOp>(sourceMgr, m_mlirContext.get());
+      if (!module)
+        throw std::runtime_error("Failed to parse the input MLIR code");
+      auto engine = jitMlirCode(*module, requestInfo.passes);
+      const std::string entryPointFunc =
+          std::string(cudaq::runtime::cudaqGenPrefixName) +
+          std::string(kernelName);
+      auto fnPtr =
+          getValueOrThrow(engine->lookup(entryPointFunc),
+                          "Failed to look up entry-point function symbol");
+      if (!fnPtr)
+        throw std::runtime_error("Failed to get entry function");
+
+      // quake-to-qir translates cc.stdvec<f64> to !llvm.struct<(ptr<f64>,
+      // i64)>, so we need to provide the inputs in this format. Make a lambda
+      // to convert between the two formats.
+      struct stdvec_struct {
+        const double *ptr;
+        std::size_t size;
+      };
+      auto fn = reinterpret_cast<void (*)(stdvec_struct)>(fnPtr);
+      auto fnWrapper = [fn](const std::vector<double> &x) {
+        fn({x.data(), x.size()});
+      };
+
+      // Construct the gradient object.
+      if (gradient)
+        gradient->setKernel(fnWrapper);
+
+      bool requiresGrad = optimizer.requiresGradients();
+      auto theSpin = **io_context.spin;
+
+      result = optimizer.optimize(n_params, [&](const std::vector<double> &x,
+                                                std::vector<double> &grad_vec) {
+        double e = cudaq::observe(fnWrapper, theSpin, x);
+        if (requiresGrad)
+          gradient->compute(x, grad_vec, theSpin, e);
+        return e;
+      });
+    }
+    simulationEnd = std::chrono::high_resolution_clock::now();
+    io_context.optResult = result;
+  }
+
   virtual void handleRequest(std::size_t reqId,
                              cudaq::ExecutionContext &io_context,
                              const std::string &backendSimName,
@@ -595,7 +669,25 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
       m_codeTransform[reqId] =
           CodeTransformInfo(request.format, request.passes);
       json resultJson;
-      if (request.executionContext.name == "state-overlap") {
+      std::vector<char> decodedCodeIr;
+      auto errorCode = llvm::decodeBase64(request.code, decodedCodeIr);
+      if (errorCode) {
+        LLVMConsumeError(llvm::wrap(std::move(errorCode)));
+        throw std::runtime_error("Failed to decode input IR");
+      }
+      std::string_view codeStr(decodedCodeIr.data(), decodedCodeIr.size());
+
+      if (request.opt.has_value() && request.opt->optimizer) {
+        if (!request.opt->optimizer_n_params.has_value())
+          throw std::runtime_error(
+              "Cannot run optimizer without providing optimizer_n_params");
+
+        handleVQERequest(
+            reqId, request.executionContext, request.simulator, codeStr,
+            request.opt->gradient.get(), *request.opt->optimizer,
+            *request.opt->optimizer_n_params, request.entryPoint, request.seed);
+        resultJson["executionContext"] = request.executionContext;
+      } else if (request.executionContext.name == "state-overlap") {
         if (!request.overlapKernel.has_value())
           throw std::runtime_error("Missing overlap kernel data.");
         std::vector<char> decodedCodeIr1, decodedCodeIr2;
diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
index 274d9577737..fefc83d8091 100644
--- a/runtime/cudaq/platform/qpu.h
+++ b/runtime/cudaq/platform/qpu.h
@@ -19,6 +19,8 @@
 #include <optional>
 
 namespace cudaq {
+class gradient;
+class optimizer;
 class SerializedCodeExecutionContext;
 
 /// Expose the function that will return the current ExecutionManager
@@ -136,6 +138,9 @@ class QPU : public registry::RegisteredType<QPU> {
   /// @brief Return whether this QPU has conditional feedback support
   virtual bool supportsConditionalFeedback() { return false; }
 
+  /// @brief Return whether this QPU has remote VQE execution support
+  virtual bool supportsRemoteVQE() { return false; }
+
   /// @brief Return whether this QPU has support for remote serialized code
   /// execution
   virtual bool supportsRemoteSerializedCode() { return false; }
@@ -160,6 +165,11 @@ class QPU : public registry::RegisteredType<QPU> {
   virtual void resetExecutionContext() = 0;
   virtual void setTargetBackend(const std::string &backend) {}
 
+  virtual void launchVQE(const std::string &name, const void *kernelArgs,
+                         cudaq::gradient *gradient, cudaq::spin_op H,
+                         cudaq::optimizer &optimizer, const int n_params,
+                         const std::size_t shots) {}
+
   /// Launch the kernel with given name (to extract its Quake representation).
   /// The raw function pointer is also provided, as are the runtime arguments,
   /// as a struct-packed void pointer and its corresponding size.
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index 722434442fb..c676bc04dfb 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -126,6 +126,27 @@ bool quantum_platform::supports_conditional_feedback(
   return platformQPUs[qpu_id]->supportsConditionalFeedback();
 }
 
+bool quantum_platform::supports_remote_vqe(const std::size_t qpu_id) const {
+  return platformQPUs[qpu_id]->supportsRemoteVQE();
+}
+
+void quantum_platform::launchVQE(const std::string kernelName,
+                                 const void *kernelArgs,
+                                 cudaq::gradient *gradient, cudaq::spin_op H,
+                                 cudaq::optimizer &optimizer,
+                                 const int n_params, const std::size_t shots) {
+  std::size_t qpu_id = 0;
+
+  auto tid = std::hash<std::thread::id>{}(std::this_thread::get_id());
+  auto iter = threadToQpuId.find(tid);
+  if (iter != threadToQpuId.end())
+    qpu_id = iter->second;
+
+  auto &qpu = platformQPUs[qpu_id];
+  qpu->launchVQE(kernelName, kernelArgs, gradient, H, optimizer, n_params,
+                 shots);
+}
+
 bool quantum_platform::supports_remote_serialized_code(
     const std::size_t qpu_id) const {
   return platformQPUs[qpu_id]->supportsRemoteSerializedCode();
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
index b7641d1af84..f5985aafa0b 100644
--- a/runtime/cudaq/platform/quantum_platform.h
+++ b/runtime/cudaq/platform/quantum_platform.h
@@ -24,6 +24,8 @@
 namespace cudaq {
 
 class QPU;
+class gradient;
+class optimizer;
 class SerializedCodeExecutionContext;
 
 /// Typedefs for defining the connectivity structure of a QPU
@@ -114,6 +116,9 @@ class quantum_platform {
   /// @brief Return true if QPU is locally emulating a remote QPU
   bool is_emulated(const std::size_t qpuId = 0) const;
 
+  /// @brief Return whether the QPU has support for fully remote VQE execution
+  bool supports_remote_vqe(const std::size_t qpuId = 0) const;
+
   /// @brief Set the noise model for future invocations of
   /// quantum kernels.
   void set_noise(const noise_model *model);
@@ -132,6 +137,12 @@ class quantum_platform {
   /// @brief Enqueue a general task that runs on the specified QPU
   void enqueueAsyncTask(const std::size_t qpu_id, std::function<void()> &f);
 
+  /// @brief Launch a VQE operation on the platform.
+  void launchVQE(const std::string kernelName, const void *kernelArgs,
+                 cudaq::gradient *gradient, cudaq::spin_op H,
+                 cudaq::optimizer &optimizer, const int n_params,
+                 const std::size_t shots);
+
   // This method is the hook for the kernel rewrites to invoke
   // quantum kernels.
   void launchKernel(std::string kernelName, void (*kernelFunc)(void *),
diff --git a/targettests/Remote-Sim/vqe_h2.cpp b/targettests/Remote-Sim/vqe_h2.cpp
new file mode 100644
index 00000000000..e1764a121a0
--- /dev/null
+++ b/targettests/Remote-Sim/vqe_h2.cpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: remote-sim
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t
+// clang-format on
+
+#include <cudaq.h>
+#include <cudaq/algorithm.h>
+#include <cudaq/builder.h>
+#include <cudaq/gradients.h>
+#include <cudaq/optimizers.h>
+
+// Here we build up a CUDA-Q kernel with N layers and each
+// layer containing an arrangement of random SO(4) rotations. The algorithm
+// leverages the CUDA-Q VQE support to compute the ground state of the
+// Hydrogen atom.
+
+// The SO4 random entangler written as a CUDA-Q kernel free function
+// since this is a pure-device quantum kernel
+__qpu__ void so4(cudaq::qubit &q, cudaq::qubit &r,
+                 const std::vector<double> &thetas) {
+  ry(thetas[0], q);
+  ry(thetas[1], r);
+
+  h(r);
+  cx(q, r);
+  h(r);
+
+  ry(thetas[2], q);
+  ry(thetas[3], r);
+
+  h(r);
+  cx(q, r);
+  h(r);
+
+  ry(thetas[4], q);
+  ry(thetas[5], r);
+
+  h(r);
+  cx(q, r);
+  h(r);
+}
+
+// The SO4 fabric CUDA-Q kernel. Keeps track of simple
+// arithmetic class members controlling the number of qubits and
+// entangling layers.
+struct so4_fabric {
+  void operator()(std::vector<double> params, int n_qubits,
+                  int n_layers) __qpu__ {
+    cudaq::qvector q(n_qubits);
+
+    x(q[0]);
+    x(q[2]);
+
+    const int block_size = 2;
+    int counter = 0;
+    for (int i = 0; i < n_layers; i++) {
+      // first layer of so4 blocks (even)
+      for (int k = 0; k < n_qubits; k += 2) {
+        auto subq = q.slice(k, block_size);
+        auto so4_params = cudaq::slice_vector(params, counter, 6);
+        so4(subq[0], subq[1], so4_params);
+        counter += 6;
+      }
+
+      // second layer of so4 blocks (odd)
+      for (int k = 1; k + block_size < n_qubits; k += 2) {
+        auto subq = q.slice(k, block_size);
+        auto so4_params = cudaq::slice_vector(params, counter, 6);
+        so4(subq[0], subq[1], so4_params);
+        counter += 6;
+      }
+    }
+  }
+};
+
+int main() {
+  // Read in the spin op from file
+  std::vector<double> h2_data{0, 0, 0, 0, -0.10647701149499994, 0.0,
+                              1, 1, 1, 1, 0.0454063328691,      0.0,
+                              1, 1, 3, 3, 0.0454063328691,      0.0,
+                              3, 3, 1, 1, 0.0454063328691,      0.0,
+                              3, 3, 3, 3, 0.0454063328691,      0.0,
+                              2, 0, 0, 0, 0.170280101353,       0.0,
+                              2, 2, 0, 0, 0.120200490713,       0.0,
+                              2, 0, 2, 0, 0.168335986252,       0.0,
+                              2, 0, 0, 2, 0.165606823582,       0.0,
+                              0, 2, 0, 0, -0.22004130022499996, 0.0,
+                              0, 2, 2, 0, 0.165606823582,       0.0,
+                              0, 2, 0, 2, 0.174072892497,       0.0,
+                              0, 0, 2, 0, 0.17028010135300004,  0.0,
+                              0, 0, 2, 2, 0.120200490713,       0.0,
+                              0, 0, 0, 2, -0.22004130022499999, 0.0,
+                              15};
+  cudaq::spin_op H(h2_data, /*nQubits*/ 4);
+
+  // For 8 qubits, 36 parameters per layer
+  int n_layers = 2, n_qubits = H.num_qubits(), block_size = 2, p_counter = 0;
+  int n_blocks_per_layer = 2 * (n_qubits / block_size) - 1;
+  int n_params = n_layers * 6 * n_blocks_per_layer;
+  printf("%d qubit Hamiltonian -> %d parameters\n", n_qubits, n_params);
+
+  // Define the initial parameters and ansatz.
+  auto init_params =
+      cudaq::random_vector(-1, 1, n_params, std::mt19937::default_seed);
+
+  so4_fabric ansatz;
+
+  // Run VQE with lbfgs + central_difference
+  {
+    cudaq::optimizers::lbfgs optimizer;
+    optimizer.initial_parameters = init_params;
+    optimizer.max_eval = 20;
+    optimizer.max_line_search_trials = 10;
+    cudaq::gradients::central_difference gradient;
+    auto [opt_val, opt_params] = cudaq::vqe(ansatz, gradient, H, optimizer,
+                                            n_params, n_qubits, n_layers);
+    printf("Optimal value = %.16lf\n", opt_val);
+    assert(std::abs(opt_val - -1.1164613629294273) < 1e-3);
+  }
+  // Run VQE with cobyla
+  {
+    cudaq::optimizers::cobyla optimizer;
+    optimizer.initial_parameters = init_params;
+    optimizer.max_eval = 100;
+    auto [opt_val, opt_params] =
+        cudaq::vqe(ansatz, H, optimizer, n_params, n_qubits, n_layers);
+    printf("Optimal value = %.16lf\n", opt_val);
+    assert(std::abs(opt_val - -1.0769400650758392) < 1e-3);
+  }
+  // Run VQE with cobyla with fixed number of shots
+  {
+    cudaq::optimizers::cobyla optimizer;
+    optimizer.initial_parameters = init_params;
+    optimizer.max_eval = 100;
+    auto [opt_val, opt_params] = cudaq::vqe(
+        /*shots=*/1000, ansatz, H, optimizer, n_params, n_qubits, n_layers);
+    printf("Optimal value = %.16lf\n", opt_val);
+    assert(std::abs(opt_val - -1.0769400650758392) < 1e-3);
+  }
+}