diff --git a/.github/workflows/nvqc_regression_tests.yml b/.github/workflows/nvqc_regression_tests.yml
index 5057ae5710..afa4f006bf 100644
--- a/.github/workflows/nvqc_regression_tests.yml
+++ b/.github/workflows/nvqc_regression_tests.yml
@@ -127,7 +127,8 @@ jobs:
             # pauli_word: https://github.com/NVIDIA/cuda-quantum/issues/1957
             # nested_vectors: related to vector of pauli_words (https://github.com/NVIDIA/cuda-quantum/issues/1957)
             # custom_operation: https://github.com/NVIDIA/cuda-quantum/issues/1985
-            if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]] && [[ "$filename" != *"custom_operation"* ]]; then
+            # return_values: only supported in 0.8 NVQC service.
+            if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]] && [[ "$filename" != *"custom_operation"* ]] && [[ "$filename" != *"return_values"* ]]; then
               echo "$filename"
               nvqc_config=""
               # Look for a --remote-mqpu-auto-launch to determine the number of QPUs
diff --git a/docs/sphinx/using/backends/platform.rst b/docs/sphinx/using/backends/platform.rst
index fd6709dd95..d30d04c2f5 100644
--- a/docs/sphinx/using/backends/platform.rst
+++ b/docs/sphinx/using/backends/platform.rst
@@ -267,4 +267,7 @@ language constructs within quantum kernels may not yet be fully supported.
    * - Single-level nested `std::vector` of supported `std::vector` types
      - `std::vector<std::vector<int>>`, `std::vector<cudaq::pauli_word>`, etc. 
      - Number of top-level elements (as a 64-bit integer) followed sizes in bytes of element vectors (as a contiguous array of 64-bit integers) then serialized data of the inner vectors.
-     
\ No newline at end of file
+     
+For CUDA-Q kernels that return a value, the remote platform supports returning simple data types of 
+`bool`, integral (e.g., `int` or `std::size_t`), and floating-point types (`float` or `double`) 
+when MLIR-based compilation is enabled (:code:`--enable-mlir`).
diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index 1fbed52564..0c897cd4b3 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -126,8 +126,17 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
     // set up a single-shot execution context for this case.
     static thread_local cudaq::ExecutionContext defaultContext("sample",
                                                                /*shots=*/1);
+    // This is a kernel invocation outside the CUDA-Q APIs (sample/observe).
+    const bool isDirectInvocation = !executionContextPtr;
     cudaq::ExecutionContext &executionContext =
         executionContextPtr ? *executionContextPtr : defaultContext;
+
+    // Populate the conditional feedback metadata if this is a direct
+    // invocation (not otherwise populated by cudaq::sample)
+    if (isDirectInvocation)
+      executionContext.hasConditionalsOnMeasureResults =
+          cudaq::kernelHasConditionalFeedback(name);
+
     std::string errorMsg;
     const bool requestOkay = m_client->sendRequest(
         *m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr,
@@ -135,6 +144,30 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
         m_simName, name, kernelFunc, args, voidStarSize, &errorMsg);
     if (!requestOkay)
       throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
+    if (isDirectInvocation &&
+        !executionContext.invocationResultBuffer.empty()) {
+      if (executionContext.invocationResultBuffer.size() + resultOffset >
+          voidStarSize)
+        throw std::runtime_error(
+            "Unexpected result: return type size of " +
+            std::to_string(executionContext.invocationResultBuffer.size()) +
+            " bytes overflows the argument buffer.");
+      // Currently, we only support result buffer serialization on LittleEndian
+      // CPUs (x86, ARM, PPC64LE).
+      // Note: NVQC service will always be using LE. If
+      // the client (e.g., compiled from source) is built for big-endian, we
+      // will throw an error if result buffer data is returned.
+      if (llvm::sys::IsBigEndianHost)
+        throw std::runtime_error(
+            "Serializing the result buffer from a remote kernel invocation is "
+            "not supported for BigEndian CPU architectures.");
+
+      char *resultBuf = reinterpret_cast<char *>(args) + resultOffset;
+      // Copy the result data to the args buffer.
+      std::memcpy(resultBuf, executionContext.invocationResultBuffer.data(),
+                  executionContext.invocationResultBuffer.size());
+      executionContext.invocationResultBuffer.clear();
+    }
   }
 
   void
diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h
index d7f763dd62..70c0827e49 100644
--- a/runtime/common/ExecutionContext.h
+++ b/runtime/common/ExecutionContext.h
@@ -100,6 +100,11 @@ class ExecutionContext {
   /// register after execution. Empty means no reordering.
   std::vector<std::size_t> reorderIdx;
 
+  /// @brief A buffer containing the return value of a kernel invocation.
+  /// Note: this is only needed for invocation not able to return a
+  /// `sample_result`.
+  std::vector<char> invocationResultBuffer;
+
   /// @brief The Constructor, takes the name of the context
   /// @param n The name of the context
   ExecutionContext(const std::string n) : name(n) {}
diff --git a/runtime/common/JsonConvert.h b/runtime/common/JsonConvert.h
index 0c770dd258..67da14cea0 100644
--- a/runtime/common/JsonConvert.h
+++ b/runtime/common/JsonConvert.h
@@ -153,6 +153,9 @@ inline void to_json(json &j, const ExecutionContext &context) {
 
   if (context.amplitudeMaps.has_value())
     j["amplitudeMaps"] = context.amplitudeMaps.value();
+
+  if (!context.invocationResultBuffer.empty())
+    j["invocationResultBuffer"] = context.invocationResultBuffer;
 }
 
 inline void from_json(const json &j, ExecutionContext &context) {
@@ -214,6 +217,9 @@ inline void from_json(const json &j, ExecutionContext &context) {
 
   if (j.contains("amplitudeMaps"))
     context.amplitudeMaps = j["amplitudeMaps"];
+
+  if (j.contains("invocationResultBuffer"))
+    context.invocationResultBuffer = j["invocationResultBuffer"];
 }
 
 // Enum data to denote the payload format.
diff --git a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
index f54f07299a..b532ba171c 100644
--- a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
+++ b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
@@ -420,7 +420,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
           io_context.hasConditionalsOnMeasureResults) {
         // Need to run simulation shot-by-shot
         cudaq::sample_result counts;
-        invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
+        invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes,
                          std::string(kernelName), io_context.shots,
                          [&](std::size_t i) {
                            // Reset the context and get the single
@@ -436,7 +436,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
         io_context.result = counts;
         platform.set_exec_ctx(&io_context);
       } else {
-        invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
+        invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes,
                          std::string(kernelName));
       }
     }
@@ -537,7 +537,8 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
   }
 
   void
-  invokeMlirKernel(std::unique_ptr<MLIRContext> &contextPtr,
+  invokeMlirKernel(cudaq::ExecutionContext &io_context,
+                   std::unique_ptr<MLIRContext> &contextPtr,
                    std::string_view irString,
                    const std::vector<std::string> &passes,
                    const std::string &entryPointFn, std::size_t numTimes = 1,
@@ -549,21 +550,56 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
     if (!module)
       throw std::runtime_error("Failed to parse the input MLIR code");
     auto engine = jitMlirCode(*module, passes);
+    llvm::SmallVector<void *> returnArg;
     const std::string entryPointFunc =
         std::string(cudaq::runtime::cudaqGenPrefixName) + entryPointFn;
-    auto fnPtr =
-        getValueOrThrow(engine->lookup(entryPointFunc),
-                        "Failed to look up entry-point function symbol");
-    if (!fnPtr)
-      throw std::runtime_error("Failed to get entry function");
+    if (auto funcOp = module->lookupSymbol<LLVM::LLVMFuncOp>(entryPointFunc)) {
+      auto funcTy = funcOp.getFunctionType();
+      auto returnTy = funcTy.getReturnType();
+      // These are the returned types that we support.
+      if (returnTy.isF32()) {
+        io_context.invocationResultBuffer.resize(sizeof(float));
+        returnArg.push_back(io_context.invocationResultBuffer.data());
+      } else if (returnTy.isF64()) {
+        io_context.invocationResultBuffer.resize(sizeof(double));
+        returnArg.push_back(io_context.invocationResultBuffer.data());
+      } else if (returnTy.isInteger(1)) {
+        static_assert(sizeof(bool) == sizeof(char),
+                      "Incompatible boolean data type. CUDA-Q kernels expect "
+                      "sizeof(bool) == sizeof(char).");
+        io_context.invocationResultBuffer.resize(sizeof(bool));
+        returnArg.push_back(io_context.invocationResultBuffer.data());
+      } else if (returnTy.isIntOrIndex()) {
+        io_context.invocationResultBuffer.resize(
+            (returnTy.getIntOrFloatBitWidth() + 7) / 8);
+        returnArg.push_back(io_context.invocationResultBuffer.data());
+      }
+    }
 
-    auto fn = reinterpret_cast<void (*)()>(fnPtr);
-    simulationStart = std::chrono::high_resolution_clock::now();
-    for (std::size_t i = 0; i < numTimes; ++i) {
-      // Invoke the kernel
-      fn();
-      if (postExecCallback) {
-        postExecCallback(i);
+    // Note: currently, we only return data from kernel on single-shot
+    // execution. Once we enable arbitrary sample return type, we can run this
+    // in a loop and return a vector of return type.
+    if (numTimes == 1 && !returnArg.empty()) {
+      simulationStart = std::chrono::high_resolution_clock::now();
+      llvm::Error error = engine->invokePacked(entryPointFunc, returnArg);
+      if (error)
+        throw std::runtime_error("JIT invocation failed");
+      if (postExecCallback)
+        postExecCallback(0);
+    } else {
+      auto fnPtr =
+          getValueOrThrow(engine->lookup(entryPointFunc),
+                          "Failed to look up entry-point function symbol");
+      if (!fnPtr)
+        throw std::runtime_error("Failed to get entry function");
+
+      auto fn = reinterpret_cast<void (*)()>(fnPtr);
+      simulationStart = std::chrono::high_resolution_clock::now();
+      for (std::size_t i = 0; i < numTimes; ++i) {
+        // Invoke the kernel
+        fn();
+        if (postExecCallback)
+          postExecCallback(i);
       }
     }
   }
diff --git a/targettests/Remote-Sim/return_values.cpp b/targettests/Remote-Sim/return_values.cpp
new file mode 100644
index 0000000000..a78e8a2c30
--- /dev/null
+++ b/targettests/Remote-Sim/return_values.cpp
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: remote-sim
+// REQUIRES: c++20
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+struct rwpe {
+  double operator()(const int n_iter, double mu, double sigma) __qpu__ {
+    int iteration = 0;
+
+    // Allocate the qubits
+    cudaq::qvector q(2);
+
+    // Alias them
+    auto &aux = q.front();
+    auto &target = q.back();
+
+    x(q[1]);
+
+    while (iteration < n_iter) {
+      h(aux);
+      rz(1.0 - (mu / sigma), aux);
+      rz(.25 / sigma, target);
+      x<cudaq::ctrl>(aux, target);
+      rz(-.25 / sigma, target);
+      x<cudaq::ctrl>(aux, target);
+      h(aux);
+      if (mz(aux)) {
+        x(aux);
+        mu += sigma * .6065;
+      } else {
+        mu -= sigma * .6065;
+      }
+
+      sigma *= .7951;
+      iteration += 1;
+    }
+
+    return 2. * mu;
+  }
+};
+
+struct returnTrue {
+  bool operator()() __qpu__ {
+    cudaq::qubit q;
+    x(q);
+    return mz(q);
+  }
+};
+
+struct returnFalse {
+  bool operator()() __qpu__ {
+    cudaq::qubit q, r;
+    x(q);
+    return mz(q) && mz(r);
+  }
+};
+
+struct returnInt {
+  int operator()(int iters) __qpu__ {
+    cudaq::qubit q;
+    int count = 0;
+    for (int i = 0; i < iters; ++i) {
+      h(q);
+      if (mz(q)) {
+        count++;
+        x(q);
+      }
+    }
+    return count;
+  }
+};
+
+int main() {
+  int n_iterations = 24;
+  double mu = 0.7951, sigma = 0.6065;
+  auto phase = rwpe{}(n_iterations, mu, sigma);
+
+  assert(std::abs(phase - 0.49) < 0.05);
+
+  assert(returnTrue{}());
+
+  assert(!returnFalse{}());
+  cudaq::set_random_seed(123);
+  const int oneCount = returnInt{}(1000);
+  std::cout << "One count = " << oneCount << "\n";
+  // We expect ~ 50% one.
+  assert(oneCount > 100);
+}