diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 4b15be9915..03af81ab35 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -12,6 +12,7 @@
 // These transforms can generally be thought of as "optimizations" or "rewrites"
 // on the IR.
 
+#include "SimulationData.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -43,7 +44,9 @@ std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass>
 createQuakeSynthesizer(std::string_view, const void *,
-                       std::size_t startingArgIdx = 0);
+                       std::size_t startingArgIdx = 0,
+                       SimulationStateData::getDataFunc *getData = nullptr,
+                       bool sameAddressSpace = false);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
 std::unique_ptr<mlir::Pass> createUnwindLoweringPass();
 
diff --git a/include/cudaq/Optimizer/Transforms/SimulationData.h b/include/cudaq/Optimizer/Transforms/SimulationData.h
new file mode 100644
index 0000000000..d0c8b3b5b4
--- /dev/null
+++ b/include/cudaq/Optimizer/Transforms/SimulationData.h
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <numbers>
+#include <vector>
+
+#include <iostream>
+
+// cudaq::state is defined in the runtime. The compiler will never need to know
+// about its implementation and there should not be a circular build/library
+// dependence because of it. Simply forward declare it, as it is notional.
+namespace cudaq {
+class state;
+}
+
+/// Owns the data
+class SimulationStateData {
+public:
+  typedef SimulationStateData(getDataFunc)(cudaq::state *);
+
+  SimulationStateData(void *data, std::size_t size, std::size_t elementSize)
+      : data(data), size(size), elementSize(elementSize) {}
+
+  // template <typename T>
+  // std::vector<T> toVector() {
+  //   assert(sizeof(T) == elementSize && "incorrect element size in simulation
+  //   data"); std::vector<T> result;
+
+  //   std::cout << "SimulationStateData:" << std::endl;
+  //   for (std::size_t i = 0; i < size; i++) {
+  //     auto elePtr = reinterpret_cast<T*>(data) + i;
+  //     result.push_back(*elePtr);
+  //     std::cout << *elePtr << std::endl;
+  //   }
+
+  //   return result;
+  // }
+
+  ~SimulationStateData() { delete reinterpret_cast<int *>(data); }
+
+  void *data;
+  std::size_t size;
+  std::size_t elementSize;
+};
diff --git a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
index 8d4be784db..ed78fe7bd4 100644
--- a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
+++ b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
@@ -43,7 +43,9 @@ struct VerifyNVQIRCallOpsPass
           cudaq::opt::NVQIRInvokeRotationWithControlBits,
           cudaq::opt::NVQIRInvokeWithControlRegisterOrBits,
           cudaq::opt::NVQIRPackSingleQubitInArray,
-          cudaq::opt::NVQIRReleasePackedQubitArray};
+          cudaq::opt::NVQIRReleasePackedQubitArray,
+          cudaq::getNumQubitsFromCudaqState,
+      };
       // It must be either NVQIR extension functions or in the allowed list.
       return std::find(NVQIR_FUNCS.begin(), NVQIR_FUNCS.end(), functionName) !=
                  NVQIR_FUNCS.end() ||
@@ -71,8 +73,8 @@ struct VerifyNVQIRCallOpsPass
         passFailed = true;
         return WalkResult::interrupt();
       } else if (!isa<LLVM::AddressOfOp, LLVM::AllocaOp, LLVM::BitcastOp,
-                      LLVM::ExtractValueOp, LLVM::GEPOp, LLVM::LoadOp,
-                      LLVM::StoreOp>(op)) {
+                      LLVM::ExtractValueOp, LLVM::GEPOp, LLVM::IntToPtrOp,
+                      LLVM::LoadOp, LLVM::StoreOp>(op)) {
         // No pointers allowed except for the above operations.
         for (auto oper : op->getOperands()) {
           if (isa<LLVM::LLVMPointerType>(oper.getType())) {
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 54aeeee024..a32eb6d737 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -15,6 +15,7 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "cudaq/Optimizer/Transforms/SimulationData.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -115,6 +116,102 @@ static bool hasInitStateUse(BlockArgument argument) {
   return false;
 }
 
+template <typename T>
+std::vector<T> stateDataToVector(SimulationStateData &stateData) {
+  assert(sizeof(T) == stateData.elementSize &&
+         "incorrect element size in simulation data");
+  std::vector<T> result;
+
+  for (std::size_t i = 0; i < stateData.size; i++) {
+    auto elePtr = reinterpret_cast<T *>(stateData.data) + i;
+    result.push_back(*elePtr);
+  }
+
+  return result;
+}
+
+template <typename T>
+Value createGlobalArray(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                        BlockArgument argument, Type arrTy,
+                        std::vector<T> vec) {
+  OpBuilder::InsertionGuard guard(builder);
+  auto argLoc = argument.getLoc();
+
+  // Stick global at end of Module.
+  std::string symbol = "__nvqpp_rodata_init_state." + std::to_string(counter++);
+
+  cudaq::IRBuilder irBuilder(builder);
+  irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
+
+  builder.setInsertionPointToStart(argument.getOwner());
+  return builder.create<cudaq::cc::AddressOfOp>(
+      argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
+}
+
+template <typename T>
+LogicalResult synthesizeStateArgument(OpBuilder &builder, ModuleOp module,
+                                      unsigned &counter, BlockArgument argument,
+                                      Type eleTy, std::vector<T> &vec) {
+  auto *ctx = builder.getContext();
+  auto argLoc = argument.getLoc();
+  auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
+
+  builder.setInsertionPointToStart(argument.getOwner());
+  auto toErase = std::vector<mlir::Operation *>();
+
+  // Iterate over the users of this state argument.
+  for (auto *argUser : argument.getUsers()) {
+    // Replace a calls to runtime function that reads the number of qubits
+    // with the log of the length, which is a synthesized constant.
+    if (auto numOfQubitsOp = dyn_cast<func::CallOp>(argUser)) {
+      if (auto calleeAttr = numOfQubitsOp.getCalleeAttr()) {
+        auto funcName = calleeAttr.getValue().str();
+        if (funcName == cudaq::getNumQubitsFromCudaqState) {
+          Value numOfQubits = builder.create<arith::ConstantIntOp>(
+              argLoc, log2(vec.size()), builder.getI64Type());
+          numOfQubitsOp.replaceAllUsesWith(ValueRange{numOfQubits});
+          toErase.push_back(numOfQubitsOp);
+        } else {
+          argUser->emitError("Unexpected call on state argument");
+          return failure();
+        }
+      }
+    }
+  }
+
+  OpBuilder::InsertionGuard guard(builder);
+  auto buffer =
+      createGlobalArray(builder, module, counter, argument, arrTy, vec);
+  auto ptrArrEleTy =
+      cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
+  Value memArr = builder.create<cudaq::cc::CastOp>(argLoc, ptrArrEleTy, buffer);
+
+  argument.replaceAllUsesWith(memArr);
+
+  for (auto &op : toErase)
+    op->erase();
+
+  return success();
+}
+
+static LogicalResult synthesizeStateArgument(OpBuilder &builder,
+                                             ModuleOp module, unsigned &counter,
+                                             BlockArgument argument,
+                                             SimulationStateData &stateData) {
+
+  if (stateData.elementSize == sizeof(std::complex<double>)) {
+    auto vec = stateDataToVector<std::complex<double>>(stateData);
+    return synthesizeStateArgument(builder, module, counter, argument,
+                                   ComplexType::get(builder.getF64Type()), vec);
+  } else if (stateData.elementSize == sizeof(std::complex<float>)) {
+    auto vec = stateDataToVector<std::complex<float>>(stateData);
+    return synthesizeStateArgument(builder, module, counter, argument,
+                                   ComplexType::get(builder.getF32Type()), vec);
+  }
+  module.emitError("unexpected element size in simulation state data");
+  return failure();
+}
+
 template <typename ELETY, typename T, typename ATTR, typename MAKER>
 LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
@@ -122,9 +219,11 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          ATTR arrayAttr, MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
+
   assert(isa<cudaq::cc::StdvecType>(argTy));
   auto strTy = cast<cudaq::cc::StdvecType>(argTy);
   auto eleTy = cast<ELETY>(strTy.getElementType());
+
   builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
   auto conArray = builder.create<cudaq::cc::ConstantArrayOp>(
@@ -141,16 +240,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     OpBuilder::InsertionGuard guard(builder);
     Value buffer;
     if (hasInitStateUse(argument)) {
-      // Stick global at end of Module.
-      std::string symbol =
-          "__nvqpp_rodata_init_state." + std::to_string(counter++);
-
-      cudaq::IRBuilder irBuilder(builder);
-      irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
-
-      builder.setInsertionPointToStart(argument.getOwner());
-      buffer = builder.create<cudaq::cc::AddressOfOp>(
-          argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
+      buffer =
+          createGlobalArray(builder, module, counter, argument, arrTy, vec);
     } else {
       builder.setInsertionPointAfter(conArray);
       buffer = builder.create<cudaq::cc::AllocaOp>(argLoc, arrTy);
@@ -395,12 +486,18 @@ class QuakeSynthesizer
   // in `args`.
   std::size_t startingArgIdx = 0;
 
+  // Function to read the state data, if any.
+  SimulationStateData::getDataFunc *getStateData = nullptr;
+
+  // Is the simulation running in the same address space as synthesis?
+  bool sameAddressSpace = false;
+
 public:
   QuakeSynthesizer() = default;
-  QuakeSynthesizer(std::string_view kernel, const void *a)
-      : kernelName(kernel), args(a) {}
-  QuakeSynthesizer(std::string_view kernel, const void *a, std::size_t s)
-      : kernelName(kernel), args(a), startingArgIdx(s) {}
+  QuakeSynthesizer(std::string_view kernel, const void *a, std::size_t s,
+                   SimulationStateData::getDataFunc *getData, bool sameSpace)
+      : kernelName(kernel), args(a), startingArgIdx(s), getStateData(getData),
+        sameAddressSpace(sameSpace) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
@@ -540,19 +637,37 @@ class QuakeSynthesizer
 
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          // Special case of a `cudaq::state*` which must be in the same address
-          // space. This references a container to a set of simulation
-          // amplitudes.
-          synthesizeRuntimeArgument<cudaq::state *>(
-              builder, argument, args, offset, sizeof(void *),
-              [=](OpBuilder &builder, cudaq::state **concrete) {
-                Value rawPtr = builder.create<arith::ConstantIntOp>(
-                    loc, reinterpret_cast<std::intptr_t>(*concrete),
-                    sizeof(void *) * 8);
-                auto stateTy = cudaq::cc::StateType::get(builder.getContext());
-                return builder.create<cudaq::cc::CastOp>(
-                    loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
-              });
+          if (sameAddressSpace) {
+            // Special case of a `cudaq::state*` which must be in the same
+            // address space. This references a container to a set of simulation
+            // amplitudes.
+            synthesizeRuntimeArgument<cudaq::state *>(
+                builder, argument, args, offset, sizeof(void *),
+                [=](OpBuilder &builder, cudaq::state **concrete) {
+                  Value rawPtr = builder.create<arith::ConstantIntOp>(
+                      loc, reinterpret_cast<std::intptr_t>(*concrete),
+                      sizeof(void *) * 8);
+                  auto stateTy =
+                      cudaq::cc::StateType::get(builder.getContext());
+                  return builder.create<cudaq::cc::CastOp>(
+                      loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
+                });
+          } else if (getStateData != nullptr) {
+            // Special case of running on a simulator in a different address
+            // space, when we know how to convert state to data.
+            cudaq::state *concrete;
+            std::memcpy(&concrete, ((const char *)args) + offset,
+                        sizeof(cudaq::state *));
+            auto stateData = getStateData(concrete);
+            if (failed(synthesizeStateArgument(builder, module, counter,
+                                               argument, stateData)))
+              module.emitError("Failed to synthesize state*");
+          } else {
+            // All other cases are not yet supported (i.e. quantum hardware).
+            funcOp.emitOpError("synthesis: unsupported argument type on "
+                               "quantum devices: state*");
+            signalPassFailure();
+          }
           continue;
         }
         // N.B. Other pointers will not be materialized and may be in a
@@ -761,8 +876,9 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
   return std::make_unique<QuakeSynthesizer>();
 }
 
-std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a,
-                                   std::size_t startingArgIdx) {
-  return std::make_unique<QuakeSynthesizer>(kernelName, a, startingArgIdx);
+std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer(
+    std::string_view kernelName, const void *a, std::size_t startingArgIdx,
+    SimulationStateData::getDataFunc *getData, bool sameAddressSpace) {
+  return std::make_unique<QuakeSynthesizer>(kernelName, a, startingArgIdx,
+                                            getData, sameAddressSpace);
 }
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 8496199d15..353e36bcd4 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -511,7 +511,8 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
       getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
 
   PassManager pm(context);
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs));
+  pm.addPass(
+      cudaq::opt::createQuakeSynthesizer(name, rawArgs, 0, nullptr, true));
   pm.addPass(createCanonicalizerPass());
 
   // Run state preparation for quantum devices only.
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
index 0e21a5bf88..9680a8f3bc 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
@@ -166,7 +166,6 @@ def kernel(vec: List[complex]):
 
 
 def test_arbitrary_unitary_synthesis():
-    import numpy as np
     cudaq.register_operation("custom_h",
                              1. / np.sqrt(2.) * np.array([1, 1, 1, -1]))
 
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index b2bb752962..2ea79e3f91 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -23,6 +23,7 @@
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "cudaq/Optimizer/Transforms/SimulationData.h"
 #include "cudaq/Support/Plugin.h"
 #include "cudaq/Support/TargetConfig.h"
 #include "cudaq/platform/qpu.h"
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index e224d827e8..79ef3a5043 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -92,6 +92,26 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
                        });
   }
 
+  static SimulationStateData readSimulationStateData(cudaq::state *s) {
+    void *dataPtr = nullptr;
+    auto stateVector = s->get_tensor();
+    auto precision = s->get_precision();
+    auto numElements = stateVector.get_num_elements();
+    auto elementSize = 0;
+    if (precision == SimulationState::precision::fp32) {
+      elementSize = sizeof(std::complex<float>);
+      auto *hostData = new std::complex<float>[numElements];
+      s->to_host(hostData, numElements);
+      dataPtr = reinterpret_cast<void *>(hostData);
+    } else {
+      elementSize = sizeof(std::complex<double>);
+      auto *hostData = new std::complex<double>[numElements];
+      s->to_host(hostData, numElements);
+      dataPtr = reinterpret_cast<void *>(hostData);
+    }
+    return SimulationStateData(dataPtr, numElements, elementSize);
+  }
+
 public:
   virtual void setConfig(
       const std::unordered_map<std::string, std::string> &configs) override {
@@ -172,8 +192,11 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
-        pm.addPass(
-            cudaq::opt::createQuakeSynthesizer(name, args, startingArgIdx));
+        // For efficiency, we don't run state prep to convert states to gates on
+        // remote simulators, instead we synthesize states as vectors.
+        // Pass the data reader function to the synthesizer for this purpose.
+        pm.addPass(cudaq::opt::createQuakeSynthesizer(
+            name, args, startingArgIdx, readSimulationStateData));
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
diff --git a/targettests/Remote-Sim/state_init.cpp b/targettests/Remote-Sim/state_init.cpp
new file mode 100644
index 0000000000..735cb16f43
--- /dev/null
+++ b/targettests/Remote-Sim/state_init.cpp
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: remote-sim
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu %s -o %t && %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+
+__qpu__ void test_complex_array_param(cudaq::state* inState) {
+  cudaq::qvector q1(inState);
+}
+
+void printCounts(cudaq::sample_result& result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+  std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.,  0., 0.,  0., 0.};
+  std::vector<cudaq::complex> vec1{0., 0.,  0., 0., 0., 0., M_SQRT1_2, M_SQRT1_2};
+  auto state = cudaq::state::from_data(vec);
+  auto state1 = cudaq::state::from_data(vec1);
+  {
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_complex_array_param, &state);
+      printCounts(counts);
+
+      counts = cudaq::sample(test_complex_array_param, &state1);
+      printCounts(counts);
+  }
+}
+
+// CHECK: 000
+// CHECK: 100
+
+// CHECK: 011
+// CHECK: 111
diff --git a/targettests/Remote-Sim/state_init_vector.cpp b/targettests/Remote-Sim/state_init_vector.cpp
new file mode 100644
index 0000000000..b8d6bdb3bb
--- /dev/null
+++ b/targettests/Remote-Sim/state_init_vector.cpp
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: remote-sim
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu %s -o %t && %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void test_complex_constant_array() {
+   cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+}
+
+__qpu__ void test_complex_constant_array2() {
+   cudaq::qvector v1(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+   cudaq::qvector v2(std::vector<cudaq::complex>({ 0., 0., M_SQRT1_2, M_SQRT1_2}));
+}
+
+__qpu__ void test_complex_constant_array3() {
+   cudaq::qvector v({
+    cudaq::complex(M_SQRT1_2),
+    cudaq::complex(M_SQRT1_2),
+    cudaq::complex(0.0),
+    cudaq::complex(0.0)
+  });
+}
+
+__qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test_real_constant_array() {
+  cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
+__qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test_double_array_param(std::vector<double> inState) {
+  cudaq::qvector q = inState;
+}
+
+__qpu__ void test_float_array_param(std::vector<float> inState) {
+  cudaq::qvector q = inState;
+}
+
+void printCounts(cudaq::sample_result& result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    {
+      auto counts = cudaq::sample(test_complex_constant_array);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+    {
+      auto counts = cudaq::sample(test_complex_constant_array2);
+      printCounts(counts);
+    }
+
+// CHECK: 0001
+// CHECK: 0011
+// CHECK: 1001
+// CHECK: 1011
+
+    {
+      auto counts = cudaq::sample(test_complex_constant_array3);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+    {
+      auto counts = cudaq::sample(test_real_constant_array);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+    {
+      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      {
+          // Passing state data as argument (kernel mode)
+          auto counts = cudaq::sample(test_complex_array_param, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(test_complex_array_param, vec1);
+          printCounts(counts);
+      }
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+      {
+          // Passing state data as argument (builder mode)
+          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+          auto qubits = kernel.qalloc(v);
+
+          auto counts = cudaq::sample(kernel, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(kernel, vec1);
+          printCounts(counts);
+      }
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+    {
+      std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      {
+          // Passing state data as argument (kernel mode)
+          auto counts = cudaq::sample(test_real_array_param, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(test_real_array_param, vec1);
+          printCounts(counts);
+      }
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+      {
+          // Passing state data as argument (builder mode)
+          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
+          auto qubits = kernel.qalloc(v);
+
+          auto counts = cudaq::sample(kernel, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(kernel, vec1);
+          printCounts(counts);
+      }
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+    }
+
+    {
+      std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_double_array_param, vec);
+      printCounts(counts);
+
+      counts = cudaq::sample(test_double_array_param, vec1);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+    {
+      std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_float_array_param, vec);
+      printCounts(counts);
+
+      counts = cudaq::sample(test_float_array_param, vec1);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+}
+
diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
index c7e6bce0c5..a74a1c82bb 100644
--- a/unittests/Optimizer/QuakeSynthTester.cpp
+++ b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -54,7 +54,8 @@ LogicalResult runQuakeSynth(std::string_view kernelName, void *rawArgs,
   PassManager pm(module->getContext());
   module->getContext()->disableMultithreading();
   pm.enableIRPrinting();
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, rawArgs));
+  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, rawArgs, 0, nullptr,
+                                                true));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
@@ -328,6 +329,66 @@ TEST(QuakeSynthTests, checkVectorOfInt) {
   EXPECT_EQ(countz.size(), 1);
 }
 
+TEST(QuakeSynthTests, checkStatePointerLocalSim) {
+  auto [kernel, thetas] = cudaq::make_kernel<cudaq::state *>();
+  auto theta = thetas[0];
+  auto phi = thetas[1];
+  auto q = kernel.qalloc(3);
+  kernel.x(q[0]);
+  kernel.ry(theta, q[1]);
+  kernel.ry(phi, q[2]);
+  kernel.x<cudaq::ctrl>(q[2], q[0]);
+  kernel.x<cudaq::ctrl>(q[0], q[1]);
+  kernel.ry(-theta, q[1]);
+  kernel.x<cudaq::ctrl>(q[0], q[1]);
+  kernel.x<cudaq::ctrl>(q[1], q[0]);
+
+  std::cout << kernel.to_quake() << '\n';
+
+  // Set the proper name for the kernel
+  auto properName = cudaq::runtime::cudaqGenPrefixName + kernel.name();
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  cudaq::spin_op h3 = h + 9.625 - 9.625 * z(2) - 3.913119 * x(1) * x(2) -
+                      3.913119 * y(1) * y(2);
+
+  cudaq::state state = cudaq::state::from_data(
+      std::vector<std::complex<double>>({.3591, .2569}));
+  double energy = cudaq::observe(kernel, h3, &state);
+  EXPECT_NEAR(energy, -2.045375, 1e-3);
+
+  // Map the kernel_builder to_quake output  to MLIR
+  auto context = cudaq::initializeMLIR();
+  auto module = parseSourceString<ModuleOp>(kernel.to_quake(), context.get());
+
+  // Create a struct defining the runtime args for the kernel
+  auto [args, offset] =
+      cudaq::mapToRawArgs(kernel.name(), std::vector<double>{.3591, .2569});
+
+  // Run quake-synth
+  EXPECT_TRUE(succeeded(runQuakeSynth(kernel.name(), args, module)));
+
+  // Get the function, make sure that it has no arguments
+  auto func = module->lookupSymbol<func::FuncOp>(properName);
+  EXPECT_TRUE(func);
+  EXPECT_TRUE(func.getArguments().empty());
+
+  func.dump();
+
+  // Lower to LLVM and create the JIT execution engine
+  EXPECT_TRUE(succeeded(lowerToLLVMDialect(*module)));
+  auto jitOrError = ExecutionEngine::create(*module);
+  EXPECT_TRUE(!!jitOrError);
+  std::unique_ptr<ExecutionEngine> jit = std::move(jitOrError.get());
+
+  // // Sample this new kernel processed with quake synth
+  energy = observeJitCode(jit.get(), h3, kernel.name());
+  // Should see the same thing as before.
+  EXPECT_NEAR(energy, -2.045375, 1e-3);
+}
+
 TEST(QuakeSynthTests, checkCallable) {
   auto [ansatz, thetas] = cudaq::make_kernel<std::vector<double>>();
   auto q = ansatz.qalloc(2);