diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 996b6e56a7..422032326c 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -40,6 +40,8 @@ std::unique_ptr<mlir::Pass> createLowerToCFGPass();
 std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
+std::unique_ptr<mlir::Pass> createStatePreparation();
+std::unique_ptr<mlir::Pass> createStatePreparation(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 8d2f0c1821..e5e15a8776 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -512,6 +512,17 @@ def PruneCtrlRelations : Pass<"pruned-ctrl-form", "mlir::func::FuncOp"> {
   }];
 }
 
+def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
+  let summary =
+    "Convert state vector data into gates";
+  let description = [{
+    Convert quake representation that includes qubit initialization
+    from data into qubit initialization using gates.
+  }];
+
+  let constructor = "cudaq::opt::createStatePreparation()";
+}
+
 def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> {
   let summary =
     "Synthesize concrete quantum program from Quake code plus runtime values.";
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 7600efe276..173cec4538 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -42,6 +42,8 @@ add_cudaq_library(OptTransforms
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
+  StateDecomposer.cpp
+  StatePreparation.cpp
   PySynthCallableBlockArgs.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 625e9727c6..0244adb3d3 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -1220,7 +1220,6 @@ class GenerateKernelExecution
         }
         continue;
       }
-
       stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
                                                        stVal, arg, idx);
     }
diff --git a/lib/Optimizer/Transforms/StateDecomposer.cpp b/lib/Optimizer/Transforms/StateDecomposer.cpp
new file mode 100644
index 0000000000..3105fad707
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateDecomposer.cpp
@@ -0,0 +1,128 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "StateDecomposer.h"
+
+namespace cudaq::details {
+
+std::vector<std::size_t> grayCode(std::size_t numBits) {
+  std::vector<std::size_t> result(1ULL << numBits);
+  for (std::size_t i = 0; i < (1ULL << numBits); ++i)
+    result[i] = ((i >> 1) ^ i);
+  return result;
+}
+
+std::vector<std::size_t> getControlIndices(std::size_t numBits) {
+  auto code = grayCode(numBits);
+  std::vector<std::size_t> indices;
+  for (auto i = 0u; i < code.size(); ++i) {
+    // The position of the control in the lth CNOT gate is set to match
+    // the position where the lth and (l + 1)th bit strings g[l] and g[l+1] of
+    // the binary reflected Gray code differ.
+    auto position = std::log2(code[i] ^ code[(i + 1) % code.size()]);
+    // N.B: In CUDA Quantum we write the least significant bit (LSb) on the left
+    //
+    //  lsb -v
+    //       001
+    //         ^- msb
+    //
+    // Meaning that the bitstring 001 represents the number four instead of one.
+    // The above position calculation uses the 'normal' convention of writing
+    // numbers with the LSb on the left.
+    //
+    // Now, what we need to find out is the position of the 1 in the bitstring.
+    // If we take LSb as being position 0, then for the normal convention its
+    // position will be 0. Using CUDA Quantum convention it will be 2. Hence,
+    // we need to convert the position we find using:
+    //
+    // numBits - position - 1
+    //
+    // The extra -1 is to account for indices starting at 0. Using the above
+    // examples:
+    //
+    // bitstring: 001
+    // numBits: 3
+    // position: 0
+    //
+    // We have the converted position: 2, which is what we need.
+    indices.emplace_back(numBits - position - 1);
+  }
+  return indices;
+}
+
+std::vector<double> convertAngles(const std::span<double> alphas) {
+  // Implements Eq. (3) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  //
+  // N.B: The paper does fails to explicitly define what is the dot operator in
+  // the exponent of -1. Ref. 3 solves the mystery: its the bitwise inner
+  // product.
+  auto bitwiseInnerProduct = [](std::size_t a, std::size_t b) {
+    auto product = a & b;
+    auto sumOfProducts = 0;
+    while (product) {
+      sumOfProducts += product & 0b1 ? 1 : 0;
+      product = product >> 1;
+    }
+    return sumOfProducts;
+  };
+  std::vector<double> thetas(alphas.size(), 0);
+  for (std::size_t i = 0u; i < alphas.size(); ++i) {
+    for (std::size_t j = 0u; j < alphas.size(); ++j)
+      thetas[i] +=
+          bitwiseInnerProduct(j, ((i >> 1) ^ i)) & 0b1 ? -alphas[j] : alphas[j];
+    thetas[i] /= alphas.size();
+  }
+  return thetas;
+}
+
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (5) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  std::vector<double> angles;
+  double divisor = static_cast<double>(1ULL << (k - 1));
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double angle = 0.0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l)
+      // N.B: There is an extra '-1' on these indices computations to account
+      // for the fact that our indices start at 0.
+      angle += data[(2 * j - 1) * (1 << (k - 1)) + l - 1] -
+               data[(2 * j - 2) * (1 << (k - 1)) + l - 1];
+    angles.push_back(angle / divisor);
+  }
+  return angles;
+}
+
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (8) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  // N.B: There is an extra '-1' on these indices computations to account for
+  // the fact that our indices start at 0.
+  std::vector<double> angles;
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double numerator = 0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l) {
+      numerator +=
+          std::pow(std::abs(data[(2 * j - 1) * (1 << (k - 1)) + l - 1]), 2);
+    }
+
+    double denominator = 0;
+    for (std::size_t l = 1; l <= (1ULL << k); ++l) {
+      denominator += std::pow(std::abs(data[(j - 1) * (1 << k) + l - 1]), 2);
+    }
+
+    if (denominator == 0.0) {
+      assert(numerator == 0.0 &&
+             "If the denominator is zero, the numerator must also be zero.");
+      angles.push_back(0.0);
+      continue;
+    }
+    angles.push_back(2.0 * std::asin(std::sqrt(numerator / denominator)));
+  }
+  return angles;
+}
+} // namespace cudaq::details
\ No newline at end of file
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
new file mode 100644
index 0000000000..a698ac83c2
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -0,0 +1,177 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/TypeToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include <span>
+
+#include <iostream>
+
+namespace cudaq::details {
+
+/// @brief Converts angles of a uniformly controlled rotation to angles of
+/// non-controlled rotations.
+std::vector<double> convertAngles(const std::span<double> alphas);
+
+/// @brief Return the control indices dictated by the gray code implementation.
+///
+/// Here, numBits is the number of controls.
+std::vector<std::size_t> getControlIndices(std::size_t numBits);
+
+/// @brief Return angles required to implement a uniformly controlled z-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
+
+/// @brief Return angles required to implement a uniformly controlled y-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
+} // namespace cudaq::details
+
+class StateGateBuilder {
+public:
+  StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q)
+      : builder(b), loc(l), qubits(q) {}
+
+  template <typename Op>
+  void applyRotationOp(double theta, std::size_t target) {
+    auto qubit = createQubitRef(target);
+    auto thetaValue = createAngleValue(theta);
+    builder.create<Op>(loc, thetaValue, mlir::ValueRange{}, qubit);
+  };
+
+  void applyX(std::size_t control, std::size_t target) {
+    auto qubitC = createQubitRef(control);
+    auto qubitT = createQubitRef(target);
+    builder.create<quake::XOp>(loc, qubitC, qubitT);
+  };
+
+private:
+  mlir::Value createQubitRef(std::size_t index) {
+    if (qubitRefs.contains(index)) {
+      return qubitRefs[index];
+    }
+
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(
+        loc, index, builder.getIntegerType(64));
+    auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
+    qubitRefs[index] = ref;
+    return ref;
+  }
+
+  mlir::Value createAngleValue(double angle) {
+    return builder.create<mlir::arith::ConstantFloatOp>(
+        loc, llvm::APFloat{angle}, builder.getF64Type());
+  }
+
+  mlir::OpBuilder &builder;
+  mlir::Location &loc;
+  mlir::Value &qubits;
+
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs =
+      std::unordered_map<std::size_t, mlir::Value>();
+};
+
+class StateDecomposer {
+public:
+  StateDecomposer(StateGateBuilder &b, std::span<std::complex<double>> a)
+      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+
+  /// @brief Decompose the input state vector data to a set of controlled
+  /// operations and rotations. This function takes as input a `OpBuilder`
+  /// and appends the operations of the decomposition to its internal
+  /// representation. This implementation follows the algorithm defined in
+  /// `https://arxiv.org/pdf/quant-ph/0407010.pdf`.
+  void decompose() {
+
+    // Decompose the state into phases and magnitudes.
+    bool needsPhaseEqualization = false;
+    std::vector<double> phases;
+    std::vector<double> magnitudes;
+    for (const auto &a : amplitudes) {
+      phases.push_back(std::arg(a));
+      magnitudes.push_back(std::abs(a));
+      // FIXME: remove magic number.
+      needsPhaseEqualization |= std::abs(phases.back()) > 1e-10;
+    }
+
+    // N.B: The algorithm, as described in the paper, creates a circuit that
+    // begins with a target state and brings it to the all zero state. Hence,
+    // this implementation do the two steps described in Section III in reverse
+    // order.
+
+    // Apply uniformly controlled y-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaYk = cudaq::details::getAlphaY(magnitudes, numQubits, k);
+      applyRotation<quake::RyOp>(alphaYk, numControls, target);
+    }
+
+    if (!needsPhaseEqualization)
+      return;
+
+    // Apply uniformly controlled z-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaZk = cudaq::details::getAlphaZ(phases, numQubits, k);
+      if (alphaZk.empty())
+        continue;
+      applyRotation<quake::RzOp>(alphaZk, numControls, target);
+    }
+  }
+
+private:
+  /// @brief Apply a uniformly controlled rotation on the target qubit.
+  template <typename Op>
+  void applyRotation(const std::span<double> alphas, std::size_t numControls,
+                     std::size_t target) {
+
+    // In our model the index 1 (i.e. |01>) in quantum state data
+    // corresponds to qubits[0]=1 and qubits[1] = 0.
+    // Revert the order of qubits as the state preparation algorithm
+    // we use assumes the opposite.
+    auto qubitIndex = [&](std::size_t i) { return numQubits - i - 1; };
+
+    auto thetas = cudaq::details::convertAngles(alphas);
+    if (numControls == 0) {
+      builder.applyRotationOp<Op>(thetas[0], qubitIndex(target));
+      return;
+    }
+
+    auto controlIndices = cudaq::details::getControlIndices(numControls);
+    assert(thetas.size() == controlIndices.size());
+    for (auto [i, c] : llvm::enumerate(controlIndices)) {
+      builder.applyRotationOp<Op>(thetas[i], qubitIndex(target));
+      builder.applyX(qubitIndex(c), qubitIndex(target));
+    }
+  }
+
+  StateGateBuilder &builder;
+  std::span<std::complex<double>> amplitudes;
+  std::size_t numQubits;
+};
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
new file mode 100644
index 0000000000..785e70b3f8
--- /dev/null
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -0,0 +1,370 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "StateDecomposer.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/TypeToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include <span>
+
+#define DEBUG_TYPE "state-preparation"
+
+using namespace mlir;
+
+/// Replace a qubit initialization from vectors with quantum gates.
+/// For example:
+///
+///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
+///     %1 = math.cttz %0 : i64
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
+///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
+///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
+///     !quake.veq<?> return
+///   }
+///
+/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
+/// M_SQRT1_2} as arg0:
+///
+///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+///     %0 = quake.alloca !quake.veq<2>
+///     %c0_i64 = arith.constant 0 : i64
+///     %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst = arith.constant 1.5707963267948968 : f64
+///     quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+///     %c1_i64 = arith.constant 1 : i64
+///     %2 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst_0 = arith.constant 1.5707963267948966 : f64
+///     quake.ry (%cst_0) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     %cst_1 = arith.constant -1.5707963267948966 : f64
+///     quake.ry (%cst_1) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     return
+///   }
+///
+/// Note: the following synthesis and const prop passes will replace
+/// the argument by a constant and propagate the values and vector size
+/// through other instructions.
+
+namespace {
+
+template <typename T>
+concept IntegralType =
+    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
+    std::is_same<T, std::int16_t>::value ||
+    std::is_same<T, std::int32_t>::value ||
+    std::is_same<T, std::int64_t>::value;
+
+template <typename T>
+concept FloatingType = std::is_same<T, float>::value;
+
+template <typename T>
+concept DoubleType = std::is_same<T, double>::value;
+
+template <typename T>
+concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
+
+/// Input was complex<float> but we prefer
+/// complex<double>. Make a copy, extending the values.
+template <FloatingType From>
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  auto convertData = std::vector<std::complex<double>>(size);
+  for (std::size_t i = 0; i < size; ++i)
+    convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
+                                          static_cast<double>(data[i].imag())};
+  return convertData;
+}
+
+template <DoubleType From>
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  return std::vector<std::complex<From>>(data, data + size);
+}
+
+/// Input was float/double but we prefer complex<double>.
+/// Make a copy, extending or truncating the values.
+template <ComplexDataType From>
+std::vector<std::complex<double>> convertToComplex(From *data,
+                                                   std::uint64_t size) {
+  auto convertData = std::vector<std::complex<double>>(size);
+  for (std::size_t i = 0; i < size; ++i)
+    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
+                                          static_cast<double>(0.0)};
+  return convertData;
+}
+
+LogicalResult
+prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
+                               unsigned &counter, BlockArgument argument,
+                               std::vector<std::complex<double>> &vec) {
+  auto *ctx = builder.getContext();
+  auto argLoc = argument.getLoc();
+
+  auto toErase = std::vector<mlir::Operation *>();
+
+  for (auto *argUser : argument.getUsers()) {
+    // Handle the `StdvecSize` and `quake.alloca` use case:
+    // - Replace a `vec.size()` with the vector length.
+    // - Replace the number of qubits calculation with the vector length
+    // logarithm.
+    // - Replace `quake.alloca` with a constant size qvector allocation.
+    if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
+      builder.setInsertionPointAfter(stdvecSizeOp);
+      Value length = builder.create<arith::ConstantIntOp>(
+          argLoc, vec.size(), stdvecSizeOp.getType());
+
+      Value numQubits = builder.create<arith::ConstantIntOp>(
+          argLoc, log2(vec.size()), stdvecSizeOp.getType());
+
+      for (auto *sizeUser : argUser->getUsers()) {
+        if (auto countZeroesOp =
+                dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
+          for (auto *numQubitsUser : sizeUser->getUsers()) {
+            if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
+              builder.setInsertionPointAfter(quakeAllocaOp);
+              auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
+              Value newAlloc = builder.create<quake::AllocaOp>(argLoc, veqTy);
+              quakeAllocaOp.replaceAllUsesWith(newAlloc);
+              toErase.push_back(quakeAllocaOp);
+            }
+          }
+          countZeroesOp.replaceAllUsesWith(numQubits);
+          toErase.push_back(countZeroesOp);
+        }
+      }
+
+      stdvecSizeOp.replaceAllUsesWith(length);
+      toErase.push_back(stdvecSizeOp);
+      continue;
+    }
+
+    // Handle the `StdvecDataOp` and `quake.init_state` use case:
+    // - Replace a `quake.init_state` with gates preparing the state.
+    if (auto stdvecDataOp = dyn_cast<cudaq::cc::StdvecDataOp>(argUser)) {
+      for (auto *dataUser : stdvecDataOp->getUsers()) {
+        if (auto initOp = dyn_cast<quake::InitializeStateOp>(dataUser)) {
+          builder.setInsertionPointAfter(initOp);
+          // Find the qvector alloc instruction
+          auto qubits = initOp.getOperand(0);
+
+          // Prepare state from vector data.
+          auto gateBuilder = StateGateBuilder(builder, argLoc, qubits);
+          auto decomposer = StateDecomposer(gateBuilder, vec);
+          decomposer.decompose();
+
+          initOp.replaceAllUsesWith(qubits);
+          toErase.push_back(initOp);
+        }
+      }
+    }
+  }
+
+  for (auto &op : toErase) {
+    op->erase();
+  }
+
+  return success();
+}
+
+class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
+protected:
+  // The name of the kernel to be synthesized
+  std::string kernelName;
+
+  // The raw pointer to the runtime arguments.
+  void *args;
+
+public:
+  StatePreparation() = default;
+  StatePreparation(std::string_view kernel, void *a)
+      : kernelName(kernel), args(a) {}
+
+  mlir::ModuleOp getModule() { return getOperation(); }
+
+  std::pair<std::size_t, std::vector<std::size_t>>
+  getTargetLayout(FunctionType funcTy) {
+    auto bufferTy = cudaq::opt::factory::buildInvokeStructType(funcTy);
+    StringRef dataLayoutSpec = "";
+    if (auto attr =
+            getModule()->getAttr(cudaq::opt::factory::targetDataLayoutAttrName))
+      dataLayoutSpec = cast<StringAttr>(attr);
+    auto dataLayout = llvm::DataLayout(dataLayoutSpec);
+    // Convert bufferTy to llvm.
+    llvm::LLVMContext context;
+    LLVMTypeConverter converter(funcTy.getContext());
+    cudaq::opt::initializeTypeConversions(converter);
+    auto llvmDialectTy = converter.convertType(bufferTy);
+    LLVM::TypeToLLVMIRTranslator translator(context);
+    auto *llvmStructTy =
+        cast<llvm::StructType>(translator.translateType(llvmDialectTy));
+    auto *layout = dataLayout.getStructLayout(llvmStructTy);
+    auto strSize = layout->getSizeInBytes();
+    std::vector<std::size_t> fieldOffsets;
+    for (std::size_t i = 0, I = bufferTy.getMembers().size(); i != I; ++i)
+      fieldOffsets.emplace_back(layout->getElementOffset(i));
+    return {strSize, fieldOffsets};
+  }
+
+  void runOnOperation() override final {
+    auto module = getModule();
+    unsigned counter = 0;
+
+    if (args == nullptr || kernelName.empty()) {
+      module.emitOpError(
+          "State preparation requires a kernel and the values of the "
+          "arguments passed when it is called.");
+      signalPassFailure();
+      return;
+    }
+
+    auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName;
+    // Get the function we care about (the one with kernelName)
+    auto funcOp = module.lookupSymbol<func::FuncOp>(kernelNameInQuake);
+    if (!funcOp) {
+      module.emitOpError("The kernel '" + kernelName +
+                         "' was not found in the module.");
+      signalPassFailure();
+      return;
+    }
+
+    // Create the builder.
+    auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
+    auto arguments = funcOp.getArguments();
+    auto structLayout = getTargetLayout(funcOp.getFunctionType());
+    // Keep track of the stdVec sizes.
+    std::vector<std::tuple<std::size_t, Type, std::uint64_t>> stdVecInfo;
+
+    for (auto iter : llvm::enumerate(arguments)) {
+      auto argNum = iter.index();
+      auto argument = iter.value();
+      std::size_t offset = structLayout.second[argNum];
+
+      // Get the argument type
+      auto type = argument.getType();
+
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
+        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+          funcOp.emitOpError(
+              "State preparation from cudaq::state is not supported.");
+          return;
+        }
+      }
+
+      // If std::vector<arithmetic> type, add it to the list of vector info.
+      // These will be processed when we reach the buffer's appendix.
+      if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
+        auto eleTy = vecTy.getElementType();
+        if (!isa<IntegerType, FloatType, ComplexType>(eleTy)) {
+          continue;
+        }
+        char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
+        auto sizeFromBuffer =
+            *reinterpret_cast<std::uint64_t *>(ptrToSizeInBuffer);
+        unsigned bytesInType = [&eleTy]() {
+          if (auto complexTy = dyn_cast<ComplexType>(eleTy))
+            return 2 * cudaq::opt::convertBitsToBytes(
+                           complexTy.getElementType().getIntOrFloatBitWidth());
+          return cudaq::opt::convertBitsToBytes(eleTy.getIntOrFloatBitWidth());
+        }();
+        assert(bytesInType > 0 && "element must have a size");
+        auto vectorSize = sizeFromBuffer / bytesInType;
+        stdVecInfo.emplace_back(argNum, eleTy, vectorSize);
+        continue;
+      }
+    }
+
+    // For any `std::vector` arguments, we now know the sizes so let's replace
+    // the block arg with the actual vector element data. First get the pointer
+    // to the start of the buffer's appendix.
+    auto structSize = structLayout.first;
+    char *bufferAppendix = static_cast<char *>(args) + structSize;
+    for (auto [idx, eleTy, vecLength] : stdVecInfo) {
+      if (!eleTy) {
+        bufferAppendix += vecLength;
+        continue;
+      }
+      auto doVector = [&]<typename T>(T) {
+        auto *ptr = reinterpret_cast<T *>(bufferAppendix);
+        auto v = convertToComplex(ptr, vecLength);
+        if (failed(prepareStateFromVectorArgument(builder, module, counter,
+                                                  arguments[idx], v)))
+          funcOp.emitOpError("state preparation failed for vector<T>");
+        bufferAppendix += vecLength * sizeof(T);
+      };
+      if (auto ty = dyn_cast<IntegerType>(eleTy)) {
+        switch (ty.getIntOrFloatBitWidth()) {
+        case 1:
+          doVector(false);
+          break;
+        case 8:
+          doVector(std::int8_t{});
+          break;
+        case 16:
+          doVector(std::int16_t{});
+          break;
+        case 32:
+          doVector(std::int32_t{});
+          break;
+        case 64:
+          doVector(std::int64_t{});
+          break;
+        default:
+          bufferAppendix += vecLength * cudaq::opt::convertBitsToBytes(
+                                            ty.getIntOrFloatBitWidth());
+          funcOp.emitOpError(
+              "state preparation failed for vector<integral-type>.");
+          break;
+        }
+        continue;
+      }
+      if (eleTy == builder.getF32Type()) {
+        doVector(float{});
+        continue;
+      }
+      if (eleTy == builder.getF64Type()) {
+        doVector(double{});
+        continue;
+      }
+      if (eleTy == ComplexType::get(builder.getF32Type())) {
+        doVector(std::complex<float>{});
+        continue;
+      }
+      if (eleTy == ComplexType::get(builder.getF64Type())) {
+        doVector(std::complex<double>{});
+        continue;
+      }
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation() {
+  return std::make_unique<StatePreparation>();
+}
+
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createStatePreparation(std::string_view kernelName, void *a) {
+  return std::make_unique<StatePreparation>(kernelName, a);
+}
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 476ee187bb..f2da95a8f3 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -475,6 +475,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
 
   PassManager pm(context);
   pm.addPass(createCanonicalizerPass());
+  pm.addPass(cudaq::opt::createStatePreparation(name, rawArgs));
   pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs));
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index de072335bf..fc11224f5e 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -7,6 +7,7 @@
 # ============================================================================ #
 
 import cudaq, pytest, os, time
+import numpy as np
 from cudaq import spin
 from multiprocessing import Process
 try:
@@ -169,6 +170,20 @@ def kernel():
     result = cudaq.sample(kernel)
 
 
+def test_quantinuum_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index ddaeb6cc4d..28260dcb4d 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -5,11 +5,18 @@
 # This source code and the accompanying materials are made available under     #
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
+
+import os, sys
 import pytest
 
 import cudaq
 import numpy as np
 
+## [PYTHON_VERSION_FIX]
+skipIfPythonLessThan39 = pytest.mark.skipif(
+    sys.version_info < (3, 9),
+    reason="built-in collection types such as `list` not supported")
+
 skipIfNvidiaFP64NotInstalled = pytest.mark.skipif(
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia-fp64')),
     reason='Could not find nvidia-fp64 in installation')
@@ -18,30 +25,34 @@
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
     reason='Could not find nvidia in installation')
 
+# state preparation and synthesis
 
-# float
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_params_f64():
 
+@skipIfPythonLessThan39
+def test_kernel_state_preparation():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
-    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+    c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
 
     @cudaq.kernel
-    def kernel(vec: list[float]):
+    def kernel(vec: list[complex]):
         q = cudaq.qvector(vec)
 
-    counts = cudaq.sample(kernel, f)
-    print(counts)
-    assert '11' in counts
+    synthesized = cudaq.synthesize(kernel, c)
+    assert 'quake.init_state' in kernel.__str__()
+    assert not 'quake.init_state' in synthesized.__str__()
+
+    counts = cudaq.sample(synthesized)
     assert '00' in counts
+    assert '10' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_float_params_f32():
+# float
+
+
+@skipIfPythonLessThan39
+def test_kernel_float_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -156,33 +167,9 @@ def kernel():
 # complex
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_rotate_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [0. + 0j, 0., 0., 1.]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-        x(q.front())
-        y(q.back())
-        h(q)
-        mz(q)
-
-    counts = cudaq.sample(kernel, c)
-    print(f'rotate: {counts}')
-    assert '11' in counts
-    assert '00' in counts
-    assert '01' in counts
-    assert '10' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_rotate_f32():
+@skipIfPythonLessThan39
+def test_kernel_complex_params_rotate():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [0. + 0j, 0., 0., 1.]
 
@@ -202,27 +189,9 @@ def kernel(vec: list[complex]):
     assert '10' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_f32():
+@skipIfPythonLessThan39
+def test_kernel_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -337,10 +306,9 @@ def kernel():
 # np arrays
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex_params_f64():
+@skipIfPythonLessThan39
+def test_kernel_dtype_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -354,10 +322,9 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex128_params_f64():
+@skipIfPythonLessThan39
+def test_kernel_dtype_complex128_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -425,27 +392,9 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_params_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_params_f32():
+@skipIfPythonLessThan39
+def test_kernel_amplitudes_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
 
@@ -459,10 +408,9 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_from_capture_f64():
+@skipIfPythonLessThan39
+def test_kernel_amplitudes_complex_from_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -476,23 +424,6 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_from_capture_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(cudaq.amplitudes(vec))
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_np_array_from_capture_f64():
     cudaq.reset_target()
@@ -568,40 +499,9 @@ def kernel():
 # test errors
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_array_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex))
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_list_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector([1., 0., 0.])
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_array_size_f32():
+@skipIfPythonLessThan39
+def test_kernel_error_invalid_array_size():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -613,10 +513,9 @@ def kernel():
         e)
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_list_size_f32():
+@skipIfPythonLessThan39
+def test_kernel_error_invalid_list_size():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -629,6 +528,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_param_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel(n: int):
@@ -643,6 +543,8 @@ def kernel(n: int):
 
 
 def test_kernel_qvector_init_from_capture_int():
+    cudaq.reset_target()
+
     n = 2
 
     @cudaq.kernel
@@ -658,6 +560,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel():
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index aa36a0c62d..08f41e60ec 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -401,6 +401,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
+      pm.addPass(cudaq::opt::createStatePreparation(kernelName, updatedArgs));
       pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 520c20410f..0b9e8d313e 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -153,6 +153,7 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
+        pm.addPass(cudaq::opt::createStatePreparation(name, args));
         pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args));
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
diff --git a/targettests/execution/from_state_complex.cpp b/targettests/execution/from_state_complex.cpp
new file mode 100644
index 0000000000..5ca8813393
--- /dev/null
+++ b/targettests/execution/from_state_complex.cpp
@@ -0,0 +1,27 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q = inState;
+}
+
+// CHECK: size 2
+
+int main() {
+  std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  auto counts = cudaq::sample(test, vec);
+  counts.dump();
+
+  printf("size %zu\n", counts.size());
+  return !(counts.size() == 2);
+}
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
new file mode 100644
index 0000000000..35a628c06a
--- /dev/null
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+void printCounts(cudaq::sample_result& result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test, vec);
+        printCounts(counts);
+
+        counts = cudaq::sample(test, vec1);
+        printCounts(counts);
+    }
+
+    {
+        // Passing state data as argument (builder mode)
+        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+        auto qubits = kernel.qalloc(v);
+    
+        auto counts = cudaq::sample(kernel, vec);
+        printCounts(counts);
+    }
+}
+
+// CHECK: 00
+// CHECK: 10
+// CHECK: 01
+// CHECK: 11
+// CHECK: 00
+// CHECK: 10