From a6d5f4e7ec30a7c6e3391c8647f41e966ef7f1ab Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 17 Jun 2024 10:08:03 -0700
Subject: [PATCH 1/9] Add a pass for state preparation from vectors

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   2 +
 include/cudaq/Optimizer/Transforms/Passes.td  |  11 ++
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 .../Transforms/GenKernelExecution.cpp         |  34 +++-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |  34 ++++
 lib/Optimizer/Transforms/StatePreparation.cpp | 119 +++++++++++++
 program.py                                    |  35 ++++
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   1 +
 runtime/common/BaseRemoteRESTQPU.h            |   1 +
 runtime/common/BaseRestRemoteClient.h         |   1 +
 targettests/execution/from_state.cpp          |  30 ++++
 targettests/execution/from_state_complex.cpp  |  27 +++
 targettests/execution/program.cpp             | 167 ++++++++++++++++++
 13 files changed, 457 insertions(+), 6 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StatePreparation.cpp
 create mode 100644 program.py
 create mode 100644 targettests/execution/from_state.cpp
 create mode 100644 targettests/execution/from_state_complex.cpp
 create mode 100644 targettests/execution/program.cpp
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 996b6e56a7..422032326c 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -40,6 +40,8 @@ std::unique_ptr<mlir::Pass> createLowerToCFGPass();
 std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
+std::unique_ptr<mlir::Pass> createStatePreparation();
+std::unique_ptr<mlir::Pass> createStatePreparation(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 8d2f0c1821..e5e15a8776 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -512,6 +512,17 @@ def PruneCtrlRelations : Pass<"pruned-ctrl-form", "mlir::func::FuncOp"> {
   }];
 }
 
+def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
+  let summary =
+    "Convert state vector data into gates";
+  let description = [{
+    Convert quake representation that includes qubit initialization
+    from data into qubit initialization using gates.
+  }];
+
+  let constructor = "cudaq::opt::createStatePreparation()";
+}
+
 def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> {
   let summary =
     "Synthesize concrete quantum program from Quake code plus runtime values.";
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 7600efe276..6a51057bd3 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,6 +39,7 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
+  StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index c16a4af7dd..68ef5b21b7 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -434,8 +434,18 @@ class GenerateKernelExecution
         hasTrailingData = true;
         continue;
       }
-      if (isa<cudaq::cc::PointerType>(currEleTy) &&
-          !isStatePointerType(currEleTy)) {
+      //if (isa<cudaq::cc::PointerType>(currEleTy) &&
+      //    !isStatePointerType(currEleTy)) {
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(currEleTy)) {
+        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+          // Special case: if the argument is a `cudaq::state*`, then just pass
+          // the pointer. We can do that in this case because the synthesis step
+          // (which will receive the argument data) is assumed to run in the
+          // same memory space.
+          argPtr = builder.create<cudaq::cc::CastOp>(loc, currEleTy, argPtr);
+          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
+                                                           stVal, argPtr, idx);
+        }
         continue;
       }
 
@@ -941,8 +951,8 @@ class GenerateKernelExecution
         cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet);
     if (count > 0 && args.size() >= count &&
         std::all_of(args.begin(), args.begin() + count, [](auto i) {
-          return isa<cudaq::cc::PointerType>(i.getType()) &&
-                 !isStatePointerType(i.getType());
+          return isa<cudaq::cc::PointerType>(i.getType());// &&
+                // !isStatePointerType(i.getType());
         }))
       return args.drop_front(count);
     return args;
@@ -1208,9 +1218,21 @@ class GenerateKernelExecution
         hasTrailingData = true;
         continue;
       }
-      if (isa<cudaq::cc::PointerType>(inTy) && !isStatePointerType(inTy))
+      //if (isa<cudaq::cc::PointerType>(inTy) && !isStatePointerType(inTy))
+      //  continue;
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(inTy)) {
+        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+          // Special case: if the argument is a `cudaq::state*`, then just pass
+          // the pointer. We can do that in this case because the synthesis step
+          // (which will receive the argument data) is assumed to run in the
+          // same memory space.
+          Value argPtr = builder.create<cudaq::cc::CastOp>(loc, inTy, arg);
+          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
+                                                           stVal, argPtr, idx);
+        }
         continue;
-
+      }
+      
       stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
                                                        stVal, arg, idx);
     }
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index f371a8b9cd..dbb2b00cc8 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -23,10 +23,19 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
+#include <iostream>
+
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
 
+// cudaq::state is defined in the runtime. The compiler will never need to know
+// about its implementation and there should not be a circular build/library
+// dependence because of it. Simply forward declare it, as it is notional.
+namespace cudaq {
+class state;
+}
+
 /// Replace a BlockArgument of a specific type with a concrete instantiation of
 /// that type, and add the generation of that constant as an MLIR Op to the
 /// beginning of the function. For example
@@ -366,7 +375,9 @@ class QuakeSynthesizer
   }
 
   void runOnOperation() override final {
+    std::cout << "Module before synthesis " << std::endl;
     auto module = getModule();
+    module.dump();
     if (args == nullptr || kernelName.empty()) {
       module.emitOpError("Synthesis requires a kernel and the values of the "
                          "arguments passed when it is called.");
@@ -472,6 +483,27 @@ class QuakeSynthesizer
         continue;
       }
 
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
+        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+          // Special case of a `cudaq::state*` which must be in the same address
+          // space. This references a container to a set of simulation
+          // amplitudes.
+          synthesizeRuntimeArgument<cudaq::state *>(
+              builder, argument, args, offset, sizeof(void *),
+              [=](OpBuilder &builder, cudaq::state **concrete) {
+                Value rawPtr = builder.create<arith::ConstantIntOp>(
+                    loc, reinterpret_cast<std::intptr_t>(*concrete),
+                    sizeof(void *) * 8);
+                auto stateTy = cudaq::cc::StateType::get(builder.getContext());
+                return builder.create<cudaq::cc::CastOp>(
+                    loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
+              });
+          continue;
+        }
+        // N.B. Other pointers will not be materialized and may be in a
+        // different address space.
+      }
+
       // If std::vector<arithmetic> type, add it to the list of vector info.
       // These will be processed when we reach the buffer's appendix.
       if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
@@ -601,6 +633,8 @@ class QuakeSynthesizer
       }
     }
     funcOp.eraseArguments(argsToErase);
+    std::cout << "Module after synthesis " << std::endl; 
+    module.dump();
   }
 };
 
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
new file mode 100644
index 0000000000..d7868b46ef
--- /dev/null
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/TypeToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+#include <iostream>
+
+#define DEBUG_TYPE "state-preparation"
+
+using namespace mlir;
+
+/// Replace a qubit initialization from vectors with quantum gates.
+/// For example:
+///
+///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
+///     %1 = math.cttz %0 : i64
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>>
+///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
+///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+///     return
+///   }
+///
+/// on call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0
+/// will be updated to:
+///
+///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
+///     %c4_i64 = arith.constant 4 : i64
+///     %3 = math.cttz %c4_i64 : i64
+///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
+///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
+///     quake.h %6 : (!quake.ref) -> ()
+///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
+///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
+///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
+///   }
+///
+/// Note: we rely on the later synthesis and const prop stages to replace
+/// the argument by a constant and propagate the values and vector size
+/// through those and other instructions.
+
+namespace {
+class StatePreparation
+    : public cudaq::opt::StatePreparationBase<StatePreparation> {
+protected:
+  // The name of the kernel to be synthesized
+  std::string kernelName;
+
+  // The raw pointer to the runtime arguments.
+  void *args;
+
+public:
+  StatePreparation() = default;
+  StatePreparation(std::string_view kernel, void *a)
+      : kernelName(kernel), args(a) {}
+
+  mlir::ModuleOp getModule() { return getOperation(); }
+
+
+  void runOnOperation() override final {
+    std::cout << "Module before state prep " << std::endl;
+    auto module = getModule();
+    module.dump();
+    if (args == nullptr || kernelName.empty()) {
+      module.emitOpError("Synthesis requires a kernel and the values of the "
+                         "arguments passed when it is called.");
+      signalPassFailure();
+      return;
+    }
+
+    auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName;
+    // Get the function we care about (the one with kernelName)
+    auto funcOp = module.lookupSymbol<func::FuncOp>(kernelNameInQuake);
+    if (!funcOp) {
+      module.emitOpError("The kernel '" + kernelName +
+                         "' was not found in the module.");
+      signalPassFailure();
+      return;
+    }
+
+    // Create the builder.
+    auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
+    
+    std::cout << "Module after synthesis " << std::endl; 
+    module.dump();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation() {
+  return std::make_unique<StatePreparation>();
+}
+
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createStatePreparation(std::string_view kernelName, void *a) {
+  return std::make_unique<StatePreparation>(kernelName, a);
+}
diff --git a/program.py b/program.py
new file mode 100644
index 0000000000..e282d8cd5d
--- /dev/null
+++ b/program.py
@@ -0,0 +1,35 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import numpy as np
+import cudaq
+
+import cudaq
+import numpy as np
+
+cudaq.reset_target()
+
+cudaq.set_target('nvidia')
+#cudaq.set_target('nvidia-mqpu')
+# cudaq.set_target('density-matrix-cpu')
+
+
+c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
+                dtype=np.complex128)
+state = cudaq.State.from_data(c)
+
+@cudaq.kernel(verbose=True)
+def kernel(vec: cudaq.State):
+    q = cudaq.qvector(vec)
+
+print(kernel)
+print(cudaq.to_qir(kernel))
+
+#print(cudaq.get_target())
+#counts = cudaq.sample(kernel, state)
+#print(counts)
\ No newline at end of file
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 51f86ce15f..ff0c0ce477 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -474,6 +474,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
 
   PassManager pm(context);
   pm.addPass(createCanonicalizerPass());
+  pm.addPass(cudaq::opt::createStatePreparation(name, rawArgs));
   pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs));
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index aa36a0c62d..08f41e60ec 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -401,6 +401,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
+      pm.addPass(cudaq::opt::createStatePreparation(kernelName, updatedArgs));
       pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 17c235a76b..9325d0345d 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -153,6 +153,7 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
+        pm.addPass(cudaq::opt::createStatePreparation(name, args));
         pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args));
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp
new file mode 100644
index 0000000000..55438848cb
--- /dev/null
+++ b/targettests/execution/from_state.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test(cudaq::state *inState) {
+  cudaq::qvector q(inState);
+}
+
+// CHECK: size 2
+
+int main() {
+  std::vector<std::complex<float>> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  auto state = cudaq::state::from_data(vec);
+  auto counts = cudaq::sample(test, &state);
+  counts.dump();
+
+  printf("size %zu\n", counts.size());
+  return !(counts.size() == 2);
+}
diff --git a/targettests/execution/from_state_complex.cpp b/targettests/execution/from_state_complex.cpp
new file mode 100644
index 0000000000..5ca8813393
--- /dev/null
+++ b/targettests/execution/from_state_complex.cpp
@@ -0,0 +1,27 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q = inState;
+}
+
+// CHECK: size 2
+
+int main() {
+  std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  auto counts = cudaq::sample(test, vec);
+  counts.dump();
+
+  printf("size %zu\n", counts.size());
+  return !(counts.size() == 2);
+}
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
new file mode 100644
index 0000000000..b6a12ebb57
--- /dev/null
+++ b/targettests/execution/program.cpp
@@ -0,0 +1,167 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test1(std::vector<cudaq::complex> inState) {
+    cudaq::qvector q1 = inState;
+    h(q1[0]);
+    cx(q1[0], q1[1]);
+
+}
+
+//  __qpu__ void test2(cudaq::state *inState) {
+//    cudaq::qvector q2(inState);
+//    cudaq::x(q2);
+// }
+
+// __qpu__ void test3() {
+//   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
+// }
+
+// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:1938: not yet implemented: unknown function, get_state, in cudaq namespace
+// __qpu__ void test4() {
+//   cudaq::qvector q(cudaq::get_state(test3));
+// }
+
+// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
+// __qpu__ void test5(cudaq::state *inState) {
+//   test2(inState);
+// }
+
+
+
+int main() {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+
+    {
+        // Passing state data as argument (vector<complex>)
+
+        // Before synthesis:
+
+        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE(%arg0: !cc.stdvec<complex<f32>>) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+        //     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
+        //     %1 = math.cttz %0 : i64
+        //     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>>
+        //     %3 = quake.alloca !quake.veq<?>[%1 : i64]
+        //     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+        //     return
+        // }
+
+        // After synthesis
+
+        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+        //     %0 = cc.const_array [0.707106769 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.707106769 : f32, 0.000000e+00 : f32] : !cc.array<complex<f32> x 4>
+        //     %1 = cc.alloca !cc.array<complex<f32> x 4>
+        //     cc.store %0, %1 : !cc.ptr<!cc.array<complex<f32> x 4>>
+        //     %2 = cc.cast %1 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+        //     %c4_i64 = arith.constant 4 : i64
+        //     %3 = math.cttz %c4_i64 : i64                        // (TODO: replace by a const)
+        //     %4 = quake.alloca !quake.veq<?>[%3 : i64]
+        //     %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?> // TODO: replace by gates
+        //     return
+        // }
+
+        // TODO: in StatePreparation pass
+        // input - vector<double>, qubits
+        // output - MLIR replacing alloca+state_init instructions with gates on qubits
+
+        // %3 = math.cttz %c4_i64 : i64
+        // %4 = quake.alloca !quake.veq<?>[%3 : i64]
+        // %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+
+        // => (something like)
+
+        // create a function that does the following and call it on qubits
+        // %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
+        // quake.ry (%cst) %6 : (f64, !quake.ref) -> ()
+        // ...
+
+        // TODO: Run state preparation pass before synthesis 
+
+        std::cout << "test1(vec): "  << "\n";
+        auto counts = cudaq::sample(test1, vec);
+        counts.dump();
+    }
+
+    // {
+    //     // Passing state ptr as argument - no support for from_data
+
+    //     // "func.func"() ({
+    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
+    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
+    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
+    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    //     //     "func.return"() : () -> ()
+    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
+        
+    //     std::cout << "test2(state): "  << "\n";
+    //     auto state = cudaq::state::from_data(vec);
+
+    //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
+    //     //auto counts = cudaq::sample(test2, &state);
+    //     //counts.dump();
+    // }
+
+    // {
+    //     // Passing a state from another kernel as argument
+
+    //     // "func.func"() ({
+    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
+    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
+    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
+    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    //     //     "func.return"() : () -> ()
+    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
+        
+    //     std::cout << "test2(test3): "  << "\n";
+    //     auto state = cudaq::get_state(test3);
+
+    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
+    //     auto counts = cudaq::sample(test2, &state);
+    //     counts.dump();
+    // }
+
+    // {
+    //     // Passing a state to another kernel as argument
+    //     std::cout << "test4(state): "  << "\n";
+    //     //auto state = cudaq::state::from_data(vec);
+    //     //auto counts = cudaq::sample(test4, &state);
+    // }
+
+    // {
+    //     // Creating a kernel from state and passing its state to another kernel
+
+    //     // "func.func"() ({
+    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
+    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
+    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
+    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    //     //     "func.return"() : () -> ()
+    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
+        
+    //     std::cout << "test2(kernel): "  << "\n";
+    //     std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
+    //     auto kernel = cudaq::make_kernel();
+    //     auto qubits = kernel.qalloc(2);
+
+    //     cudaq::from_state(kernel, qubits, vec);
+
+    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
+    //     //auto state = cudaq::get_state(kernel);
+    //     //auto counts = cudaq::sample(test2, &state);
+
+    //     //counts.dump();
+    // }
+
+}
\ No newline at end of file

From 93dd8d7f4ba31cc3869fd7fbaa399631c1cdaa97 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 25 Jun 2024 11:23:37 -0700
Subject: [PATCH 2/9] Implement state preparation

---
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 lib/Optimizer/Transforms/StateDecomposer.cpp  | 128 ++++++++++++++
 lib/Optimizer/Transforms/StateDecomposer.h    | 163 ++++++++++++++++++
 lib/Optimizer/Transforms/StatePreparation.cpp | 151 ++++++++++------
 targettests/execution/program.cpp             | 118 +++----------
 5 files changed, 415 insertions(+), 146 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StateDecomposer.cpp
 create mode 100644 lib/Optimizer/Transforms/StateDecomposer.h

diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 6a51057bd3..b0a13571ec 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,6 +39,7 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
+  StateDecomposer.cpp
   StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
diff --git a/lib/Optimizer/Transforms/StateDecomposer.cpp b/lib/Optimizer/Transforms/StateDecomposer.cpp
new file mode 100644
index 0000000000..3105fad707
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateDecomposer.cpp
@@ -0,0 +1,128 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "StateDecomposer.h"
+
+namespace cudaq::details {
+
+std::vector<std::size_t> grayCode(std::size_t numBits) {
+  std::vector<std::size_t> result(1ULL << numBits);
+  for (std::size_t i = 0; i < (1ULL << numBits); ++i)
+    result[i] = ((i >> 1) ^ i);
+  return result;
+}
+
+std::vector<std::size_t> getControlIndices(std::size_t numBits) {
+  auto code = grayCode(numBits);
+  std::vector<std::size_t> indices;
+  for (auto i = 0u; i < code.size(); ++i) {
+    // The position of the control in the lth CNOT gate is set to match
+    // the position where the lth and (l + 1)th bit strings g[l] and g[l+1] of
+    // the binary reflected Gray code differ.
+    auto position = std::log2(code[i] ^ code[(i + 1) % code.size()]);
+    // N.B: In CUDA Quantum we write the least significant bit (LSb) on the left
+    //
+    //  lsb -v
+    //       001
+    //         ^- msb
+    //
+    // Meaning that the bitstring 001 represents the number four instead of one.
+    // The above position calculation uses the 'normal' convention of writing
+    // numbers with the LSb on the left.
+    //
+    // Now, what we need to find out is the position of the 1 in the bitstring.
+    // If we take LSb as being position 0, then for the normal convention its
+    // position will be 0. Using CUDA Quantum convention it will be 2. Hence,
+    // we need to convert the position we find using:
+    //
+    // numBits - position - 1
+    //
+    // The extra -1 is to account for indices starting at 0. Using the above
+    // examples:
+    //
+    // bitstring: 001
+    // numBits: 3
+    // position: 0
+    //
+    // We have the converted position: 2, which is what we need.
+    indices.emplace_back(numBits - position - 1);
+  }
+  return indices;
+}
+
+std::vector<double> convertAngles(const std::span<double> alphas) {
+  // Implements Eq. (3) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  //
+  // N.B: The paper does fails to explicitly define what is the dot operator in
+  // the exponent of -1. Ref. 3 solves the mystery: its the bitwise inner
+  // product.
+  auto bitwiseInnerProduct = [](std::size_t a, std::size_t b) {
+    auto product = a & b;
+    auto sumOfProducts = 0;
+    while (product) {
+      sumOfProducts += product & 0b1 ? 1 : 0;
+      product = product >> 1;
+    }
+    return sumOfProducts;
+  };
+  std::vector<double> thetas(alphas.size(), 0);
+  for (std::size_t i = 0u; i < alphas.size(); ++i) {
+    for (std::size_t j = 0u; j < alphas.size(); ++j)
+      thetas[i] +=
+          bitwiseInnerProduct(j, ((i >> 1) ^ i)) & 0b1 ? -alphas[j] : alphas[j];
+    thetas[i] /= alphas.size();
+  }
+  return thetas;
+}
+
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (5) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  std::vector<double> angles;
+  double divisor = static_cast<double>(1ULL << (k - 1));
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double angle = 0.0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l)
+      // N.B: There is an extra '-1' on these indices computations to account
+      // for the fact that our indices start at 0.
+      angle += data[(2 * j - 1) * (1 << (k - 1)) + l - 1] -
+               data[(2 * j - 2) * (1 << (k - 1)) + l - 1];
+    angles.push_back(angle / divisor);
+  }
+  return angles;
+}
+
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (8) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  // N.B: There is an extra '-1' on these indices computations to account for
+  // the fact that our indices start at 0.
+  std::vector<double> angles;
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double numerator = 0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l) {
+      numerator +=
+          std::pow(std::abs(data[(2 * j - 1) * (1 << (k - 1)) + l - 1]), 2);
+    }
+
+    double denominator = 0;
+    for (std::size_t l = 1; l <= (1ULL << k); ++l) {
+      denominator += std::pow(std::abs(data[(j - 1) * (1 << k) + l - 1]), 2);
+    }
+
+    if (denominator == 0.0) {
+      assert(numerator == 0.0 &&
+             "If the denominator is zero, the numerator must also be zero.");
+      angles.push_back(0.0);
+      continue;
+    }
+    angles.push_back(2.0 * std::asin(std::sqrt(numerator / denominator)));
+  }
+  return angles;
+}
+} // namespace cudaq::details
\ No newline at end of file
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
new file mode 100644
index 0000000000..bac6909708
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -0,0 +1,163 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/TypeToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include <span>
+
+#include <iostream>
+
+namespace cudaq::details {
+
+  /// @brief Converts angles of a uniformly controlled rotation to angles of
+  /// non-controlled rotations.
+  std::vector<double> convertAngles(const std::span<double> alphas);
+
+  /// @brief Return the control indices dictated by the gray code implementation.
+  ///
+  /// Here, numBits is the number of controls.
+  std::vector<std::size_t> getControlIndices(std::size_t numBits);
+
+   /// @brief Return angles required to implement a uniformly controlled z-rotation
+  /// on the `kth` qubit.
+  std::vector<double> getAlphaZ(const std::span<double> data,
+                                std::size_t numQubits, std::size_t k);
+
+  /// @brief Return angles required to implement a uniformly controlled y-rotation
+  /// on the `kth` qubit.
+  std::vector<double> getAlphaY(const std::span<double> data,
+                                std::size_t numQubits, std::size_t k);
+} // namespace cudaq::details
+
+class StateGateBuilder {
+public:
+  StateGateBuilder(mlir::OpBuilder& b, mlir::Location& l, mlir::Value& q): builder(b), loc(l), qubits(q) {}
+
+  template<typename Op>
+  void applyRotationOp(double theta, std::size_t target) {
+    auto qubit = createQubitRef(target);
+    auto thetaValue = createAngleValue(theta);
+    builder.create<Op>(loc, thetaValue, mlir::ValueRange{}, qubit);
+  };
+
+  void applyX(std::size_t control, std::size_t target) {
+    auto qubitC = createQubitRef(control);
+    auto qubitT = createQubitRef(target);
+    builder.create<quake::XOp>(loc, qubitC, qubitT);
+  };
+
+private:
+  mlir::Value createQubitRef(std::size_t index) {
+    if (qubitRefs.contains(index)) {
+      return qubitRefs[index];
+    }
+
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(loc, index, builder.getIntegerType(64));
+    auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
+    qubitRefs[index] = ref;
+    return ref;
+  }
+
+  mlir::Value createAngleValue(double angle) {
+    return builder.create<mlir::arith::ConstantFloatOp>(loc, llvm::APFloat{angle}, builder.getF64Type());
+  }
+
+  mlir::OpBuilder& builder;
+  mlir::Location& loc;
+  mlir::Value& qubits;
+
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs = std::unordered_map<std::size_t, mlir::Value>();
+};
+
+class StateDecomposer {
+public:
+  StateDecomposer(StateGateBuilder& b, std::vector<std::complex<double>>& a): builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+
+  /// @brief Decompose the input state vector data to a set of controlled
+  /// operations and rotations. This function takes as input a `OpBuilder`
+  /// and appends the operations of the decomposition to its internal
+  /// representation. This implementation follows the algorithm defined in
+  /// `https://arxiv.org/pdf/quant-ph/0407010.pdf`.
+  void decompose() {
+
+    // Decompose the state into phases and magnitudes.
+    bool needsPhaseEqualization = false;
+    std::vector<double> phases;
+    std::vector<double> magnitudes;
+    for (const auto &a : amplitudes) {
+      phases.push_back(std::arg(a));
+      magnitudes.push_back(std::abs(a));
+      // FIXME: remove magic number.
+      needsPhaseEqualization |= std::abs(phases.back()) > 1e-10;
+    }
+
+    // N.B: The algorithm, as described in the paper, creates a circuit that
+    // begins with a target state and brings it to the all zero state. Hence, this
+    // implementation do the two steps described in Section III in reverse order.
+
+    // Apply uniformly controlled y-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaYk = cudaq::details::getAlphaY(magnitudes, numQubits, k);
+      applyRotation<quake::RyOp>(alphaYk, numControls, target);
+    }
+
+    if (!needsPhaseEqualization)
+      return;
+
+    // Apply uniformly controlled z-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaZk = cudaq::details::getAlphaZ(phases, numQubits, k);
+      if (alphaZk.empty())
+        continue;
+      applyRotation<quake::RzOp>(alphaZk, numControls, target);
+    }
+  }
+
+private:
+  /// @brief Apply a uniformly controlled rotation on the target qubit.
+  template <typename Op>
+  void applyRotation(const std::span<double> alphas, std::size_t numControls, std::size_t target) {
+    auto thetas = cudaq::details::convertAngles(alphas);
+    if (numControls == 0) {
+      builder.applyRotationOp<Op>(thetas[0], target);
+      return;
+    }
+
+    auto controlIndices = cudaq::details::getControlIndices(numControls);
+    assert(thetas.size() == controlIndices.size());
+    for (auto [i, c] : llvm::enumerate(controlIndices)) {
+      builder.applyRotationOp<Op>(thetas[i], target);
+      builder.applyX(c, target);
+    }
+  }
+
+  StateGateBuilder& builder;
+  std::span<std::complex<double>> amplitudes;
+  std::size_t numQubits;
+};
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index ce46efecc0..86bb911a3a 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -18,10 +18,13 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include <span>
+#include "StateDecomposer.h"
 
 #include <iostream>
 
@@ -35,33 +38,44 @@ using namespace mlir;
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
 ///     %1 = math.cttz %0 : i64
-///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
-///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
-///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
-///     !quake.veq<?> return
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>> 
+///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
+///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+///     return
 ///   }
 ///
-/// on call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
-/// M_SQRT1_2} as arg0 will be updated to:
+/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0:
 ///
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-///     %c4_i64 = arith.constant 4 : i64
-///     %3 = math.cttz %c4_i64 : i64
-///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
-///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-///     quake.h %6 : (!quake.ref) -> ()
-///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
-///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
+///     %0 = quake.alloca !quake.veq<2>
+///     %c0_i64 = arith.constant 0 : i64
+///     %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst = arith.constant 1.5707963267948968 : f64
+///     quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+///     %c1_i64 = arith.constant 1 : i64
+///     %2 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst_0 = arith.constant 1.5707963267948966 : f64
+///     quake.ry (%cst_0) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     %cst_1 = arith.constant -1.5707963267948966 : f64
+///     quake.ry (%cst_1) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     return
 ///   }
 ///
-/// Note: we rely on the later synthesis and const prop stages to replace
+/// Note: the following synthesis and const prop passes will replace
 /// the argument by a constant and propagate the values and vector size
-/// through those and other instructions.
+/// through other instructions.
 
 namespace {
 
+template <typename T>
+concept IntegralType = std::is_same<T, bool>::value 
+    || std::is_same<T, std::int8_t>::value
+    || std::is_same<T, std::int16_t>::value
+    || std::is_same<T, std::int32_t>::value
+    || std::is_same<T, std::int64_t>::value;
+
 template <typename T>
 concept FloatingType = std::is_same<T, float>::value;
 
@@ -69,12 +83,11 @@ template <typename T>
 concept DoubleType = std::is_same<T, double>::value;
 
 template <typename T>
-concept ComplexDataType = FloatingType<T> || DoubleType<T>;
+concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
 
 /// Input was complex<float>/complex<double> but we prefer
 /// complex<double>/complex<float>. Make a copy, extending or truncating the
 /// values.
-/// TODO: dont convert if not needed
 template <FloatingType From>
 std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
@@ -86,7 +99,7 @@ std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std
 
 template <DoubleType From>
 std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
-    return std::vector<std::complex<double>>(data, size);
+    return std::vector<std::complex<From>>(data, data+size);
 }
 
 /// Input was float/double but we prefer complex<float>/complex<double>.
@@ -104,7 +117,7 @@ LogicalResult
 prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
                                unsigned &counter, BlockArgument argument,
                                std::vector<std::complex<double>> &vec) {
-  // auto *ctx = builder.getContext();
+  auto *ctx = builder.getContext();
   // builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
 
@@ -132,30 +145,67 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
   ///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
   ///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
 
+  auto toErase = std::vector<mlir::Operation*>();
+
   for (auto *argUser : argument.getUsers()) {
+    // Handle the `StdvecSize` and `quake.alloca` use case:
+    // - Replace a `vec.size()` with the vector length.
+    // - Replace the number of qubits calculation with the vector length logarithm.
+    // - Replace `quake.alloca` with a constant size qvector allocation.
+    if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
+      builder.setInsertionPointAfter(stdvecSizeOp);
+      Value length = builder.create<arith::ConstantIntOp>(
+          argLoc, vec.size(), stdvecSizeOp.getType());
+
+      Value numQubits = builder.create<arith::ConstantIntOp>(
+          argLoc, log2(vec.size()), stdvecSizeOp.getType());
+
+      for (auto *sizeUser: argUser->getUsers()) {
+        if (auto countZeroesOp = dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
+          for (auto *numQubitsUser: sizeUser->getUsers()) {
+            if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
+              builder.setInsertionPointAfter(quakeAllocaOp);
+              auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
+              Value newAlloc = builder.create<quake::AllocaOp>(argLoc, veqTy);
+              quakeAllocaOp.replaceAllUsesWith(newAlloc);
+              toErase.push_back(quakeAllocaOp);
+            }
+          }
+          countZeroesOp.replaceAllUsesWith(numQubits);
+          toErase.push_back(countZeroesOp);
+        }
+      }
+      
+      stdvecSizeOp.replaceAllUsesWith(length);
+      toErase.push_back(stdvecSizeOp);
+      continue;
+    }
+
+    // Handle the `StdvecDataOp` and `quake.init_state` use case:
+    // - Replace a `quake.init_state` with gates preparing the state.
     if (auto stdvecDataOp = dyn_cast<cudaq::cc::StdvecDataOp>(argUser)) {
       for (auto *dataUser : stdvecDataOp->getUsers()) {
         if (auto initOp = dyn_cast<quake::InitializeStateOp>(dataUser)) {
           builder.setInsertionPointAfter(initOp);
           // Find the qvector alloc instruction
-          auto qvector = initOp.getOperand(0);
-
-          // Replace!
-          auto zero = builder.create<arith::ConstantIntOp>(
-              argLoc, 0, builder.getIntegerType(64));
-          auto one = builder.create<arith::ConstantIntOp>(
-              argLoc, 1, builder.getIntegerType(64));
-          Value q0 = builder.create<quake::ExtractRefOp>(argLoc, qvector, zero);
-          Value q1 = builder.create<quake::ExtractRefOp>(argLoc, qvector, one);
-          /*auto hval =*/ builder.create<quake::HOp>(argLoc, q0);
-          /*auto xval =*/ builder.create<quake::XOp>(argLoc, q0, q1);
-
-          initOp.replaceAllUsesWith(qvector);
+          auto qubits = initOp.getOperand(0);
+
+          // Prepare state from vector data.
+          auto gateBuilder = StateGateBuilder(builder, argLoc, qubits);
+          auto decomposer = StateDecomposer(gateBuilder, vec);
+          decomposer.decompose();
+
+          initOp.replaceAllUsesWith(qubits);
+          toErase.push_back(initOp);
         }
       }
     }
   }
 
+  for (auto& op: toErase) {
+    op->erase();
+  }
+
   return success();
 }
 
@@ -294,20 +344,20 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
       };
       if (auto ty = dyn_cast<IntegerType>(eleTy)) {
         switch (ty.getIntOrFloatBitWidth()) {
-        // case 1:
-        //   doVector(false);
-        //   break;
-        // case 8:
-        //   doVector(std::int8_t{});
-        //   break;
-        // case 16:
-        //   doVector(std::int16_t{});
-        //   break;
-        // case 32:
-        //   doVector(std::int32_t{});
-        //   break;
-        // case 64:
-        //   doVector(std::int64_t{});
+        case 1:
+          doVector(false);
+          break;
+        case 8:
+          doVector(std::int8_t{});
+          break;
+        case 16:
+          doVector(std::int16_t{});
+          break;
+        case 32:
+          doVector(std::int32_t{});
+          break;
+        case 64:
+          doVector(std::int64_t{});
           break;
         default:
           bufferAppendix += vecLength * cudaq::opt::convertBitsToBytes(
@@ -334,10 +384,9 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
         doVector(std::complex<double>{});
         continue;
       }
-
-      std::cout << "Module after state preparation " << std::endl;
-      module.dump();
     }
+    std::cout << "Module after state preparation " << std::endl;
+    module.dump();
   }
 };
 
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
index 055084455c..be4855e3de 100644
--- a/targettests/execution/program.cpp
+++ b/targettests/execution/program.cpp
@@ -15,9 +15,6 @@
 
 __qpu__ void test1(std::vector<cudaq::complex> inState) {
     cudaq::qvector q1 = inState;
-    // Should synthesize to
-    // h(q1[0]);
-    // cx(q1[0], q1[1]);
 }
 
 //  __qpu__ void test2(cudaq::state *inState) {
@@ -29,10 +26,6 @@ __qpu__ void test1(std::vector<cudaq::complex> inState) {
 //   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
 // }
 
-// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:1938: not yet implemented: unknown function, get_state, in cudaq namespace
-// __qpu__ void test4() {
-//   cudaq::qvector q(cudaq::get_state(test3));
-// }
 
 // error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
 // __qpu__ void test5(cudaq::state *inState) {
@@ -42,53 +35,9 @@ __qpu__ void test1(std::vector<cudaq::complex> inState) {
 
 
 int main() {
-    std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
-
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
     {
         // Passing state data as argument (vector<complex>)
-
-        // Before synthesis:
-
-        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE(%arg0: !cc.stdvec<complex<f32>>) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-        //     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-        //     %1 = math.cttz %0 : i64
-        //     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>>
-        //     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-        //     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-        //     return
-        // }
-
-        // After synthesis
-
-        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-        //     %0 = cc.const_array [0.707106769 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.707106769 : f32, 0.000000e+00 : f32] : !cc.array<complex<f32> x 4>
-        //     %1 = cc.alloca !cc.array<complex<f32> x 4>
-        //     cc.store %0, %1 : !cc.ptr<!cc.array<complex<f32> x 4>>
-        //     %2 = cc.cast %1 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-        //     %c4_i64 = arith.constant 4 : i64
-        //     %3 = math.cttz %c4_i64 : i64                        // (TODO: replace by a const)
-        //     %4 = quake.alloca !quake.veq<?>[%3 : i64]
-        //     %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?> // TODO: replace by gates
-        //     return
-        // }
-
-        // TODO: in StatePreparation pass
-        // input - vector<double>, qubits
-        // output - MLIR replacing alloca+state_init instructions with gates on qubits
-
-        // %3 = math.cttz %c4_i64 : i64
-        // %4 = quake.alloca !quake.veq<?>[%3 : i64]
-        // %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-
-        // => (something like)
-
-        // create a function that does the following and call it on qubits
-        // %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-        // quake.ry (%cst) %6 : (f64, !quake.ref) -> ()
-        // ...
-
-        // TODO: Run state preparation pass before synthesis 
-
         std::cout << "test1(vec): "  << "\n";
         auto counts = cudaq::sample(test1, vec);
         counts.dump();
@@ -96,37 +45,21 @@ int main() {
 
     // {
     //     // Passing state ptr as argument - no support for from_data
-
-    //     // "func.func"() ({
-    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
-    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
-    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
-    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    //     //     "func.return"() : () -> ()
-    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
-        
+    //
     //     std::cout << "test2(state): "  << "\n";
     //     auto state = cudaq::state::from_data(vec);
-
+    //
     //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     //auto counts = cudaq::sample(test2, &state);
-    //     //counts.dump();
+    //     auto counts = cudaq::sample(test2, &state);
+    //     counts.dump();
     // }
 
     // {
     //     // Passing a state from another kernel as argument
-
-    //     // "func.func"() ({
-    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
-    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
-    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
-    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    //     //     "func.return"() : () -> ()
-    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
-        
+    //
     //     std::cout << "test2(test3): "  << "\n";
     //     auto state = cudaq::get_state(test3);
-
+    //
     //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
     //     auto counts = cudaq::sample(test2, &state);
     //     counts.dump();
@@ -134,34 +67,29 @@ int main() {
 
     // {
     //     // Passing a state to another kernel as argument
+    //
     //     std::cout << "test4(state): "  << "\n";
-    //     //auto state = cudaq::state::from_data(vec);
-    //     //auto counts = cudaq::sample(test4, &state);
+    //     
+    //     auto state = cudaq::state::from_data(vec);
+    //     auto counts = cudaq::sample(test4, &state);
     // }
 
     // {
-    //     // Creating a kernel from state and passing its state to another kernel
-
-    //     // "func.func"() ({
-    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
-    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
-    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
-    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    //     //     "func.return"() : () -> ()
-    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
-        
-    //     std::cout << "test2(kernel): "  << "\n";
-    //     std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
-    //     auto kernel = cudaq::make_kernel();
-    //     auto qubits = kernel.qalloc(2);
-
-    //     cudaq::from_state(kernel, qubits, vec);
-
+    //     // Creating a kernel from state and passing its state to another kernel - is it deprecated?
+    //
+        std::cout << "test2(kernel): "  << "\n";
+        std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
+        auto kernel = cudaq::make_kernel();
+        auto qubits = kernel.qalloc(2);
+    
+        cudaq::from_state(kernel, qubits, vec);
+        auto counts = cudaq::sample(kernel);
+    //
     //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
     //     //auto state = cudaq::get_state(kernel);
     //     //auto counts = cudaq::sample(test2, &state);
-
-    //     //counts.dump();
+    //
+         counts.dump();
     // }
 
 }
\ No newline at end of file

From 1cd5cbe8ee8a196aa7bc364b77b03d1060ee2b58 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 25 Jun 2024 15:29:40 -0700
Subject: [PATCH 3/9] Cleanup

---
 lib/Optimizer/Transforms/CMakeLists.txt       |   4 +-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |   5 -
 lib/Optimizer/Transforms/StateDecomposer.h    |  69 ++++----
 lib/Optimizer/Transforms/StatePreparation.cpp | 108 ++++--------
 program.py                                    |  35 ----
 .../tests/kernel/test_kernel_qvector_init.py  | 162 ++----------------
 targettests/execution/from_state.cpp          |  30 ----
 targettests/execution/program.cpp             |  95 ----------
 .../execution/state_preparation_vector.cpp    |  57 ++++++
 9 files changed, 148 insertions(+), 417 deletions(-)
 delete mode 100644 program.py
 delete mode 100644 targettests/execution/from_state.cpp
 delete mode 100644 targettests/execution/program.cpp
 create mode 100644 targettests/execution/state_preparation_vector.cpp

diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index b0a13571ec..173cec4538 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,11 +39,11 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
-  StateDecomposer.cpp
-  StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
+  StateDecomposer.cpp
+  StatePreparation.cpp
   PySynthCallableBlockArgs.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 0fa859f175..cc9279c79c 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -23,8 +23,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
-#include <iostream>
-
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
@@ -419,9 +417,7 @@ class QuakeSynthesizer
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before synthesis " << std::endl;
     auto module = getModule();
-    // module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -680,7 +676,6 @@ class QuakeSynthesizer
       }
     }
     funcOp.eraseArguments(argsToErase);
-    // std::cout << "Module after synthesis " << std::endl;
     module.dump();
   }
 };
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index bac6909708..2d17edb768 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -29,31 +29,32 @@
 
 namespace cudaq::details {
 
-  /// @brief Converts angles of a uniformly controlled rotation to angles of
-  /// non-controlled rotations.
-  std::vector<double> convertAngles(const std::span<double> alphas);
-
-  /// @brief Return the control indices dictated by the gray code implementation.
-  ///
-  /// Here, numBits is the number of controls.
-  std::vector<std::size_t> getControlIndices(std::size_t numBits);
-
-   /// @brief Return angles required to implement a uniformly controlled z-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaZ(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
-
-  /// @brief Return angles required to implement a uniformly controlled y-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaY(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
+/// @brief Converts angles of a uniformly controlled rotation to angles of
+/// non-controlled rotations.
+std::vector<double> convertAngles(const std::span<double> alphas);
+
+/// @brief Return the control indices dictated by the gray code implementation.
+///
+/// Here, numBits is the number of controls.
+std::vector<std::size_t> getControlIndices(std::size_t numBits);
+
+/// @brief Return angles required to implement a uniformly controlled z-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
+
+/// @brief Return angles required to implement a uniformly controlled y-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
 } // namespace cudaq::details
 
 class StateGateBuilder {
 public:
-  StateGateBuilder(mlir::OpBuilder& b, mlir::Location& l, mlir::Value& q): builder(b), loc(l), qubits(q) {}
+  StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q)
+      : builder(b), loc(l), qubits(q) {}
 
-  template<typename Op>
+  template <typename Op>
   void applyRotationOp(double theta, std::size_t target) {
     auto qubit = createQubitRef(target);
     auto thetaValue = createAngleValue(theta);
@@ -72,26 +73,30 @@ class StateGateBuilder {
       return qubitRefs[index];
     }
 
-    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(loc, index, builder.getIntegerType(64));
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(
+        loc, index, builder.getIntegerType(64));
     auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
     qubitRefs[index] = ref;
     return ref;
   }
 
   mlir::Value createAngleValue(double angle) {
-    return builder.create<mlir::arith::ConstantFloatOp>(loc, llvm::APFloat{angle}, builder.getF64Type());
+    return builder.create<mlir::arith::ConstantFloatOp>(
+        loc, llvm::APFloat{angle}, builder.getF64Type());
   }
 
-  mlir::OpBuilder& builder;
-  mlir::Location& loc;
-  mlir::Value& qubits;
+  mlir::OpBuilder &builder;
+  mlir::Location &loc;
+  mlir::Value &qubits;
 
-  std::unordered_map<std::size_t, mlir::Value> qubitRefs = std::unordered_map<std::size_t, mlir::Value>();
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs =
+      std::unordered_map<std::size_t, mlir::Value>();
 };
 
 class StateDecomposer {
 public:
-  StateDecomposer(StateGateBuilder& b, std::vector<std::complex<double>>& a): builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+  StateDecomposer(StateGateBuilder &b, std::vector<std::complex<double>> &a)
+      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
 
   /// @brief Decompose the input state vector data to a set of controlled
   /// operations and rotations. This function takes as input a `OpBuilder`
@@ -112,8 +117,9 @@ class StateDecomposer {
     }
 
     // N.B: The algorithm, as described in the paper, creates a circuit that
-    // begins with a target state and brings it to the all zero state. Hence, this
-    // implementation do the two steps described in Section III in reverse order.
+    // begins with a target state and brings it to the all zero state. Hence,
+    // this implementation do the two steps described in Section III in reverse
+    // order.
 
     // Apply uniformly controlled y-rotations, the construction in Eq. (4).
     for (std::size_t j = 1; j <= numQubits; ++j) {
@@ -142,7 +148,8 @@ class StateDecomposer {
 private:
   /// @brief Apply a uniformly controlled rotation on the target qubit.
   template <typename Op>
-  void applyRotation(const std::span<double> alphas, std::size_t numControls, std::size_t target) {
+  void applyRotation(const std::span<double> alphas, std::size_t numControls,
+                     std::size_t target) {
     auto thetas = cudaq::details::convertAngles(alphas);
     if (numControls == 0) {
       builder.applyRotationOp<Op>(thetas[0], target);
@@ -157,7 +164,7 @@ class StateDecomposer {
     }
   }
 
-  StateGateBuilder& builder;
+  StateGateBuilder &builder;
   std::span<std::complex<double>> amplitudes;
   std::size_t numQubits;
 };
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 86bb911a3a..785e70b3f8 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
+#include "StateDecomposer.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -24,9 +25,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include <span>
-#include "StateDecomposer.h"
-
-#include <iostream>
 
 #define DEBUG_TYPE "state-preparation"
 
@@ -38,13 +36,14 @@ using namespace mlir;
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
 ///     %1 = math.cttz %0 : i64
-///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>> 
-///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-///     return
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
+///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
+///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
+///     !quake.veq<?> return
 ///   }
 ///
-/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0:
+/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
+/// M_SQRT1_2} as arg0:
 ///
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = quake.alloca !quake.veq<2>
@@ -70,11 +69,11 @@ using namespace mlir;
 namespace {
 
 template <typename T>
-concept IntegralType = std::is_same<T, bool>::value 
-    || std::is_same<T, std::int8_t>::value
-    || std::is_same<T, std::int16_t>::value
-    || std::is_same<T, std::int32_t>::value
-    || std::is_same<T, std::int64_t>::value;
+concept IntegralType =
+    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
+    std::is_same<T, std::int16_t>::value ||
+    std::is_same<T, std::int32_t>::value ||
+    std::is_same<T, std::int64_t>::value;
 
 template <typename T>
 concept FloatingType = std::is_same<T, float>::value;
@@ -85,31 +84,33 @@ concept DoubleType = std::is_same<T, double>::value;
 template <typename T>
 concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
 
-/// Input was complex<float>/complex<double> but we prefer
-/// complex<double>/complex<float>. Make a copy, extending or truncating the
-/// values.
+/// Input was complex<float> but we prefer
+/// complex<double>. Make a copy, extending the values.
 template <FloatingType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
     convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
-                                      static_cast<double>(data[i].imag())};
+                                          static_cast<double>(data[i].imag())};
   return convertData;
 }
 
 template <DoubleType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
-    return std::vector<std::complex<From>>(data, data+size);
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  return std::vector<std::complex<From>>(data, data + size);
 }
 
-/// Input was float/double but we prefer complex<float>/complex<double>.
+/// Input was float/double but we prefer complex<double>.
 /// Make a copy, extending or truncating the values.
 template <ComplexDataType From>
-std::vector<std::complex<double>> convertToComplex(From *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(From *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
-    convertData[i] =
-        std::complex<double>{static_cast<double>(data[i]), static_cast<double>(0.0)};
+    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
+                                          static_cast<double>(0.0)};
   return convertData;
 }
 
@@ -118,39 +119,15 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
                                unsigned &counter, BlockArgument argument,
                                std::vector<std::complex<double>> &vec) {
   auto *ctx = builder.getContext();
-  // builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
 
-  // TODO: look at quake.init_state instructions from vector data and track them
-  // to the argument vector, then replace the instruction by gates preparing the
-  // state (or a call to a kernel with gates)
-
-  ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-  ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-  ///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
-  ///     !cc.ptr<complex<f32>>
-  ///
-  ///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-  ///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>)
-  ///     -> !quake.veq<?> return
-  ///   }
-
-  /// =>
-
-  ///     ...
-  ///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
-  ///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.h %6 : (!quake.ref) -> ()
-  ///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
-
-  auto toErase = std::vector<mlir::Operation*>();
+  auto toErase = std::vector<mlir::Operation *>();
 
   for (auto *argUser : argument.getUsers()) {
     // Handle the `StdvecSize` and `quake.alloca` use case:
     // - Replace a `vec.size()` with the vector length.
-    // - Replace the number of qubits calculation with the vector length logarithm.
+    // - Replace the number of qubits calculation with the vector length
+    // logarithm.
     // - Replace `quake.alloca` with a constant size qvector allocation.
     if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
       builder.setInsertionPointAfter(stdvecSizeOp);
@@ -160,9 +137,10 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
       Value numQubits = builder.create<arith::ConstantIntOp>(
           argLoc, log2(vec.size()), stdvecSizeOp.getType());
 
-      for (auto *sizeUser: argUser->getUsers()) {
-        if (auto countZeroesOp = dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
-          for (auto *numQubitsUser: sizeUser->getUsers()) {
+      for (auto *sizeUser : argUser->getUsers()) {
+        if (auto countZeroesOp =
+                dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
+          for (auto *numQubitsUser : sizeUser->getUsers()) {
             if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
               builder.setInsertionPointAfter(quakeAllocaOp);
               auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
@@ -175,7 +153,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
           toErase.push_back(countZeroesOp);
         }
       }
-      
+
       stdvecSizeOp.replaceAllUsesWith(length);
       toErase.push_back(stdvecSizeOp);
       continue;
@@ -202,7 +180,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
     }
   }
 
-  for (auto& op: toErase) {
+  for (auto &op : toErase) {
     op->erase();
   }
 
@@ -249,9 +227,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before state prep " << std::endl;
     auto module = getModule();
-    module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -286,13 +262,12 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
 
       // Get the argument type
       auto type = argument.getType();
-      // auto loc = argument.getLoc();
 
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          std::cout << "State pointer found, TODO: call a kernel that created "
-                       "the state"
-                    << std::endl;
+          funcOp.emitOpError(
+              "State preparation from cudaq::state is not supported.");
+          return;
         }
       }
 
@@ -301,9 +276,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
       if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
         auto eleTy = vecTy.getElementType();
         if (!isa<IntegerType, FloatType, ComplexType>(eleTy)) {
-          funcOp.emitOpError("synthesis: unsupported argument type");
-          signalPassFailure();
-          return;
+          continue;
         }
         char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
         auto sizeFromBuffer =
@@ -328,10 +301,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
     char *bufferAppendix = static_cast<char *>(args) + structSize;
     for (auto [idx, eleTy, vecLength] : stdVecInfo) {
       if (!eleTy) {
-        // FIXME: Skip struct values.
         bufferAppendix += vecLength;
-        funcOp.emitOpError(
-            "argument to kernel may be a struct and was not synthesized");
         continue;
       }
       auto doVector = [&]<typename T>(T) {
@@ -385,8 +355,6 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
         continue;
       }
     }
-    std::cout << "Module after state preparation " << std::endl;
-    module.dump();
   }
 };
 
diff --git a/program.py b/program.py
deleted file mode 100644
index e282d8cd5d..0000000000
--- a/program.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-import numpy as np
-import cudaq
-
-import cudaq
-import numpy as np
-
-cudaq.reset_target()
-
-cudaq.set_target('nvidia')
-#cudaq.set_target('nvidia-mqpu')
-# cudaq.set_target('density-matrix-cpu')
-
-
-c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
-                dtype=np.complex128)
-state = cudaq.State.from_data(c)
-
-@cudaq.kernel(verbose=True)
-def kernel(vec: cudaq.State):
-    q = cudaq.qvector(vec)
-
-print(kernel)
-print(cudaq.to_qir(kernel))
-
-#print(cudaq.get_target())
-#counts = cudaq.sample(kernel, state)
-#print(counts)
\ No newline at end of file
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index ddaeb6cc4d..f998a82dd1 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -20,28 +20,8 @@
 
 
 # float
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_params_f64():
-
+def test_kernel_float_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[float]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, f)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_float_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -156,10 +136,8 @@ def kernel():
 # complex
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_rotate_f64():
+def test_kernel_complex_params_rotate():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [0. + 0j, 0., 0., 1.]
 
@@ -179,50 +157,8 @@ def kernel(vec: list[complex]):
     assert '10' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_rotate_f32():
+def test_kernel_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [0. + 0j, 0., 0., 1.]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-        x(q.front())
-        y(q.back())
-        h(q)
-        mz(q)
-
-    counts = cudaq.sample(kernel, c)
-    print(f'rotate: {counts}')
-    assert '11' in counts
-    assert '00' in counts
-    assert '01' in counts
-    assert '10' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -337,10 +273,8 @@ def kernel():
 # np arrays
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex_params_f64():
+def test_kernel_dtype_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -354,10 +288,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex128_params_f64():
+def test_kernel_dtype_complex128_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -425,10 +357,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_params_f64():
+def test_kernel_amplitudes_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
 
@@ -442,27 +372,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_params_f32():
+def test_kernel_amplitudes_complex_from_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_from_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -476,23 +387,6 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_from_capture_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(cudaq.amplitudes(vec))
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_np_array_from_capture_f64():
     cudaq.reset_target()
@@ -568,40 +462,8 @@ def kernel():
 # test errors
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_array_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex))
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_list_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector([1., 0., 0.])
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_array_size_f32():
+def test_kernel_error_invalid_array_size_():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -613,10 +475,8 @@ def kernel():
         e)
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_list_size_f32():
+def test_kernel_error_invalid_list_size():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -629,6 +489,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_param_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel(n: int):
@@ -643,6 +504,8 @@ def kernel(n: int):
 
 
 def test_kernel_qvector_init_from_capture_int():
+    cudaq.reset_target()
+
     n = 2
 
     @cudaq.kernel
@@ -658,6 +521,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel():
diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp
deleted file mode 100644
index 55438848cb..0000000000
--- a/targettests/execution/from_state.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test(cudaq::state *inState) {
-  cudaq::qvector q(inState);
-}
-
-// CHECK: size 2
-
-int main() {
-  std::vector<std::complex<float>> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
-  auto state = cudaq::state::from_data(vec);
-  auto counts = cudaq::sample(test, &state);
-  counts.dump();
-
-  printf("size %zu\n", counts.size());
-  return !(counts.size() == 2);
-}
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
deleted file mode 100644
index be4855e3de..0000000000
--- a/targettests/execution/program.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test1(std::vector<cudaq::complex> inState) {
-    cudaq::qvector q1 = inState;
-}
-
-//  __qpu__ void test2(cudaq::state *inState) {
-//    cudaq::qvector q2(inState);
-//    cudaq::x(q2);
-// }
-
-// __qpu__ void test3() {
-//   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
-// }
-
-
-// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
-// __qpu__ void test5(cudaq::state *inState) {
-//   test2(inState);
-// }
-
-
-
-int main() {
-    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    {
-        // Passing state data as argument (vector<complex>)
-        std::cout << "test1(vec): "  << "\n";
-        auto counts = cudaq::sample(test1, vec);
-        counts.dump();
-    }
-
-    // {
-    //     // Passing state ptr as argument - no support for from_data
-    //
-    //     std::cout << "test2(state): "  << "\n";
-    //     auto state = cudaq::state::from_data(vec);
-    //
-    //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state from another kernel as argument
-    //
-    //     std::cout << "test2(test3): "  << "\n";
-    //     auto state = cudaq::get_state(test3);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state to another kernel as argument
-    //
-    //     std::cout << "test4(state): "  << "\n";
-    //     
-    //     auto state = cudaq::state::from_data(vec);
-    //     auto counts = cudaq::sample(test4, &state);
-    // }
-
-    // {
-    //     // Creating a kernel from state and passing its state to another kernel - is it deprecated?
-    //
-        std::cout << "test2(kernel): "  << "\n";
-        std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
-        auto kernel = cudaq::make_kernel();
-        auto qubits = kernel.qalloc(2);
-    
-        cudaq::from_state(kernel, qubits, vec);
-        auto counts = cudaq::sample(kernel);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     //auto state = cudaq::get_state(kernel);
-    //     //auto counts = cudaq::sample(test2, &state);
-    //
-         counts.dump();
-    // }
-
-}
\ No newline at end of file
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
new file mode 100644
index 0000000000..dbe9b15d86
--- /dev/null
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test2() {
+  cudaq::qvector q1({M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
+void printCounts(cudaq::sample_result& result) {
+  for (auto &&[bits, counts] : result) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test, vec);
+        printCounts(counts);
+    }
+    
+    {
+        // Using state data inside kernel (kernel mode) - not implemented yet.
+        // auto counts = cudaq::sample(test2);
+        // printCounts(counts);
+    }
+
+    {
+       // Passing state data as argument (builder mode)
+        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+        auto qubits = kernel.qalloc(v);
+    
+        auto counts = cudaq::sample(kernel, vec);
+        printCounts(counts);
+    }
+}
+
+// CHECK: 01
+// CHECK: 00
+
+// CHECK: 01
+// CHECK: 00
\ No newline at end of file

From 0a04d33ce4c7b734348784df2d14d3958827a592 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 25 Jun 2024 15:29:40 -0700
Subject: [PATCH 4/9] Cleanup

---
 lib/Optimizer/Transforms/CMakeLists.txt       |   4 +-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |   6 -
 lib/Optimizer/Transforms/StateDecomposer.h    |  69 ++++----
 lib/Optimizer/Transforms/StatePreparation.cpp | 108 ++++--------
 program.py                                    |  35 ----
 .../tests/kernel/test_kernel_qvector_init.py  | 162 ++----------------
 targettests/execution/from_state.cpp          |  30 ----
 targettests/execution/program.cpp             |  95 ----------
 .../execution/state_preparation_vector.cpp    |  57 ++++++
 9 files changed, 148 insertions(+), 418 deletions(-)
 delete mode 100644 program.py
 delete mode 100644 targettests/execution/from_state.cpp
 delete mode 100644 targettests/execution/program.cpp
 create mode 100644 targettests/execution/state_preparation_vector.cpp

diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index b0a13571ec..173cec4538 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,11 +39,11 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
-  StateDecomposer.cpp
-  StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
+  StateDecomposer.cpp
+  StatePreparation.cpp
   PySynthCallableBlockArgs.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 0fa859f175..7d83c152dd 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -23,8 +23,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
-#include <iostream>
-
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
@@ -419,9 +417,7 @@ class QuakeSynthesizer
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before synthesis " << std::endl;
     auto module = getModule();
-    // module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -680,8 +676,6 @@ class QuakeSynthesizer
       }
     }
     funcOp.eraseArguments(argsToErase);
-    // std::cout << "Module after synthesis " << std::endl;
-    module.dump();
   }
 };
 
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index bac6909708..2d17edb768 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -29,31 +29,32 @@
 
 namespace cudaq::details {
 
-  /// @brief Converts angles of a uniformly controlled rotation to angles of
-  /// non-controlled rotations.
-  std::vector<double> convertAngles(const std::span<double> alphas);
-
-  /// @brief Return the control indices dictated by the gray code implementation.
-  ///
-  /// Here, numBits is the number of controls.
-  std::vector<std::size_t> getControlIndices(std::size_t numBits);
-
-   /// @brief Return angles required to implement a uniformly controlled z-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaZ(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
-
-  /// @brief Return angles required to implement a uniformly controlled y-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaY(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
+/// @brief Converts angles of a uniformly controlled rotation to angles of
+/// non-controlled rotations.
+std::vector<double> convertAngles(const std::span<double> alphas);
+
+/// @brief Return the control indices dictated by the gray code implementation.
+///
+/// Here, numBits is the number of controls.
+std::vector<std::size_t> getControlIndices(std::size_t numBits);
+
+/// @brief Return angles required to implement a uniformly controlled z-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
+
+/// @brief Return angles required to implement a uniformly controlled y-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
 } // namespace cudaq::details
 
 class StateGateBuilder {
 public:
-  StateGateBuilder(mlir::OpBuilder& b, mlir::Location& l, mlir::Value& q): builder(b), loc(l), qubits(q) {}
+  StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q)
+      : builder(b), loc(l), qubits(q) {}
 
-  template<typename Op>
+  template <typename Op>
   void applyRotationOp(double theta, std::size_t target) {
     auto qubit = createQubitRef(target);
     auto thetaValue = createAngleValue(theta);
@@ -72,26 +73,30 @@ class StateGateBuilder {
       return qubitRefs[index];
     }
 
-    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(loc, index, builder.getIntegerType(64));
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(
+        loc, index, builder.getIntegerType(64));
     auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
     qubitRefs[index] = ref;
     return ref;
   }
 
   mlir::Value createAngleValue(double angle) {
-    return builder.create<mlir::arith::ConstantFloatOp>(loc, llvm::APFloat{angle}, builder.getF64Type());
+    return builder.create<mlir::arith::ConstantFloatOp>(
+        loc, llvm::APFloat{angle}, builder.getF64Type());
   }
 
-  mlir::OpBuilder& builder;
-  mlir::Location& loc;
-  mlir::Value& qubits;
+  mlir::OpBuilder &builder;
+  mlir::Location &loc;
+  mlir::Value &qubits;
 
-  std::unordered_map<std::size_t, mlir::Value> qubitRefs = std::unordered_map<std::size_t, mlir::Value>();
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs =
+      std::unordered_map<std::size_t, mlir::Value>();
 };
 
 class StateDecomposer {
 public:
-  StateDecomposer(StateGateBuilder& b, std::vector<std::complex<double>>& a): builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+  StateDecomposer(StateGateBuilder &b, std::vector<std::complex<double>> &a)
+      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
 
   /// @brief Decompose the input state vector data to a set of controlled
   /// operations and rotations. This function takes as input a `OpBuilder`
@@ -112,8 +117,9 @@ class StateDecomposer {
     }
 
     // N.B: The algorithm, as described in the paper, creates a circuit that
-    // begins with a target state and brings it to the all zero state. Hence, this
-    // implementation do the two steps described in Section III in reverse order.
+    // begins with a target state and brings it to the all zero state. Hence,
+    // this implementation do the two steps described in Section III in reverse
+    // order.
 
     // Apply uniformly controlled y-rotations, the construction in Eq. (4).
     for (std::size_t j = 1; j <= numQubits; ++j) {
@@ -142,7 +148,8 @@ class StateDecomposer {
 private:
   /// @brief Apply a uniformly controlled rotation on the target qubit.
   template <typename Op>
-  void applyRotation(const std::span<double> alphas, std::size_t numControls, std::size_t target) {
+  void applyRotation(const std::span<double> alphas, std::size_t numControls,
+                     std::size_t target) {
     auto thetas = cudaq::details::convertAngles(alphas);
     if (numControls == 0) {
       builder.applyRotationOp<Op>(thetas[0], target);
@@ -157,7 +164,7 @@ class StateDecomposer {
     }
   }
 
-  StateGateBuilder& builder;
+  StateGateBuilder &builder;
   std::span<std::complex<double>> amplitudes;
   std::size_t numQubits;
 };
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 86bb911a3a..785e70b3f8 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
+#include "StateDecomposer.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -24,9 +25,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include <span>
-#include "StateDecomposer.h"
-
-#include <iostream>
 
 #define DEBUG_TYPE "state-preparation"
 
@@ -38,13 +36,14 @@ using namespace mlir;
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
 ///     %1 = math.cttz %0 : i64
-///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>> 
-///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-///     return
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
+///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
+///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
+///     !quake.veq<?> return
 ///   }
 ///
-/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0:
+/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
+/// M_SQRT1_2} as arg0:
 ///
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = quake.alloca !quake.veq<2>
@@ -70,11 +69,11 @@ using namespace mlir;
 namespace {
 
 template <typename T>
-concept IntegralType = std::is_same<T, bool>::value 
-    || std::is_same<T, std::int8_t>::value
-    || std::is_same<T, std::int16_t>::value
-    || std::is_same<T, std::int32_t>::value
-    || std::is_same<T, std::int64_t>::value;
+concept IntegralType =
+    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
+    std::is_same<T, std::int16_t>::value ||
+    std::is_same<T, std::int32_t>::value ||
+    std::is_same<T, std::int64_t>::value;
 
 template <typename T>
 concept FloatingType = std::is_same<T, float>::value;
@@ -85,31 +84,33 @@ concept DoubleType = std::is_same<T, double>::value;
 template <typename T>
 concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
 
-/// Input was complex<float>/complex<double> but we prefer
-/// complex<double>/complex<float>. Make a copy, extending or truncating the
-/// values.
+/// Input was complex<float> but we prefer
+/// complex<double>. Make a copy, extending the values.
 template <FloatingType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
     convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
-                                      static_cast<double>(data[i].imag())};
+                                          static_cast<double>(data[i].imag())};
   return convertData;
 }
 
 template <DoubleType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
-    return std::vector<std::complex<From>>(data, data+size);
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  return std::vector<std::complex<From>>(data, data + size);
 }
 
-/// Input was float/double but we prefer complex<float>/complex<double>.
+/// Input was float/double but we prefer complex<double>.
 /// Make a copy, extending or truncating the values.
 template <ComplexDataType From>
-std::vector<std::complex<double>> convertToComplex(From *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(From *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
-    convertData[i] =
-        std::complex<double>{static_cast<double>(data[i]), static_cast<double>(0.0)};
+    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
+                                          static_cast<double>(0.0)};
   return convertData;
 }
 
@@ -118,39 +119,15 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
                                unsigned &counter, BlockArgument argument,
                                std::vector<std::complex<double>> &vec) {
   auto *ctx = builder.getContext();
-  // builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
 
-  // TODO: look at quake.init_state instructions from vector data and track them
-  // to the argument vector, then replace the instruction by gates preparing the
-  // state (or a call to a kernel with gates)
-
-  ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-  ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-  ///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
-  ///     !cc.ptr<complex<f32>>
-  ///
-  ///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-  ///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>)
-  ///     -> !quake.veq<?> return
-  ///   }
-
-  /// =>
-
-  ///     ...
-  ///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
-  ///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.h %6 : (!quake.ref) -> ()
-  ///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
-
-  auto toErase = std::vector<mlir::Operation*>();
+  auto toErase = std::vector<mlir::Operation *>();
 
   for (auto *argUser : argument.getUsers()) {
     // Handle the `StdvecSize` and `quake.alloca` use case:
     // - Replace a `vec.size()` with the vector length.
-    // - Replace the number of qubits calculation with the vector length logarithm.
+    // - Replace the number of qubits calculation with the vector length
+    // logarithm.
     // - Replace `quake.alloca` with a constant size qvector allocation.
     if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
       builder.setInsertionPointAfter(stdvecSizeOp);
@@ -160,9 +137,10 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
       Value numQubits = builder.create<arith::ConstantIntOp>(
           argLoc, log2(vec.size()), stdvecSizeOp.getType());
 
-      for (auto *sizeUser: argUser->getUsers()) {
-        if (auto countZeroesOp = dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
-          for (auto *numQubitsUser: sizeUser->getUsers()) {
+      for (auto *sizeUser : argUser->getUsers()) {
+        if (auto countZeroesOp =
+                dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
+          for (auto *numQubitsUser : sizeUser->getUsers()) {
             if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
               builder.setInsertionPointAfter(quakeAllocaOp);
               auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
@@ -175,7 +153,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
           toErase.push_back(countZeroesOp);
         }
       }
-      
+
       stdvecSizeOp.replaceAllUsesWith(length);
       toErase.push_back(stdvecSizeOp);
       continue;
@@ -202,7 +180,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
     }
   }
 
-  for (auto& op: toErase) {
+  for (auto &op : toErase) {
     op->erase();
   }
 
@@ -249,9 +227,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before state prep " << std::endl;
     auto module = getModule();
-    module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -286,13 +262,12 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
 
       // Get the argument type
       auto type = argument.getType();
-      // auto loc = argument.getLoc();
 
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          std::cout << "State pointer found, TODO: call a kernel that created "
-                       "the state"
-                    << std::endl;
+          funcOp.emitOpError(
+              "State preparation from cudaq::state is not supported.");
+          return;
         }
       }
 
@@ -301,9 +276,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
       if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
         auto eleTy = vecTy.getElementType();
         if (!isa<IntegerType, FloatType, ComplexType>(eleTy)) {
-          funcOp.emitOpError("synthesis: unsupported argument type");
-          signalPassFailure();
-          return;
+          continue;
         }
         char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
         auto sizeFromBuffer =
@@ -328,10 +301,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
     char *bufferAppendix = static_cast<char *>(args) + structSize;
     for (auto [idx, eleTy, vecLength] : stdVecInfo) {
       if (!eleTy) {
-        // FIXME: Skip struct values.
         bufferAppendix += vecLength;
-        funcOp.emitOpError(
-            "argument to kernel may be a struct and was not synthesized");
         continue;
       }
       auto doVector = [&]<typename T>(T) {
@@ -385,8 +355,6 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
         continue;
       }
     }
-    std::cout << "Module after state preparation " << std::endl;
-    module.dump();
   }
 };
 
diff --git a/program.py b/program.py
deleted file mode 100644
index e282d8cd5d..0000000000
--- a/program.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-import numpy as np
-import cudaq
-
-import cudaq
-import numpy as np
-
-cudaq.reset_target()
-
-cudaq.set_target('nvidia')
-#cudaq.set_target('nvidia-mqpu')
-# cudaq.set_target('density-matrix-cpu')
-
-
-c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
-                dtype=np.complex128)
-state = cudaq.State.from_data(c)
-
-@cudaq.kernel(verbose=True)
-def kernel(vec: cudaq.State):
-    q = cudaq.qvector(vec)
-
-print(kernel)
-print(cudaq.to_qir(kernel))
-
-#print(cudaq.get_target())
-#counts = cudaq.sample(kernel, state)
-#print(counts)
\ No newline at end of file
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index ddaeb6cc4d..f998a82dd1 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -20,28 +20,8 @@
 
 
 # float
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_params_f64():
-
+def test_kernel_float_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[float]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, f)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_float_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -156,10 +136,8 @@ def kernel():
 # complex
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_rotate_f64():
+def test_kernel_complex_params_rotate():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [0. + 0j, 0., 0., 1.]
 
@@ -179,50 +157,8 @@ def kernel(vec: list[complex]):
     assert '10' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_rotate_f32():
+def test_kernel_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [0. + 0j, 0., 0., 1.]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-        x(q.front())
-        y(q.back())
-        h(q)
-        mz(q)
-
-    counts = cudaq.sample(kernel, c)
-    print(f'rotate: {counts}')
-    assert '11' in counts
-    assert '00' in counts
-    assert '01' in counts
-    assert '10' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -337,10 +273,8 @@ def kernel():
 # np arrays
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex_params_f64():
+def test_kernel_dtype_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -354,10 +288,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex128_params_f64():
+def test_kernel_dtype_complex128_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -425,10 +357,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_params_f64():
+def test_kernel_amplitudes_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
 
@@ -442,27 +372,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_params_f32():
+def test_kernel_amplitudes_complex_from_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_from_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -476,23 +387,6 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_from_capture_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(cudaq.amplitudes(vec))
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_np_array_from_capture_f64():
     cudaq.reset_target()
@@ -568,40 +462,8 @@ def kernel():
 # test errors
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_array_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex))
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_list_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector([1., 0., 0.])
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_array_size_f32():
+def test_kernel_error_invalid_array_size_():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -613,10 +475,8 @@ def kernel():
         e)
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_list_size_f32():
+def test_kernel_error_invalid_list_size():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -629,6 +489,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_param_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel(n: int):
@@ -643,6 +504,8 @@ def kernel(n: int):
 
 
 def test_kernel_qvector_init_from_capture_int():
+    cudaq.reset_target()
+
     n = 2
 
     @cudaq.kernel
@@ -658,6 +521,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel():
diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp
deleted file mode 100644
index 55438848cb..0000000000
--- a/targettests/execution/from_state.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test(cudaq::state *inState) {
-  cudaq::qvector q(inState);
-}
-
-// CHECK: size 2
-
-int main() {
-  std::vector<std::complex<float>> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
-  auto state = cudaq::state::from_data(vec);
-  auto counts = cudaq::sample(test, &state);
-  counts.dump();
-
-  printf("size %zu\n", counts.size());
-  return !(counts.size() == 2);
-}
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
deleted file mode 100644
index be4855e3de..0000000000
--- a/targettests/execution/program.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test1(std::vector<cudaq::complex> inState) {
-    cudaq::qvector q1 = inState;
-}
-
-//  __qpu__ void test2(cudaq::state *inState) {
-//    cudaq::qvector q2(inState);
-//    cudaq::x(q2);
-// }
-
-// __qpu__ void test3() {
-//   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
-// }
-
-
-// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
-// __qpu__ void test5(cudaq::state *inState) {
-//   test2(inState);
-// }
-
-
-
-int main() {
-    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    {
-        // Passing state data as argument (vector<complex>)
-        std::cout << "test1(vec): "  << "\n";
-        auto counts = cudaq::sample(test1, vec);
-        counts.dump();
-    }
-
-    // {
-    //     // Passing state ptr as argument - no support for from_data
-    //
-    //     std::cout << "test2(state): "  << "\n";
-    //     auto state = cudaq::state::from_data(vec);
-    //
-    //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state from another kernel as argument
-    //
-    //     std::cout << "test2(test3): "  << "\n";
-    //     auto state = cudaq::get_state(test3);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state to another kernel as argument
-    //
-    //     std::cout << "test4(state): "  << "\n";
-    //     
-    //     auto state = cudaq::state::from_data(vec);
-    //     auto counts = cudaq::sample(test4, &state);
-    // }
-
-    // {
-    //     // Creating a kernel from state and passing its state to another kernel - is it deprecated?
-    //
-        std::cout << "test2(kernel): "  << "\n";
-        std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
-        auto kernel = cudaq::make_kernel();
-        auto qubits = kernel.qalloc(2);
-    
-        cudaq::from_state(kernel, qubits, vec);
-        auto counts = cudaq::sample(kernel);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     //auto state = cudaq::get_state(kernel);
-    //     //auto counts = cudaq::sample(test2, &state);
-    //
-         counts.dump();
-    // }
-
-}
\ No newline at end of file
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
new file mode 100644
index 0000000000..dbe9b15d86
--- /dev/null
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test2() {
+  cudaq::qvector q1({M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
+void printCounts(cudaq::sample_result& result) {
+  for (auto &&[bits, counts] : result) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test, vec);
+        printCounts(counts);
+    }
+    
+    {
+        // Using state data inside kernel (kernel mode) - not implemented yet.
+        // auto counts = cudaq::sample(test2);
+        // printCounts(counts);
+    }
+
+    {
+       // Passing state data as argument (builder mode)
+        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+        auto qubits = kernel.qalloc(v);
+    
+        auto counts = cudaq::sample(kernel, vec);
+        printCounts(counts);
+    }
+}
+
+// CHECK: 01
+// CHECK: 00
+
+// CHECK: 01
+// CHECK: 00
\ No newline at end of file

From 3660e278407719c7aa7ba82f93f08261dc936635 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 26 Jun 2024 09:51:42 -0700
Subject: [PATCH 5/9] Updated test

---
 .../execution/state_preparation_vector.cpp    | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index dbe9b15d86..d415072ce7 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -16,10 +16,6 @@ __qpu__ void test(std::vector<cudaq::complex> inState) {
   cudaq::qvector q1 = inState;
 }
 
-__qpu__ void test2() {
-  cudaq::qvector q1({M_SQRT1_2, M_SQRT1_2, 0., 0.});
-}
-
 void printCounts(cudaq::sample_result& result) {
   for (auto &&[bits, counts] : result) {
     std::cout << bits << '\n';
@@ -28,20 +24,18 @@ void printCounts(cudaq::sample_result& result) {
 
 int main() {
     std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
     {
         // Passing state data as argument (kernel mode)
         auto counts = cudaq::sample(test, vec);
         printCounts(counts);
-    }
-    
-    {
-        // Using state data inside kernel (kernel mode) - not implemented yet.
-        // auto counts = cudaq::sample(test2);
-        // printCounts(counts);
+
+        counts = cudaq::sample(test, vec1);
+        printCounts(counts);
     }
 
     {
-       // Passing state data as argument (builder mode)
+        // Passing state data as argument (builder mode)
         auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
         auto qubits = kernel.qalloc(v);
     
@@ -53,5 +47,8 @@ int main() {
 // CHECK: 01
 // CHECK: 00
 
+// CHECK: 10
+// CHECK: 10
+
 // CHECK: 01
 // CHECK: 00
\ No newline at end of file

From 8cbc1f6905babbfe1e123840d9d1b6e1a00747fa Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 26 Jun 2024 12:37:52 -0700
Subject: [PATCH 6/9] Fix test failures

---
 .../tests/kernel/test_kernel_qvector_init.py  | 21 +++++++++++++++++--
 .../execution/state_preparation_vector.cpp    | 16 ++++++++------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index f998a82dd1..6f2fd07152 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -5,11 +5,18 @@
 # This source code and the accompanying materials are made available under     #
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
+
+import os, sys
 import pytest
 
 import cudaq
 import numpy as np
 
+## [PYTHON_VERSION_FIX]
+skipIfPythonLessThan39 = pytest.mark.skipif(
+    sys.version_info < (3, 9),
+    reason="built-in collection types such as `list` not supported")
+
 skipIfNvidiaFP64NotInstalled = pytest.mark.skipif(
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia-fp64')),
     reason='Could not find nvidia-fp64 in installation')
@@ -18,8 +25,10 @@
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
     reason='Could not find nvidia in installation')
 
-
 # float
+
+
+@skipIfPythonLessThan39
 def test_kernel_float_params():
     cudaq.reset_target()
 
@@ -136,6 +145,7 @@ def kernel():
 # complex
 
 
+@skipIfPythonLessThan39
 def test_kernel_complex_params_rotate():
     cudaq.reset_target()
 
@@ -157,6 +167,7 @@ def kernel(vec: list[complex]):
     assert '10' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_complex_params():
     cudaq.reset_target()
 
@@ -273,6 +284,7 @@ def kernel():
 # np arrays
 
 
+@skipIfPythonLessThan39
 def test_kernel_dtype_complex_params():
     cudaq.reset_target()
 
@@ -288,6 +300,7 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_dtype_complex128_params():
     cudaq.reset_target()
 
@@ -357,6 +370,7 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_amplitudes_complex_params():
     cudaq.reset_target()
 
@@ -372,6 +386,7 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_amplitudes_complex_from_capture():
     cudaq.reset_target()
 
@@ -462,7 +477,8 @@ def kernel():
 # test errors
 
 
-def test_kernel_error_invalid_array_size_():
+@skipIfPythonLessThan39
+def test_kernel_error_invalid_array_size():
     cudaq.reset_target()
 
     @cudaq.kernel
@@ -475,6 +491,7 @@ def kernel():
         e)
 
 
+@skipIfPythonLessThan39
 def test_kernel_error_invalid_list_size():
     cudaq.reset_target()
 
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index d415072ce7..ef4ea69b92 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -17,7 +17,13 @@ __qpu__ void test(std::vector<cudaq::complex> inState) {
 }
 
 void printCounts(cudaq::sample_result& result) {
+  std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
     std::cout << bits << '\n';
   }
 }
@@ -44,11 +50,9 @@ int main() {
     }
 }
 
-// CHECK: 01
 // CHECK: 00
-
-// CHECK: 10
-// CHECK: 10
-
 // CHECK: 01
-// CHECK: 00
\ No newline at end of file
+// CHECK: 10
+// CHECK: 11
+// CHECK: 00
+// CHECK: 01
\ No newline at end of file

From 6d4433d5cf40835dfb42c67f180062b8aac7d601 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 27 Jun 2024 09:46:13 -0700
Subject: [PATCH 7/9] Revert the order of qubits in state prep

---
 lib/Optimizer/Transforms/StateDecomposer.h    | 13 ++++++++---
 .../tests/backends/test_Quantinuum_kernel.py  | 15 +++++++++++++
 .../tests/kernel/test_kernel_qvector_init.py  | 22 +++++++++++++++++++
 .../execution/state_preparation_vector.cpp    |  4 ++--
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index 2d17edb768..b433089258 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -150,17 +150,24 @@ class StateDecomposer {
   template <typename Op>
   void applyRotation(const std::span<double> alphas, std::size_t numControls,
                      std::size_t target) {
+
+    // In our model the index 1 (i.e. |01>) in quantum state data
+    // corresponds to qubits[0]=1 and qubits[1] = 0.
+    // Revert the order of qubits as the state preparation algorithm
+    // we use assumes the opposite.
+    auto qubitIndex = [&](std::size_t i) { return numQubits - i - 1; };
+
     auto thetas = cudaq::details::convertAngles(alphas);
     if (numControls == 0) {
-      builder.applyRotationOp<Op>(thetas[0], target);
+      builder.applyRotationOp<Op>(thetas[0], qubitIndex(target));
       return;
     }
 
     auto controlIndices = cudaq::details::getControlIndices(numControls);
     assert(thetas.size() == controlIndices.size());
     for (auto [i, c] : llvm::enumerate(controlIndices)) {
-      builder.applyRotationOp<Op>(thetas[i], target);
-      builder.applyX(c, target);
+      builder.applyRotationOp<Op>(thetas[i], qubitIndex(target));
+      builder.applyX(qubitIndex(c), qubitIndex(target));
     }
   }
 
diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index de072335bf..b0ca043060 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -7,6 +7,7 @@
 # ============================================================================ #
 
 import cudaq, pytest, os, time
+import numpy as np
 from cudaq import spin
 from multiprocessing import Process
 try:
@@ -169,6 +170,20 @@ def kernel():
     result = cudaq.sample(kernel)
 
 
+def test_quantinuum_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '11' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index 6f2fd07152..28260dcb4d 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -25,6 +25,28 @@
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
     reason='Could not find nvidia in installation')
 
+# state preparation and synthesis
+
+
+@skipIfPythonLessThan39
+def test_kernel_state_preparation():
+    cudaq.reset_target()
+
+    c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+
+    synthesized = cudaq.synthesize(kernel, c)
+    assert 'quake.init_state' in kernel.__str__()
+    assert not 'quake.init_state' in synthesized.__str__()
+
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+
+
 # float
 
 
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index ef4ea69b92..fccf6d872c 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -51,8 +51,8 @@ int main() {
 }
 
 // CHECK: 00
-// CHECK: 01
 // CHECK: 10
+// CHECK: 01
 // CHECK: 11
 // CHECK: 00
-// CHECK: 01
\ No newline at end of file
+// CHECK: 10

From 46f247728cf2ca22cda3bbf417007c63db1a1bed Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 27 Jun 2024 10:20:09 -0700
Subject: [PATCH 8/9] Fixed failing tests

---
 python/tests/backends/test_Quantinuum_kernel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index b0ca043060..fc11224f5e 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -178,7 +178,7 @@ def kernel(vec: list[complex]):
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
     counts = cudaq.sample(kernel, state)
-    assert '11' in counts
+    assert '00' in counts
     assert '10' in counts
     assert not '01' in counts
     assert not '11' in counts

From fb0994f8cb2c8459d715a27c208cef19c58542cb Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 27 Jun 2024 11:27:27 -0700
Subject: [PATCH 9/9] Fix test faiure

---
 lib/Optimizer/Transforms/StateDecomposer.h         | 2 +-
 targettests/execution/state_preparation_vector.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index b433089258..a698ac83c2 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -95,7 +95,7 @@ class StateGateBuilder {
 
 class StateDecomposer {
 public:
-  StateDecomposer(StateGateBuilder &b, std::vector<std::complex<double>> &a)
+  StateDecomposer(StateGateBuilder &b, std::span<std::complex<double>> a)
       : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
 
   /// @brief Decompose the input state vector data to a set of controlled
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index fccf6d872c..35a628c06a 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -9,7 +9,6 @@
 // RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
-#include "cudaq/builder/kernels.h"
 #include <iostream>
 
 __qpu__ void test(std::vector<cudaq::complex> inState) {