From a6d5f4e7ec30a7c6e3391c8647f41e966ef7f1ab Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 17 Jun 2024 10:08:03 -0700
Subject: [PATCH 01/50] Add a pass for state preparation from vectors

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   2 +
 include/cudaq/Optimizer/Transforms/Passes.td  |  11 ++
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 .../Transforms/GenKernelExecution.cpp         |  34 +++-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |  34 ++++
 lib/Optimizer/Transforms/StatePreparation.cpp | 119 +++++++++++++
 program.py                                    |  35 ++++
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   1 +
 runtime/common/BaseRemoteRESTQPU.h            |   1 +
 runtime/common/BaseRestRemoteClient.h         |   1 +
 targettests/execution/from_state.cpp          |  30 ++++
 targettests/execution/from_state_complex.cpp  |  27 +++
 targettests/execution/program.cpp             | 167 ++++++++++++++++++
 13 files changed, 457 insertions(+), 6 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StatePreparation.cpp
 create mode 100644 program.py
 create mode 100644 targettests/execution/from_state.cpp
 create mode 100644 targettests/execution/from_state_complex.cpp
 create mode 100644 targettests/execution/program.cpp
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 996b6e56a7..422032326c 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -40,6 +40,8 @@ std::unique_ptr<mlir::Pass> createLowerToCFGPass();
 std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
+std::unique_ptr<mlir::Pass> createStatePreparation();
+std::unique_ptr<mlir::Pass> createStatePreparation(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 8d2f0c1821..e5e15a8776 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -512,6 +512,17 @@ def PruneCtrlRelations : Pass<"pruned-ctrl-form", "mlir::func::FuncOp"> {
   }];
 }
 
+def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
+  let summary =
+    "Convert state vector data into gates";
+  let description = [{
+    Convert quake representation that includes qubit initialization
+    from data into qubit initialization using gates.
+  }];
+
+  let constructor = "cudaq::opt::createStatePreparation()";
+}
+
 def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> {
   let summary =
     "Synthesize concrete quantum program from Quake code plus runtime values.";
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 7600efe276..6a51057bd3 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,6 +39,7 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
+  StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index c16a4af7dd..68ef5b21b7 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -434,8 +434,18 @@ class GenerateKernelExecution
         hasTrailingData = true;
         continue;
       }
-      if (isa<cudaq::cc::PointerType>(currEleTy) &&
-          !isStatePointerType(currEleTy)) {
+      //if (isa<cudaq::cc::PointerType>(currEleTy) &&
+      //    !isStatePointerType(currEleTy)) {
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(currEleTy)) {
+        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+          // Special case: if the argument is a `cudaq::state*`, then just pass
+          // the pointer. We can do that in this case because the synthesis step
+          // (which will receive the argument data) is assumed to run in the
+          // same memory space.
+          argPtr = builder.create<cudaq::cc::CastOp>(loc, currEleTy, argPtr);
+          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
+                                                           stVal, argPtr, idx);
+        }
         continue;
       }
 
@@ -941,8 +951,8 @@ class GenerateKernelExecution
         cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet);
     if (count > 0 && args.size() >= count &&
         std::all_of(args.begin(), args.begin() + count, [](auto i) {
-          return isa<cudaq::cc::PointerType>(i.getType()) &&
-                 !isStatePointerType(i.getType());
+          return isa<cudaq::cc::PointerType>(i.getType());// &&
+                // !isStatePointerType(i.getType());
         }))
       return args.drop_front(count);
     return args;
@@ -1208,9 +1218,21 @@ class GenerateKernelExecution
         hasTrailingData = true;
         continue;
       }
-      if (isa<cudaq::cc::PointerType>(inTy) && !isStatePointerType(inTy))
+      //if (isa<cudaq::cc::PointerType>(inTy) && !isStatePointerType(inTy))
+      //  continue;
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(inTy)) {
+        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+          // Special case: if the argument is a `cudaq::state*`, then just pass
+          // the pointer. We can do that in this case because the synthesis step
+          // (which will receive the argument data) is assumed to run in the
+          // same memory space.
+          Value argPtr = builder.create<cudaq::cc::CastOp>(loc, inTy, arg);
+          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
+                                                           stVal, argPtr, idx);
+        }
         continue;
-
+      }
+      
       stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
                                                        stVal, arg, idx);
     }
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index f371a8b9cd..dbb2b00cc8 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -23,10 +23,19 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
+#include <iostream>
+
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
 
+// cudaq::state is defined in the runtime. The compiler will never need to know
+// about its implementation and there should not be a circular build/library
+// dependence because of it. Simply forward declare it, as it is notional.
+namespace cudaq {
+class state;
+}
+
 /// Replace a BlockArgument of a specific type with a concrete instantiation of
 /// that type, and add the generation of that constant as an MLIR Op to the
 /// beginning of the function. For example
@@ -366,7 +375,9 @@ class QuakeSynthesizer
   }
 
   void runOnOperation() override final {
+    std::cout << "Module before synthesis " << std::endl;
     auto module = getModule();
+    module.dump();
     if (args == nullptr || kernelName.empty()) {
       module.emitOpError("Synthesis requires a kernel and the values of the "
                          "arguments passed when it is called.");
@@ -472,6 +483,27 @@ class QuakeSynthesizer
         continue;
       }
 
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
+        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+          // Special case of a `cudaq::state*` which must be in the same address
+          // space. This references a container to a set of simulation
+          // amplitudes.
+          synthesizeRuntimeArgument<cudaq::state *>(
+              builder, argument, args, offset, sizeof(void *),
+              [=](OpBuilder &builder, cudaq::state **concrete) {
+                Value rawPtr = builder.create<arith::ConstantIntOp>(
+                    loc, reinterpret_cast<std::intptr_t>(*concrete),
+                    sizeof(void *) * 8);
+                auto stateTy = cudaq::cc::StateType::get(builder.getContext());
+                return builder.create<cudaq::cc::CastOp>(
+                    loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
+              });
+          continue;
+        }
+        // N.B. Other pointers will not be materialized and may be in a
+        // different address space.
+      }
+
       // If std::vector<arithmetic> type, add it to the list of vector info.
       // These will be processed when we reach the buffer's appendix.
       if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
@@ -601,6 +633,8 @@ class QuakeSynthesizer
       }
     }
     funcOp.eraseArguments(argsToErase);
+    std::cout << "Module after synthesis " << std::endl; 
+    module.dump();
   }
 };
 
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
new file mode 100644
index 0000000000..d7868b46ef
--- /dev/null
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/TypeToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+#include <iostream>
+
+#define DEBUG_TYPE "state-preparation"
+
+using namespace mlir;
+
+/// Replace a qubit initialization from vectors with quantum gates.
+/// For example:
+///
+///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
+///     %1 = math.cttz %0 : i64
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>>
+///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
+///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+///     return
+///   }
+///
+/// on call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0
+/// will be updated to:
+///
+///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
+///     %c4_i64 = arith.constant 4 : i64
+///     %3 = math.cttz %c4_i64 : i64
+///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
+///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
+///     quake.h %6 : (!quake.ref) -> ()
+///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
+///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
+///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
+///   }
+///
+/// Note: we rely on the later synthesis and const prop stages to replace
+/// the argument by a constant and propagate the values and vector size
+/// through those and other instructions.
+
+namespace {
+class StatePreparation
+    : public cudaq::opt::StatePreparationBase<StatePreparation> {
+protected:
+  // The name of the kernel to be synthesized
+  std::string kernelName;
+
+  // The raw pointer to the runtime arguments.
+  void *args;
+
+public:
+  StatePreparation() = default;
+  StatePreparation(std::string_view kernel, void *a)
+      : kernelName(kernel), args(a) {}
+
+  mlir::ModuleOp getModule() { return getOperation(); }
+
+
+  void runOnOperation() override final {
+    std::cout << "Module before state prep " << std::endl;
+    auto module = getModule();
+    module.dump();
+    if (args == nullptr || kernelName.empty()) {
+      module.emitOpError("Synthesis requires a kernel and the values of the "
+                         "arguments passed when it is called.");
+      signalPassFailure();
+      return;
+    }
+
+    auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName;
+    // Get the function we care about (the one with kernelName)
+    auto funcOp = module.lookupSymbol<func::FuncOp>(kernelNameInQuake);
+    if (!funcOp) {
+      module.emitOpError("The kernel '" + kernelName +
+                         "' was not found in the module.");
+      signalPassFailure();
+      return;
+    }
+
+    // Create the builder.
+    auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
+    
+    std::cout << "Module after synthesis " << std::endl; 
+    module.dump();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation() {
+  return std::make_unique<StatePreparation>();
+}
+
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createStatePreparation(std::string_view kernelName, void *a) {
+  return std::make_unique<StatePreparation>(kernelName, a);
+}
diff --git a/program.py b/program.py
new file mode 100644
index 0000000000..e282d8cd5d
--- /dev/null
+++ b/program.py
@@ -0,0 +1,35 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import numpy as np
+import cudaq
+
+import cudaq
+import numpy as np
+
+cudaq.reset_target()
+
+cudaq.set_target('nvidia')
+#cudaq.set_target('nvidia-mqpu')
+# cudaq.set_target('density-matrix-cpu')
+
+
+c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
+                dtype=np.complex128)
+state = cudaq.State.from_data(c)
+
+@cudaq.kernel(verbose=True)
+def kernel(vec: cudaq.State):
+    q = cudaq.qvector(vec)
+
+print(kernel)
+print(cudaq.to_qir(kernel))
+
+#print(cudaq.get_target())
+#counts = cudaq.sample(kernel, state)
+#print(counts)
\ No newline at end of file
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 51f86ce15f..ff0c0ce477 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -474,6 +474,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
 
   PassManager pm(context);
   pm.addPass(createCanonicalizerPass());
+  pm.addPass(cudaq::opt::createStatePreparation(name, rawArgs));
   pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs));
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index aa36a0c62d..08f41e60ec 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -401,6 +401,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
+      pm.addPass(cudaq::opt::createStatePreparation(kernelName, updatedArgs));
       pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 17c235a76b..9325d0345d 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -153,6 +153,7 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
+        pm.addPass(cudaq::opt::createStatePreparation(name, args));
         pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args));
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp
new file mode 100644
index 0000000000..55438848cb
--- /dev/null
+++ b/targettests/execution/from_state.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test(cudaq::state *inState) {
+  cudaq::qvector q(inState);
+}
+
+// CHECK: size 2
+
+int main() {
+  std::vector<std::complex<float>> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  auto state = cudaq::state::from_data(vec);
+  auto counts = cudaq::sample(test, &state);
+  counts.dump();
+
+  printf("size %zu\n", counts.size());
+  return !(counts.size() == 2);
+}
diff --git a/targettests/execution/from_state_complex.cpp b/targettests/execution/from_state_complex.cpp
new file mode 100644
index 0000000000..5ca8813393
--- /dev/null
+++ b/targettests/execution/from_state_complex.cpp
@@ -0,0 +1,27 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q = inState;
+}
+
+// CHECK: size 2
+
+int main() {
+  std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  auto counts = cudaq::sample(test, vec);
+  counts.dump();
+
+  printf("size %zu\n", counts.size());
+  return !(counts.size() == 2);
+}
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
new file mode 100644
index 0000000000..b6a12ebb57
--- /dev/null
+++ b/targettests/execution/program.cpp
@@ -0,0 +1,167 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test1(std::vector<cudaq::complex> inState) {
+    cudaq::qvector q1 = inState;
+    h(q1[0]);
+    cx(q1[0], q1[1]);
+
+}
+
+//  __qpu__ void test2(cudaq::state *inState) {
+//    cudaq::qvector q2(inState);
+//    cudaq::x(q2);
+// }
+
+// __qpu__ void test3() {
+//   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
+// }
+
+// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:1938: not yet implemented: unknown function, get_state, in cudaq namespace
+// __qpu__ void test4() {
+//   cudaq::qvector q(cudaq::get_state(test3));
+// }
+
+// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
+// __qpu__ void test5(cudaq::state *inState) {
+//   test2(inState);
+// }
+
+
+
+int main() {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+
+    {
+        // Passing state data as argument (vector<complex>)
+
+        // Before synthesis:
+
+        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE(%arg0: !cc.stdvec<complex<f32>>) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+        //     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
+        //     %1 = math.cttz %0 : i64
+        //     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>>
+        //     %3 = quake.alloca !quake.veq<?>[%1 : i64]
+        //     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+        //     return
+        // }
+
+        // After synthesis
+
+        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+        //     %0 = cc.const_array [0.707106769 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.707106769 : f32, 0.000000e+00 : f32] : !cc.array<complex<f32> x 4>
+        //     %1 = cc.alloca !cc.array<complex<f32> x 4>
+        //     cc.store %0, %1 : !cc.ptr<!cc.array<complex<f32> x 4>>
+        //     %2 = cc.cast %1 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+        //     %c4_i64 = arith.constant 4 : i64
+        //     %3 = math.cttz %c4_i64 : i64                        // (TODO: replace by a const)
+        //     %4 = quake.alloca !quake.veq<?>[%3 : i64]
+        //     %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?> // TODO: replace by gates
+        //     return
+        // }
+
+        // TODO: in StatePreparation pass
+        // input - vector<double>, qubits
+        // output - MLIR replacing alloca+state_init instructions with gates on qubits
+
+        // %3 = math.cttz %c4_i64 : i64
+        // %4 = quake.alloca !quake.veq<?>[%3 : i64]
+        // %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+
+        // => (something like)
+
+        // create a function that does the following and call it on qubits
+        // %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
+        // quake.ry (%cst) %6 : (f64, !quake.ref) -> ()
+        // ...
+
+        // TODO: Run state preparation pass before synthesis 
+
+        std::cout << "test1(vec): "  << "\n";
+        auto counts = cudaq::sample(test1, vec);
+        counts.dump();
+    }
+
+    // {
+    //     // Passing state ptr as argument - no support for from_data
+
+    //     // "func.func"() ({
+    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
+    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
+    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
+    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    //     //     "func.return"() : () -> ()
+    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
+        
+    //     std::cout << "test2(state): "  << "\n";
+    //     auto state = cudaq::state::from_data(vec);
+
+    //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
+    //     //auto counts = cudaq::sample(test2, &state);
+    //     //counts.dump();
+    // }
+
+    // {
+    //     // Passing a state from another kernel as argument
+
+    //     // "func.func"() ({
+    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
+    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
+    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
+    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    //     //     "func.return"() : () -> ()
+    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
+        
+    //     std::cout << "test2(test3): "  << "\n";
+    //     auto state = cudaq::get_state(test3);
+
+    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
+    //     auto counts = cudaq::sample(test2, &state);
+    //     counts.dump();
+    // }
+
+    // {
+    //     // Passing a state to another kernel as argument
+    //     std::cout << "test4(state): "  << "\n";
+    //     //auto state = cudaq::state::from_data(vec);
+    //     //auto counts = cudaq::sample(test4, &state);
+    // }
+
+    // {
+    //     // Creating a kernel from state and passing its state to another kernel
+
+    //     // "func.func"() ({
+    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
+    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
+    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
+    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    //     //     "func.return"() : () -> ()
+    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
+        
+    //     std::cout << "test2(kernel): "  << "\n";
+    //     std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
+    //     auto kernel = cudaq::make_kernel();
+    //     auto qubits = kernel.qalloc(2);
+
+    //     cudaq::from_state(kernel, qubits, vec);
+
+    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
+    //     //auto state = cudaq::get_state(kernel);
+    //     //auto counts = cudaq::sample(test2, &state);
+
+    //     //counts.dump();
+    // }
+
+}
\ No newline at end of file

From 93dd8d7f4ba31cc3869fd7fbaa399631c1cdaa97 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 25 Jun 2024 11:23:37 -0700
Subject: [PATCH 02/50] Implement state preparation

---
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 lib/Optimizer/Transforms/StateDecomposer.cpp  | 128 ++++++++++++++
 lib/Optimizer/Transforms/StateDecomposer.h    | 163 ++++++++++++++++++
 lib/Optimizer/Transforms/StatePreparation.cpp | 151 ++++++++++------
 targettests/execution/program.cpp             | 118 +++----------
 5 files changed, 415 insertions(+), 146 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StateDecomposer.cpp
 create mode 100644 lib/Optimizer/Transforms/StateDecomposer.h

diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 6a51057bd3..b0a13571ec 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,6 +39,7 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
+  StateDecomposer.cpp
   StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
diff --git a/lib/Optimizer/Transforms/StateDecomposer.cpp b/lib/Optimizer/Transforms/StateDecomposer.cpp
new file mode 100644
index 0000000000..3105fad707
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateDecomposer.cpp
@@ -0,0 +1,128 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "StateDecomposer.h"
+
+namespace cudaq::details {
+
+std::vector<std::size_t> grayCode(std::size_t numBits) {
+  std::vector<std::size_t> result(1ULL << numBits);
+  for (std::size_t i = 0; i < (1ULL << numBits); ++i)
+    result[i] = ((i >> 1) ^ i);
+  return result;
+}
+
+std::vector<std::size_t> getControlIndices(std::size_t numBits) {
+  auto code = grayCode(numBits);
+  std::vector<std::size_t> indices;
+  for (auto i = 0u; i < code.size(); ++i) {
+    // The position of the control in the lth CNOT gate is set to match
+    // the position where the lth and (l + 1)th bit strings g[l] and g[l+1] of
+    // the binary reflected Gray code differ.
+    auto position = std::log2(code[i] ^ code[(i + 1) % code.size()]);
+    // N.B: In CUDA Quantum we write the least significant bit (LSb) on the left
+    //
+    //  lsb -v
+    //       001
+    //         ^- msb
+    //
+    // Meaning that the bitstring 001 represents the number four instead of one.
+    // The above position calculation uses the 'normal' convention of writing
+    // numbers with the LSb on the left.
+    //
+    // Now, what we need to find out is the position of the 1 in the bitstring.
+    // If we take LSb as being position 0, then for the normal convention its
+    // position will be 0. Using CUDA Quantum convention it will be 2. Hence,
+    // we need to convert the position we find using:
+    //
+    // numBits - position - 1
+    //
+    // The extra -1 is to account for indices starting at 0. Using the above
+    // examples:
+    //
+    // bitstring: 001
+    // numBits: 3
+    // position: 0
+    //
+    // We have the converted position: 2, which is what we need.
+    indices.emplace_back(numBits - position - 1);
+  }
+  return indices;
+}
+
+std::vector<double> convertAngles(const std::span<double> alphas) {
+  // Implements Eq. (3) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  //
+  // N.B: The paper does fails to explicitly define what is the dot operator in
+  // the exponent of -1. Ref. 3 solves the mystery: its the bitwise inner
+  // product.
+  auto bitwiseInnerProduct = [](std::size_t a, std::size_t b) {
+    auto product = a & b;
+    auto sumOfProducts = 0;
+    while (product) {
+      sumOfProducts += product & 0b1 ? 1 : 0;
+      product = product >> 1;
+    }
+    return sumOfProducts;
+  };
+  std::vector<double> thetas(alphas.size(), 0);
+  for (std::size_t i = 0u; i < alphas.size(); ++i) {
+    for (std::size_t j = 0u; j < alphas.size(); ++j)
+      thetas[i] +=
+          bitwiseInnerProduct(j, ((i >> 1) ^ i)) & 0b1 ? -alphas[j] : alphas[j];
+    thetas[i] /= alphas.size();
+  }
+  return thetas;
+}
+
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (5) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  std::vector<double> angles;
+  double divisor = static_cast<double>(1ULL << (k - 1));
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double angle = 0.0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l)
+      // N.B: There is an extra '-1' on these indices computations to account
+      // for the fact that our indices start at 0.
+      angle += data[(2 * j - 1) * (1 << (k - 1)) + l - 1] -
+               data[(2 * j - 2) * (1 << (k - 1)) + l - 1];
+    angles.push_back(angle / divisor);
+  }
+  return angles;
+}
+
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (8) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  // N.B: There is an extra '-1' on these indices computations to account for
+  // the fact that our indices start at 0.
+  std::vector<double> angles;
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double numerator = 0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l) {
+      numerator +=
+          std::pow(std::abs(data[(2 * j - 1) * (1 << (k - 1)) + l - 1]), 2);
+    }
+
+    double denominator = 0;
+    for (std::size_t l = 1; l <= (1ULL << k); ++l) {
+      denominator += std::pow(std::abs(data[(j - 1) * (1 << k) + l - 1]), 2);
+    }
+
+    if (denominator == 0.0) {
+      assert(numerator == 0.0 &&
+             "If the denominator is zero, the numerator must also be zero.");
+      angles.push_back(0.0);
+      continue;
+    }
+    angles.push_back(2.0 * std::asin(std::sqrt(numerator / denominator)));
+  }
+  return angles;
+}
+} // namespace cudaq::details
\ No newline at end of file
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
new file mode 100644
index 0000000000..bac6909708
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -0,0 +1,163 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/TypeToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include <span>
+
+#include <iostream>
+
+namespace cudaq::details {
+
+  /// @brief Converts angles of a uniformly controlled rotation to angles of
+  /// non-controlled rotations.
+  std::vector<double> convertAngles(const std::span<double> alphas);
+
+  /// @brief Return the control indices dictated by the gray code implementation.
+  ///
+  /// Here, numBits is the number of controls.
+  std::vector<std::size_t> getControlIndices(std::size_t numBits);
+
+   /// @brief Return angles required to implement a uniformly controlled z-rotation
+  /// on the `kth` qubit.
+  std::vector<double> getAlphaZ(const std::span<double> data,
+                                std::size_t numQubits, std::size_t k);
+
+  /// @brief Return angles required to implement a uniformly controlled y-rotation
+  /// on the `kth` qubit.
+  std::vector<double> getAlphaY(const std::span<double> data,
+                                std::size_t numQubits, std::size_t k);
+} // namespace cudaq::details
+
+class StateGateBuilder {
+public:
+  StateGateBuilder(mlir::OpBuilder& b, mlir::Location& l, mlir::Value& q): builder(b), loc(l), qubits(q) {}
+
+  template<typename Op>
+  void applyRotationOp(double theta, std::size_t target) {
+    auto qubit = createQubitRef(target);
+    auto thetaValue = createAngleValue(theta);
+    builder.create<Op>(loc, thetaValue, mlir::ValueRange{}, qubit);
+  };
+
+  void applyX(std::size_t control, std::size_t target) {
+    auto qubitC = createQubitRef(control);
+    auto qubitT = createQubitRef(target);
+    builder.create<quake::XOp>(loc, qubitC, qubitT);
+  };
+
+private:
+  mlir::Value createQubitRef(std::size_t index) {
+    if (qubitRefs.contains(index)) {
+      return qubitRefs[index];
+    }
+
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(loc, index, builder.getIntegerType(64));
+    auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
+    qubitRefs[index] = ref;
+    return ref;
+  }
+
+  mlir::Value createAngleValue(double angle) {
+    return builder.create<mlir::arith::ConstantFloatOp>(loc, llvm::APFloat{angle}, builder.getF64Type());
+  }
+
+  mlir::OpBuilder& builder;
+  mlir::Location& loc;
+  mlir::Value& qubits;
+
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs = std::unordered_map<std::size_t, mlir::Value>();
+};
+
+class StateDecomposer {
+public:
+  StateDecomposer(StateGateBuilder& b, std::vector<std::complex<double>>& a): builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+
+  /// @brief Decompose the input state vector data to a set of controlled
+  /// operations and rotations. This function takes as input a `OpBuilder`
+  /// and appends the operations of the decomposition to its internal
+  /// representation. This implementation follows the algorithm defined in
+  /// `https://arxiv.org/pdf/quant-ph/0407010.pdf`.
+  void decompose() {
+
+    // Decompose the state into phases and magnitudes.
+    bool needsPhaseEqualization = false;
+    std::vector<double> phases;
+    std::vector<double> magnitudes;
+    for (const auto &a : amplitudes) {
+      phases.push_back(std::arg(a));
+      magnitudes.push_back(std::abs(a));
+      // FIXME: remove magic number.
+      needsPhaseEqualization |= std::abs(phases.back()) > 1e-10;
+    }
+
+    // N.B: The algorithm, as described in the paper, creates a circuit that
+    // begins with a target state and brings it to the all zero state. Hence, this
+    // implementation do the two steps described in Section III in reverse order.
+
+    // Apply uniformly controlled y-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaYk = cudaq::details::getAlphaY(magnitudes, numQubits, k);
+      applyRotation<quake::RyOp>(alphaYk, numControls, target);
+    }
+
+    if (!needsPhaseEqualization)
+      return;
+
+    // Apply uniformly controlled z-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaZk = cudaq::details::getAlphaZ(phases, numQubits, k);
+      if (alphaZk.empty())
+        continue;
+      applyRotation<quake::RzOp>(alphaZk, numControls, target);
+    }
+  }
+
+private:
+  /// @brief Apply a uniformly controlled rotation on the target qubit.
+  template <typename Op>
+  void applyRotation(const std::span<double> alphas, std::size_t numControls, std::size_t target) {
+    auto thetas = cudaq::details::convertAngles(alphas);
+    if (numControls == 0) {
+      builder.applyRotationOp<Op>(thetas[0], target);
+      return;
+    }
+
+    auto controlIndices = cudaq::details::getControlIndices(numControls);
+    assert(thetas.size() == controlIndices.size());
+    for (auto [i, c] : llvm::enumerate(controlIndices)) {
+      builder.applyRotationOp<Op>(thetas[i], target);
+      builder.applyX(c, target);
+    }
+  }
+
+  StateGateBuilder& builder;
+  std::span<std::complex<double>> amplitudes;
+  std::size_t numQubits;
+};
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index ce46efecc0..86bb911a3a 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -18,10 +18,13 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include <span>
+#include "StateDecomposer.h"
 
 #include <iostream>
 
@@ -35,33 +38,44 @@ using namespace mlir;
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
 ///     %1 = math.cttz %0 : i64
-///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
-///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
-///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
-///     !quake.veq<?> return
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>> 
+///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
+///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
+///     return
 ///   }
 ///
-/// on call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
-/// M_SQRT1_2} as arg0 will be updated to:
+/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0:
 ///
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-///     %c4_i64 = arith.constant 4 : i64
-///     %3 = math.cttz %c4_i64 : i64
-///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
-///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-///     quake.h %6 : (!quake.ref) -> ()
-///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
-///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
+///     %0 = quake.alloca !quake.veq<2>
+///     %c0_i64 = arith.constant 0 : i64
+///     %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst = arith.constant 1.5707963267948968 : f64
+///     quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+///     %c1_i64 = arith.constant 1 : i64
+///     %2 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst_0 = arith.constant 1.5707963267948966 : f64
+///     quake.ry (%cst_0) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     %cst_1 = arith.constant -1.5707963267948966 : f64
+///     quake.ry (%cst_1) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     return
 ///   }
 ///
-/// Note: we rely on the later synthesis and const prop stages to replace
+/// Note: the following synthesis and const prop passes will replace
 /// the argument by a constant and propagate the values and vector size
-/// through those and other instructions.
+/// through other instructions.
 
 namespace {
 
+template <typename T>
+concept IntegralType = std::is_same<T, bool>::value 
+    || std::is_same<T, std::int8_t>::value
+    || std::is_same<T, std::int16_t>::value
+    || std::is_same<T, std::int32_t>::value
+    || std::is_same<T, std::int64_t>::value;
+
 template <typename T>
 concept FloatingType = std::is_same<T, float>::value;
 
@@ -69,12 +83,11 @@ template <typename T>
 concept DoubleType = std::is_same<T, double>::value;
 
 template <typename T>
-concept ComplexDataType = FloatingType<T> || DoubleType<T>;
+concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
 
 /// Input was complex<float>/complex<double> but we prefer
 /// complex<double>/complex<float>. Make a copy, extending or truncating the
 /// values.
-/// TODO: dont convert if not needed
 template <FloatingType From>
 std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
@@ -86,7 +99,7 @@ std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std
 
 template <DoubleType From>
 std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
-    return std::vector<std::complex<double>>(data, size);
+    return std::vector<std::complex<From>>(data, data+size);
 }
 
 /// Input was float/double but we prefer complex<float>/complex<double>.
@@ -104,7 +117,7 @@ LogicalResult
 prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
                                unsigned &counter, BlockArgument argument,
                                std::vector<std::complex<double>> &vec) {
-  // auto *ctx = builder.getContext();
+  auto *ctx = builder.getContext();
   // builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
 
@@ -132,30 +145,67 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
   ///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
   ///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
 
+  auto toErase = std::vector<mlir::Operation*>();
+
   for (auto *argUser : argument.getUsers()) {
+    // Handle the `StdvecSize` and `quake.alloca` use case:
+    // - Replace a `vec.size()` with the vector length.
+    // - Replace the number of qubits calculation with the vector length logarithm.
+    // - Replace `quake.alloca` with a constant size qvector allocation.
+    if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
+      builder.setInsertionPointAfter(stdvecSizeOp);
+      Value length = builder.create<arith::ConstantIntOp>(
+          argLoc, vec.size(), stdvecSizeOp.getType());
+
+      Value numQubits = builder.create<arith::ConstantIntOp>(
+          argLoc, log2(vec.size()), stdvecSizeOp.getType());
+
+      for (auto *sizeUser: argUser->getUsers()) {
+        if (auto countZeroesOp = dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
+          for (auto *numQubitsUser: sizeUser->getUsers()) {
+            if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
+              builder.setInsertionPointAfter(quakeAllocaOp);
+              auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
+              Value newAlloc = builder.create<quake::AllocaOp>(argLoc, veqTy);
+              quakeAllocaOp.replaceAllUsesWith(newAlloc);
+              toErase.push_back(quakeAllocaOp);
+            }
+          }
+          countZeroesOp.replaceAllUsesWith(numQubits);
+          toErase.push_back(countZeroesOp);
+        }
+      }
+      
+      stdvecSizeOp.replaceAllUsesWith(length);
+      toErase.push_back(stdvecSizeOp);
+      continue;
+    }
+
+    // Handle the `StdvecDataOp` and `quake.init_state` use case:
+    // - Replace a `quake.init_state` with gates preparing the state.
     if (auto stdvecDataOp = dyn_cast<cudaq::cc::StdvecDataOp>(argUser)) {
       for (auto *dataUser : stdvecDataOp->getUsers()) {
         if (auto initOp = dyn_cast<quake::InitializeStateOp>(dataUser)) {
           builder.setInsertionPointAfter(initOp);
           // Find the qvector alloc instruction
-          auto qvector = initOp.getOperand(0);
-
-          // Replace!
-          auto zero = builder.create<arith::ConstantIntOp>(
-              argLoc, 0, builder.getIntegerType(64));
-          auto one = builder.create<arith::ConstantIntOp>(
-              argLoc, 1, builder.getIntegerType(64));
-          Value q0 = builder.create<quake::ExtractRefOp>(argLoc, qvector, zero);
-          Value q1 = builder.create<quake::ExtractRefOp>(argLoc, qvector, one);
-          /*auto hval =*/ builder.create<quake::HOp>(argLoc, q0);
-          /*auto xval =*/ builder.create<quake::XOp>(argLoc, q0, q1);
-
-          initOp.replaceAllUsesWith(qvector);
+          auto qubits = initOp.getOperand(0);
+
+          // Prepare state from vector data.
+          auto gateBuilder = StateGateBuilder(builder, argLoc, qubits);
+          auto decomposer = StateDecomposer(gateBuilder, vec);
+          decomposer.decompose();
+
+          initOp.replaceAllUsesWith(qubits);
+          toErase.push_back(initOp);
         }
       }
     }
   }
 
+  for (auto& op: toErase) {
+    op->erase();
+  }
+
   return success();
 }
 
@@ -294,20 +344,20 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
       };
       if (auto ty = dyn_cast<IntegerType>(eleTy)) {
         switch (ty.getIntOrFloatBitWidth()) {
-        // case 1:
-        //   doVector(false);
-        //   break;
-        // case 8:
-        //   doVector(std::int8_t{});
-        //   break;
-        // case 16:
-        //   doVector(std::int16_t{});
-        //   break;
-        // case 32:
-        //   doVector(std::int32_t{});
-        //   break;
-        // case 64:
-        //   doVector(std::int64_t{});
+        case 1:
+          doVector(false);
+          break;
+        case 8:
+          doVector(std::int8_t{});
+          break;
+        case 16:
+          doVector(std::int16_t{});
+          break;
+        case 32:
+          doVector(std::int32_t{});
+          break;
+        case 64:
+          doVector(std::int64_t{});
           break;
         default:
           bufferAppendix += vecLength * cudaq::opt::convertBitsToBytes(
@@ -334,10 +384,9 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
         doVector(std::complex<double>{});
         continue;
       }
-
-      std::cout << "Module after state preparation " << std::endl;
-      module.dump();
     }
+    std::cout << "Module after state preparation " << std::endl;
+    module.dump();
   }
 };
 
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
index 055084455c..be4855e3de 100644
--- a/targettests/execution/program.cpp
+++ b/targettests/execution/program.cpp
@@ -15,9 +15,6 @@
 
 __qpu__ void test1(std::vector<cudaq::complex> inState) {
     cudaq::qvector q1 = inState;
-    // Should synthesize to
-    // h(q1[0]);
-    // cx(q1[0], q1[1]);
 }
 
 //  __qpu__ void test2(cudaq::state *inState) {
@@ -29,10 +26,6 @@ __qpu__ void test1(std::vector<cudaq::complex> inState) {
 //   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
 // }
 
-// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:1938: not yet implemented: unknown function, get_state, in cudaq namespace
-// __qpu__ void test4() {
-//   cudaq::qvector q(cudaq::get_state(test3));
-// }
 
 // error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
 // __qpu__ void test5(cudaq::state *inState) {
@@ -42,53 +35,9 @@ __qpu__ void test1(std::vector<cudaq::complex> inState) {
 
 
 int main() {
-    std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
-
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
     {
         // Passing state data as argument (vector<complex>)
-
-        // Before synthesis:
-
-        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE(%arg0: !cc.stdvec<complex<f32>>) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-        //     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-        //     %1 = math.cttz %0 : i64
-        //     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>>
-        //     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-        //     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-        //     return
-        // }
-
-        // After synthesis
-
-        // func.func @__nvqpp__mlirgen__function_test1._Z5test1St6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-        //     %0 = cc.const_array [0.707106769 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.707106769 : f32, 0.000000e+00 : f32] : !cc.array<complex<f32> x 4>
-        //     %1 = cc.alloca !cc.array<complex<f32> x 4>
-        //     cc.store %0, %1 : !cc.ptr<!cc.array<complex<f32> x 4>>
-        //     %2 = cc.cast %1 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-        //     %c4_i64 = arith.constant 4 : i64
-        //     %3 = math.cttz %c4_i64 : i64                        // (TODO: replace by a const)
-        //     %4 = quake.alloca !quake.veq<?>[%3 : i64]
-        //     %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?> // TODO: replace by gates
-        //     return
-        // }
-
-        // TODO: in StatePreparation pass
-        // input - vector<double>, qubits
-        // output - MLIR replacing alloca+state_init instructions with gates on qubits
-
-        // %3 = math.cttz %c4_i64 : i64
-        // %4 = quake.alloca !quake.veq<?>[%3 : i64]
-        // %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-
-        // => (something like)
-
-        // create a function that does the following and call it on qubits
-        // %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-        // quake.ry (%cst) %6 : (f64, !quake.ref) -> ()
-        // ...
-
-        // TODO: Run state preparation pass before synthesis 
-
         std::cout << "test1(vec): "  << "\n";
         auto counts = cudaq::sample(test1, vec);
         counts.dump();
@@ -96,37 +45,21 @@ int main() {
 
     // {
     //     // Passing state ptr as argument - no support for from_data
-
-    //     // "func.func"() ({
-    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
-    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
-    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
-    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    //     //     "func.return"() : () -> ()
-    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
-        
+    //
     //     std::cout << "test2(state): "  << "\n";
     //     auto state = cudaq::state::from_data(vec);
-
+    //
     //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     //auto counts = cudaq::sample(test2, &state);
-    //     //counts.dump();
+    //     auto counts = cudaq::sample(test2, &state);
+    //     counts.dump();
     // }
 
     // {
     //     // Passing a state from another kernel as argument
-
-    //     // "func.func"() ({
-    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
-    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
-    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
-    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    //     //     "func.return"() : () -> ()
-    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
-        
+    //
     //     std::cout << "test2(test3): "  << "\n";
     //     auto state = cudaq::get_state(test3);
-
+    //
     //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
     //     auto counts = cudaq::sample(test2, &state);
     //     counts.dump();
@@ -134,34 +67,29 @@ int main() {
 
     // {
     //     // Passing a state to another kernel as argument
+    //
     //     std::cout << "test4(state): "  << "\n";
-    //     //auto state = cudaq::state::from_data(vec);
-    //     //auto counts = cudaq::sample(test4, &state);
+    //     
+    //     auto state = cudaq::state::from_data(vec);
+    //     auto counts = cudaq::sample(test4, &state);
     // }
 
     // {
-    //     // Creating a kernel from state and passing its state to another kernel
-
-    //     // "func.func"() ({
-    //     // ^bb0(%arg0: !cc.ptr<!cc.state>):
-    //     //     %0 = "func.call"(%arg0) {callee = @__nvqpp_cudaq_state_numberOfQubits} : (!cc.ptr<!cc.state>) -> i64
-    //     //     %1 = "quake.alloca"(%0) : (i64) -> !quake.veq<?>
-    //     //     %2 = "quake.init_state"(%1, %arg0) : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    //     //     "func.return"() : () -> ()
-    //     // }) {"cudaq-entrypoint", "cudaq-kernel", function_type = (!cc.ptr<!cc.state>) -> (), no_this, sym_name = "__nvqpp__mlirgen__function_test2._Z5test2PN5cudaq5stateE"} : () -> ()
-        
-    //     std::cout << "test2(kernel): "  << "\n";
-    //     std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
-    //     auto kernel = cudaq::make_kernel();
-    //     auto qubits = kernel.qalloc(2);
-
-    //     cudaq::from_state(kernel, qubits, vec);
-
+    //     // Creating a kernel from state and passing its state to another kernel - is it deprecated?
+    //
+        std::cout << "test2(kernel): "  << "\n";
+        std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
+        auto kernel = cudaq::make_kernel();
+        auto qubits = kernel.qalloc(2);
+    
+        cudaq::from_state(kernel, qubits, vec);
+        auto counts = cudaq::sample(kernel);
+    //
     //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
     //     //auto state = cudaq::get_state(kernel);
     //     //auto counts = cudaq::sample(test2, &state);
-
-    //     //counts.dump();
+    //
+         counts.dump();
     // }
 
 }
\ No newline at end of file

From 1cd5cbe8ee8a196aa7bc364b77b03d1060ee2b58 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 25 Jun 2024 15:29:40 -0700
Subject: [PATCH 03/50] Cleanup

---
 lib/Optimizer/Transforms/CMakeLists.txt       |   4 +-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |   5 -
 lib/Optimizer/Transforms/StateDecomposer.h    |  69 ++++----
 lib/Optimizer/Transforms/StatePreparation.cpp | 108 ++++--------
 program.py                                    |  35 ----
 .../tests/kernel/test_kernel_qvector_init.py  | 162 ++----------------
 targettests/execution/from_state.cpp          |  30 ----
 targettests/execution/program.cpp             |  95 ----------
 .../execution/state_preparation_vector.cpp    |  57 ++++++
 9 files changed, 148 insertions(+), 417 deletions(-)
 delete mode 100644 program.py
 delete mode 100644 targettests/execution/from_state.cpp
 delete mode 100644 targettests/execution/program.cpp
 create mode 100644 targettests/execution/state_preparation_vector.cpp

diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index b0a13571ec..173cec4538 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,11 +39,11 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
-  StateDecomposer.cpp
-  StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
+  StateDecomposer.cpp
+  StatePreparation.cpp
   PySynthCallableBlockArgs.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 0fa859f175..cc9279c79c 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -23,8 +23,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
-#include <iostream>
-
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
@@ -419,9 +417,7 @@ class QuakeSynthesizer
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before synthesis " << std::endl;
     auto module = getModule();
-    // module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -680,7 +676,6 @@ class QuakeSynthesizer
       }
     }
     funcOp.eraseArguments(argsToErase);
-    // std::cout << "Module after synthesis " << std::endl;
     module.dump();
   }
 };
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index bac6909708..2d17edb768 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -29,31 +29,32 @@
 
 namespace cudaq::details {
 
-  /// @brief Converts angles of a uniformly controlled rotation to angles of
-  /// non-controlled rotations.
-  std::vector<double> convertAngles(const std::span<double> alphas);
-
-  /// @brief Return the control indices dictated by the gray code implementation.
-  ///
-  /// Here, numBits is the number of controls.
-  std::vector<std::size_t> getControlIndices(std::size_t numBits);
-
-   /// @brief Return angles required to implement a uniformly controlled z-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaZ(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
-
-  /// @brief Return angles required to implement a uniformly controlled y-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaY(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
+/// @brief Converts angles of a uniformly controlled rotation to angles of
+/// non-controlled rotations.
+std::vector<double> convertAngles(const std::span<double> alphas);
+
+/// @brief Return the control indices dictated by the gray code implementation.
+///
+/// Here, numBits is the number of controls.
+std::vector<std::size_t> getControlIndices(std::size_t numBits);
+
+/// @brief Return angles required to implement a uniformly controlled z-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
+
+/// @brief Return angles required to implement a uniformly controlled y-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
 } // namespace cudaq::details
 
 class StateGateBuilder {
 public:
-  StateGateBuilder(mlir::OpBuilder& b, mlir::Location& l, mlir::Value& q): builder(b), loc(l), qubits(q) {}
+  StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q)
+      : builder(b), loc(l), qubits(q) {}
 
-  template<typename Op>
+  template <typename Op>
   void applyRotationOp(double theta, std::size_t target) {
     auto qubit = createQubitRef(target);
     auto thetaValue = createAngleValue(theta);
@@ -72,26 +73,30 @@ class StateGateBuilder {
       return qubitRefs[index];
     }
 
-    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(loc, index, builder.getIntegerType(64));
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(
+        loc, index, builder.getIntegerType(64));
     auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
     qubitRefs[index] = ref;
     return ref;
   }
 
   mlir::Value createAngleValue(double angle) {
-    return builder.create<mlir::arith::ConstantFloatOp>(loc, llvm::APFloat{angle}, builder.getF64Type());
+    return builder.create<mlir::arith::ConstantFloatOp>(
+        loc, llvm::APFloat{angle}, builder.getF64Type());
   }
 
-  mlir::OpBuilder& builder;
-  mlir::Location& loc;
-  mlir::Value& qubits;
+  mlir::OpBuilder &builder;
+  mlir::Location &loc;
+  mlir::Value &qubits;
 
-  std::unordered_map<std::size_t, mlir::Value> qubitRefs = std::unordered_map<std::size_t, mlir::Value>();
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs =
+      std::unordered_map<std::size_t, mlir::Value>();
 };
 
 class StateDecomposer {
 public:
-  StateDecomposer(StateGateBuilder& b, std::vector<std::complex<double>>& a): builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+  StateDecomposer(StateGateBuilder &b, std::vector<std::complex<double>> &a)
+      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
 
   /// @brief Decompose the input state vector data to a set of controlled
   /// operations and rotations. This function takes as input a `OpBuilder`
@@ -112,8 +117,9 @@ class StateDecomposer {
     }
 
     // N.B: The algorithm, as described in the paper, creates a circuit that
-    // begins with a target state and brings it to the all zero state. Hence, this
-    // implementation do the two steps described in Section III in reverse order.
+    // begins with a target state and brings it to the all zero state. Hence,
+    // this implementation do the two steps described in Section III in reverse
+    // order.
 
     // Apply uniformly controlled y-rotations, the construction in Eq. (4).
     for (std::size_t j = 1; j <= numQubits; ++j) {
@@ -142,7 +148,8 @@ class StateDecomposer {
 private:
   /// @brief Apply a uniformly controlled rotation on the target qubit.
   template <typename Op>
-  void applyRotation(const std::span<double> alphas, std::size_t numControls, std::size_t target) {
+  void applyRotation(const std::span<double> alphas, std::size_t numControls,
+                     std::size_t target) {
     auto thetas = cudaq::details::convertAngles(alphas);
     if (numControls == 0) {
       builder.applyRotationOp<Op>(thetas[0], target);
@@ -157,7 +164,7 @@ class StateDecomposer {
     }
   }
 
-  StateGateBuilder& builder;
+  StateGateBuilder &builder;
   std::span<std::complex<double>> amplitudes;
   std::size_t numQubits;
 };
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 86bb911a3a..785e70b3f8 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
+#include "StateDecomposer.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -24,9 +25,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include <span>
-#include "StateDecomposer.h"
-
-#include <iostream>
 
 #define DEBUG_TYPE "state-preparation"
 
@@ -38,13 +36,14 @@ using namespace mlir;
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
 ///     %1 = math.cttz %0 : i64
-///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>> 
-///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-///     return
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
+///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
+///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
+///     !quake.veq<?> return
 ///   }
 ///
-/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0:
+/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
+/// M_SQRT1_2} as arg0:
 ///
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = quake.alloca !quake.veq<2>
@@ -70,11 +69,11 @@ using namespace mlir;
 namespace {
 
 template <typename T>
-concept IntegralType = std::is_same<T, bool>::value 
-    || std::is_same<T, std::int8_t>::value
-    || std::is_same<T, std::int16_t>::value
-    || std::is_same<T, std::int32_t>::value
-    || std::is_same<T, std::int64_t>::value;
+concept IntegralType =
+    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
+    std::is_same<T, std::int16_t>::value ||
+    std::is_same<T, std::int32_t>::value ||
+    std::is_same<T, std::int64_t>::value;
 
 template <typename T>
 concept FloatingType = std::is_same<T, float>::value;
@@ -85,31 +84,33 @@ concept DoubleType = std::is_same<T, double>::value;
 template <typename T>
 concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
 
-/// Input was complex<float>/complex<double> but we prefer
-/// complex<double>/complex<float>. Make a copy, extending or truncating the
-/// values.
+/// Input was complex<float> but we prefer
+/// complex<double>. Make a copy, extending the values.
 template <FloatingType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
     convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
-                                      static_cast<double>(data[i].imag())};
+                                          static_cast<double>(data[i].imag())};
   return convertData;
 }
 
 template <DoubleType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
-    return std::vector<std::complex<From>>(data, data+size);
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  return std::vector<std::complex<From>>(data, data + size);
 }
 
-/// Input was float/double but we prefer complex<float>/complex<double>.
+/// Input was float/double but we prefer complex<double>.
 /// Make a copy, extending or truncating the values.
 template <ComplexDataType From>
-std::vector<std::complex<double>> convertToComplex(From *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(From *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
-    convertData[i] =
-        std::complex<double>{static_cast<double>(data[i]), static_cast<double>(0.0)};
+    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
+                                          static_cast<double>(0.0)};
   return convertData;
 }
 
@@ -118,39 +119,15 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
                                unsigned &counter, BlockArgument argument,
                                std::vector<std::complex<double>> &vec) {
   auto *ctx = builder.getContext();
-  // builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
 
-  // TODO: look at quake.init_state instructions from vector data and track them
-  // to the argument vector, then replace the instruction by gates preparing the
-  // state (or a call to a kernel with gates)
-
-  ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-  ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-  ///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
-  ///     !cc.ptr<complex<f32>>
-  ///
-  ///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-  ///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>)
-  ///     -> !quake.veq<?> return
-  ///   }
-
-  /// =>
-
-  ///     ...
-  ///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
-  ///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.h %6 : (!quake.ref) -> ()
-  ///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
-
-  auto toErase = std::vector<mlir::Operation*>();
+  auto toErase = std::vector<mlir::Operation *>();
 
   for (auto *argUser : argument.getUsers()) {
     // Handle the `StdvecSize` and `quake.alloca` use case:
     // - Replace a `vec.size()` with the vector length.
-    // - Replace the number of qubits calculation with the vector length logarithm.
+    // - Replace the number of qubits calculation with the vector length
+    // logarithm.
     // - Replace `quake.alloca` with a constant size qvector allocation.
     if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
       builder.setInsertionPointAfter(stdvecSizeOp);
@@ -160,9 +137,10 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
       Value numQubits = builder.create<arith::ConstantIntOp>(
           argLoc, log2(vec.size()), stdvecSizeOp.getType());
 
-      for (auto *sizeUser: argUser->getUsers()) {
-        if (auto countZeroesOp = dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
-          for (auto *numQubitsUser: sizeUser->getUsers()) {
+      for (auto *sizeUser : argUser->getUsers()) {
+        if (auto countZeroesOp =
+                dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
+          for (auto *numQubitsUser : sizeUser->getUsers()) {
             if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
               builder.setInsertionPointAfter(quakeAllocaOp);
               auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
@@ -175,7 +153,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
           toErase.push_back(countZeroesOp);
         }
       }
-      
+
       stdvecSizeOp.replaceAllUsesWith(length);
       toErase.push_back(stdvecSizeOp);
       continue;
@@ -202,7 +180,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
     }
   }
 
-  for (auto& op: toErase) {
+  for (auto &op : toErase) {
     op->erase();
   }
 
@@ -249,9 +227,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before state prep " << std::endl;
     auto module = getModule();
-    module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -286,13 +262,12 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
 
       // Get the argument type
       auto type = argument.getType();
-      // auto loc = argument.getLoc();
 
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          std::cout << "State pointer found, TODO: call a kernel that created "
-                       "the state"
-                    << std::endl;
+          funcOp.emitOpError(
+              "State preparation from cudaq::state is not supported.");
+          return;
         }
       }
 
@@ -301,9 +276,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
       if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
         auto eleTy = vecTy.getElementType();
         if (!isa<IntegerType, FloatType, ComplexType>(eleTy)) {
-          funcOp.emitOpError("synthesis: unsupported argument type");
-          signalPassFailure();
-          return;
+          continue;
         }
         char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
         auto sizeFromBuffer =
@@ -328,10 +301,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
     char *bufferAppendix = static_cast<char *>(args) + structSize;
     for (auto [idx, eleTy, vecLength] : stdVecInfo) {
       if (!eleTy) {
-        // FIXME: Skip struct values.
         bufferAppendix += vecLength;
-        funcOp.emitOpError(
-            "argument to kernel may be a struct and was not synthesized");
         continue;
       }
       auto doVector = [&]<typename T>(T) {
@@ -385,8 +355,6 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
         continue;
       }
     }
-    std::cout << "Module after state preparation " << std::endl;
-    module.dump();
   }
 };
 
diff --git a/program.py b/program.py
deleted file mode 100644
index e282d8cd5d..0000000000
--- a/program.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-import numpy as np
-import cudaq
-
-import cudaq
-import numpy as np
-
-cudaq.reset_target()
-
-cudaq.set_target('nvidia')
-#cudaq.set_target('nvidia-mqpu')
-# cudaq.set_target('density-matrix-cpu')
-
-
-c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
-                dtype=np.complex128)
-state = cudaq.State.from_data(c)
-
-@cudaq.kernel(verbose=True)
-def kernel(vec: cudaq.State):
-    q = cudaq.qvector(vec)
-
-print(kernel)
-print(cudaq.to_qir(kernel))
-
-#print(cudaq.get_target())
-#counts = cudaq.sample(kernel, state)
-#print(counts)
\ No newline at end of file
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index ddaeb6cc4d..f998a82dd1 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -20,28 +20,8 @@
 
 
 # float
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_params_f64():
-
+def test_kernel_float_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[float]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, f)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_float_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -156,10 +136,8 @@ def kernel():
 # complex
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_rotate_f64():
+def test_kernel_complex_params_rotate():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [0. + 0j, 0., 0., 1.]
 
@@ -179,50 +157,8 @@ def kernel(vec: list[complex]):
     assert '10' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_rotate_f32():
+def test_kernel_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [0. + 0j, 0., 0., 1.]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-        x(q.front())
-        y(q.back())
-        h(q)
-        mz(q)
-
-    counts = cudaq.sample(kernel, c)
-    print(f'rotate: {counts}')
-    assert '11' in counts
-    assert '00' in counts
-    assert '01' in counts
-    assert '10' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -337,10 +273,8 @@ def kernel():
 # np arrays
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex_params_f64():
+def test_kernel_dtype_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -354,10 +288,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex128_params_f64():
+def test_kernel_dtype_complex128_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -425,10 +357,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_params_f64():
+def test_kernel_amplitudes_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
 
@@ -442,27 +372,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_params_f32():
+def test_kernel_amplitudes_complex_from_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_from_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -476,23 +387,6 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_from_capture_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(cudaq.amplitudes(vec))
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_np_array_from_capture_f64():
     cudaq.reset_target()
@@ -568,40 +462,8 @@ def kernel():
 # test errors
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_array_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex))
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_list_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector([1., 0., 0.])
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_array_size_f32():
+def test_kernel_error_invalid_array_size_():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -613,10 +475,8 @@ def kernel():
         e)
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_list_size_f32():
+def test_kernel_error_invalid_list_size():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -629,6 +489,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_param_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel(n: int):
@@ -643,6 +504,8 @@ def kernel(n: int):
 
 
 def test_kernel_qvector_init_from_capture_int():
+    cudaq.reset_target()
+
     n = 2
 
     @cudaq.kernel
@@ -658,6 +521,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel():
diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp
deleted file mode 100644
index 55438848cb..0000000000
--- a/targettests/execution/from_state.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test(cudaq::state *inState) {
-  cudaq::qvector q(inState);
-}
-
-// CHECK: size 2
-
-int main() {
-  std::vector<std::complex<float>> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
-  auto state = cudaq::state::from_data(vec);
-  auto counts = cudaq::sample(test, &state);
-  counts.dump();
-
-  printf("size %zu\n", counts.size());
-  return !(counts.size() == 2);
-}
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
deleted file mode 100644
index be4855e3de..0000000000
--- a/targettests/execution/program.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test1(std::vector<cudaq::complex> inState) {
-    cudaq::qvector q1 = inState;
-}
-
-//  __qpu__ void test2(cudaq::state *inState) {
-//    cudaq::qvector q2(inState);
-//    cudaq::x(q2);
-// }
-
-// __qpu__ void test3() {
-//   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
-// }
-
-
-// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
-// __qpu__ void test5(cudaq::state *inState) {
-//   test2(inState);
-// }
-
-
-
-int main() {
-    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    {
-        // Passing state data as argument (vector<complex>)
-        std::cout << "test1(vec): "  << "\n";
-        auto counts = cudaq::sample(test1, vec);
-        counts.dump();
-    }
-
-    // {
-    //     // Passing state ptr as argument - no support for from_data
-    //
-    //     std::cout << "test2(state): "  << "\n";
-    //     auto state = cudaq::state::from_data(vec);
-    //
-    //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state from another kernel as argument
-    //
-    //     std::cout << "test2(test3): "  << "\n";
-    //     auto state = cudaq::get_state(test3);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state to another kernel as argument
-    //
-    //     std::cout << "test4(state): "  << "\n";
-    //     
-    //     auto state = cudaq::state::from_data(vec);
-    //     auto counts = cudaq::sample(test4, &state);
-    // }
-
-    // {
-    //     // Creating a kernel from state and passing its state to another kernel - is it deprecated?
-    //
-        std::cout << "test2(kernel): "  << "\n";
-        std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
-        auto kernel = cudaq::make_kernel();
-        auto qubits = kernel.qalloc(2);
-    
-        cudaq::from_state(kernel, qubits, vec);
-        auto counts = cudaq::sample(kernel);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     //auto state = cudaq::get_state(kernel);
-    //     //auto counts = cudaq::sample(test2, &state);
-    //
-         counts.dump();
-    // }
-
-}
\ No newline at end of file
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
new file mode 100644
index 0000000000..dbe9b15d86
--- /dev/null
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test2() {
+  cudaq::qvector q1({M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
+void printCounts(cudaq::sample_result& result) {
+  for (auto &&[bits, counts] : result) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test, vec);
+        printCounts(counts);
+    }
+    
+    {
+        // Using state data inside kernel (kernel mode) - not implemented yet.
+        // auto counts = cudaq::sample(test2);
+        // printCounts(counts);
+    }
+
+    {
+       // Passing state data as argument (builder mode)
+        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+        auto qubits = kernel.qalloc(v);
+    
+        auto counts = cudaq::sample(kernel, vec);
+        printCounts(counts);
+    }
+}
+
+// CHECK: 01
+// CHECK: 00
+
+// CHECK: 01
+// CHECK: 00
\ No newline at end of file

From 0a04d33ce4c7b734348784df2d14d3958827a592 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 25 Jun 2024 15:29:40 -0700
Subject: [PATCH 04/50] Cleanup

---
 lib/Optimizer/Transforms/CMakeLists.txt       |   4 +-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |   6 -
 lib/Optimizer/Transforms/StateDecomposer.h    |  69 ++++----
 lib/Optimizer/Transforms/StatePreparation.cpp | 108 ++++--------
 program.py                                    |  35 ----
 .../tests/kernel/test_kernel_qvector_init.py  | 162 ++----------------
 targettests/execution/from_state.cpp          |  30 ----
 targettests/execution/program.cpp             |  95 ----------
 .../execution/state_preparation_vector.cpp    |  57 ++++++
 9 files changed, 148 insertions(+), 418 deletions(-)
 delete mode 100644 program.py
 delete mode 100644 targettests/execution/from_state.cpp
 delete mode 100644 targettests/execution/program.cpp
 create mode 100644 targettests/execution/state_preparation_vector.cpp

diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index b0a13571ec..173cec4538 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -39,11 +39,11 @@ add_cudaq_library(OptTransforms
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
   QuakeAddMetadata.cpp
-  StateDecomposer.cpp
-  StatePreparation.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
+  StateDecomposer.cpp
+  StatePreparation.cpp
   PySynthCallableBlockArgs.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 0fa859f175..7d83c152dd 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -23,8 +23,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
-#include <iostream>
-
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
@@ -419,9 +417,7 @@ class QuakeSynthesizer
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before synthesis " << std::endl;
     auto module = getModule();
-    // module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -680,8 +676,6 @@ class QuakeSynthesizer
       }
     }
     funcOp.eraseArguments(argsToErase);
-    // std::cout << "Module after synthesis " << std::endl;
-    module.dump();
   }
 };
 
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index bac6909708..2d17edb768 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -29,31 +29,32 @@
 
 namespace cudaq::details {
 
-  /// @brief Converts angles of a uniformly controlled rotation to angles of
-  /// non-controlled rotations.
-  std::vector<double> convertAngles(const std::span<double> alphas);
-
-  /// @brief Return the control indices dictated by the gray code implementation.
-  ///
-  /// Here, numBits is the number of controls.
-  std::vector<std::size_t> getControlIndices(std::size_t numBits);
-
-   /// @brief Return angles required to implement a uniformly controlled z-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaZ(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
-
-  /// @brief Return angles required to implement a uniformly controlled y-rotation
-  /// on the `kth` qubit.
-  std::vector<double> getAlphaY(const std::span<double> data,
-                                std::size_t numQubits, std::size_t k);
+/// @brief Converts angles of a uniformly controlled rotation to angles of
+/// non-controlled rotations.
+std::vector<double> convertAngles(const std::span<double> alphas);
+
+/// @brief Return the control indices dictated by the gray code implementation.
+///
+/// Here, numBits is the number of controls.
+std::vector<std::size_t> getControlIndices(std::size_t numBits);
+
+/// @brief Return angles required to implement a uniformly controlled z-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
+
+/// @brief Return angles required to implement a uniformly controlled y-rotation
+/// on the `kth` qubit.
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k);
 } // namespace cudaq::details
 
 class StateGateBuilder {
 public:
-  StateGateBuilder(mlir::OpBuilder& b, mlir::Location& l, mlir::Value& q): builder(b), loc(l), qubits(q) {}
+  StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q)
+      : builder(b), loc(l), qubits(q) {}
 
-  template<typename Op>
+  template <typename Op>
   void applyRotationOp(double theta, std::size_t target) {
     auto qubit = createQubitRef(target);
     auto thetaValue = createAngleValue(theta);
@@ -72,26 +73,30 @@ class StateGateBuilder {
       return qubitRefs[index];
     }
 
-    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(loc, index, builder.getIntegerType(64));
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(
+        loc, index, builder.getIntegerType(64));
     auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
     qubitRefs[index] = ref;
     return ref;
   }
 
   mlir::Value createAngleValue(double angle) {
-    return builder.create<mlir::arith::ConstantFloatOp>(loc, llvm::APFloat{angle}, builder.getF64Type());
+    return builder.create<mlir::arith::ConstantFloatOp>(
+        loc, llvm::APFloat{angle}, builder.getF64Type());
   }
 
-  mlir::OpBuilder& builder;
-  mlir::Location& loc;
-  mlir::Value& qubits;
+  mlir::OpBuilder &builder;
+  mlir::Location &loc;
+  mlir::Value &qubits;
 
-  std::unordered_map<std::size_t, mlir::Value> qubitRefs = std::unordered_map<std::size_t, mlir::Value>();
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs =
+      std::unordered_map<std::size_t, mlir::Value>();
 };
 
 class StateDecomposer {
 public:
-  StateDecomposer(StateGateBuilder& b, std::vector<std::complex<double>>& a): builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+  StateDecomposer(StateGateBuilder &b, std::vector<std::complex<double>> &a)
+      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
 
   /// @brief Decompose the input state vector data to a set of controlled
   /// operations and rotations. This function takes as input a `OpBuilder`
@@ -112,8 +117,9 @@ class StateDecomposer {
     }
 
     // N.B: The algorithm, as described in the paper, creates a circuit that
-    // begins with a target state and brings it to the all zero state. Hence, this
-    // implementation do the two steps described in Section III in reverse order.
+    // begins with a target state and brings it to the all zero state. Hence,
+    // this implementation do the two steps described in Section III in reverse
+    // order.
 
     // Apply uniformly controlled y-rotations, the construction in Eq. (4).
     for (std::size_t j = 1; j <= numQubits; ++j) {
@@ -142,7 +148,8 @@ class StateDecomposer {
 private:
   /// @brief Apply a uniformly controlled rotation on the target qubit.
   template <typename Op>
-  void applyRotation(const std::span<double> alphas, std::size_t numControls, std::size_t target) {
+  void applyRotation(const std::span<double> alphas, std::size_t numControls,
+                     std::size_t target) {
     auto thetas = cudaq::details::convertAngles(alphas);
     if (numControls == 0) {
       builder.applyRotationOp<Op>(thetas[0], target);
@@ -157,7 +164,7 @@ class StateDecomposer {
     }
   }
 
-  StateGateBuilder& builder;
+  StateGateBuilder &builder;
   std::span<std::complex<double>> amplitudes;
   std::size_t numQubits;
 };
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 86bb911a3a..785e70b3f8 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
+#include "StateDecomposer.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -24,9 +25,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include <span>
-#include "StateDecomposer.h"
-
-#include <iostream>
 
 #define DEBUG_TYPE "state-preparation"
 
@@ -38,13 +36,14 @@ using namespace mlir;
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
 ///     %1 = math.cttz %0 : i64
-///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) -> !cc.ptr<complex<f32>> 
-///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) -> !quake.veq<?>
-///     return
+///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
+///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
+///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
+///     !quake.veq<?> return
 ///   }
 ///
-/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2} as arg0:
+/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
+/// M_SQRT1_2} as arg0:
 ///
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = quake.alloca !quake.veq<2>
@@ -70,11 +69,11 @@ using namespace mlir;
 namespace {
 
 template <typename T>
-concept IntegralType = std::is_same<T, bool>::value 
-    || std::is_same<T, std::int8_t>::value
-    || std::is_same<T, std::int16_t>::value
-    || std::is_same<T, std::int32_t>::value
-    || std::is_same<T, std::int64_t>::value;
+concept IntegralType =
+    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
+    std::is_same<T, std::int16_t>::value ||
+    std::is_same<T, std::int32_t>::value ||
+    std::is_same<T, std::int64_t>::value;
 
 template <typename T>
 concept FloatingType = std::is_same<T, float>::value;
@@ -85,31 +84,33 @@ concept DoubleType = std::is_same<T, double>::value;
 template <typename T>
 concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
 
-/// Input was complex<float>/complex<double> but we prefer
-/// complex<double>/complex<float>. Make a copy, extending or truncating the
-/// values.
+/// Input was complex<float> but we prefer
+/// complex<double>. Make a copy, extending the values.
 template <FloatingType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
     convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
-                                      static_cast<double>(data[i].imag())};
+                                          static_cast<double>(data[i].imag())};
   return convertData;
 }
 
 template <DoubleType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data, std::uint64_t size) {
-    return std::vector<std::complex<From>>(data, data+size);
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  return std::vector<std::complex<From>>(data, data + size);
 }
 
-/// Input was float/double but we prefer complex<float>/complex<double>.
+/// Input was float/double but we prefer complex<double>.
 /// Make a copy, extending or truncating the values.
 template <ComplexDataType From>
-std::vector<std::complex<double>> convertToComplex(From *data, std::uint64_t size) {
+std::vector<std::complex<double>> convertToComplex(From *data,
+                                                   std::uint64_t size) {
   auto convertData = std::vector<std::complex<double>>(size);
   for (std::size_t i = 0; i < size; ++i)
-    convertData[i] =
-        std::complex<double>{static_cast<double>(data[i]), static_cast<double>(0.0)};
+    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
+                                          static_cast<double>(0.0)};
   return convertData;
 }
 
@@ -118,39 +119,15 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
                                unsigned &counter, BlockArgument argument,
                                std::vector<std::complex<double>> &vec) {
   auto *ctx = builder.getContext();
-  // builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
 
-  // TODO: look at quake.init_state instructions from vector data and track them
-  // to the argument vector, then replace the instruction by gates preparing the
-  // state (or a call to a kernel with gates)
-
-  ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-  ///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-  ///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
-  ///     !cc.ptr<complex<f32>>
-  ///
-  ///     %3 = quake.alloca !quake.veq<?>[%1 : i64]
-  ///     %4 = quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>)
-  ///     -> !quake.veq<?> return
-  ///   }
-
-  /// =>
-
-  ///     ...
-  ///     %5 = quake.alloca !quake.veq<?>[%3 : i64]
-  ///     %6 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.h %6 : (!quake.ref) -> ()
-  ///     %7 = quake.extract_ref %5[0] : (!quake.veq<?>) -> !quake.ref
-  ///     %8 = quake.extract_ref %5[1] : (!quake.veq<?>) -> !quake.ref
-  ///     quake.x [%7] %8 : (!quake.ref, !quake.ref) -> ()
-
-  auto toErase = std::vector<mlir::Operation*>();
+  auto toErase = std::vector<mlir::Operation *>();
 
   for (auto *argUser : argument.getUsers()) {
     // Handle the `StdvecSize` and `quake.alloca` use case:
     // - Replace a `vec.size()` with the vector length.
-    // - Replace the number of qubits calculation with the vector length logarithm.
+    // - Replace the number of qubits calculation with the vector length
+    // logarithm.
     // - Replace `quake.alloca` with a constant size qvector allocation.
     if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
       builder.setInsertionPointAfter(stdvecSizeOp);
@@ -160,9 +137,10 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
       Value numQubits = builder.create<arith::ConstantIntOp>(
           argLoc, log2(vec.size()), stdvecSizeOp.getType());
 
-      for (auto *sizeUser: argUser->getUsers()) {
-        if (auto countZeroesOp = dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
-          for (auto *numQubitsUser: sizeUser->getUsers()) {
+      for (auto *sizeUser : argUser->getUsers()) {
+        if (auto countZeroesOp =
+                dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
+          for (auto *numQubitsUser : sizeUser->getUsers()) {
             if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
               builder.setInsertionPointAfter(quakeAllocaOp);
               auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
@@ -175,7 +153,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
           toErase.push_back(countZeroesOp);
         }
       }
-      
+
       stdvecSizeOp.replaceAllUsesWith(length);
       toErase.push_back(stdvecSizeOp);
       continue;
@@ -202,7 +180,7 @@ prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
     }
   }
 
-  for (auto& op: toErase) {
+  for (auto &op : toErase) {
     op->erase();
   }
 
@@ -249,9 +227,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
   }
 
   void runOnOperation() override final {
-    std::cout << "Module before state prep " << std::endl;
     auto module = getModule();
-    module.dump();
     unsigned counter = 0;
 
     if (args == nullptr || kernelName.empty()) {
@@ -286,13 +262,12 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
 
       // Get the argument type
       auto type = argument.getType();
-      // auto loc = argument.getLoc();
 
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          std::cout << "State pointer found, TODO: call a kernel that created "
-                       "the state"
-                    << std::endl;
+          funcOp.emitOpError(
+              "State preparation from cudaq::state is not supported.");
+          return;
         }
       }
 
@@ -301,9 +276,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
       if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
         auto eleTy = vecTy.getElementType();
         if (!isa<IntegerType, FloatType, ComplexType>(eleTy)) {
-          funcOp.emitOpError("synthesis: unsupported argument type");
-          signalPassFailure();
-          return;
+          continue;
         }
         char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
         auto sizeFromBuffer =
@@ -328,10 +301,7 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
     char *bufferAppendix = static_cast<char *>(args) + structSize;
     for (auto [idx, eleTy, vecLength] : stdVecInfo) {
       if (!eleTy) {
-        // FIXME: Skip struct values.
         bufferAppendix += vecLength;
-        funcOp.emitOpError(
-            "argument to kernel may be a struct and was not synthesized");
         continue;
       }
       auto doVector = [&]<typename T>(T) {
@@ -385,8 +355,6 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
         continue;
       }
     }
-    std::cout << "Module after state preparation " << std::endl;
-    module.dump();
   }
 };
 
diff --git a/program.py b/program.py
deleted file mode 100644
index e282d8cd5d..0000000000
--- a/program.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-import numpy as np
-import cudaq
-
-import cudaq
-import numpy as np
-
-cudaq.reset_target()
-
-cudaq.set_target('nvidia')
-#cudaq.set_target('nvidia-mqpu')
-# cudaq.set_target('density-matrix-cpu')
-
-
-c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
-                dtype=np.complex128)
-state = cudaq.State.from_data(c)
-
-@cudaq.kernel(verbose=True)
-def kernel(vec: cudaq.State):
-    q = cudaq.qvector(vec)
-
-print(kernel)
-print(cudaq.to_qir(kernel))
-
-#print(cudaq.get_target())
-#counts = cudaq.sample(kernel, state)
-#print(counts)
\ No newline at end of file
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index ddaeb6cc4d..f998a82dd1 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -20,28 +20,8 @@
 
 
 # float
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_params_f64():
-
+def test_kernel_float_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[float]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, f)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_float_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -156,10 +136,8 @@ def kernel():
 # complex
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_rotate_f64():
+def test_kernel_complex_params_rotate():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [0. + 0j, 0., 0., 1.]
 
@@ -179,50 +157,8 @@ def kernel(vec: list[complex]):
     assert '10' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_rotate_f32():
+def test_kernel_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [0. + 0j, 0., 0., 1.]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-        x(q.front())
-        y(q.back())
-        h(q)
-        mz(q)
-
-    counts = cudaq.sample(kernel, c)
-    print(f'rotate: {counts}')
-    assert '11' in counts
-    assert '00' in counts
-    assert '01' in counts
-    assert '10' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_params_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_params_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -337,10 +273,8 @@ def kernel():
 # np arrays
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex_params_f64():
+def test_kernel_dtype_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -354,10 +288,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_dtype_complex128_params_f64():
+def test_kernel_dtype_complex128_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -425,10 +357,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_params_f64():
+def test_kernel_amplitudes_complex_params():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
 
@@ -442,27 +372,8 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_params_f32():
+def test_kernel_amplitudes_complex_from_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(vec)
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_amplitudes_complex_from_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
 
     c = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -476,23 +387,6 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_amplitudes_complex_from_capture_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel(vec: list[complex]):
-        q = cudaq.qvector(cudaq.amplitudes(vec))
-
-    counts = cudaq.sample(kernel, c)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_np_array_from_capture_f64():
     cudaq.reset_target()
@@ -568,40 +462,8 @@ def kernel():
 # test errors
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_array_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex))
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_error_invalid_list_size_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        qubits = cudaq.qvector([1., 0., 0.])
-
-    with pytest.raises(RuntimeError) as e:
-        counts = cudaq.sample(kernel)
-    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
-        e)
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_array_size_f32():
+def test_kernel_error_invalid_array_size_():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -613,10 +475,8 @@ def kernel():
         e)
 
 
-@skipIfNvidiaNotInstalled
-def test_kernel_error_invalid_list_size_f32():
+def test_kernel_error_invalid_list_size():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -629,6 +489,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_param_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel(n: int):
@@ -643,6 +504,8 @@ def kernel(n: int):
 
 
 def test_kernel_qvector_init_from_capture_int():
+    cudaq.reset_target()
+
     n = 2
 
     @cudaq.kernel
@@ -658,6 +521,7 @@ def kernel():
 
 
 def test_kernel_qvector_init_from_int():
+    cudaq.reset_target()
 
     @cudaq.kernel
     def kernel():
diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp
deleted file mode 100644
index 55438848cb..0000000000
--- a/targettests/execution/from_state.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test(cudaq::state *inState) {
-  cudaq::qvector q(inState);
-}
-
-// CHECK: size 2
-
-int main() {
-  std::vector<std::complex<float>> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
-  auto state = cudaq::state::from_data(vec);
-  auto counts = cudaq::sample(test, &state);
-  counts.dump();
-
-  printf("size %zu\n", counts.size());
-  return !(counts.size() == 2);
-}
diff --git a/targettests/execution/program.cpp b/targettests/execution/program.cpp
deleted file mode 100644
index be4855e3de..0000000000
--- a/targettests/execution/program.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
-
-#include <cudaq.h>
-#include "cudaq/builder/kernels.h"
-#include <iostream>
-
-__qpu__ void test1(std::vector<cudaq::complex> inState) {
-    cudaq::qvector q1 = inState;
-}
-
-//  __qpu__ void test2(cudaq::state *inState) {
-//    cudaq::qvector q2(inState);
-//    cudaq::x(q2);
-// }
-
-// __qpu__ void test3() {
-//   auto q3 = cudaq::qvector({M_SQRT1_2, 0., 0., M_SQRT1_2});
-// }
-
-
-// error: /workspaces/cuda-quantum/lib/Frontend/nvqpp/ConvertExpr.cpp:392: not yet implemented: argument type conversion
-// __qpu__ void test5(cudaq::state *inState) {
-//   test2(inState);
-// }
-
-
-
-int main() {
-    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    {
-        // Passing state data as argument (vector<complex>)
-        std::cout << "test1(vec): "  << "\n";
-        auto counts = cudaq::sample(test1, vec);
-        counts.dump();
-    }
-
-    // {
-    //     // Passing state ptr as argument - no support for from_data
-    //
-    //     std::cout << "test2(state): "  << "\n";
-    //     auto state = cudaq::state::from_data(vec);
-    //
-    //     // 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state from another kernel as argument
-    //
-    //     std::cout << "test2(test3): "  << "\n";
-    //     auto state = cudaq::get_state(test3);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     auto counts = cudaq::sample(test2, &state);
-    //     counts.dump();
-    // }
-
-    // {
-    //     // Passing a state to another kernel as argument
-    //
-    //     std::cout << "test4(state): "  << "\n";
-    //     
-    //     auto state = cudaq::state::from_data(vec);
-    //     auto counts = cudaq::sample(test4, &state);
-    // }
-
-    // {
-    //     // Creating a kernel from state and passing its state to another kernel - is it deprecated?
-    //
-        std::cout << "test2(kernel): "  << "\n";
-        std::vector<std::complex<double>> vec{.70710678, 0., 0., 0.70710678};
-        auto kernel = cudaq::make_kernel();
-        auto qubits = kernel.qalloc(2);
-    
-        cudaq::from_state(kernel, qubits, vec);
-        auto counts = cudaq::sample(kernel);
-    //
-    //     // error: 'func.call' op '__nvqpp_cudaq_state_numberOfQubits' does not reference a valid function
-    //     //auto state = cudaq::get_state(kernel);
-    //     //auto counts = cudaq::sample(test2, &state);
-    //
-         counts.dump();
-    // }
-
-}
\ No newline at end of file
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
new file mode 100644
index 0000000000..dbe9b15d86
--- /dev/null
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include "cudaq/builder/kernels.h"
+#include <iostream>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test2() {
+  cudaq::qvector q1({M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
+void printCounts(cudaq::sample_result& result) {
+  for (auto &&[bits, counts] : result) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test, vec);
+        printCounts(counts);
+    }
+    
+    {
+        // Using state data inside kernel (kernel mode) - not implemented yet.
+        // auto counts = cudaq::sample(test2);
+        // printCounts(counts);
+    }
+
+    {
+       // Passing state data as argument (builder mode)
+        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+        auto qubits = kernel.qalloc(v);
+    
+        auto counts = cudaq::sample(kernel, vec);
+        printCounts(counts);
+    }
+}
+
+// CHECK: 01
+// CHECK: 00
+
+// CHECK: 01
+// CHECK: 00
\ No newline at end of file

From 3660e278407719c7aa7ba82f93f08261dc936635 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 26 Jun 2024 09:51:42 -0700
Subject: [PATCH 05/50] Updated test

---
 .../execution/state_preparation_vector.cpp    | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index dbe9b15d86..d415072ce7 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -16,10 +16,6 @@ __qpu__ void test(std::vector<cudaq::complex> inState) {
   cudaq::qvector q1 = inState;
 }
 
-__qpu__ void test2() {
-  cudaq::qvector q1({M_SQRT1_2, M_SQRT1_2, 0., 0.});
-}
-
 void printCounts(cudaq::sample_result& result) {
   for (auto &&[bits, counts] : result) {
     std::cout << bits << '\n';
@@ -28,20 +24,18 @@ void printCounts(cudaq::sample_result& result) {
 
 int main() {
     std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
     {
         // Passing state data as argument (kernel mode)
         auto counts = cudaq::sample(test, vec);
         printCounts(counts);
-    }
-    
-    {
-        // Using state data inside kernel (kernel mode) - not implemented yet.
-        // auto counts = cudaq::sample(test2);
-        // printCounts(counts);
+
+        counts = cudaq::sample(test, vec1);
+        printCounts(counts);
     }
 
     {
-       // Passing state data as argument (builder mode)
+        // Passing state data as argument (builder mode)
         auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
         auto qubits = kernel.qalloc(v);
     
@@ -53,5 +47,8 @@ int main() {
 // CHECK: 01
 // CHECK: 00
 
+// CHECK: 10
+// CHECK: 10
+
 // CHECK: 01
 // CHECK: 00
\ No newline at end of file

From 8cbc1f6905babbfe1e123840d9d1b6e1a00747fa Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 26 Jun 2024 12:37:52 -0700
Subject: [PATCH 06/50] Fix test failures

---
 .../tests/kernel/test_kernel_qvector_init.py  | 21 +++++++++++++++++--
 .../execution/state_preparation_vector.cpp    | 16 ++++++++------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index f998a82dd1..6f2fd07152 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -5,11 +5,18 @@
 # This source code and the accompanying materials are made available under     #
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
+
+import os, sys
 import pytest
 
 import cudaq
 import numpy as np
 
+## [PYTHON_VERSION_FIX]
+skipIfPythonLessThan39 = pytest.mark.skipif(
+    sys.version_info < (3, 9),
+    reason="built-in collection types such as `list` not supported")
+
 skipIfNvidiaFP64NotInstalled = pytest.mark.skipif(
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia-fp64')),
     reason='Could not find nvidia-fp64 in installation')
@@ -18,8 +25,10 @@
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
     reason='Could not find nvidia in installation')
 
-
 # float
+
+
+@skipIfPythonLessThan39
 def test_kernel_float_params():
     cudaq.reset_target()
 
@@ -136,6 +145,7 @@ def kernel():
 # complex
 
 
+@skipIfPythonLessThan39
 def test_kernel_complex_params_rotate():
     cudaq.reset_target()
 
@@ -157,6 +167,7 @@ def kernel(vec: list[complex]):
     assert '10' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_complex_params():
     cudaq.reset_target()
 
@@ -273,6 +284,7 @@ def kernel():
 # np arrays
 
 
+@skipIfPythonLessThan39
 def test_kernel_dtype_complex_params():
     cudaq.reset_target()
 
@@ -288,6 +300,7 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_dtype_complex128_params():
     cudaq.reset_target()
 
@@ -357,6 +370,7 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_amplitudes_complex_params():
     cudaq.reset_target()
 
@@ -372,6 +386,7 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
+@skipIfPythonLessThan39
 def test_kernel_amplitudes_complex_from_capture():
     cudaq.reset_target()
 
@@ -462,7 +477,8 @@ def kernel():
 # test errors
 
 
-def test_kernel_error_invalid_array_size_():
+@skipIfPythonLessThan39
+def test_kernel_error_invalid_array_size():
     cudaq.reset_target()
 
     @cudaq.kernel
@@ -475,6 +491,7 @@ def kernel():
         e)
 
 
+@skipIfPythonLessThan39
 def test_kernel_error_invalid_list_size():
     cudaq.reset_target()
 
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index d415072ce7..ef4ea69b92 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -17,7 +17,13 @@ __qpu__ void test(std::vector<cudaq::complex> inState) {
 }
 
 void printCounts(cudaq::sample_result& result) {
+  std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
     std::cout << bits << '\n';
   }
 }
@@ -44,11 +50,9 @@ int main() {
     }
 }
 
-// CHECK: 01
 // CHECK: 00
-
-// CHECK: 10
-// CHECK: 10
-
 // CHECK: 01
-// CHECK: 00
\ No newline at end of file
+// CHECK: 10
+// CHECK: 11
+// CHECK: 00
+// CHECK: 01
\ No newline at end of file

From 6d4433d5cf40835dfb42c67f180062b8aac7d601 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 27 Jun 2024 09:46:13 -0700
Subject: [PATCH 07/50] Revert the order of qubits in state prep

---
 lib/Optimizer/Transforms/StateDecomposer.h    | 13 ++++++++---
 .../tests/backends/test_Quantinuum_kernel.py  | 15 +++++++++++++
 .../tests/kernel/test_kernel_qvector_init.py  | 22 +++++++++++++++++++
 .../execution/state_preparation_vector.cpp    |  4 ++--
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index 2d17edb768..b433089258 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -150,17 +150,24 @@ class StateDecomposer {
   template <typename Op>
   void applyRotation(const std::span<double> alphas, std::size_t numControls,
                      std::size_t target) {
+
+    // In our model the index 1 (i.e. |01>) in quantum state data
+    // corresponds to qubits[0]=1 and qubits[1] = 0.
+    // Revert the order of qubits as the state preparation algorithm
+    // we use assumes the opposite.
+    auto qubitIndex = [&](std::size_t i) { return numQubits - i - 1; };
+
     auto thetas = cudaq::details::convertAngles(alphas);
     if (numControls == 0) {
-      builder.applyRotationOp<Op>(thetas[0], target);
+      builder.applyRotationOp<Op>(thetas[0], qubitIndex(target));
       return;
     }
 
     auto controlIndices = cudaq::details::getControlIndices(numControls);
     assert(thetas.size() == controlIndices.size());
     for (auto [i, c] : llvm::enumerate(controlIndices)) {
-      builder.applyRotationOp<Op>(thetas[i], target);
-      builder.applyX(c, target);
+      builder.applyRotationOp<Op>(thetas[i], qubitIndex(target));
+      builder.applyX(qubitIndex(c), qubitIndex(target));
     }
   }
 
diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index de072335bf..b0ca043060 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -7,6 +7,7 @@
 # ============================================================================ #
 
 import cudaq, pytest, os, time
+import numpy as np
 from cudaq import spin
 from multiprocessing import Process
 try:
@@ -169,6 +170,20 @@ def kernel():
     result = cudaq.sample(kernel)
 
 
+def test_quantinuum_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '11' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index 6f2fd07152..28260dcb4d 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -25,6 +25,28 @@
     not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
     reason='Could not find nvidia in installation')
 
+# state preparation and synthesis
+
+
+@skipIfPythonLessThan39
+def test_kernel_state_preparation():
+    cudaq.reset_target()
+
+    c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+
+    synthesized = cudaq.synthesize(kernel, c)
+    assert 'quake.init_state' in kernel.__str__()
+    assert not 'quake.init_state' in synthesized.__str__()
+
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+
+
 # float
 
 
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index ef4ea69b92..fccf6d872c 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -51,8 +51,8 @@ int main() {
 }
 
 // CHECK: 00
-// CHECK: 01
 // CHECK: 10
+// CHECK: 01
 // CHECK: 11
 // CHECK: 00
-// CHECK: 01
\ No newline at end of file
+// CHECK: 10

From 46f247728cf2ca22cda3bbf417007c63db1a1bed Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 27 Jun 2024 10:20:09 -0700
Subject: [PATCH 08/50] Fixed failing tests

---
 python/tests/backends/test_Quantinuum_kernel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index b0ca043060..fc11224f5e 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -178,7 +178,7 @@ def kernel(vec: list[complex]):
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
     counts = cudaq.sample(kernel, state)
-    assert '11' in counts
+    assert '00' in counts
     assert '10' in counts
     assert not '01' in counts
     assert not '11' in counts

From fb0994f8cb2c8459d715a27c208cef19c58542cb Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 27 Jun 2024 11:27:27 -0700
Subject: [PATCH 09/50] Fix test faiure

---
 lib/Optimizer/Transforms/StateDecomposer.h         | 2 +-
 targettests/execution/state_preparation_vector.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index b433089258..a698ac83c2 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -95,7 +95,7 @@ class StateGateBuilder {
 
 class StateDecomposer {
 public:
-  StateDecomposer(StateGateBuilder &b, std::vector<std::complex<double>> &a)
+  StateDecomposer(StateGateBuilder &b, std::span<std::complex<double>> a)
       : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
 
   /// @brief Decompose the input state vector data to a set of controlled
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index fccf6d872c..35a628c06a 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -9,7 +9,6 @@
 // RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
-#include "cudaq/builder/kernels.h"
 #include <iostream>
 
 __qpu__ void test(std::vector<cudaq::complex> inState) {

From 0abf40aa2ebc13a314500ac5ca7955cbe4510181 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 27 Jun 2024 13:22:55 -0700
Subject: [PATCH 10/50] Cleanup

---
 lib/Optimizer/Transforms/StateDecomposer.cpp | 2 +-
 lib/Optimizer/Transforms/StateDecomposer.h   | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/Optimizer/Transforms/StateDecomposer.cpp b/lib/Optimizer/Transforms/StateDecomposer.cpp
index 3105fad707..62ca8a9d73 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.cpp
+++ b/lib/Optimizer/Transforms/StateDecomposer.cpp
@@ -125,4 +125,4 @@ std::vector<double> getAlphaY(const std::span<double> data,
   }
   return angles;
 }
-} // namespace cudaq::details
\ No newline at end of file
+} // namespace cudaq::details
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
index a698ac83c2..a09b8a64e9 100644
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ b/lib/Optimizer/Transforms/StateDecomposer.h
@@ -25,8 +25,6 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include <span>
 
-#include <iostream>
-
 namespace cudaq::details {
 
 /// @brief Converts angles of a uniformly controlled rotation to angles of
@@ -152,7 +150,7 @@ class StateDecomposer {
                      std::size_t target) {
 
     // In our model the index 1 (i.e. |01>) in quantum state data
-    // corresponds to qubits[0]=1 and qubits[1] = 0.
+    // corresponds to qubits[0] = 1 and qubits[1] = 0.
     // Revert the order of qubits as the state preparation algorithm
     // we use assumes the opposite.
     auto qubitIndex = [&](std::size_t i) { return numQubits - i - 1; };

From b62bb526f1bcfa24f66867ca6fcf7001ea39e790 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 1 Jul 2024 14:08:00 -0700
Subject: [PATCH 11/50] Move state prep to after synthesis

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   2 +
 include/cudaq/Optimizer/Transforms/Passes.td  |  11 +
 lib/Frontend/nvqpp/ConvertExpr.cpp            |  20 +-
 lib/Optimizer/CodeGen/ConvertToQIR.cpp        |  12 +
 .../Transforms/ApplyControlNegations.cpp      |   4 +
 lib/Optimizer/Transforms/BasisConversion.cpp  |  14 +-
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 .../Transforms/GenDeviceCodeLoader.cpp        |   7 +
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |   2 +-
 .../Transforms/StatePreparation2.cpp          | 304 ++++++++++++++++++
 program.py                                    |  23 ++
 runtime/common/BaseRemoteRESTQPU.h            |  53 ++-
 runtime/common/RuntimeMLIRCommonImpl.h        |  10 +
 .../platform/default/rest/RemoteRESTQPU.cpp   |   2 +
 .../execution/state_preparation_vector.cpp    |  47 +--
 15 files changed, 488 insertions(+), 24 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StatePreparation2.cpp
 create mode 100644 program.py

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 422032326c..d0759cac85 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -42,6 +42,8 @@ std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createStatePreparation();
 std::unique_ptr<mlir::Pass> createStatePreparation(std::string_view, void *);
+std::unique_ptr<mlir::Pass> createStatePreparation2();
+std::unique_ptr<mlir::Pass> createStatePreparation2(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index e5e15a8776..2a342e63c3 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -523,6 +523,17 @@ def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
   let constructor = "cudaq::opt::createStatePreparation()";
 }
 
+def PrepareState2 : Pass<"state-prep2", "mlir::ModuleOp"> {
+  let summary =
+    "Convert state vector data into gates";
+  let description = [{
+    Convert quake representation that includes qubit initialization
+    from data into qubit initialization using gates.
+  }];
+
+  let constructor = "cudaq::opt::createStatePreparation2()";
+}
+
 def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> {
   let summary =
     "Synthesize concrete quantum program from Quake code plus runtime values.";
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index e5deb2e023..7c73faa2d5 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -15,6 +15,8 @@
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 
+#include <iostream>
+
 #define DEBUG_TYPE "lower-ast-expr"
 
 using namespace mlir;
@@ -2569,12 +2571,28 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
         }
       }
       return false;
-    }();
+    }(); 
     if (isVectorOfQubitRefs)
       return true;
     if (ctorName == "complex") {
       Value imag = popValue();
       Value real = popValue();
+
+      std::cout << "Real and Imag values" << std::endl;
+      real.dump();
+      imag.dump();
+      if (auto realOp = real.getDefiningOp<arith::ConstantFloatOp>()) {
+        if (auto imagOp = imag.getDefiningOp<arith::ConstantFloatOp>()) {
+          std::cout << "Creating const complex" << std::endl;
+          auto realConst = realOp.value().convertToDouble();
+          auto imagConst = imagOp.value().convertToDouble();
+          auto attr = (real.getType() == builder.getF64Type())?
+            builder.getF64ArrayAttr({realConst, imagConst}):
+            builder.getF32ArrayAttr({static_cast<float>(realConst), static_cast<float>(imagConst)});
+          return pushValue(builder.create<complex::ConstantOp>(loc, ComplexType::get(real.getType()), attr));
+        }
+      }
+      std::cout << "Creating non-const complex" << std::endl;
       return pushValue(builder.create<complex::CreateOp>(
           loc, ComplexType::get(real.getType()), real, imag));
     }
diff --git a/lib/Optimizer/CodeGen/ConvertToQIR.cpp b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
index 245a887a2e..731bc0e5bc 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIR.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
@@ -36,6 +36,8 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
+#include <iostream>
+
 #define DEBUG_TYPE "convert-to-qir"
 
 namespace cudaq::opt {
@@ -95,6 +97,7 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
   // buffer of constants.
   LogicalResult eraseConstantArrayOps() {
     bool ok = true;
+
     SmallVector<Operation *> cleanUps;
     getOperation().walk([&](cudaq::cc::ConstantArrayOp carr) {
       // If there is a constant array, then we expect that it is involved in
@@ -169,6 +172,9 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
   /// ops. This step makes converting a DAG of nodes in the conversion step
   /// simpler.
   void runOnOperation() override final {
+    std::cout << "Before ConvertToQIR" << std::endl;
+    getOperation().dump();
+
     auto *context = &getContext();
     if (failed(fuseSubgraphPatterns(context, getOperation()))) {
       signalPassFailure();
@@ -204,11 +210,17 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
     target.addLegalDialect<LLVM::LLVMDialect>();
     target.addLegalOp<ModuleOp>();
 
+
     if (failed(
             applyFullConversion(getOperation(), target, std::move(patterns)))) {
       LLVM_DEBUG(getOperation().dump());
+      std::cout << "Filed ConvertToQIR" << std::endl;
+      getOperation().dump();
       signalPassFailure();
     }
+
+    std::cout << "Succeded ConvertToQIR" << std::endl;
+    getOperation().dump();
   }
 };
 
diff --git a/lib/Optimizer/Transforms/ApplyControlNegations.cpp b/lib/Optimizer/Transforms/ApplyControlNegations.cpp
index c88f80e6a1..e10df9bd7c 100644
--- a/lib/Optimizer/Transforms/ApplyControlNegations.cpp
+++ b/lib/Optimizer/Transforms/ApplyControlNegations.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
+#include <iostream>
 namespace cudaq::opt {
 #define GEN_PASS_DEF_APPLYCONTROLNEGATIONS
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
@@ -67,6 +68,9 @@ struct ApplyControlNegationsPass
 
   void runOnOperation() override {
     auto funcOp = getOperation();
+    std::cout << " >>>> ApplyControlNegations *** " << std::endl;
+    funcOp.dump();
+    std::cout << " <<< ApplyControlNegations *** " << std::endl;
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<
diff --git a/lib/Optimizer/Transforms/BasisConversion.cpp b/lib/Optimizer/Transforms/BasisConversion.cpp
index 326feb87f2..816e7c354d 100644
--- a/lib/Optimizer/Transforms/BasisConversion.cpp
+++ b/lib/Optimizer/Transforms/BasisConversion.cpp
@@ -18,6 +18,8 @@
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/DialectConversion.h"
 
+#include <iostream>
+
 using namespace mlir;
 
 //===----------------------------------------------------------------------===//
@@ -103,6 +105,10 @@ struct BasisConversion
 
   void runOnOperation() override {
     auto module = getOperation();
+
+    std::cout << "Before BasisConversion" << std::endl;
+    getOperation().dump();
+
     if (basis.empty()) {
       module.emitError("Basis conversion requires a target basis");
       signalPassFailure();
@@ -161,8 +167,14 @@ struct BasisConversion
           return applyFullConversion(op, target, patterns);
         });
 
-    if (failed(rewriteResult))
+    if (failed(rewriteResult)) {
       signalPassFailure();
+      std::cout << "Failed BasisConversion" << std::endl;
+      getOperation().dump();
+    } else {
+      std::cout << "Succeeded BasisConversion" << std::endl;
+      getOperation().dump();
+    }
   }
 };
 
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 173cec4538..fc547f41bd 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -44,6 +44,7 @@ add_cudaq_library(OptTransforms
   RegToMem.cpp
   StateDecomposer.cpp
   StatePreparation.cpp
+  StatePreparation2.cpp
   PySynthCallableBlockArgs.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
index 96e3dcce70..c9dd468376 100644
--- a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
+++ b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
@@ -9,6 +9,7 @@
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Support/Debug.h"
@@ -93,6 +94,7 @@ class GenerateDeviceCodeLoader
     // declarations are just thrown away when the code is JIT compiled.
     SmallVector<Operation *> declarations;
     for (auto &op : *module.getBody()) {
+      llvm::errs() << "**ADDING OP ***: " << op;
       if (auto funcOp = dyn_cast<func::FuncOp>(op)) {
         if (funcOp.empty()) {
           LLVM_DEBUG(llvm::dbgs() << "adding declaration: " << op);
@@ -103,6 +105,11 @@ class GenerateDeviceCodeLoader
           LLVM_DEBUG(llvm::dbgs() << "adding declaration: " << op);
           declarations.push_back(&op);
         }
+      }       
+      // cc.global constant @__nvqpp__rodata_init_0 (dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
+      else if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
+        LLVM_DEBUG(llvm::dbgs() << "adding global: " << op);
+        declarations.push_back(&op);
       }
     }
 
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 7d83c152dd..1b1dddc028 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -142,7 +142,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
       // Stick global at end of Module.
       builder.setInsertionPointToEnd(module.getBody());
       std::string symbol =
-          "__nvqpp_rodata_init_state." + std::to_string(counter++);
+          "__nvqpp_rodata_init_state_qs." + std::to_string(counter++);
       builder.create<cudaq::cc::GlobalOp>(argLoc, arrTy, symbol, arrayAttr,
                                           /*isConstant=*/true,
                                           /*isExternal=*/false);
diff --git a/lib/Optimizer/Transforms/StatePreparation2.cpp b/lib/Optimizer/Transforms/StatePreparation2.cpp
new file mode 100644
index 0000000000..a8047821a0
--- /dev/null
+++ b/lib/Optimizer/Transforms/StatePreparation2.cpp
@@ -0,0 +1,304 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "StateDecomposer.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/TypeToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include <span>
+
+#include <iostream>
+
+#define DEBUG_TYPE "state-preparation2"
+
+using namespace mlir;
+
+/// Replace a qubit initialization from vectors with quantum gates.
+/// For example:
+///
+/// func.func @__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+///   %0 = cc.address_of @__nvqpp_rodata_init_state.0 : !cc.ptr<!cc.array<complex<f32> x 4>>
+///   %1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+///   %2 = quake.alloca !quake.veq<2>
+///   %3 = quake.init_state %2, %1 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+///   return
+/// }
+/// 
+/// is converted to:
+///
+///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+///     %0 = quake.alloca !quake.veq<2>
+///     %c0_i64 = arith.constant 0 : i64
+///     %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst = arith.constant 1.5707963267948968 : f64
+///     quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+///     %c1_i64 = arith.constant 1 : i64
+///     %2 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst_0 = arith.constant 1.5707963267948966 : f64
+///     quake.ry (%cst_0) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     %cst_1 = arith.constant -1.5707963267948966 : f64
+///     quake.ry (%cst_1) %2 : (f64, !quake.ref) -> ()
+///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+///     return
+///   }
+///
+/// Note: the following synthesis and const prop passes will replace
+/// the argument by a constant and propagate the values and vector size
+/// through other instructions.
+
+namespace {
+
+template <typename T>
+concept IntegralType =
+    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
+    std::is_same<T, std::int16_t>::value ||
+    std::is_same<T, std::int32_t>::value ||
+    std::is_same<T, std::int64_t>::value;
+
+template <typename T>
+concept FloatingType = std::is_same<T, float>::value;
+
+template <typename T>
+concept DoubleType = std::is_same<T, double>::value;
+
+template <typename T>
+concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
+
+/// Input was complex<float> but we prefer
+/// complex<double>. Make a copy, extending the values.
+template <FloatingType From>
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  auto convertData = std::vector<std::complex<double>>(size);
+  for (std::size_t i = 0; i < size; ++i)
+    convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
+                                          static_cast<double>(data[i].imag())};
+  return convertData;
+}
+
+template <DoubleType From>
+std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
+                                                   std::uint64_t size) {
+  return std::vector<std::complex<From>>(data, data + size);
+}
+
+/// Input was float/double but we prefer complex<double>.
+/// Make a copy, extending or truncating the values.
+template <ComplexDataType From>
+std::vector<std::complex<double>> convertToComplex(From *data,
+                                                   std::uint64_t size) {
+  auto convertData = std::vector<std::complex<double>>(size);
+  for (std::size_t i = 0; i < size; ++i)
+    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
+                                          static_cast<double>(0.0)};
+  return convertData;
+}
+
+std::vector<std::complex<double>> readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
+  std::vector<std::complex<double>> result{};
+
+  auto attr = global.getValue();
+  auto type = global.getType().getElementType();
+  
+  if (auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(type)) {
+    auto eleTy = arrayTy.getElementType();
+    std::cout << "Attribute element type:" << std::endl;
+    eleTy.dump();
+
+    if (attr.has_value()) {
+      //  auto tensorTy = RankedTensorType::get(size, eleTy);
+      // auto f64Attr = DenseElementsAttr::get(tensorTy, values);
+      if (auto elementsAttr = dyn_cast<mlir::ElementsAttr>(attr.value())) {
+        auto values = elementsAttr.getValues<double>();
+        for (auto it = values.begin(); it != values.end(); ++it) {
+          result.push_back({*it, 0.0});
+        }
+      }
+
+      else if (auto values = dyn_cast<mlir::ArrayAttr>(attr.value())) {
+        for (auto it = values.begin(); it != values.end(); ++it) {
+          auto real = *it;
+        // for (std::size_t idx = 0; idx < numConstants; idx += isComplex ? 2 : 1) {
+          auto v = [&]() -> std::complex<double> {
+            //auto val = constantValues[idx];
+            
+            if (isa<FloatType>(eleTy))
+              return {
+                cast<FloatAttr>(real).getValue().convertToDouble(), 
+                static_cast<double>(0.0)
+              };
+            if (isa<IntegerType>(eleTy))
+              return {
+                static_cast<double>(cast<IntegerAttr>(real).getInt()), 
+                static_cast<double>(0.0)
+              };
+            assert(isa<ComplexType>(eleTy));
+            it++;
+            auto imag = *it;
+            return {
+                cast<FloatAttr>(real).getValue().convertToDouble(),
+                cast<FloatAttr>(imag).getValue().convertToDouble()
+            };
+          }();
+
+          result.push_back(v);
+        }
+      }
+    }
+  }
+
+  std::cout << "Results (" <<  result.size() << "):" << std::endl;
+  for (auto &r: result) {
+    std::cout << r << ", " << std::endl;
+  }
+  return result;
+}
+
+LogicalResult
+transform(OpBuilder &builder, ModuleOp module) {
+  //auto *ctx = builder.getContext();
+
+  auto toErase = std::vector<mlir::Operation *>();
+
+// Module after everything
+// module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.triple = "x86_64-unknown-linux-gnu", quake.mangled_name_map = {__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE = "_Z4testSt6vectorISt7complexIfESaIS1_EE"}} {
+//   func.func @__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+//     %0 = cc.address_of @__nvqpp_rodata_init_state.0 : !cc.ptr<!cc.array<complex<f32> x 4>>
+//     %1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+//     %2 = quake.alloca !quake.veq<2>
+//     %3 = quake.init_state %2, %1 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+//     return
+//   }
+//   cc.global constant @__nvqpp_rodata_init_state.0 ([0.707106769 : f32, 0.000000e+00 : f32, 0.707106769 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]) : !cc.array<complex<f32> x 4>
+// }
+
+// func.func @__nvqpp__mlirgen__function_f._Z1fv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+//   %0 = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr<!cc.array<f64 x 4>>
+//   %1 = quake.alloca !quake.veq<2>
+//   %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+//   quake.dealloc %2 : !quake.veq<2>
+//   return
+// }
+
+  
+  module->walk([&](Operation *op) {
+    if (auto initOp = dyn_cast<quake::InitializeStateOp>(op)) {
+       toErase.push_back(initOp);
+      auto loc = op->getLoc();
+      builder.setInsertionPointAfter(initOp);
+      // Find the qvector alloc.
+      auto qubits = initOp.getOperand(0);
+      if (auto alloc = dyn_cast<quake::AllocaOp>(qubits.getDefiningOp())) {
+
+        // Find vector data.
+        auto data = initOp.getOperand(1);
+        if (auto cast = dyn_cast<cudaq::cc::CastOp>(data.getDefiningOp())) {
+          data = cast.getOperand();
+          toErase.push_back(cast);
+        }
+        if (auto addr = dyn_cast<cudaq::cc::AddressOfOp>(data.getDefiningOp())) {
+          
+          auto globalName = addr.getGlobalName();
+          auto symbol = module.lookupSymbol(globalName);
+          if (auto global = dyn_cast<cudaq::cc::GlobalOp>(symbol)) {
+            // Read state initialization data from the global array.
+            auto vec = readConstantArray(builder, global);
+            
+            // Prepare state from vector data.
+            auto gateBuilder = StateGateBuilder(builder, loc, qubits);
+            auto decomposer = StateDecomposer(gateBuilder, vec);
+            decomposer.decompose();
+
+            initOp.replaceAllUsesWith(qubits);
+            toErase.push_back(addr);
+            toErase.push_back(global);
+          }
+        }
+      }
+    }
+  });
+ 
+  for (auto &op : toErase) {
+    op->erase();
+  }
+
+  return success();
+}
+
+class StatePreparation2 : public cudaq::opt::PrepareState2Base<StatePreparation2> {
+protected:
+  // The name of the kernel to be synthesized
+  std::string kernelName;
+
+  // The raw pointer to the runtime arguments.
+  void *args;
+
+public:
+  StatePreparation2() = default;
+  StatePreparation2(std::string_view kernel, void *a)
+      : kernelName(kernel), args(a) {}
+
+  mlir::ModuleOp getModule() { return getOperation(); }
+
+
+  void runOnOperation() override final {
+    auto module = getModule();
+
+    std::cout << "Module before state prep2" << std::endl;
+    module.dump();
+
+    auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName;
+    // Get the function we care about (the one with kernelName)
+    auto funcOp = module.lookupSymbol<func::FuncOp>(kernelNameInQuake);
+    if (!funcOp) {
+      module.emitOpError("The kernel '" + kernelName +
+                         "' was not found in the module.");
+      signalPassFailure();
+      return;
+    }
+
+    // Create the builder.
+    auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
+
+    auto result = transform(builder, module);
+    if (result.failed()) {
+      module.emitOpError("Failed to prepare state for '" + kernelName);
+      signalPassFailure();
+      return;
+    }
+    
+    std::cout << "Module after state prep2" << std::endl;
+    module.dump();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation2() {
+  return std::make_unique<StatePreparation2>();
+}
+
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createStatePreparation2(std::string_view kernelName, void *a) {
+  return std::make_unique<StatePreparation2>(kernelName, a);
+}
diff --git a/program.py b/program.py
new file mode 100644
index 0000000000..92321a755a
--- /dev/null
+++ b/program.py
@@ -0,0 +1,23 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq
+import numpy as np
+
+c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+
+@cudaq.kernel
+def kernel(vec: list[complex]):
+    q = cudaq.qvector(vec)
+
+synthesized = cudaq.synthesize(kernel, c)
+print(synthesized)
+
+counts = cudaq.sample(synthesized)
+assert '00' in counts
+assert '10' in counts
\ No newline at end of file
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 08f41e60ec..c469d543ee 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -18,6 +18,7 @@
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Support/Plugin.h"
@@ -378,6 +379,18 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     auto moduleOp = builder.create<mlir::ModuleOp>();
     moduleOp.push_back(func.clone());
     moduleOp->setAttrs(m_module->getAttrDictionary());
+    for (auto &op: m_module.getOps()) {
+      // Add globals referenced in the func.
+      if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
+        //for (auto *use: globalOp->getUsers()) {
+        //  auto parent = use->getParentOfType<mlir::func::FuncOp>();
+        //  std::cout << "Global " << globalOp.getName().str() << " is used in " << parent.getName().str() <<std::endl;
+        //  if (parent.getName() == func.getName()) {
+            moduleOp.push_back(globalOp.clone());
+        //  }
+        //}
+      }
+    }
 
     // Lambda to apply a specific pipeline to the given ModuleOp
     auto runPassPipeline = [&](const std::string &pipeline,
@@ -398,10 +411,13 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         throw std::runtime_error("Remote rest platform Quake lowering failed.");
     };
 
+    std::cout << "Module before synthesis" << std::endl;
+    moduleOp.dump();
+
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createStatePreparation(kernelName, updatedArgs));
+      //pm.addPass(cudaq::opt::createStatePreparation(kernelName, updatedArgs));
       pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
@@ -411,9 +427,42 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         throw std::runtime_error("Could not successfully apply quake-synth.");
     }
 
+    std::cout << "Module after synthesis" << std::endl;
+    moduleOp.dump();
+    // runPassPipeline("canonicalize,cse", moduleOp);
+    // std::cout << "Module after synthesis and cse" << std::endl;
+    // moduleOp.dump();
+
+    // Run the config-specified pass pipeline
+    //runPassPipeline(passPipelineConfig, moduleOp);
+    //runPassPipeline("cc-loop-unroll{allow-early-exit=1},canonicalize,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping", moduleOp);
+    //if (updatedArgs) {
+      cudaq::info("Run State Prep.\n");
+      mlir::PassManager pm(&context);
+      pm.addPass(cudaq::opt::createStatePreparation2(kernelName, updatedArgs));
+      if (disableMLIRthreading || enablePrintMLIREachPass)
+        moduleOp.getContext()->disableMultithreading();
+      if (enablePrintMLIREachPass)
+        pm.enableIRPrinting();
+      if (failed(pm.run(moduleOp)))
+        throw std::runtime_error("Could not successfully apply state prep.");
+    //}
+
+    std::cout << "Module after state prep" << std::endl;
+    moduleOp.dump();
+
+    runPassPipeline("canonicalize,cse", moduleOp);
+    std::cout << "Module after state prep and cse" << std::endl;
+    moduleOp.dump();
+
     // Run the config-specified pass pipeline
+    //runPassPipeline("cc-loop-unroll{allow-early-exit=1},canonicalize,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition)", moduleOp);
+    // runPassPipeline("cc-loop-unroll{allow-early-exit=1},canonicalize,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping", moduleOp);
     runPassPipeline(passPipelineConfig, moduleOp);
 
+    std::cout << "Module after state prep and pipeline" << std::endl;
+    moduleOp.dump();
+    
     auto entryPointFunc = moduleOp.lookupSymbol<mlir::func::FuncOp>(
         std::string("__nvqpp__mlirgen__") + kernelName);
     std::vector<std::size_t> mapping_reorder_idx;
@@ -479,6 +528,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       // and use that for execution
       for (auto &[name, module] : modules) {
         auto clonedModule = module.clone();
+        std::cout << "Module after everything" << std::endl;
+        clonedModule.dump();
         jitEngines.emplace_back(
             cudaq::createQIRJITEngine(clonedModule, codegenTranslation));
       }
diff --git a/runtime/common/RuntimeMLIRCommonImpl.h b/runtime/common/RuntimeMLIRCommonImpl.h
index df06f750a0..91722e1751 100644
--- a/runtime/common/RuntimeMLIRCommonImpl.h
+++ b/runtime/common/RuntimeMLIRCommonImpl.h
@@ -40,6 +40,8 @@
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Tools/ParseUtilities.h"
 
+#include <iostream>
+
 namespace cudaq {
 
 bool setupTargetTriple(llvm::Module *llvmModule) {
@@ -369,6 +371,11 @@ qirProfileTranslationFunction(const char *qirProfile, mlir::Operation *op,
   mlir::PassManager pm(context);
   if (printIntermediateMLIR)
     pm.enableIRPrinting();
+
+  std::cout << "qirProfileTranslationFunction" << std::endl;
+  pm.enableIRPrinting();
+  context->disableMultithreading();
+
   std::string errMsg;
   llvm::raw_string_ostream errOs(errMsg);
   cudaq::opt::addPipelineConvertToQIR(pm, qirProfile);
@@ -575,6 +582,9 @@ mlir::ExecutionEngine *createQIRJITEngine(mlir::ModuleOp &moduleOp,
     tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
     auto timingScope = tm.getRootScope(); // starts the timer
     pm.enableTiming(timingScope);         // do this right before pm.run
+    std::cout << "Common IR" << std::endl;
+    context->disableMultithreading();
+    pm.enableIRPrinting();
     if (failed(pm.run(module)))
       throw std::runtime_error(
           "[createQIRJITEngine] Lowering to QIR for remote emulation failed.");
diff --git a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
index f8318e1dec..39602a6fba 100644
--- a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
@@ -36,6 +36,8 @@ class RemoteRESTQPU : public cudaq::BaseRemoteRESTQPU {
 
     // Get the quake representation of the kernel
     auto quakeCode = cudaq::get_quake_by_name(kernelName);
+    std::cout << "extractQuakeCodeAndContext" << quakeCode << std::endl;
+
     auto m_module = parseSourceString<ModuleOp>(quakeCode, &context);
     if (!m_module)
       throw std::runtime_error("module cannot be parsed");
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index 35a628c06a..ddc8e6e265 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -11,10 +11,14 @@
 #include <cudaq.h>
 #include <iostream>
 
-__qpu__ void test(std::vector<cudaq::complex> inState) {
-  cudaq::qvector q1 = inState;
+__qpu__ void f() {
+   cudaq::qvector v = {1.0, 2.0, 3.0, 4.0};
 }
 
+// __qpu__ void test(std::vector<cudaq::complex> inState) {
+//   cudaq::qvector q1 = inState;
+// }
+
 void printCounts(cudaq::sample_result& result) {
   std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
@@ -28,25 +32,28 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-    {
-        // Passing state data as argument (kernel mode)
-        auto counts = cudaq::sample(test, vec);
-        printCounts(counts);
-
-        counts = cudaq::sample(test, vec1);
-        printCounts(counts);
-    }
-
-    {
-        // Passing state data as argument (builder mode)
-        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
-        auto qubits = kernel.qalloc(v);
+    auto counts = cudaq::sample(f);
+    printCounts(counts);
+
+    // std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    // std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+    // {
+    //     // Passing state data as argument (kernel mode)
+    //     auto counts = cudaq::sample(test, vec);
+    //     printCounts(counts);
+
+    //     counts = cudaq::sample(test, vec1);
+    //     printCounts(counts);
+    // }
+
+    // {
+    //     // Passing state data as argument (builder mode)
+    //     auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+    //     auto qubits = kernel.qalloc(v);
     
-        auto counts = cudaq::sample(kernel, vec);
-        printCounts(counts);
-    }
+    //     auto counts = cudaq::sample(kernel, vec);
+    //     printCounts(counts);
+    // }
 }
 
 // CHECK: 00

From 53d2644b745a4e1e8b69912c55bd4b17ac53ebaa Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 3 Jul 2024 09:11:09 -0700
Subject: [PATCH 12/50] Read complex numbers

---
 lib/Frontend/nvqpp/ConvertExpr.cpp                 | 14 ++++++++++++++
 targettests/execution/state_preparation_vector.cpp |  9 +++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index 7c73faa2d5..8ec06e58af 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -2417,6 +2417,18 @@ static Type getEleTyFromVectorCtor(Type ctorTy) {
   return ctorTy;
 }
 
+mlir::Operation* constProp(OpBuilder &builder, Location &loc, Operation* op) {
+  if (auto &constOp = dyn_cast<arith::ConstantFloatOp>(op)) {
+    return op;
+  }
+  if (auto &truncOp = dyn_cast<arith::TruncFOp>(op)) {
+    auto truncated = truncOp->getOperand(0);
+    auto fTy = op->getType();
+    builder.create<arith::ConstantFloatOp>(loc, cast<FloatAttr>(val).getValue(), fTy);
+  }
+
+}
+
 bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
   auto loc = toLocation(x);
   auto *ctor = x->getConstructor();
@@ -2579,6 +2591,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
       Value real = popValue();
 
       std::cout << "Real and Imag values" << std::endl;
+      real = constProp(builder, loc, real);
+      imag = constProp(builder, loc, imag);
       real.dump();
       imag.dump();
       if (auto realOp = real.getDefiningOp<arith::ConstantFloatOp>()) {
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index ddc8e6e265..1a96b3e881 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -12,12 +12,13 @@
 #include <iostream>
 
 __qpu__ void f() {
-   cudaq::qvector v = {1.0, 2.0, 3.0, 4.0};
+  cudaq::qvector v = { static_cast<cudaq::complex>(1.0), static_cast<cudaq::complex>(2.0), static_cast<cudaq::complex>(3.0), static_cast<cudaq::complex>(4.0)};
+  // cudaq::qvector v = { 1.0, 2.0, 3.0, 4.0};
 }
 
-// __qpu__ void test(std::vector<cudaq::complex> inState) {
-//   cudaq::qvector q1 = inState;
-// }
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
 
 void printCounts(cudaq::sample_result& result) {
   std::vector<std::string> values{};

From 7e4523f7c7500498eff31a45b7550f6b8ab81018 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 3 Jul 2024 09:32:06 -0700
Subject: [PATCH 13/50] Merge with main

---
 lib/Frontend/nvqpp/ConvertExpr.cpp | 103 ++++++++++++++---------------
 1 file changed, 49 insertions(+), 54 deletions(-)

diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index 8ec06e58af..6051d0794a 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -15,8 +15,6 @@
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 
-#include <iostream>
-
 #define DEBUG_TYPE "lower-ast-expr"
 
 using namespace mlir;
@@ -280,8 +278,10 @@ static Value toIntegerImpl(OpBuilder &builder, Location loc, Value bitVec) {
         auto eleTy =
             cast<cudaq::cc::SpanLikeType>(bitVec.getType()).getElementType();
         auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
+        auto eleArrTy =
+            cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
         auto vecPtr =
-            builder.create<cudaq::cc::StdvecDataOp>(loc, elePtrTy, bitVec);
+            builder.create<cudaq::cc::StdvecDataOp>(loc, eleArrTy, bitVec);
         auto eleAddr = builder.create<cudaq::cc::ComputePtrOp>(
             loc, elePtrTy, vecPtr, ValueRange{kIter});
         Value bitElement = builder.create<cudaq::cc::LoadOp>(loc, eleAddr);
@@ -700,7 +700,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
       assert(result && "integer conversion failed");
       return result;
     }
-    TODO_loc(loc, "unhandled user defined implicit conversion");
+    TODO_loc(loc, "unhandled user-defined implicit conversion");
   }
   case clang::CastKind::CK_ConstructorConversion: {
     // Enable implicit conversion of surface types, which both map to VeqType.
@@ -1109,11 +1109,16 @@ bool QuakeBridgeVisitor::VisitMemberExpr(clang::MemberExpr *x) {
   if (auto *field = dyn_cast<clang::FieldDecl>(x->getMemberDecl())) {
     auto loc = toLocation(x->getSourceRange());
     auto object = popValue(); // DeclRefExpr
+    auto eleTy = cast<cc::PointerType>(object.getType()).getElementType();
+    SmallVector<cc::ComputePtrArg> offsets;
+    if (auto arrTy = dyn_cast<cc::ArrayType>(eleTy))
+      if (arrTy.isUnknownSize())
+        offsets.push_back(0);
     std::int32_t offset = field->getFieldIndex();
+    offsets.push_back(offset);
     auto ty = popType();
     return pushValue(builder.create<cc::ComputePtrOp>(
-        loc, cc::PointerType::get(ty), object,
-        SmallVector<cc::ComputePtrArg>{0, offset}));
+        loc, cc::PointerType::get(ty), object, offsets));
   }
   return true;
 }
@@ -1214,10 +1219,11 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           assert(isa<FunctionType>(calleeTy));
           auto negativeOneIndex = getConstantInt(builder, loc, -1, 64);
           auto eleTy = cast<cc::SpanLikeType>(svec.getType()).getElementType();
+          auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
           auto elePtrTy = cc::PointerType::get(eleTy);
           auto *ctx = eleTy.getContext();
           auto i64Ty = mlir::IntegerType::get(ctx, 64);
-          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, elePtrTy, svec);
+          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
           auto vecLen = builder.create<cc::StdvecSizeOp>(loc, i64Ty, svec);
           Value vecLenMinusOne =
               builder.create<arith::AddIOp>(loc, vecLen, negativeOneIndex);
@@ -1231,9 +1237,10 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           assert(isa<FunctionType>(calleeTy));
           auto eleTy = cast<cc::SpanLikeType>(svec.getType()).getElementType();
           auto elePtrTy = cc::PointerType::get(eleTy);
+          auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
           auto *ctx = eleTy.getContext();
           auto i64Ty = mlir::IntegerType::get(ctx, 64);
-          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, elePtrTy, svec);
+          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
           Value vecLen = builder.create<cc::StdvecSizeOp>(loc, i64Ty, svec);
           return pushValue(builder.create<cc::ComputePtrOp>(
               loc, elePtrTy, vecPtr, ValueRange{vecLen}));
@@ -1247,7 +1254,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
               builder.create<arith::ConstantIntOp>(loc, -1, 64);
           auto eleTy = cast<cc::SpanLikeType>(svec.getType()).getElementType();
           auto elePtrTy = cc::PointerType::get(eleTy);
-          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, elePtrTy, svec);
+          auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
+          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
           return pushValue(builder.create<cc::ComputePtrOp>(
               loc, elePtrTy, vecPtr, ValueRange{negativeOneIndex}));
         }
@@ -1890,7 +1898,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         offset = builder.create<arith::MulIOp>(loc, scale, args[1]);
       } else {
         ptrTy = cc::PointerType::get(eleTy);
-        vecPtr = builder.create<cc::StdvecDataOp>(loc, ptrTy, args[0]);
+        auto arrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
+        vecPtr = builder.create<cc::StdvecDataOp>(loc, arrTy, args[0]);
       }
       auto ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, vecPtr,
                                                   ArrayRef<Value>{offset});
@@ -1950,7 +1959,13 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       auto loInt = builder.create<cc::CastOp>(loc, i64Ty, args[0]);
       auto ptrTy = cast<cc::PointerType>(args[0].getType());
       auto eleTy = ptrTy.getElementType();
-      auto arrTy = cc::ArrayType::get(eleTy);
+      auto arrTy = dyn_cast<cc::ArrayType>(eleTy);
+      if (arrTy) {
+        eleTy = arrTy.getElementType();
+        ptrTy = cc::PointerType::get(eleTy);
+      } else {
+        arrTy = cc::ArrayType::get(eleTy);
+      }
       auto eleSize = eleTy.getIntOrFloatBitWidth();
       auto adjust = getConstantInt(builder, loc, eleSize / 4, i64Ty);
       auto dist = builder.create<arith::SubIOp>(loc, hiInt, loInt);
@@ -1988,9 +2003,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       if (specArgs[0].getKind() == clang::TemplateArgument::ArgKind::Integral) {
         auto ptr = builder.create<cc::ComputePtrOp>(
             loc, resultTy, args[0],
-            ArrayRef<cc::ComputePtrArg>{
-                0, static_cast<std::int32_t>(
-                       specArgs[0].getAsIntegral().getExtValue())});
+            ArrayRef<cc::ComputePtrArg>{static_cast<std::int32_t>(
+                specArgs[0].getAsIntegral().getExtValue())});
         return pushValue(builder.create<cc::LoadOp>(loc, ptr));
       }
       auto *selectTy = specArgs[0].getAsType().getTypePtr();
@@ -1999,7 +2013,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       for (auto &templateArg : specArgs[1].pack_elements()) {
         if (templateArg.getAsType().getTypePtr() == selectTy) {
           auto ptr = builder.create<cc::ComputePtrOp>(
-              loc, resultTy, args[0], ArrayRef<cc::ComputePtrArg>{0, i});
+              loc, resultTy, args[0], ArrayRef<cc::ComputePtrArg>{i});
           return pushValue(builder.create<cc::LoadOp>(loc, ptr));
         }
         ++i;
@@ -2112,7 +2126,8 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       }
       auto eleTy = cast<cc::StdvecType>(svec.getType()).getElementType();
       auto elePtrTy = cc::PointerType::get(eleTy);
-      auto vecPtr = builder.create<cc::StdvecDataOp>(loc, elePtrTy, svec);
+      auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
+      auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
       auto eleAddr = builder.create<cc::ComputePtrOp>(loc, elePtrTy, vecPtr,
                                                       ValueRange{indexVar});
       return replaceTOSValue(eleAddr);
@@ -2124,8 +2139,10 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       auto indexVar = popValue();
       auto svec = popValue();
       assert(svec.getType().isa<cc::StdvecType>());
-      auto elePtrTy = cc::PointerType::get(builder.getI8Type());
-      auto vecPtr = builder.create<cc::StdvecDataOp>(loc, elePtrTy, svec);
+      auto i8Ty = builder.getI8Type();
+      auto elePtrTy = cc::PointerType::get(i8Ty);
+      auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(i8Ty));
+      auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
       auto eleAddr = builder.create<cc::ComputePtrOp>(loc, elePtrTy, vecPtr,
                                                       ValueRange{indexVar});
       auto i1PtrTy = cc::PointerType::get(builder.getI1Type());
@@ -2353,13 +2370,21 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
             ArrayRef<cc::ComputePtrArg>{i / structMems, i % structMems});
       } else {
         auto ptrTy = cc::PointerType::get(structTy.getMembers()[i]);
-        ptr = builder.create<cc::ComputePtrOp>(
-            loc, ptrTy, alloca, ArrayRef<cc::ComputePtrArg>{0, i});
+        ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, alloca,
+                                               ArrayRef<cc::ComputePtrArg>{i});
       }
     } else {
-      auto ptrTy = cc::PointerType::get(eleTy);
-      ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, alloca,
-                                             ArrayRef<cc::ComputePtrArg>{i});
+      if (numEles > 1) {
+        auto ptrTy = cc::PointerType::get(eleTy);
+        ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, alloca,
+                                               ArrayRef<cc::ComputePtrArg>{i});
+      } else {
+        auto arrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
+        auto cast = builder.create<cc::CastOp>(loc, arrTy, alloca);
+        auto ptrTy = cc::PointerType::get(eleTy);
+        ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, cast,
+                                               ArrayRef<cc::ComputePtrArg>{i});
+      }
     }
     assert(ptr &&
            (v.getType() ==
@@ -2417,18 +2442,6 @@ static Type getEleTyFromVectorCtor(Type ctorTy) {
   return ctorTy;
 }
 
-mlir::Operation* constProp(OpBuilder &builder, Location &loc, Operation* op) {
-  if (auto &constOp = dyn_cast<arith::ConstantFloatOp>(op)) {
-    return op;
-  }
-  if (auto &truncOp = dyn_cast<arith::TruncFOp>(op)) {
-    auto truncated = truncOp->getOperand(0);
-    auto fTy = op->getType();
-    builder.create<arith::ConstantFloatOp>(loc, cast<FloatAttr>(val).getValue(), fTy);
-  }
-
-}
-
 bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
   auto loc = toLocation(x);
   auto *ctor = x->getConstructor();
@@ -2583,30 +2596,12 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
         }
       }
       return false;
-    }(); 
+    }();
     if (isVectorOfQubitRefs)
       return true;
     if (ctorName == "complex") {
       Value imag = popValue();
       Value real = popValue();
-
-      std::cout << "Real and Imag values" << std::endl;
-      real = constProp(builder, loc, real);
-      imag = constProp(builder, loc, imag);
-      real.dump();
-      imag.dump();
-      if (auto realOp = real.getDefiningOp<arith::ConstantFloatOp>()) {
-        if (auto imagOp = imag.getDefiningOp<arith::ConstantFloatOp>()) {
-          std::cout << "Creating const complex" << std::endl;
-          auto realConst = realOp.value().convertToDouble();
-          auto imagConst = imagOp.value().convertToDouble();
-          auto attr = (real.getType() == builder.getF64Type())?
-            builder.getF64ArrayAttr({realConst, imagConst}):
-            builder.getF32ArrayAttr({static_cast<float>(realConst), static_cast<float>(imagConst)});
-          return pushValue(builder.create<complex::ConstantOp>(loc, ComplexType::get(real.getType()), attr));
-        }
-      }
-      std::cout << "Creating non-const complex" << std::endl;
       return pushValue(builder.create<complex::CreateOp>(
           loc, ComplexType::get(real.getType()), real, imag));
     }

From 3b315937a7259856e2f9b84e96cdddc4e39fd5cb Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 5 Jul 2024 09:49:26 -0700
Subject: [PATCH 14/50] Cleanup

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   5 +-
 include/cudaq/Optimizer/Transforms/Passes.td  |  11 -
 lib/Optimizer/CodeGen/ConvertToQIR.cpp        |   8 -
 .../Transforms/ApplyControlNegations.cpp      |   4 -
 lib/Optimizer/Transforms/BasisConversion.cpp  |  14 +-
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 -
 .../Transforms/GenDeviceCodeLoader.cpp        |   5 +-
 .../Transforms/GenKernelExecution.cpp         |   1 +
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   | 133 ++++++-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |   2 +-
 lib/Optimizer/Transforms/StatePreparation.cpp | 375 ++++++------------
 .../Transforms/StatePreparation2.cpp          | 304 --------------
 program.py                                    |  23 --
 python/cudaq/kernel/ast_bridge.py             |  10 +-
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   5 +-
 .../tests/kernel/test_kernel_qvector_init.py  | 152 +------
 runtime/common/BaseRemoteRESTQPU.h            |  50 +--
 runtime/common/BaseRestRemoteClient.h         |  13 +-
 runtime/common/RuntimeMLIRCommonImpl.h        |   9 -
 .../platform/default/rest/RemoteRESTQPU.cpp   |   2 -
 .../execution/state_preparation_vector.cpp    | 136 +++++--
 21 files changed, 394 insertions(+), 869 deletions(-)
 delete mode 100644 lib/Optimizer/Transforms/StatePreparation2.cpp
 delete mode 100644 program.py

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index d0759cac85..9431b3da0d 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -36,14 +36,13 @@ createApplyOpSpecializationPass(bool computeActionOpt);
 std::unique_ptr<mlir::Pass> createDelayMeasurementsPass();
 std::unique_ptr<mlir::Pass> createExpandMeasurementsPass();
 std::unique_ptr<mlir::Pass> createLambdaLiftingPass();
+std::unique_ptr<mlir::Pass> createLiftArrayAllocPass();
 std::unique_ptr<mlir::Pass> createLowerToCFGPass();
 std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createStatePreparation();
-std::unique_ptr<mlir::Pass> createStatePreparation(std::string_view, void *);
-std::unique_ptr<mlir::Pass> createStatePreparation2();
-std::unique_ptr<mlir::Pass> createStatePreparation2(std::string_view, void *);
+std::unique_ptr<mlir::Pass> createStatePreparation(std::string_view);
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 7b0be15dc2..b00bf1f1e6 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -565,17 +565,6 @@ def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
   let constructor = "cudaq::opt::createStatePreparation()";
 }
 
-def PrepareState2 : Pass<"state-prep2", "mlir::ModuleOp"> {
-  let summary =
-    "Convert state vector data into gates";
-  let description = [{
-    Convert quake representation that includes qubit initialization
-    from data into qubit initialization using gates.
-  }];
-
-  let constructor = "cudaq::opt::createStatePreparation2()";
-}
-
 def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> {
   let summary =
     "Synthesize concrete quantum program from Quake code plus runtime values.";
diff --git a/lib/Optimizer/CodeGen/ConvertToQIR.cpp b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
index 6c34b568ac..e4f370876e 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIR.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
@@ -37,8 +37,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-#include <iostream>
-
 #define DEBUG_TYPE "convert-to-qir"
 
 namespace cudaq::opt {
@@ -74,7 +72,6 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
   // buffer of constants.
   LogicalResult eraseConstantArrayOps() {
     bool ok = true;
-
     SmallVector<Operation *> cleanUps;
     getOperation().walk([&](cudaq::cc::ConstantArrayOp carr) {
       // If there is a constant array, then we expect that it is involved in
@@ -149,9 +146,6 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
   /// ops. This step makes converting a DAG of nodes in the conversion step
   /// simpler.
   void runOnOperation() override final {
-    std::cout << "Before ConvertToQIR" << std::endl;
-    getOperation().dump();
-
     auto *context = &getContext();
     if (failed(fuseSubgraphPatterns(context, getOperation()))) {
       signalPassFailure();
@@ -191,8 +185,6 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
     LLVM_DEBUG(llvm::dbgs() << "Before conversion to QIR:\n"; op.dump());
     if (failed(applyFullConversion(op, target, std::move(patterns)))) {
       LLVM_DEBUG(getOperation().dump());
-      std::cout << "Filed ConvertToQIR" << std::endl;
-      getOperation().dump();
       signalPassFailure();
     }
     LLVM_DEBUG(llvm::dbgs() << "After conversion to QIR:\n"; op.dump());
diff --git a/lib/Optimizer/Transforms/ApplyControlNegations.cpp b/lib/Optimizer/Transforms/ApplyControlNegations.cpp
index e10df9bd7c..c88f80e6a1 100644
--- a/lib/Optimizer/Transforms/ApplyControlNegations.cpp
+++ b/lib/Optimizer/Transforms/ApplyControlNegations.cpp
@@ -16,7 +16,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
-#include <iostream>
 namespace cudaq::opt {
 #define GEN_PASS_DEF_APPLYCONTROLNEGATIONS
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
@@ -68,9 +67,6 @@ struct ApplyControlNegationsPass
 
   void runOnOperation() override {
     auto funcOp = getOperation();
-    std::cout << " >>>> ApplyControlNegations *** " << std::endl;
-    funcOp.dump();
-    std::cout << " <<< ApplyControlNegations *** " << std::endl;
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<
diff --git a/lib/Optimizer/Transforms/BasisConversion.cpp b/lib/Optimizer/Transforms/BasisConversion.cpp
index 816e7c354d..326feb87f2 100644
--- a/lib/Optimizer/Transforms/BasisConversion.cpp
+++ b/lib/Optimizer/Transforms/BasisConversion.cpp
@@ -18,8 +18,6 @@
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/DialectConversion.h"
 
-#include <iostream>
-
 using namespace mlir;
 
 //===----------------------------------------------------------------------===//
@@ -105,10 +103,6 @@ struct BasisConversion
 
   void runOnOperation() override {
     auto module = getOperation();
-
-    std::cout << "Before BasisConversion" << std::endl;
-    getOperation().dump();
-
     if (basis.empty()) {
       module.emitError("Basis conversion requires a target basis");
       signalPassFailure();
@@ -167,14 +161,8 @@ struct BasisConversion
           return applyFullConversion(op, target, patterns);
         });
 
-    if (failed(rewriteResult)) {
+    if (failed(rewriteResult))
       signalPassFailure();
-      std::cout << "Failed BasisConversion" << std::endl;
-      getOperation().dump();
-    } else {
-      std::cout << "Succeeded BasisConversion" << std::endl;
-      getOperation().dump();
-    }
   }
 };
 
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index d3e15d1382..717e379ef4 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -45,7 +45,6 @@ add_cudaq_library(OptTransforms
   RegToMem.cpp
   StateDecomposer.cpp
   StatePreparation.cpp
-  StatePreparation2.cpp
   PySynthCallableBlockArgs.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
index c9dd468376..74f4306654 100644
--- a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
+++ b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
@@ -94,7 +94,6 @@ class GenerateDeviceCodeLoader
     // declarations are just thrown away when the code is JIT compiled.
     SmallVector<Operation *> declarations;
     for (auto &op : *module.getBody()) {
-      llvm::errs() << "**ADDING OP ***: " << op;
       if (auto funcOp = dyn_cast<func::FuncOp>(op)) {
         if (funcOp.empty()) {
           LLVM_DEBUG(llvm::dbgs() << "adding declaration: " << op);
@@ -105,9 +104,7 @@ class GenerateDeviceCodeLoader
           LLVM_DEBUG(llvm::dbgs() << "adding declaration: " << op);
           declarations.push_back(&op);
         }
-      }       
-      // cc.global constant @__nvqpp__rodata_init_0 (dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
-      else if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
+      } else if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
         LLVM_DEBUG(llvm::dbgs() << "adding global: " << op);
         declarations.push_back(&op);
       }
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index e0877d4c3d..3c76d9e197 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -1243,6 +1243,7 @@ class GenerateKernelExecution
         }
         continue;
       }
+
       stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
                                                        stVal, arg, idx);
     }
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 8093a477e6..d4c0c335ed 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -38,11 +38,12 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
                                 PatternRewriter &rewriter) const override {
     SmallVector<Operation *> stores;
     bool toGlobal = false;
-    if (!isGoodCandidate(alloc, stores, dom, toGlobal))
+    if (!isGoodCandidate(alloc, stores, dom, toGlobal)) {
       return failure();
+    }
 
     LLVM_DEBUG(llvm::dbgs() << "Candidate was found\n");
-    auto arrTy = cast<cudaq::cc::ArrayType>(alloc.getElementType());
+    auto arrTy = cast<cudaq::cc::ArrayType>(alloc.getType().getElementType());
     SmallVector<Attribute> values;
 
     // Every element of `stores` must be a cc::StoreOp with a ConstantOp as the
@@ -65,6 +66,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     Value conArr;
     Value conGlobal;
     if (toGlobal) {
+      auto ip = rewriter.saveInsertionPoint();
       static unsigned counter = 0;
       auto ptrTy = cudaq::cc::PointerType::get(arrTy);
       // Build a new name based on the kernel name.
@@ -110,6 +112,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
                                                /*isExternal=*/false);
         }
       }
+      rewriter.restoreInsertionPoint(ip);
       conGlobal = rewriter.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
       conArr = rewriter.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
     } else {
@@ -117,6 +120,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
           rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
     }
 
+    std::vector<mlir::Operation *> toErase;
+
     // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr.
     // For each,u, remove a store and replace a load with a cc.extract_value.
     for (auto &use : alloc->getUses()) {
@@ -128,6 +133,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       for (auto &useuse : user->getUses()) {
         auto *useuser = useuse.getOwner();
         if (auto ist = dyn_cast<quake::InitializeStateOp>(useuser)) {
+          rewriter.setInsertionPointAfter(useuser);
           LLVM_DEBUG(llvm::dbgs() << "replaced init_state\n");
           assert(conGlobal && "global must be defined");
           rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
@@ -135,23 +141,31 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
           continue;
         }
         if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
+          rewriter.setInsertionPointAfter(useuser);
           LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
           rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
               load, eleTy, conArr,
               ArrayRef<cudaq::cc::ExtractValueArg>{offset});
           continue;
         }
-        if (isa<cudaq::cc::StoreOp>(useuser))
-          rewriter.eraseOp(useuser);
+        if (isa<cudaq::cc::StoreOp>(useuser)) {
+          toErase.push_back(useuser);
+          continue;
+        }
         isLive = true;
       }
       if (!isLive)
-        rewriter.eraseOp(user);
+        toErase.push_back(user);
     }
     if (toGlobal) {
+      rewriter.setInsertionPointAfter(alloc);
       rewriter.replaceOp(alloc, conGlobal);
     } else {
-      rewriter.eraseOp(alloc);
+      toErase.push_back(alloc);
+    }
+
+    for (auto *op : toErase) {
+      rewriter.eraseOp(op);
     }
     return success();
   }
@@ -182,8 +196,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     if (std::distance(alloc->getUses().begin(), alloc->getUses().end()) < size)
       return false;
 
-    // Keep a scoreboard for every element in the array. Every element *must* be
-    // stored to with a constant exactly one time.
+    //  Keep a scoreboard for every element in the array. Every element *must*
+    //  be stored to with a constant exactly one time.
     scoreboard.resize(size);
     for (int i = 0; i < size; i++)
       scoreboard[i] = nullptr;
@@ -249,11 +263,18 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
               scoreboard[0] = w;
               continue;
             }
-          return false;
+          // can be a cast only used for a quake.init_state)
+          continue;
+        } else {
+          if (getWriteOp(cast, 0)) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "another cast used in store: " << *op << '\n');
+            return false;
+          }
+          // can be a cast only used for a quake.init_state)
+          continue;
         }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
-        toGlobalUses.push_back(op);
-        toGlobal = true;
         continue;
       }
       LLVM_DEBUG(llvm::dbgs() << "unexpected use: " << *op << '\n');
@@ -321,6 +342,88 @@ class ComplexCreatePattern : public OpRewritePattern<complex::CreateOp> {
   }
 };
 
+// Fold arith.trunc ops if the argument is constant.
+class FloatTruncatePattern : public OpRewritePattern<arith::TruncFOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::TruncFOp truncate,
+                                PatternRewriter &rewriter) const override {
+    auto val = truncate.getOperand();
+    auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
+    if (valCon) {
+      auto v = valCon.value().convertToDouble();
+      auto fTy = dyn_cast<FloatType>(truncate.getType());
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
+          truncate, APFloat{static_cast<float>(v)}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Fold arith.ext ops if the argument is constant.
+class FloatExtendPattern : public OpRewritePattern<arith::ExtFOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::ExtFOp extend,
+                                PatternRewriter &rewriter) const override {
+    auto val = extend.getOperand();
+    auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
+    if (valCon) {
+      auto v = valCon.value().convertToFloat();
+      auto fTy = dyn_cast<FloatType>(extend.getType());
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
+          extend, APFloat{static_cast<double>(v)}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Fold complex.re ops if the argument is constant.
+class ComplexRePattern : public OpRewritePattern<complex::ReOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(complex::ReOp re,
+                                PatternRewriter &rewriter) const override {
+    auto val = re.getOperand();
+    auto valCon = val.getDefiningOp<complex::ConstantOp>();
+    if (valCon) {
+      auto attr = valCon.getValue();
+      auto real = cast<FloatAttr>(attr[0]).getValue();
+      auto fTy = dyn_cast<FloatType>(re.getType());
+      auto v = fTy.isF64() ? real.convertToDouble() : real.convertToFloat();
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(re, APFloat{v}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Fold complex.im ops if the argument is constant.
+class ComplexImPattern : public OpRewritePattern<complex::ImOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(complex::ImOp im,
+                                PatternRewriter &rewriter) const override {
+    auto val = im.getOperand();
+    auto valCon = val.getDefiningOp<complex::ConstantOp>();
+    if (valCon) {
+      auto attr = valCon.getValue();
+      auto real = cast<FloatAttr>(attr[0]).getValue();
+      auto fTy = dyn_cast<FloatType>(im.getType());
+      auto v = fTy.isF64() ? real.convertToDouble() : real.convertToFloat();
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(im, APFloat{v}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
 class LiftArrayAllocPass
     : public cudaq::opt::impl::LiftArrayAllocBase<LiftArrayAllocPass> {
 public:
@@ -338,6 +441,10 @@ class LiftArrayAllocPass
       RewritePatternSet patterns(ctx);
       patterns.insert<AllocaPattern>(ctx, domInfo, funcName, module);
       patterns.insert<ComplexCreatePattern>(ctx);
+      patterns.insert<FloatExtendPattern>(ctx);
+      patterns.insert<FloatTruncatePattern>(ctx);
+      patterns.insert<ComplexRePattern>(ctx);
+      patterns.insert<ComplexImPattern>(ctx);
 
       LLVM_DEBUG(llvm::dbgs()
                  << "Before lifting constant array: " << func << '\n');
@@ -352,3 +459,7 @@ class LiftArrayAllocPass
   }
 };
 } // namespace
+
+std::unique_ptr<mlir::Pass> cudaq::opt::createLiftArrayAllocPass() {
+  return std::make_unique<LiftArrayAllocPass>();
+}
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 0545a4e296..d81fdd04c8 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -142,7 +142,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
       // Stick global at end of Module.
       builder.setInsertionPointToEnd(module.getBody());
       std::string symbol =
-          "__nvqpp_rodata_init_state_qs." + std::to_string(counter++);
+          "__nvqpp_rodata_init_state." + std::to_string(counter++);
       builder.create<cudaq::cc::GlobalOp>(argLoc, arrTy, symbol, arrayAttr,
                                           /*isConstant=*/true,
                                           /*isExternal=*/false);
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 785e70b3f8..564a121f83 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -33,17 +33,17 @@ using namespace mlir;
 /// Replace a qubit initialization from vectors with quantum gates.
 /// For example:
 ///
-///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-///     %0 = cc.stdvec_size %arg0 : (!cc.stdvec<complex<f32>>) -> i64
-///     %1 = math.cttz %0 : i64
-///     %2 = cc.stdvec_data %arg0 : (!cc.stdvec<complex<f32>>) ->
-///     !cc.ptr<complex<f32>> %3 = quake.alloca !quake.veq<?>[%1 : i64] %4 =
-///     quake.init_state %3, %2 : (!quake.veq<?>, !cc.ptr<complex<f32>>) ->
-///     !quake.veq<?> return
-///   }
+/// func.func
+/// @__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE()
+/// attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+///   %0 = cc.address_of @__nvqpp_rodata_init_state.0 :
+///   !cc.ptr<!cc.array<complex<f32> x 4>> %1 = cc.cast %0 :
+///   (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>> %2 =
+///   quake.alloca !quake.veq<2> %3 = quake.init_state %2, %1 : (!quake.veq<2>,
+///   !cc.ptr<complex<f32>>) -> !quake.veq<2> return
+/// }
 ///
-/// On a call that passes std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0.,
-/// M_SQRT1_2} as arg0:
+/// is converted to:
 ///
 ///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
 ///     %0 = quake.alloca !quake.veq<2>
@@ -61,127 +61,114 @@ using namespace mlir;
 ///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
 ///     return
 ///   }
-///
-/// Note: the following synthesis and const prop passes will replace
-/// the argument by a constant and propagate the values and vector size
-/// through other instructions.
 
 namespace {
 
-template <typename T>
-concept IntegralType =
-    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
-    std::is_same<T, std::int16_t>::value ||
-    std::is_same<T, std::int32_t>::value ||
-    std::is_same<T, std::int64_t>::value;
-
-template <typename T>
-concept FloatingType = std::is_same<T, float>::value;
-
-template <typename T>
-concept DoubleType = std::is_same<T, double>::value;
-
-template <typename T>
-concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
-
-/// Input was complex<float> but we prefer
-/// complex<double>. Make a copy, extending the values.
-template <FloatingType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
-                                                   std::uint64_t size) {
-  auto convertData = std::vector<std::complex<double>>(size);
-  for (std::size_t i = 0; i < size; ++i)
-    convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
-                                          static_cast<double>(data[i].imag())};
-  return convertData;
-}
-
-template <DoubleType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
-                                                   std::uint64_t size) {
-  return std::vector<std::complex<From>>(data, data + size);
-}
-
-/// Input was float/double but we prefer complex<double>.
-/// Make a copy, extending or truncating the values.
-template <ComplexDataType From>
-std::vector<std::complex<double>> convertToComplex(From *data,
-                                                   std::uint64_t size) {
-  auto convertData = std::vector<std::complex<double>>(size);
-  for (std::size_t i = 0; i < size; ++i)
-    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
-                                          static_cast<double>(0.0)};
-  return convertData;
-}
-
-LogicalResult
-prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module,
-                               unsigned &counter, BlockArgument argument,
-                               std::vector<std::complex<double>> &vec) {
-  auto *ctx = builder.getContext();
-  auto argLoc = argument.getLoc();
-
-  auto toErase = std::vector<mlir::Operation *>();
-
-  for (auto *argUser : argument.getUsers()) {
-    // Handle the `StdvecSize` and `quake.alloca` use case:
-    // - Replace a `vec.size()` with the vector length.
-    // - Replace the number of qubits calculation with the vector length
-    // logarithm.
-    // - Replace `quake.alloca` with a constant size qvector allocation.
-    if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
-      builder.setInsertionPointAfter(stdvecSizeOp);
-      Value length = builder.create<arith::ConstantIntOp>(
-          argLoc, vec.size(), stdvecSizeOp.getType());
-
-      Value numQubits = builder.create<arith::ConstantIntOp>(
-          argLoc, log2(vec.size()), stdvecSizeOp.getType());
-
-      for (auto *sizeUser : argUser->getUsers()) {
-        if (auto countZeroesOp =
-                dyn_cast<mlir::math::CountTrailingZerosOp>(sizeUser)) {
-          for (auto *numQubitsUser : sizeUser->getUsers()) {
-            if (auto quakeAllocaOp = dyn_cast<quake::AllocaOp>(numQubitsUser)) {
-              builder.setInsertionPointAfter(quakeAllocaOp);
-              auto veqTy = quake::VeqType::get(ctx, log2(vec.size()));
-              Value newAlloc = builder.create<quake::AllocaOp>(argLoc, veqTy);
-              quakeAllocaOp.replaceAllUsesWith(newAlloc);
-              toErase.push_back(quakeAllocaOp);
-            }
+std::vector<std::complex<double>>
+readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
+  std::vector<std::complex<double>> result{};
+
+  auto attr = global.getValue();
+  auto type = global.getType().getElementType();
+
+  if (auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(type)) {
+    auto eleTy = arrayTy.getElementType();
+
+    if (attr.has_value()) {
+      if (auto elementsAttr = dyn_cast<mlir::ElementsAttr>(attr.value())) {
+        auto eleTy = elementsAttr.getElementType();
+        if (isa<ComplexType>(eleTy)) {
+          auto values = elementsAttr.getValues<mlir::ArrayAttr>();
+          for (auto it = values.begin(); it != values.end(); ++it) {
+            auto valueAttr = *it;
+            auto real =
+                cast<FloatAttr>(valueAttr[0]).getValue().convertToDouble();
+            auto imag =
+                cast<FloatAttr>(valueAttr[1]).getValue().convertToDouble();
+            result.push_back({real, imag});
           }
-          countZeroesOp.replaceAllUsesWith(numQubits);
-          toErase.push_back(countZeroesOp);
+        } else {
+          auto values = elementsAttr.getValues<double>();
+          for (auto it = values.begin(); it != values.end(); ++it) {
+            result.push_back({*it, 0.0});
+          }
+        }
+      } else if (auto values = dyn_cast<mlir::ArrayAttr>(attr.value())) {
+        for (auto it = values.begin(); it != values.end(); ++it) {
+          auto real = *it;
+          // for (std::size_t idx = 0; idx < numConstants; idx += isComplex ? 2
+          // : 1) {
+          auto v = [&]() -> std::complex<double> {
+            if (isa<FloatType>(eleTy))
+              return {cast<FloatAttr>(real).getValue().convertToDouble(),
+                      static_cast<double>(0.0)};
+            if (isa<IntegerType>(eleTy))
+              return {static_cast<double>(cast<IntegerAttr>(real).getInt()),
+                      static_cast<double>(0.0)};
+            assert(isa<ComplexType>(eleTy));
+            it++;
+            auto imag = *it;
+            return {cast<FloatAttr>(real).getValue().convertToDouble(),
+                    cast<FloatAttr>(imag).getValue().convertToDouble()};
+          }();
+
+          result.push_back(v);
         }
       }
-
-      stdvecSizeOp.replaceAllUsesWith(length);
-      toErase.push_back(stdvecSizeOp);
-      continue;
     }
+  }
 
-    // Handle the `StdvecDataOp` and `quake.init_state` use case:
-    // - Replace a `quake.init_state` with gates preparing the state.
-    if (auto stdvecDataOp = dyn_cast<cudaq::cc::StdvecDataOp>(argUser)) {
-      for (auto *dataUser : stdvecDataOp->getUsers()) {
-        if (auto initOp = dyn_cast<quake::InitializeStateOp>(dataUser)) {
-          builder.setInsertionPointAfter(initOp);
-          // Find the qvector alloc instruction
-          auto qubits = initOp.getOperand(0);
-
-          // Prepare state from vector data.
-          auto gateBuilder = StateGateBuilder(builder, argLoc, qubits);
-          auto decomposer = StateDecomposer(gateBuilder, vec);
-          decomposer.decompose();
+  return result;
+}
 
-          initOp.replaceAllUsesWith(qubits);
-          toErase.push_back(initOp);
+LogicalResult transform(OpBuilder &builder, ModuleOp module) {
+  auto toErase = std::vector<mlir::Operation *>();
+  module->walk([&](Operation *op) {
+    if (auto initOp = dyn_cast<quake::InitializeStateOp>(op)) {
+      toErase.push_back(initOp);
+      auto loc = op->getLoc();
+      builder.setInsertionPointAfter(initOp);
+      // Find the qvector alloc.
+      auto qubits = initOp.getOperand(0);
+      if (auto alloc = dyn_cast<quake::AllocaOp>(qubits.getDefiningOp())) {
+
+        // Find vector data.
+        auto data = initOp.getOperand(1);
+        if (auto cast = dyn_cast<cudaq::cc::CastOp>(data.getDefiningOp())) {
+          data = cast.getOperand();
+          toErase.push_back(cast);
+        }
+        if (auto addr =
+                dyn_cast<cudaq::cc::AddressOfOp>(data.getDefiningOp())) {
+
+          auto globalName = addr.getGlobalName();
+          auto symbol = module.lookupSymbol(globalName);
+          if (auto global = dyn_cast<cudaq::cc::GlobalOp>(symbol)) {
+            // Read state initialization data from the global array.
+            auto vec = readConstantArray(builder, global);
+
+            // Prepare state from vector data.
+            auto gateBuilder = StateGateBuilder(builder, loc, qubits);
+            auto decomposer = StateDecomposer(gateBuilder, vec);
+            decomposer.decompose();
+
+            initOp.replaceAllUsesWith(qubits);
+            toErase.push_back(addr);
+            toErase.push_back(global);
+          }
         }
       }
     }
-  }
+  });
 
   for (auto &op : toErase) {
-    op->erase();
+    if (op->getUses().empty()) {
+      op->erase();
+    } else {
+      module.emitOpError("StatePreparation failed to remove quake.init_state "
+                         "or its dependencies.");
+      return failure();
+    }
   }
 
   return success();
@@ -192,52 +179,14 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
   // The name of the kernel to be synthesized
   std::string kernelName;
 
-  // The raw pointer to the runtime arguments.
-  void *args;
-
 public:
   StatePreparation() = default;
-  StatePreparation(std::string_view kernel, void *a)
-      : kernelName(kernel), args(a) {}
+  StatePreparation(std::string_view kernel) : kernelName(kernel) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
-  std::pair<std::size_t, std::vector<std::size_t>>
-  getTargetLayout(FunctionType funcTy) {
-    auto bufferTy = cudaq::opt::factory::buildInvokeStructType(funcTy);
-    StringRef dataLayoutSpec = "";
-    if (auto attr =
-            getModule()->getAttr(cudaq::opt::factory::targetDataLayoutAttrName))
-      dataLayoutSpec = cast<StringAttr>(attr);
-    auto dataLayout = llvm::DataLayout(dataLayoutSpec);
-    // Convert bufferTy to llvm.
-    llvm::LLVMContext context;
-    LLVMTypeConverter converter(funcTy.getContext());
-    cudaq::opt::initializeTypeConversions(converter);
-    auto llvmDialectTy = converter.convertType(bufferTy);
-    LLVM::TypeToLLVMIRTranslator translator(context);
-    auto *llvmStructTy =
-        cast<llvm::StructType>(translator.translateType(llvmDialectTy));
-    auto *layout = dataLayout.getStructLayout(llvmStructTy);
-    auto strSize = layout->getSizeInBytes();
-    std::vector<std::size_t> fieldOffsets;
-    for (std::size_t i = 0, I = bufferTy.getMembers().size(); i != I; ++i)
-      fieldOffsets.emplace_back(layout->getElementOffset(i));
-    return {strSize, fieldOffsets};
-  }
-
   void runOnOperation() override final {
     auto module = getModule();
-    unsigned counter = 0;
-
-    if (args == nullptr || kernelName.empty()) {
-      module.emitOpError(
-          "State preparation requires a kernel and the values of the "
-          "arguments passed when it is called.");
-      signalPassFailure();
-      return;
-    }
-
     auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName;
     // Get the function we care about (the one with kernelName)
     auto funcOp = module.lookupSymbol<func::FuncOp>(kernelNameInQuake);
@@ -248,112 +197,12 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
       return;
     }
 
-    // Create the builder.
     auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
-    auto arguments = funcOp.getArguments();
-    auto structLayout = getTargetLayout(funcOp.getFunctionType());
-    // Keep track of the stdVec sizes.
-    std::vector<std::tuple<std::size_t, Type, std::uint64_t>> stdVecInfo;
-
-    for (auto iter : llvm::enumerate(arguments)) {
-      auto argNum = iter.index();
-      auto argument = iter.value();
-      std::size_t offset = structLayout.second[argNum];
-
-      // Get the argument type
-      auto type = argument.getType();
-
-      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
-        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          funcOp.emitOpError(
-              "State preparation from cudaq::state is not supported.");
-          return;
-        }
-      }
-
-      // If std::vector<arithmetic> type, add it to the list of vector info.
-      // These will be processed when we reach the buffer's appendix.
-      if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
-        auto eleTy = vecTy.getElementType();
-        if (!isa<IntegerType, FloatType, ComplexType>(eleTy)) {
-          continue;
-        }
-        char *ptrToSizeInBuffer = static_cast<char *>(args) + offset;
-        auto sizeFromBuffer =
-            *reinterpret_cast<std::uint64_t *>(ptrToSizeInBuffer);
-        unsigned bytesInType = [&eleTy]() {
-          if (auto complexTy = dyn_cast<ComplexType>(eleTy))
-            return 2 * cudaq::opt::convertBitsToBytes(
-                           complexTy.getElementType().getIntOrFloatBitWidth());
-          return cudaq::opt::convertBitsToBytes(eleTy.getIntOrFloatBitWidth());
-        }();
-        assert(bytesInType > 0 && "element must have a size");
-        auto vectorSize = sizeFromBuffer / bytesInType;
-        stdVecInfo.emplace_back(argNum, eleTy, vectorSize);
-        continue;
-      }
-    }
-
-    // For any `std::vector` arguments, we now know the sizes so let's replace
-    // the block arg with the actual vector element data. First get the pointer
-    // to the start of the buffer's appendix.
-    auto structSize = structLayout.first;
-    char *bufferAppendix = static_cast<char *>(args) + structSize;
-    for (auto [idx, eleTy, vecLength] : stdVecInfo) {
-      if (!eleTy) {
-        bufferAppendix += vecLength;
-        continue;
-      }
-      auto doVector = [&]<typename T>(T) {
-        auto *ptr = reinterpret_cast<T *>(bufferAppendix);
-        auto v = convertToComplex(ptr, vecLength);
-        if (failed(prepareStateFromVectorArgument(builder, module, counter,
-                                                  arguments[idx], v)))
-          funcOp.emitOpError("state preparation failed for vector<T>");
-        bufferAppendix += vecLength * sizeof(T);
-      };
-      if (auto ty = dyn_cast<IntegerType>(eleTy)) {
-        switch (ty.getIntOrFloatBitWidth()) {
-        case 1:
-          doVector(false);
-          break;
-        case 8:
-          doVector(std::int8_t{});
-          break;
-        case 16:
-          doVector(std::int16_t{});
-          break;
-        case 32:
-          doVector(std::int32_t{});
-          break;
-        case 64:
-          doVector(std::int64_t{});
-          break;
-        default:
-          bufferAppendix += vecLength * cudaq::opt::convertBitsToBytes(
-                                            ty.getIntOrFloatBitWidth());
-          funcOp.emitOpError(
-              "state preparation failed for vector<integral-type>.");
-          break;
-        }
-        continue;
-      }
-      if (eleTy == builder.getF32Type()) {
-        doVector(float{});
-        continue;
-      }
-      if (eleTy == builder.getF64Type()) {
-        doVector(double{});
-        continue;
-      }
-      if (eleTy == ComplexType::get(builder.getF32Type())) {
-        doVector(std::complex<float>{});
-        continue;
-      }
-      if (eleTy == ComplexType::get(builder.getF64Type())) {
-        doVector(std::complex<double>{});
-        continue;
-      }
+    auto result = transform(builder, module);
+    if (result.failed()) {
+      module.emitOpError("Failed to prepare state for '" + kernelName);
+      signalPassFailure();
+      return;
     }
   }
 };
@@ -365,6 +214,6 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation() {
 }
 
 std::unique_ptr<mlir::Pass>
-cudaq::opt::createStatePreparation(std::string_view kernelName, void *a) {
-  return std::make_unique<StatePreparation>(kernelName, a);
+cudaq::opt::createStatePreparation(std::string_view kernelName) {
+  return std::make_unique<StatePreparation>(kernelName);
 }
diff --git a/lib/Optimizer/Transforms/StatePreparation2.cpp b/lib/Optimizer/Transforms/StatePreparation2.cpp
deleted file mode 100644
index a8047821a0..0000000000
--- a/lib/Optimizer/Transforms/StatePreparation2.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "PassDetails.h"
-#include "StateDecomposer.h"
-#include "cudaq/Optimizer/Builder/Runtime.h"
-#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
-#include "cudaq/Optimizer/Transforms/Passes.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Target/LLVMIR/TypeToLLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include <span>
-
-#include <iostream>
-
-#define DEBUG_TYPE "state-preparation2"
-
-using namespace mlir;
-
-/// Replace a qubit initialization from vectors with quantum gates.
-/// For example:
-///
-/// func.func @__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-///   %0 = cc.address_of @__nvqpp_rodata_init_state.0 : !cc.ptr<!cc.array<complex<f32> x 4>>
-///   %1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-///   %2 = quake.alloca !quake.veq<2>
-///   %3 = quake.init_state %2, %1 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
-///   return
-/// }
-/// 
-/// is converted to:
-///
-///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
-///     %0 = quake.alloca !quake.veq<2>
-///     %c0_i64 = arith.constant 0 : i64
-///     %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
-///     %cst = arith.constant 1.5707963267948968 : f64
-///     quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
-///     %c1_i64 = arith.constant 1 : i64
-///     %2 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
-///     %cst_0 = arith.constant 1.5707963267948966 : f64
-///     quake.ry (%cst_0) %2 : (f64, !quake.ref) -> ()
-///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
-///     %cst_1 = arith.constant -1.5707963267948966 : f64
-///     quake.ry (%cst_1) %2 : (f64, !quake.ref) -> ()
-///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
-///     return
-///   }
-///
-/// Note: the following synthesis and const prop passes will replace
-/// the argument by a constant and propagate the values and vector size
-/// through other instructions.
-
-namespace {
-
-template <typename T>
-concept IntegralType =
-    std::is_same<T, bool>::value || std::is_same<T, std::int8_t>::value ||
-    std::is_same<T, std::int16_t>::value ||
-    std::is_same<T, std::int32_t>::value ||
-    std::is_same<T, std::int64_t>::value;
-
-template <typename T>
-concept FloatingType = std::is_same<T, float>::value;
-
-template <typename T>
-concept DoubleType = std::is_same<T, double>::value;
-
-template <typename T>
-concept ComplexDataType = FloatingType<T> || DoubleType<T> || IntegralType<T>;
-
-/// Input was complex<float> but we prefer
-/// complex<double>. Make a copy, extending the values.
-template <FloatingType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
-                                                   std::uint64_t size) {
-  auto convertData = std::vector<std::complex<double>>(size);
-  for (std::size_t i = 0; i < size; ++i)
-    convertData[i] = std::complex<double>{static_cast<double>(data[i].real()),
-                                          static_cast<double>(data[i].imag())};
-  return convertData;
-}
-
-template <DoubleType From>
-std::vector<std::complex<double>> convertToComplex(std::complex<From> *data,
-                                                   std::uint64_t size) {
-  return std::vector<std::complex<From>>(data, data + size);
-}
-
-/// Input was float/double but we prefer complex<double>.
-/// Make a copy, extending or truncating the values.
-template <ComplexDataType From>
-std::vector<std::complex<double>> convertToComplex(From *data,
-                                                   std::uint64_t size) {
-  auto convertData = std::vector<std::complex<double>>(size);
-  for (std::size_t i = 0; i < size; ++i)
-    convertData[i] = std::complex<double>{static_cast<double>(data[i]),
-                                          static_cast<double>(0.0)};
-  return convertData;
-}
-
-std::vector<std::complex<double>> readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
-  std::vector<std::complex<double>> result{};
-
-  auto attr = global.getValue();
-  auto type = global.getType().getElementType();
-  
-  if (auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(type)) {
-    auto eleTy = arrayTy.getElementType();
-    std::cout << "Attribute element type:" << std::endl;
-    eleTy.dump();
-
-    if (attr.has_value()) {
-      //  auto tensorTy = RankedTensorType::get(size, eleTy);
-      // auto f64Attr = DenseElementsAttr::get(tensorTy, values);
-      if (auto elementsAttr = dyn_cast<mlir::ElementsAttr>(attr.value())) {
-        auto values = elementsAttr.getValues<double>();
-        for (auto it = values.begin(); it != values.end(); ++it) {
-          result.push_back({*it, 0.0});
-        }
-      }
-
-      else if (auto values = dyn_cast<mlir::ArrayAttr>(attr.value())) {
-        for (auto it = values.begin(); it != values.end(); ++it) {
-          auto real = *it;
-        // for (std::size_t idx = 0; idx < numConstants; idx += isComplex ? 2 : 1) {
-          auto v = [&]() -> std::complex<double> {
-            //auto val = constantValues[idx];
-            
-            if (isa<FloatType>(eleTy))
-              return {
-                cast<FloatAttr>(real).getValue().convertToDouble(), 
-                static_cast<double>(0.0)
-              };
-            if (isa<IntegerType>(eleTy))
-              return {
-                static_cast<double>(cast<IntegerAttr>(real).getInt()), 
-                static_cast<double>(0.0)
-              };
-            assert(isa<ComplexType>(eleTy));
-            it++;
-            auto imag = *it;
-            return {
-                cast<FloatAttr>(real).getValue().convertToDouble(),
-                cast<FloatAttr>(imag).getValue().convertToDouble()
-            };
-          }();
-
-          result.push_back(v);
-        }
-      }
-    }
-  }
-
-  std::cout << "Results (" <<  result.size() << "):" << std::endl;
-  for (auto &r: result) {
-    std::cout << r << ", " << std::endl;
-  }
-  return result;
-}
-
-LogicalResult
-transform(OpBuilder &builder, ModuleOp module) {
-  //auto *ctx = builder.getContext();
-
-  auto toErase = std::vector<mlir::Operation *>();
-
-// Module after everything
-// module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.triple = "x86_64-unknown-linux-gnu", quake.mangled_name_map = {__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE = "_Z4testSt6vectorISt7complexIfESaIS1_EE"}} {
-//   func.func @__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-//     %0 = cc.address_of @__nvqpp_rodata_init_state.0 : !cc.ptr<!cc.array<complex<f32> x 4>>
-//     %1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-//     %2 = quake.alloca !quake.veq<2>
-//     %3 = quake.init_state %2, %1 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
-//     return
-//   }
-//   cc.global constant @__nvqpp_rodata_init_state.0 ([0.707106769 : f32, 0.000000e+00 : f32, 0.707106769 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]) : !cc.array<complex<f32> x 4>
-// }
-
-// func.func @__nvqpp__mlirgen__function_f._Z1fv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-//   %0 = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr<!cc.array<f64 x 4>>
-//   %1 = quake.alloca !quake.veq<2>
-//   %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
-//   quake.dealloc %2 : !quake.veq<2>
-//   return
-// }
-
-  
-  module->walk([&](Operation *op) {
-    if (auto initOp = dyn_cast<quake::InitializeStateOp>(op)) {
-       toErase.push_back(initOp);
-      auto loc = op->getLoc();
-      builder.setInsertionPointAfter(initOp);
-      // Find the qvector alloc.
-      auto qubits = initOp.getOperand(0);
-      if (auto alloc = dyn_cast<quake::AllocaOp>(qubits.getDefiningOp())) {
-
-        // Find vector data.
-        auto data = initOp.getOperand(1);
-        if (auto cast = dyn_cast<cudaq::cc::CastOp>(data.getDefiningOp())) {
-          data = cast.getOperand();
-          toErase.push_back(cast);
-        }
-        if (auto addr = dyn_cast<cudaq::cc::AddressOfOp>(data.getDefiningOp())) {
-          
-          auto globalName = addr.getGlobalName();
-          auto symbol = module.lookupSymbol(globalName);
-          if (auto global = dyn_cast<cudaq::cc::GlobalOp>(symbol)) {
-            // Read state initialization data from the global array.
-            auto vec = readConstantArray(builder, global);
-            
-            // Prepare state from vector data.
-            auto gateBuilder = StateGateBuilder(builder, loc, qubits);
-            auto decomposer = StateDecomposer(gateBuilder, vec);
-            decomposer.decompose();
-
-            initOp.replaceAllUsesWith(qubits);
-            toErase.push_back(addr);
-            toErase.push_back(global);
-          }
-        }
-      }
-    }
-  });
- 
-  for (auto &op : toErase) {
-    op->erase();
-  }
-
-  return success();
-}
-
-class StatePreparation2 : public cudaq::opt::PrepareState2Base<StatePreparation2> {
-protected:
-  // The name of the kernel to be synthesized
-  std::string kernelName;
-
-  // The raw pointer to the runtime arguments.
-  void *args;
-
-public:
-  StatePreparation2() = default;
-  StatePreparation2(std::string_view kernel, void *a)
-      : kernelName(kernel), args(a) {}
-
-  mlir::ModuleOp getModule() { return getOperation(); }
-
-
-  void runOnOperation() override final {
-    auto module = getModule();
-
-    std::cout << "Module before state prep2" << std::endl;
-    module.dump();
-
-    auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName;
-    // Get the function we care about (the one with kernelName)
-    auto funcOp = module.lookupSymbol<func::FuncOp>(kernelNameInQuake);
-    if (!funcOp) {
-      module.emitOpError("The kernel '" + kernelName +
-                         "' was not found in the module.");
-      signalPassFailure();
-      return;
-    }
-
-    // Create the builder.
-    auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
-
-    auto result = transform(builder, module);
-    if (result.failed()) {
-      module.emitOpError("Failed to prepare state for '" + kernelName);
-      signalPassFailure();
-      return;
-    }
-    
-    std::cout << "Module after state prep2" << std::endl;
-    module.dump();
-  }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation2() {
-  return std::make_unique<StatePreparation2>();
-}
-
-std::unique_ptr<mlir::Pass>
-cudaq::opt::createStatePreparation2(std::string_view kernelName, void *a) {
-  return std::make_unique<StatePreparation2>(kernelName, a);
-}
diff --git a/program.py b/program.py
deleted file mode 100644
index 92321a755a..0000000000
--- a/program.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-import cudaq
-import numpy as np
-
-c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
-
-@cudaq.kernel
-def kernel(vec: list[complex]):
-    q = cudaq.qvector(vec)
-
-synthesized = cudaq.synthesize(kernel, c)
-print(synthesized)
-
-counts = cudaq.sample(synthesized)
-assert '00' in counts
-assert '10' in counts
\ No newline at end of file
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 1b1fed2a83..3b35b4dd0d 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -539,18 +539,18 @@ def __copyVectorAndCastElements(self, source, targetEleType):
         if (sourceEleType == targetEleType):
             return sourcePtr
 
-        sourceArrEleTy = cc.ArrayType.get(self.ctx, sourceEleType)
+        sourceArrTy = cc.ArrayType.get(self.ctx, sourceEleType)
         sourceElePtrTy = cc.PointerType.get(self.ctx, sourceEleType)
-        sourceArrElePtrTy = cc.PointerType.get(self.ctx, sourceArrType)
+        sourceArrPtrTy = cc.PointerType.get(self.ctx, sourceArrTy)
         sourceValue = self.ifPointerThenLoad(sourcePtr)
-        sourceDataPtr = cc.StdvecDataOp(sourceArrElePtrTy, sourceValue).result
+        sourceDataPtr = cc.StdvecDataOp(sourceArrPtrTy, sourceValue).result
         sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result
 
         targetElePtrType = cc.PointerType.get(self.ctx, targetEleType)
         targetTy = cc.ArrayType.get(self.ctx, targetEleType)
-        targetArrElePtrTy = cc.PointerType.get(self.ctx, targetTy)
+        targetArrPtrTy = cc.PointerType.get(self.ctx, targetTy)
         targetVecTy = cc.StdvecType.get(self.ctx, targetEleType)
-        targetPtr = cc.AllocaOp(targetArrElePtr,
+        targetPtr = cc.AllocaOp(targetArrPtrTy,
                                 TypeAttr.get(targetEleType),
                                 seqSize=sourceSize).result
 
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 93bfd8a295..a937b4acc8 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -474,12 +474,11 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   registerLLVMDialectTranslation(*context);
 
   PassManager pm(context);
-  //pm.addPass(createCanonicalizerPass());
-  //pm.addPass(cudaq::opt::createStatePreparation(name, rawArgs));
   pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
-  pm.addPass(cudaq::opt::createStatePreparation2(name, rawArgs));
+  pm.addPass(cudaq::opt::createLiftArrayAllocPass());
+  pm.addPass(cudaq::opt::createStatePreparation(name));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index 28260dcb4d..0323d13f99 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -66,27 +66,9 @@ def kernel(vec: list[float]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel():
-        q = cudaq.qvector(f)
-
-    counts = cudaq.sample(kernel)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_float_capture_f32():
+@skipIfPythonLessThan39
+def test_kernel_float_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -100,27 +82,9 @@ def kernel():
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_np_array_from_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel():
-        q = cudaq.qvector(np.array(f))
-
-    counts = cudaq.sample(kernel)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_float_np_array_from_capture_f32():
+@skipIfPythonLessThan39
+def test_kernel_float_np_array_from_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
 
@@ -134,25 +98,9 @@ def kernel():
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_float_definition_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        q = cudaq.qvector([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
-
-    counts = cudaq.sample(kernel)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_float_definition_f32():
+@skipIfPythonLessThan39
+def test_kernel_float_definition():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -205,27 +153,9 @@ def kernel(vec: list[complex]):
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel():
-        q = cudaq.qvector(c)
-
-    counts = cudaq.sample(kernel)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_capture_f32():
+@skipIfPythonLessThan39
+def test_kernel_complex_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -239,27 +169,10 @@ def kernel():
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_np_array_from_capture_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    @cudaq.kernel
-    def kernel():
-        q = cudaq.qvector(np.array(c))
 
-    counts = cudaq.sample(kernel)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_np_array_from_capture_f32():
+@skipIfPythonLessThan39
+def test_kernel_complex_np_array_from_capture():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
@@ -273,25 +186,10 @@ def kernel():
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
-def test_kernel_complex_definition_f64():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    @cudaq.kernel
-    def kernel():
-        q = cudaq.qvector([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)])
 
-    counts = cudaq.sample(kernel)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_complex_definition_f32():
+@skipIfPythonLessThan39
+def test_kernel_complex_definition():
     cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     @cudaq.kernel
     def kernel():
@@ -362,7 +260,7 @@ def kernel(vec: list[complex]):
 def test_kernel_simulation_dtype_complex_params_f64():
     cudaq.reset_target()
     cudaq.set_target('nvidia-fp64')
-
+    
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
     @cudaq.kernel
@@ -428,7 +326,7 @@ def kernel(vec: list[complex]):
 def test_kernel_simulation_dtype_np_array_from_capture_f64():
     cudaq.reset_target()
     cudaq.set_target('nvidia-fp64')
-
+    
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
     @cudaq.kernel
@@ -458,29 +356,9 @@ def kernel():
     assert '00' in counts
 
 
-@skipIfNvidiaFP64NotInstalled
+@skipIfPythonLessThan39
 def test_kernel_simulation_dtype_np_array_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
-
-    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
-
-    state = np.array(c, dtype=cudaq.complex())
-
-    @cudaq.kernel
-    def kernel():
-        q = cudaq.qvector(state)
-
-    counts = cudaq.sample(kernel)
-    print(counts)
-    assert '11' in counts
-    assert '00' in counts
-
-
-@skipIfNvidiaNotInstalled
-def test_kernel_simulation_dtype_np_array_capture_f32():
-    cudaq.reset_target()
-    cudaq.set_target('nvidia')
 
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index e05a32bf13..b598da28ed 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -381,16 +381,10 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     auto moduleOp = builder.create<mlir::ModuleOp>();
     moduleOp.push_back(func.clone());
     moduleOp->setAttrs(m_module->getAttrDictionary());
-    for (auto &op: m_module.getOps()) {
-      // Add globals referenced in the func.
+
+    for (auto &op : m_module.getOps()) {
       if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
-        //for (auto *use: globalOp->getUsers()) {
-        //  auto parent = use->getParentOfType<mlir::func::FuncOp>();
-        //  std::cout << "Global " << globalOp.getName().str() << " is used in " << parent.getName().str() <<std::endl;
-        //  if (parent.getName() == func.getName()) {
-            moduleOp.push_back(globalOp.clone());
-        //  }
-        //}
+        moduleOp.push_back(globalOp.clone());
       }
     }
 
@@ -413,13 +407,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         throw std::runtime_error("Remote rest platform Quake lowering failed.");
     };
 
-    std::cout << "Module before synthesis" << std::endl;
-    moduleOp.dump();
-
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
-      //pm.addPass(cudaq::opt::createStatePreparation(kernelName, updatedArgs));
       pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
@@ -430,42 +420,24 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         throw std::runtime_error("Could not successfully apply quake-synth.");
     }
 
-    std::cout << "Module after synthesis" << std::endl;
-    moduleOp.dump();
-    // runPassPipeline("canonicalize,cse", moduleOp);
-    // std::cout << "Module after synthesis and cse" << std::endl;
-    // moduleOp.dump();
-
-    // Run the config-specified pass pipeline
-    //runPassPipeline(passPipelineConfig, moduleOp);
-    //runPassPipeline("cc-loop-unroll{allow-early-exit=1},canonicalize,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping", moduleOp);
-    //if (updatedArgs) {
+    {
       cudaq::info("Run State Prep.\n");
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createStatePreparation2(kernelName, updatedArgs));
+
+      pm.addPass(mlir::createCanonicalizerPass());
+      pm.addPass(mlir::createCSEPass());
+      pm.addPass(cudaq::opt::createLiftArrayAllocPass());
+      pm.addPass(cudaq::opt::createStatePreparation(kernelName));
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
       if (enablePrintMLIREachPass)
         pm.enableIRPrinting();
       if (failed(pm.run(moduleOp)))
         throw std::runtime_error("Could not successfully apply state prep.");
-    //}
-
-    std::cout << "Module after state prep" << std::endl;
-    moduleOp.dump();
-
-    runPassPipeline("canonicalize,cse", moduleOp);
-    std::cout << "Module after state prep and cse" << std::endl;
-    moduleOp.dump();
+    }
 
-    // Run the config-specified pass pipeline
-    //runPassPipeline("cc-loop-unroll{allow-early-exit=1},canonicalize,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition)", moduleOp);
-    // runPassPipeline("cc-loop-unroll{allow-early-exit=1},canonicalize,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping", moduleOp);
     runPassPipeline(passPipelineConfig, moduleOp);
 
-    std::cout << "Module after state prep and pipeline" << std::endl;
-    moduleOp.dump();
-    
     auto entryPointFunc = moduleOp.lookupSymbol<mlir::func::FuncOp>(
         std::string("__nvqpp__mlirgen__") + kernelName);
     std::vector<std::size_t> mapping_reorder_idx;
@@ -531,8 +503,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       // and use that for execution
       for (auto &[name, module] : modules) {
         auto clonedModule = module.clone();
-        std::cout << "Module after everything" << std::endl;
-        clonedModule.dump();
         jitEngines.emplace_back(
             cudaq::createQIRJITEngine(clonedModule, codegenTranslation));
       }
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 1d6f40e8a7..db1288caca 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -153,13 +153,24 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
-        pm.addPass(cudaq::opt::createStatePreparation(name, args));
         pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args));
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
       }
 
+      {
+        cudaq::info("Run State Prep.\n");
+        mlir::PassManager pm(&mlirContext);
+        pm.addPass(mlir::createCanonicalizerPass());
+        pm.addPass(mlir::createCSEPass());
+        pm.addPass(cudaq::opt::createLiftArrayAllocPass());
+        pm.addPass(cudaq::opt::createStatePreparation(name));
+        pm.addPass(mlir::createCanonicalizerPass());
+        if (failed(pm.run(moduleOp)))
+          throw std::runtime_error("Could not successfully apply state-prep.");
+      }
+
       // Run client-side passes. `clientPasses` is empty right now, but the code
       // below accommodates putting passes into it.
       mlir::PassManager pm(&mlirContext);
diff --git a/runtime/common/RuntimeMLIRCommonImpl.h b/runtime/common/RuntimeMLIRCommonImpl.h
index 91722e1751..e3661744f3 100644
--- a/runtime/common/RuntimeMLIRCommonImpl.h
+++ b/runtime/common/RuntimeMLIRCommonImpl.h
@@ -40,8 +40,6 @@
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Tools/ParseUtilities.h"
 
-#include <iostream>
-
 namespace cudaq {
 
 bool setupTargetTriple(llvm::Module *llvmModule) {
@@ -372,10 +370,6 @@ qirProfileTranslationFunction(const char *qirProfile, mlir::Operation *op,
   if (printIntermediateMLIR)
     pm.enableIRPrinting();
 
-  std::cout << "qirProfileTranslationFunction" << std::endl;
-  pm.enableIRPrinting();
-  context->disableMultithreading();
-
   std::string errMsg;
   llvm::raw_string_ostream errOs(errMsg);
   cudaq::opt::addPipelineConvertToQIR(pm, qirProfile);
@@ -582,9 +576,6 @@ mlir::ExecutionEngine *createQIRJITEngine(mlir::ModuleOp &moduleOp,
     tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
     auto timingScope = tm.getRootScope(); // starts the timer
     pm.enableTiming(timingScope);         // do this right before pm.run
-    std::cout << "Common IR" << std::endl;
-    context->disableMultithreading();
-    pm.enableIRPrinting();
     if (failed(pm.run(module)))
       throw std::runtime_error(
           "[createQIRJITEngine] Lowering to QIR for remote emulation failed.");
diff --git a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
index 39602a6fba..f8318e1dec 100644
--- a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
@@ -36,8 +36,6 @@ class RemoteRESTQPU : public cudaq::BaseRemoteRESTQPU {
 
     // Get the quake representation of the kernel
     auto quakeCode = cudaq::get_quake_by_name(kernelName);
-    std::cout << "extractQuakeCodeAndContext" << quakeCode << std::endl;
-
     auto m_module = parseSourceString<ModuleOp>(quakeCode, &context);
     if (!m_module)
       throw std::runtime_error("module cannot be parsed");
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index 1a96b3e881..886c3d92b8 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -11,12 +11,28 @@
 #include <cudaq.h>
 #include <iostream>
 
-__qpu__ void f() {
-  cudaq::qvector v = { static_cast<cudaq::complex>(1.0), static_cast<cudaq::complex>(2.0), static_cast<cudaq::complex>(3.0), static_cast<cudaq::complex>(4.0)};
-  // cudaq::qvector v = { 1.0, 2.0, 3.0, 4.0};
+__qpu__ void test_complex_constant_array() {
+   cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
 
-__qpu__ void test(std::vector<cudaq::complex> inState) {
+__qpu__ void test_complex_constant_array2() {
+   cudaq::qvector v({
+    cudaq::complex(M_SQRT1_2),
+    cudaq::complex(M_SQRT1_2),
+    cudaq::complex(0.0),
+    cudaq::complex(0.0)
+  });
+}
+
+__qpu__ void test_real_constant_array() {
+  cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
+__qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
   cudaq::qvector q1 = inState;
 }
 
@@ -33,33 +49,101 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-    auto counts = cudaq::sample(f);
-    printCounts(counts);
-
-    // std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    // std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-    // {
-    //     // Passing state data as argument (kernel mode)
-    //     auto counts = cudaq::sample(test, vec);
-    //     printCounts(counts);
-
-    //     counts = cudaq::sample(test, vec1);
-    //     printCounts(counts);
-    // }
-
-    // {
-    //     // Passing state data as argument (builder mode)
-    //     auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
-    //     auto qubits = kernel.qalloc(v);
-    
-    //     auto counts = cudaq::sample(kernel, vec);
-    //     printCounts(counts);
-    // }
+    {
+      auto counts = cudaq::sample(test_complex_constant_array);
+      printCounts(counts);
+    }
+
+    {
+      auto counts = cudaq::sample(test_complex_constant_array2);
+      printCounts(counts);
+    }
+
+    {
+      auto counts = cudaq::sample(test_real_constant_array);
+      printCounts(counts);
+    }
+
+    {
+      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      {
+          // Passing state data as argument (kernel mode)
+          auto counts = cudaq::sample(test_complex_array_param, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(test_complex_array_param, vec1);
+          printCounts(counts);
+      }
+
+      {
+          // Passing state data as argument (builder mode)
+          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+          auto qubits = kernel.qalloc(v);
+
+          auto counts = cudaq::sample(kernel, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(kernel, vec1);
+          printCounts(counts);
+      }
+    }
+
+    {
+      std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      {
+          // Passing state data as argument (kernel mode)
+          auto counts = cudaq::sample(test_real_array_param, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(test_real_array_param, vec1);
+          printCounts(counts);
+      }
+
+      {
+          // Passing state data as argument (builder mode)
+          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
+          auto qubits = kernel.qalloc(v);
+
+          auto counts = cudaq::sample(kernel, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(kernel, vec1);
+          printCounts(counts);
+      }
+    }
 }
 
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 00
+// CHECK: 10
+
+
+// CHECK: 00
+// CHECK: 10
+// CHECK: 01
+// CHECK: 11
+
 // CHECK: 00
 // CHECK: 10
 // CHECK: 01
 // CHECK: 11
+
 // CHECK: 00
 // CHECK: 10
+// CHECK: 01
+// CHECK: 11
+
+// CHECK: 00
+// CHECK: 10
+// CHECK: 01
+// CHECK: 11

From c2431d5a17ffd216162ee9f0eb3ee1a3eae4be85 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 5 Jul 2024 09:51:22 -0700
Subject: [PATCH 15/50] Format

---
 python/tests/kernel/test_kernel_qvector_init.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index 0323d13f99..3edb5ca951 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -169,7 +169,6 @@ def kernel():
     assert '00' in counts
 
 
-
 @skipIfPythonLessThan39
 def test_kernel_complex_np_array_from_capture():
     cudaq.reset_target()
@@ -186,7 +185,6 @@ def kernel():
     assert '00' in counts
 
 
-
 @skipIfPythonLessThan39
 def test_kernel_complex_definition():
     cudaq.reset_target()
@@ -260,7 +258,7 @@ def kernel(vec: list[complex]):
 def test_kernel_simulation_dtype_complex_params_f64():
     cudaq.reset_target()
     cudaq.set_target('nvidia-fp64')
-    
+
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
     @cudaq.kernel
@@ -326,7 +324,7 @@ def kernel(vec: list[complex]):
 def test_kernel_simulation_dtype_np_array_from_capture_f64():
     cudaq.reset_target()
     cudaq.set_target('nvidia-fp64')
-    
+
     c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
 
     @cudaq.kernel

From 8b18c672ed9f110b2bd057d25c1bb620e3c18b7f Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 10 Jul 2024 16:37:06 -0700
Subject: [PATCH 16/50] Unified common code and added tests

---
 include/cudaq/Optimizer/Builder/Intrinsics.h  |  39 ++++-
 include/cudaq/Optimizer/Transforms/Passes.h   |   1 -
 include/cudaq/Optimizer/Transforms/Passes.td  |  57 +++++--
 lib/Optimizer/Builder/Intrinsics.cpp          |  99 +++++++++--
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   | 118 +++++++------
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 125 ++++++--------
 lib/Optimizer/Transforms/StatePreparation.cpp | 157 ++++++++----------
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  34 +++-
 python/tests/backends/test_IQM.py             |  27 +++
 python/tests/backends/test_IonQ.py            |  27 +++
 python/tests/backends/test_OQC.py             |  27 +++
 .../test_Quantinuum_LocalEmulation_builder.py |  13 ++
 .../test_Quantinuum_LocalEmulation_kernel.py  |  15 ++
 .../tests/backends/test_Quantinuum_builder.py |  13 ++
 .../tests/kernel/test_kernel_qvector_init.py  |  19 ++-
 python/tests/remote/test_remote_code_exec.py  |  26 +++
 runtime/common/BaseRemoteRESTQPU.h            |  16 --
 runtime/common/BaseRestRemoteClient.h         |  13 +-
 .../default/rest/helpers/ionq/ionq.config     |   2 +-
 .../default/rest/helpers/iqm/iqm.config       |   2 +-
 .../default/rest/helpers/oqc/oqc.config       |   2 +-
 .../rest/helpers/quantinuum/quantinuum.config |   2 +-
 .../execution/state_preparation_vector.cpp    |  35 ++--
 test/Quake/lift_array.qke                     |  35 +++-
 test/Quake/lift_array_temp.qke                |  50 ++++++
 test/Quake/state_prep.qke                     | 114 +++++++++++++
 26 files changed, 764 insertions(+), 304 deletions(-)
 create mode 100644 test/Quake/lift_array_temp.qke
 create mode 100644 test/Quake/state_prep.qke

diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index 4d9130504c..2413c935c7 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -86,13 +86,40 @@ class IRBuilder : public mlir::OpBuilder {
   }
 
   cc::GlobalOp
-  genVectorOfComplexConstant(mlir::Location loc, mlir::ModuleOp module,
-                             mlir::StringRef name,
-                             const std::vector<std::complex<double>> &values);
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       mlir::StringRef name,
+                       const std::vector<std::complex<double>> &values);
   cc::GlobalOp
-  genVectorOfComplexConstant(mlir::Location loc, mlir::ModuleOp module,
-                             mlir::StringRef name,
-                             const std::vector<std::complex<float>> &values);
+  genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                       mlir::StringRef name,
+                       const std::vector<std::complex<float>> &values);
+
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    mlir::StringRef name,
+                                    const std::vector<double> &values);
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    mlir::StringRef name,
+                                    const std::vector<float> &values);
+
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    mlir::StringRef name,
+                                    const std::vector<std::int64_t> &values);
+
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    mlir::StringRef name,
+                                    const std::vector<std::int32_t> &values);
+
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    mlir::StringRef name,
+                                    const std::vector<std::int16_t> &values);
+
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    mlir::StringRef name,
+                                    const std::vector<std::int8_t> &values);
+
+  cc::GlobalOp genVectorOfConstants(mlir::Location loc, mlir::ModuleOp module,
+                                    mlir::StringRef name,
+                                    const std::vector<bool> &values);
 
   /// Load an intrinsic into \p module. The intrinsic to load has name \p name.
   /// This will automatically load any intrinsics that \p name depends upon.
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 9431b3da0d..57b79cdec2 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -42,7 +42,6 @@ std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createStatePreparation();
-std::unique_ptr<mlir::Pass> createStatePreparation(std::string_view);
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 0b20662af6..1a2675d482 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -532,6 +532,52 @@ def ObserveAnsatz : Pass<"observe-ansatz", "mlir::func::FuncOp"> {
   ];
 }
 
+def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
+  let summary =
+    "Convert state vector data into gates";
+  let description = [{
+    Convert quake representation that includes qubit initialization
+    from data into qubit initialization using gates.
+
+    For example:
+
+    ```mlir
+    module {
+      func.func @foo() attributes {
+        %0 = cc.address_of @foo.rodata_0 : !cc.ptr<!cc.array<complex<f32> x 4>>
+        %1 = quake.alloca !quake.veq<2>
+        %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+        return
+      }
+      cc.global constant @foo.rodata_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
+    }
+    ```
+    Will be rewritten to:
+    ```mlir
+    module {
+      func.func @foo() attributes {
+        %0 = quake.alloca !quake.veq<2>
+        %c1_i64 = arith.constant 1 : i64
+        %1 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
+        %cst = arith.constant 0.000000e+00 : f64
+        quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+        %c0_i64 = arith.constant 0 : i64
+        %2 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
+        %cst_0 = arith.constant 0.78539816339744839 : f64
+        quake.ry (%cst_0) %2 : (f64, !quake.ref) -> ()
+        quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+        %cst_1 = arith.constant 0.78539816339744839 : f64
+        quake.ry (%cst_1) %2 : (f64, !quake.ref) -> ()
+        quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
+        return
+      }
+    }
+    ```
+  }];
+
+  let constructor = "cudaq::opt::createStatePreparation()";
+}
+
 def PromoteRefToVeqAlloc : Pass<"promote-qubit-allocation"> {
   let summary = "Promote single qubit allocations.";
   let description = [{
@@ -572,17 +618,6 @@ def PruneCtrlRelations : Pass<"pruned-ctrl-form", "mlir::func::FuncOp"> {
   }];
 }
 
-def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
-  let summary =
-    "Convert state vector data into gates";
-  let description = [{
-    Convert quake representation that includes qubit initialization
-    from data into qubit initialization using gates.
-  }];
-
-  let constructor = "cudaq::opt::createStatePreparation()";
-}
-
 def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> {
   let summary =
     "Synthesize concrete quantum program from Quake code plus runtime values.";
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index e801a123f9..2b8be438fd 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -388,39 +388,102 @@ LogicalResult IRBuilder::loadIntrinsic(ModuleOp module, StringRef intrinName) {
 }
 
 template <typename A>
-cc::GlobalOp
-buildVectorOfComplexConstant(Location loc, ModuleOp module, StringRef name,
-                             const std::vector<std::complex<A>> &values,
-                             IRBuilder &builder, Type ty) {
+static std::vector<std::int32_t> asI32(const std::vector<A> &v) {
+  std::vector<std::int32_t> result(v.size());
+  for (auto iter : llvm::enumerate(v))
+    result[iter.index()] = static_cast<std::int32_t>(iter.value());
+  return result;
+}
+
+template <typename T>
+DenseElementsAttr createArrayAttr(const std::vector<T> &values, Type eleTy) {
+  auto newValues = ArrayRef<T>(values.data(), values.size());
+  auto tensorTy = RankedTensorType::get(values.size(), eleTy);
+  return DenseElementsAttr::get(tensorTy, newValues);
+}
+
+template <typename A>
+cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
+                                           StringRef name,
+                                           const std::vector<A> &values,
+                                           IRBuilder &builder, Type eleTy) {
   if (auto glob = module.lookupSymbol<cc::GlobalOp>(name))
     return glob;
   auto *ctx = builder.getContext();
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(module.getBody());
-  auto complexTy = ComplexType::get(ty);
-  auto globalTy = cc::ArrayType::get(ctx, complexTy, values.size());
-  SmallVector<std::complex<APFloat>> newValues;
-  for (auto c : values)
-    newValues.emplace_back(APFloat{c.real()}, APFloat{c.imag()});
-  auto tensorTy = RankedTensorType::get(values.size(), complexTy);
-  auto denseEleAttr = DenseElementsAttr::get(tensorTy, newValues);
-  return builder.create<cudaq::cc::GlobalOp>(loc, globalTy, name, denseEleAttr,
+  auto globalTy = cc::ArrayType::get(ctx, eleTy, values.size());
+
+  auto arrayAttr = createArrayAttr(values, eleTy);
+  return builder.create<cudaq::cc::GlobalOp>(loc, globalTy, name, arrayAttr,
                                              /*constant=*/true,
                                              /*external=*/false);
 }
 
-cc::GlobalOp IRBuilder::genVectorOfComplexConstant(
+cc::GlobalOp IRBuilder::genVectorOfConstants(
     Location loc, ModuleOp module, StringRef name,
     const std::vector<std::complex<double>> &values) {
-  return buildVectorOfComplexConstant(loc, module, name, values, *this,
-                                      getF64Type());
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       ComplexType::get(getF64Type()));
 }
 
-cc::GlobalOp IRBuilder::genVectorOfComplexConstant(
+cc::GlobalOp IRBuilder::genVectorOfConstants(
     Location loc, ModuleOp module, StringRef name,
     const std::vector<std::complex<float>> &values) {
-  return buildVectorOfComplexConstant(loc, module, name, values, *this,
-                                      getF32Type());
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       ComplexType::get(getF32Type()));
+}
+
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const std::vector<double> &values) {
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       getF64Type());
+}
+
+cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
+                                             StringRef name,
+                                             const std::vector<float> &values) {
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       getF32Type());
+}
+
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const std::vector<std::int64_t> &values) {
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       getI64Type());
+}
+
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const std::vector<std::int32_t> &values) {
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       getI32Type());
+}
+
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const std::vector<std::int16_t> &values) {
+  auto converted = asI32(values);
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       getI32Type());
+}
+
+cc::GlobalOp
+IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
+                                const std::vector<std::int8_t> &values) {
+  auto converted = asI32(values);
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       getI32Type());
+}
+
+cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
+                                             StringRef name,
+                                             const std::vector<bool> &values) {
+  auto converted = asI32(values);
+  return buildVectorOfConstantElements(loc, module, name, converted, *this,
+                                       getI32Type());
 }
 
 Value IRBuilder::getByteSizeOfType(Location loc, Type ty) {
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index d4c0c335ed..d541edcacb 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -27,6 +27,64 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
+namespace {
+template <typename A>
+std::vector<A> readConstantValues(SmallVectorImpl<Attribute> &vec, Type eleTy) {
+  std::vector<A> result;
+  for (auto a : vec) {
+    if constexpr (std::is_same_v<A, std::complex<double>>) {
+      auto v = cast<ArrayAttr>(a);
+      result.emplace_back(cast<FloatAttr>(v[0]).getValue().convertToDouble(),
+                          cast<FloatAttr>(v[1]).getValue().convertToDouble());
+    } else if constexpr (std::is_same_v<A, std::complex<float>>) {
+      auto v = cast<ArrayAttr>(a);
+      result.emplace_back(cast<FloatAttr>(v[0]).getValue().convertToFloat(),
+                          cast<FloatAttr>(v[1]).getValue().convertToFloat());
+    } else if constexpr (std::is_same_v<A, double>) {
+      auto v = cast<FloatAttr>(a);
+      result.emplace_back(v.getValue().convertToDouble());
+    } else if constexpr (std::is_same_v<A, float>) {
+      auto v = cast<FloatAttr>(a);
+      result.emplace_back(v.getValue().convertToFloat());
+    } else {
+      assert(false && "unexpected type in constant array");
+    }
+  }
+  return result;
+}
+
+void genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder,
+                                        Location loc, ModuleOp module,
+                                        StringRef name,
+                                        SmallVector<Attribute> &values,
+                                        Type eleTy) {
+
+  if (auto cTy = dyn_cast<ComplexType>(eleTy)) {
+    auto floatTy = cTy.getElementType();
+    if (floatTy == irBuilder.getF64Type()) {
+      auto vals = readConstantValues<std::complex<double>>(values, cTy);
+      irBuilder.genVectorOfConstants(loc, module, name, vals);
+      return;
+    } else if (floatTy == irBuilder.getF32Type()) {
+      auto vals = readConstantValues<std::complex<float>>(values, cTy);
+      irBuilder.genVectorOfConstants(loc, module, name, vals);
+      return;
+    }
+  } else if (auto floatTy = dyn_cast<FloatType>(eleTy)) {
+    if (floatTy == irBuilder.getF64Type()) {
+      auto vals = readConstantValues<double>(values, floatTy);
+      irBuilder.genVectorOfConstants(loc, module, name, vals);
+      return;
+    } else if (floatTy == irBuilder.getF32Type()) {
+      auto vals = readConstantValues<float>(values, floatTy);
+      irBuilder.genVectorOfConstants(loc, module, name, vals);
+      return;
+    }
+  }
+  assert(false && "unexpected element type in constant array");
+}
+} // namespace
+
 namespace {
 class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
 public:
@@ -66,53 +124,13 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     Value conArr;
     Value conGlobal;
     if (toGlobal) {
-      auto ip = rewriter.saveInsertionPoint();
       static unsigned counter = 0;
       auto ptrTy = cudaq::cc::PointerType::get(arrTy);
       // Build a new name based on the kernel name.
       std::string name = funcName + ".rodata_" + std::to_string(counter++);
-      {
-        OpBuilder::InsertionGuard guard(rewriter);
-        if (auto complexTy = dyn_cast<ComplexType>(eleTy)) {
-          // Transforming complex vectors is a bit more labor intensive. Use the
-          // IRBuilder to create the object since we have to thread the needle
-          // for the LLVM-IR to be lowered to LLVM correctly.
-          auto transform = [&]<typename A>(SmallVectorImpl<Attribute> &vec)
-              -> std::vector<std::complex<A>> {
-            std::vector<std::complex<A>> result;
-            for (auto a : vec) {
-              auto v = cast<ArrayAttr>(a);
-              if constexpr (std::is_same_v<A, double>) {
-                result.emplace_back(
-                    cast<FloatAttr>(v[0]).getValue().convertToDouble(),
-                    cast<FloatAttr>(v[1]).getValue().convertToDouble());
-              } else {
-                result.emplace_back(
-                    cast<FloatAttr>(v[0]).getValue().convertToFloat(),
-                    cast<FloatAttr>(v[1]).getValue().convertToFloat());
-              }
-            }
-            return result;
-          };
-          cudaq::IRBuilder irBuilder(rewriter.getContext());
-          if (complexTy.getElementType() == rewriter.getF64Type()) {
-            std::vector<std::complex<double>> vals =
-                transform.template operator()<double>(values);
-            irBuilder.genVectorOfComplexConstant(loc, module, name, vals);
-          } else {
-            std::vector<std::complex<float>> vals =
-                transform.template operator()<float>(values);
-            irBuilder.genVectorOfComplexConstant(loc, module, name, vals);
-          }
-        } else {
-          OpBuilder::InsertionGuard guard(rewriter);
-          rewriter.setInsertionPointToEnd(module.getBody());
-          rewriter.create<cudaq::cc::GlobalOp>(loc, arrTy, name, valuesAttr,
-                                               /*isConstant=*/true,
-                                               /*isExternal=*/false);
-        }
-      }
-      rewriter.restoreInsertionPoint(ip);
+      cudaq::IRBuilder irBuilder(rewriter.getContext());
+      genVectorOfConstantsFromAttributes(irBuilder, loc, module, name, values,
+                                         eleTy);
       conGlobal = rewriter.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
       conArr = rewriter.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
     } else {
@@ -150,7 +168,6 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         }
         if (isa<cudaq::cc::StoreOp>(useuser)) {
           toErase.push_back(useuser);
-          continue;
         }
         isLive = true;
       }
@@ -165,7 +182,13 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     }
 
     for (auto *op : toErase) {
-      rewriter.eraseOp(op);
+      if (op->getUses().empty()) {
+        rewriter.eraseOp(op);
+      } else {
+        module.emitOpError("LiftArrayAlloc failed to remove quake.init_state "
+                           "or its dependencies.");
+        return failure();
+      }
     }
     return success();
   }
@@ -263,7 +286,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
               scoreboard[0] = w;
               continue;
             }
-          // can be a cast only used for a quake.init_state)
+          // can be a cast only used for a quake.init_state or vector init
           continue;
         } else {
           if (getWriteOp(cast, 0)) {
@@ -271,7 +294,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
                        << "another cast used in store: " << *op << '\n');
             return false;
           }
-          // can be a cast only used for a quake.init_state)
+          // can be a cast only used for a quake.init_state or vector init
+          toGlobal = true;
           continue;
         }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index d81fdd04c8..58a5f4a3f9 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -102,22 +103,31 @@ Value makeComplexElement(OpBuilder &builder, Location argLoc,
   return builder.create<complex::ConstantOp>(argLoc, eleTy, complexVal);
 }
 
-/// returns true if and only if \p argument is used by a `quake.init_state`
-/// operation.
-static bool hasInitStateUse(BlockArgument argument) {
-  for (auto *argUser : argument.getUsers())
-    if (auto stdvecDataOp = dyn_cast<cudaq::cc::StdvecDataOp>(argUser))
-      for (auto *dataUser : stdvecDataOp->getUsers())
-        if (isa<quake::InitializeStateOp>(dataUser))
-          return true;
-  return false;
+template <typename T>
+std::tuple<Value, Value>
+createArrayInMemory(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                    BlockArgument argument, std::vector<T> &vec,
+                    cudaq::cc::ArrayType arrTy) {
+  auto argLoc = argument.getLoc();
+
+  // Stick global at end of Module.
+  std::string symbol = "__nvqpp_rodata_init_state." + std::to_string(counter++);
+
+  cudaq::IRBuilder irBuilder(builder);
+  irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
+
+  builder.setInsertionPointToStart(argument.getOwner());
+  auto buffer = builder.create<cudaq::cc::AddressOfOp>(
+      argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
+  auto data = builder.create<cudaq::cc::LoadOp>(argLoc, arrTy, buffer);
+  return {buffer, data};
 }
 
-template <typename ELETY, typename T, typename ATTR, typename MAKER>
+template <typename ELETY, typename T, typename MAKER>
 LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<T> &vec,
-                         ATTR arrayAttr, MAKER makeElementValue) {
+                         MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
   assert(isa<cudaq::cc::StdvecType>(argTy));
@@ -125,40 +135,26 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
   auto eleTy = cast<ELETY>(strTy.getElementType());
   builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
-  auto conArray = builder.create<cudaq::cc::ConstantArrayOp>(
-      argLoc, cudaq::cc::ArrayType::get(ctx, eleTy, vec.size()), arrayAttr);
+
   auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
   std::optional<Value> arrayInMemory;
+  std::optional<Value> conArray;
   auto ptrEleTy = cudaq::cc::PointerType::get(eleTy);
   bool generateNewValue = false;
 
   // Helper function that materializes the array in memory.
-  auto getArrayInMemory = [&]() -> Value {
+  auto getArrayInMemory = [&]() -> std::tuple<Value, Value> {
     if (arrayInMemory)
-      return *arrayInMemory;
+      return {*arrayInMemory, *conArray};
     OpBuilder::InsertionGuard guard(builder);
-    Value buffer;
-    if (hasInitStateUse(argument)) {
-      // Stick global at end of Module.
-      builder.setInsertionPointToEnd(module.getBody());
-      std::string symbol =
-          "__nvqpp_rodata_init_state." + std::to_string(counter++);
-      builder.create<cudaq::cc::GlobalOp>(argLoc, arrTy, symbol, arrayAttr,
-                                          /*isConstant=*/true,
-                                          /*isExternal=*/false);
-      builder.setInsertionPointAfter(conArray);
-      buffer = builder.create<cudaq::cc::AddressOfOp>(
-          argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
-    } else {
-      builder.setInsertionPointAfter(conArray);
-      buffer = builder.create<cudaq::cc::AllocaOp>(argLoc, arrTy);
-      builder.create<cudaq::cc::StoreOp>(argLoc, conArray, buffer);
-    }
+    auto [buffer, data] =
+        createArrayInMemory(builder, module, counter, argument, vec, arrTy);
     auto ptrArrEleTy =
         cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
     Value res = builder.create<cudaq::cc::CastOp>(argLoc, ptrArrEleTy, buffer);
     arrayInMemory = res;
-    return res;
+    conArray = data;
+    return {res, data};
   };
 
   auto replaceLoads = [&](cudaq::cc::ComputePtrOp elePtrOp,
@@ -211,11 +207,11 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
           if (index == cudaq::cc::ComputePtrOp::kDynamicIndex) {
             OpBuilder::InsertionGuard guard(builder);
             builder.setInsertionPoint(elePtrOp);
+            auto [memArr, conArray] = getArrayInMemory();
             Value getEle = builder.create<cudaq::cc::ExtractValueOp>(
                 elePtrOp.getLoc(), eleTy, conArray,
                 elePtrOp.getDynamicIndices()[0]);
             if (failed(replaceLoads(elePtrOp, getEle))) {
-              Value memArr = getArrayInMemory();
               builder.setInsertionPoint(elePtrOp);
               Value newComputedPtr = builder.create<cudaq::cc::ComputePtrOp>(
                   argLoc, ptrEleTy, memArr,
@@ -228,7 +224,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
           Value runtimeParam =
               makeElementValue(builder, argLoc, vec[index], eleTy);
           if (failed(replaceLoads(elePtrOp, runtimeParam))) {
-            Value memArr = getArrayInMemory();
+            auto [memArr, _] = getArrayInMemory();
             OpBuilder::InsertionGuard guard(builder);
             builder.setInsertionPoint(elePtrOp);
             Value newComputedPtr = builder.create<cudaq::cc::ComputePtrOp>(
@@ -243,7 +239,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
       // Check if there were other uses of `vec.data()` and simply forward the
       // constant array as materialized in memory.
       if (replaceOtherUses) {
-        Value memArr = getArrayInMemory();
+        auto [memArr, _] = getArrayInMemory();
         stdvecDataOp.replaceAllUsesWith(memArr);
       }
       continue;
@@ -255,9 +251,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     generateNewValue = true;
   }
   if (generateNewValue) {
-    auto memArr = getArrayInMemory();
+    auto [memArr, data] = getArrayInMemory();
     OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointAfter(memArr.getDefiningOp());
+    builder.setInsertionPointAfter(data.getDefiningOp());
     Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
     Value newVec =
         builder.create<cudaq::cc::StdvecInitOp>(argLoc, strTy, memArr, size);
@@ -274,15 +270,11 @@ std::vector<std::int32_t> asI32(const std::vector<A> &v) {
   return result;
 }
 
-// TODO: consider using DenseArrayAttr here instead. NB: such a change may alter
-// the output of the constant array op.
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<bool> &vec) {
-  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
-  return synthesizeVectorArgument<IntegerType>(builder, module, counter,
-                                               argument, vec, arrayAttr,
-                                               makeIntegerElement<bool>);
+  return synthesizeVectorArgument<IntegerType>(
+      builder, module, counter, argument, vec, makeIntegerElement<bool>);
 }
 
 static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
@@ -290,10 +282,8 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int8_t> &vec) {
-  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
-  return synthesizeVectorArgument<IntegerType>(builder, module, counter,
-                                               argument, vec, arrayAttr,
-                                               makeIntegerElement<std::int8_t>);
+  return synthesizeVectorArgument<IntegerType>(
+      builder, module, counter, argument, vec, makeIntegerElement<std::int8_t>);
 }
 
 static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
@@ -301,9 +291,8 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int16_t> &vec) {
-  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
   return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec, arrayAttr,
+      builder, module, counter, argument, vec,
       makeIntegerElement<std::int16_t>);
 }
 
@@ -312,9 +301,8 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int32_t> &vec) {
-  auto arrayAttr = builder.getI32ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec, arrayAttr,
+      builder, module, counter, argument, vec,
       makeIntegerElement<std::int32_t>);
 }
 
@@ -323,58 +311,39 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int64_t> &vec) {
-  auto arrayAttr = builder.getI64ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec, arrayAttr,
+      builder, module, counter, argument, vec,
       makeIntegerElement<std::int64_t>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<float> &vec) {
-  auto arrayAttr = builder.getF32ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
-                                             vec, arrayAttr,
-                                             makeFloatElement<float>);
+                                             vec, makeFloatElement<float>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<double> &vec) {
-  auto arrayAttr = builder.getF64ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
-                                             vec, arrayAttr,
-                                             makeFloatElement<double>);
+                                             vec, makeFloatElement<double>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
                          std::vector<std::complex<float>> &vec) {
-  std::vector<float> vec2;
-  for (auto c : vec) {
-    vec2.push_back(c.real());
-    vec2.push_back(c.imag());
-  }
-  auto arrayAttr = builder.getF32ArrayAttr(vec2);
-  return synthesizeVectorArgument<ComplexType>(builder, module, counter,
-                                               argument, vec, arrayAttr,
-                                               makeComplexElement<float>);
+  return synthesizeVectorArgument<ComplexType>(
+      builder, module, counter, argument, vec, makeComplexElement<float>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
                          std::vector<std::complex<double>> &vec) {
-  std::vector<double> vec2;
-  for (auto c : vec) {
-    vec2.push_back(c.real());
-    vec2.push_back(c.imag());
-  }
-  auto arrayAttr = builder.getF64ArrayAttr(vec2);
-  return synthesizeVectorArgument<ComplexType>(builder, module, counter,
-                                               argument, vec, arrayAttr,
-                                               makeComplexElement<double>);
+  return synthesizeVectorArgument<ComplexType>(
+      builder, module, counter, argument, vec, makeComplexElement<double>);
 }
 
 namespace {
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 564a121f83..8f84623a29 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -33,34 +33,42 @@ using namespace mlir;
 /// Replace a qubit initialization from vectors with quantum gates.
 /// For example:
 ///
-/// func.func
-/// @__nvqpp__mlirgen__function_test._Z4testSt6vectorISt7complexIfESaIS1_EE()
-/// attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-///   %0 = cc.address_of @__nvqpp_rodata_init_state.0 :
-///   !cc.ptr<!cc.array<complex<f32> x 4>> %1 = cc.cast %0 :
-///   (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>> %2 =
-///   quake.alloca !quake.veq<2> %3 = quake.init_state %2, %1 : (!quake.veq<2>,
-///   !cc.ptr<complex<f32>>) -> !quake.veq<2> return
+///
+/// Before PrepareState (state-prep):
+///
+/// module {
+///   func.func @foo() attributes {
+///     %0 = cc.address_of @foo.rodata_0 : !cc.ptr<!cc.array<complex<f32> x 4>>
+///     %1 = quake.alloca !quake.veq<2>
+///     %2 = quake.init_state %1, %0 : (!quake.veq<2>,
+///       !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2> return
+///  }
+///  cc.global constant @foo.rodata_0 (dense<[(0.707106769,0.000000e+00),
+///      (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00),
+///      (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) :
+///    !cc.array<complex<f32> x 4>
 /// }
 ///
-/// is converted to:
+/// After PrepareState (state-prep):
 ///
-///   func.func @foo(%arg0 : !cc.stdvec<complex<f32>>) {
+/// module {
+///   func.func @foo() attributes {
 ///     %0 = quake.alloca !quake.veq<2>
-///     %c0_i64 = arith.constant 0 : i64
-///     %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
-///     %cst = arith.constant 1.5707963267948968 : f64
-///     quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
 ///     %c1_i64 = arith.constant 1 : i64
-///     %2 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
-///     %cst_0 = arith.constant 1.5707963267948966 : f64
+///     %1 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst = arith.constant 0.000000e+00 : f64
+///     quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+///     %c0_i64 = arith.constant 0 : i64
+///     %2 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref
+///     %cst_0 = arith.constant 0.78539816339744839 : f64
 ///     quake.ry (%cst_0) %2 : (f64, !quake.ref) -> ()
 ///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
-///     %cst_1 = arith.constant -1.5707963267948966 : f64
+///     %cst_1 = arith.constant 0.78539816339744839 : f64
 ///     quake.ry (%cst_1) %2 : (f64, !quake.ref) -> ()
 ///     quake.x [%1] %2 : (!quake.ref, !quake.ref) -> ()
 ///     return
 ///   }
+/// }
 
 namespace {
 
@@ -71,59 +79,41 @@ readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
   auto attr = global.getValue();
   auto type = global.getType().getElementType();
 
-  if (auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(type)) {
-    auto eleTy = arrayTy.getElementType();
-
-    if (attr.has_value()) {
-      if (auto elementsAttr = dyn_cast<mlir::ElementsAttr>(attr.value())) {
-        auto eleTy = elementsAttr.getElementType();
-        if (isa<ComplexType>(eleTy)) {
-          auto values = elementsAttr.getValues<mlir::ArrayAttr>();
-          for (auto it = values.begin(); it != values.end(); ++it) {
-            auto valueAttr = *it;
-            auto real =
-                cast<FloatAttr>(valueAttr[0]).getValue().convertToDouble();
-            auto imag =
-                cast<FloatAttr>(valueAttr[1]).getValue().convertToDouble();
-            result.push_back({real, imag});
-          }
-        } else {
-          auto values = elementsAttr.getValues<double>();
-          for (auto it = values.begin(); it != values.end(); ++it) {
-            result.push_back({*it, 0.0});
-          }
-        }
-      } else if (auto values = dyn_cast<mlir::ArrayAttr>(attr.value())) {
-        for (auto it = values.begin(); it != values.end(); ++it) {
-          auto real = *it;
-          // for (std::size_t idx = 0; idx < numConstants; idx += isComplex ? 2
-          // : 1) {
-          auto v = [&]() -> std::complex<double> {
-            if (isa<FloatType>(eleTy))
-              return {cast<FloatAttr>(real).getValue().convertToDouble(),
-                      static_cast<double>(0.0)};
-            if (isa<IntegerType>(eleTy))
-              return {static_cast<double>(cast<IntegerAttr>(real).getInt()),
-                      static_cast<double>(0.0)};
-            assert(isa<ComplexType>(eleTy));
-            it++;
-            auto imag = *it;
-            return {cast<FloatAttr>(real).getValue().convertToDouble(),
-                    cast<FloatAttr>(imag).getValue().convertToDouble()};
-          }();
-
-          result.push_back(v);
-        }
-      }
-    }
+  auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(type);
+  assert(arrayTy);
+  assert(attr.has_value());
+
+  auto elementsAttr = dyn_cast<mlir::ElementsAttr>(attr.value());
+  assert(elementsAttr);
+  auto eleTy = elementsAttr.getElementType();
+  auto values = elementsAttr.getValues<mlir::Attribute>();
+
+  for (auto it = values.begin(); it != values.end(); ++it) {
+    auto valAttr = *it;
+
+    auto v = [&]() -> std::complex<double> {
+      if (isa<FloatType>(eleTy))
+        return {cast<FloatAttr>(valAttr).getValue().convertToDouble(),
+                static_cast<double>(0.0)};
+      if (isa<IntegerType>(eleTy))
+        return {static_cast<double>(cast<IntegerAttr>(valAttr).getInt()),
+                static_cast<double>(0.0)};
+      assert(isa<ComplexType>(eleTy));
+      auto arrayAttr = cast<mlir::ArrayAttr>(valAttr);
+      auto real = cast<FloatAttr>(arrayAttr[0]).getValue().convertToDouble();
+      auto imag = cast<FloatAttr>(arrayAttr[1]).getValue().convertToDouble();
+      return {real, imag};
+    }();
+
+    result.push_back(v);
   }
-
   return result;
 }
 
-LogicalResult transform(OpBuilder &builder, ModuleOp module) {
+LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
+  auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
   auto toErase = std::vector<mlir::Operation *>();
-  module->walk([&](Operation *op) {
+  funcOp->walk([&](Operation *op) {
     if (auto initOp = dyn_cast<quake::InitializeStateOp>(op)) {
       toErase.push_back(initOp);
       auto loc = op->getLoc();
@@ -176,33 +166,25 @@ LogicalResult transform(OpBuilder &builder, ModuleOp module) {
 
 class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
 protected:
-  // The name of the kernel to be synthesized
-  std::string kernelName;
-
 public:
   StatePreparation() = default;
-  StatePreparation(std::string_view kernel) : kernelName(kernel) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
   void runOnOperation() override final {
     auto module = getModule();
-    auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName;
-    // Get the function we care about (the one with kernelName)
-    auto funcOp = module.lookupSymbol<func::FuncOp>(kernelNameInQuake);
-    if (!funcOp) {
-      module.emitOpError("The kernel '" + kernelName +
-                         "' was not found in the module.");
-      signalPassFailure();
-      return;
-    }
-
-    auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
-    auto result = transform(builder, module);
-    if (result.failed()) {
-      module.emitOpError("Failed to prepare state for '" + kernelName);
-      signalPassFailure();
-      return;
+    for (Operation &op : *module.getBody()) {
+      auto funcOp = dyn_cast<func::FuncOp>(op);
+      if (!funcOp)
+        continue;
+      std::string kernelName = funcOp.getName().str();
+
+      auto result = transform(module, funcOp);
+      if (result.failed()) {
+        funcOp.emitOpError("Failed to prepare state for '" + kernelName);
+        signalPassFailure();
+        return;
+      }
     }
   }
 };
@@ -212,8 +194,3 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
 std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation() {
   return std::make_unique<StatePreparation>();
 }
-
-std::unique_ptr<mlir::Pass>
-cudaq::opt::createStatePreparation(std::string_view kernelName) {
-  return std::make_unique<StatePreparation>(kernelName);
-}
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index a937b4acc8..6d238509ec 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -462,6 +462,18 @@ py::object pyAltLaunchKernelR(const std::string &name, MlirModule module,
   return returnValue;
 }
 
+/// @brief Helper function to get boolean environment variable
+bool getEnvBool(const char *envName, bool defaultVal = false) {
+  if (auto envVal = std::getenv(envName)) {
+    std::string tmp(envVal);
+    std::transform(tmp.begin(), tmp.end(), tmp.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    if (tmp == "1" || tmp == "on" || tmp == "true" || tmp == "yes")
+      return true;
+  }
+  return defaultVal;
+}
+
 MlirModule synthesizeKernel(const std::string &name, MlirModule module,
                             cudaq::OpaqueArguments &runtimeArgs) {
   ScopedTraceWithContext(cudaq::TIMING_JIT, "synthesizeKernel", name);
@@ -473,12 +485,24 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto context = cloned.getContext();
   registerLLVMDialectTranslation(*context);
 
+  // Get additional debug values
+  auto disableMLIRthreading = getEnvBool("CUDAQ_MLIR_DISABLE_THREADING", false);
+  auto enablePrintMLIREachPass =
+      getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
+
   PassManager pm(context);
   pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs));
   pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-  pm.addPass(cudaq::opt::createLiftArrayAllocPass());
-  pm.addPass(cudaq::opt::createStatePreparation(name));
+
+  // Run state preparation for quantum devices only.
+  // Simulators have direct implementation of state initialization
+  // in their runtime.
+  auto &platform = cudaq::get_platform();
+  if (!platform.is_simulator() || platform.is_emulated()) {
+    pm.addPass(createCSEPass());
+    pm.addPass(cudaq::opt::createLiftArrayAllocPass());
+    pm.addPass(cudaq::opt::createStatePreparation());
+  }
   pm.addPass(createCanonicalizerPass());
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
@@ -490,6 +514,10 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
   auto timingScope = tm.getRootScope(); // starts the timer
   pm.enableTiming(timingScope);         // do this right before pm.run
+  if (disableMLIRthreading || enablePrintMLIREachPass)
+    context->disableMultithreading();
+  if (enablePrintMLIREachPass)
+    pm.enableIRPrinting();
   if (failed(pm.run(cloned)))
     throw std::runtime_error(
         "cudaq::builder failed to JIT compile the Quake representation.");
diff --git a/python/tests/backends/test_IQM.py b/python/tests/backends/test_IQM.py
index 200d078fcc..76bb1190a3 100644
--- a/python/tests/backends/test_IQM.py
+++ b/python/tests/backends/test_IQM.py
@@ -10,6 +10,7 @@
 import tempfile
 import time
 from multiprocessing import Process
+import numpy as np
 
 import cudaq
 from cudaq import spin
@@ -160,6 +161,32 @@ def kernel():
     result = cudaq.sample(kernel)
 
 
+def test_IQM_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
+def test_IQM_state_preparation_builder():
+    kernel, state = cudaq.make_kernel(list[complex])
+    qubits = kernel.qalloc(state)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/backends/test_IonQ.py b/python/tests/backends/test_IonQ.py
index be93445359..92a7ac8a5d 100644
--- a/python/tests/backends/test_IonQ.py
+++ b/python/tests/backends/test_IonQ.py
@@ -8,6 +8,7 @@
 
 import cudaq, pytest, os, time
 from cudaq import spin
+import numpy as np
 from multiprocessing import Process
 try:
     from utils.mock_qpu.ionq import startServer
@@ -156,6 +157,32 @@ def kernel():
     result = cudaq.sample(kernel)
 
 
+def test_ionq_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
+def test_ionq_state_preparation_builder():
+    kernel, state = cudaq.make_kernel(list[complex])
+    qubits = kernel.qalloc(state)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/backends/test_OQC.py b/python/tests/backends/test_OQC.py
index 70779e975f..0dc40e4bec 100644
--- a/python/tests/backends/test_OQC.py
+++ b/python/tests/backends/test_OQC.py
@@ -15,6 +15,7 @@
 
 import cudaq
 from cudaq import spin
+import numpy as np
 
 try:
     from utils.mock_qpu.oqc import startServer
@@ -158,6 +159,32 @@ def test_OQC_observe():
     assert assert_close(res.expectation())
 
 
+def test_OQC_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
+def test_OQC_state_preparation_builder():
+    kernel, state = cudaq.make_kernel(list[complex])
+    qubits = kernel.qalloc(state)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
index b368cefdb0..70a1d29aa9 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
@@ -8,6 +8,7 @@
 
 import cudaq, pytest, os, time
 from cudaq import spin
+import numpy as np
 from multiprocessing import Process
 
 
@@ -111,6 +112,18 @@ def test_quantinuum_exp_pauli():
     assert assert_close(res.expectation())
 
 
+def test_quantinuum_state_preparation():
+    kernel, state = cudaq.make_kernel(list[complex])
+    qubits = kernel.qalloc(state)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
index 3bad589111..5576f46597 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
@@ -8,6 +8,7 @@
 
 import cudaq, pytest, os, time
 from cudaq import spin
+import numpy as np
 from multiprocessing import Process
 
 
@@ -138,6 +139,20 @@ def kernel():
     result = cudaq.sample(kernel)
 
 
+def test_quantinuum_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/backends/test_Quantinuum_builder.py b/python/tests/backends/test_Quantinuum_builder.py
index 1d82c6abb0..d20cb0d499 100644
--- a/python/tests/backends/test_Quantinuum_builder.py
+++ b/python/tests/backends/test_Quantinuum_builder.py
@@ -7,6 +7,7 @@
 # ============================================================================ #
 
 import cudaq, pytest, os, time
+import numpy as np
 from cudaq import spin
 from multiprocessing import Process
 try:
@@ -145,6 +146,18 @@ def test_quantinuum_observe():
     assert assert_close(res.expectation())
 
 
+def test_quantinuum_state_preparation():
+    kernel, state = cudaq.make_kernel(list[complex])
+    qubits = kernel.qalloc(state)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py
index 3edb5ca951..e892be2dc2 100644
--- a/python/tests/kernel/test_kernel_qvector_init.py
+++ b/python/tests/kernel/test_kernel_qvector_init.py
@@ -29,7 +29,7 @@
 
 
 @skipIfPythonLessThan39
-def test_kernel_state_preparation():
+def test_kernel_complex_synthesize():
     cudaq.reset_target()
 
     c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
@@ -39,9 +39,22 @@ def kernel(vec: list[complex]):
         q = cudaq.qvector(vec)
 
     synthesized = cudaq.synthesize(kernel, c)
-    assert 'quake.init_state' in kernel.__str__()
-    assert not 'quake.init_state' in synthesized.__str__()
+    counts = cudaq.sample(synthesized)
+    assert '00' in counts
+    assert '10' in counts
+
+
+@skipIfPythonLessThan39
+def test_kernel_float_synthesize():
+    cudaq.reset_target()
 
+    c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+
+    @cudaq.kernel
+    def kernel(vec: list[float]):
+        q = cudaq.qvector(vec)
+
+    synthesized = cudaq.synthesize(kernel, c)
     counts = cudaq.sample(synthesized)
     assert '00' in counts
     assert '10' in counts
diff --git a/python/tests/remote/test_remote_code_exec.py b/python/tests/remote/test_remote_code_exec.py
index 67541fddc9..5b0f869f87 100644
--- a/python/tests/remote/test_remote_code_exec.py
+++ b/python/tests/remote/test_remote_code_exec.py
@@ -15,6 +15,7 @@
 
 import cudaq
 from cudaq import spin
+import numpy as np
 
 ## [PYTHON_VERSION_FIX]
 skipIfPythonLessThan39 = pytest.mark.skipif(
@@ -349,6 +350,31 @@ def test_complex_vqe_named_lambda_sweep_opt(optimizer):
 def test_complex_vqe_named_lambda_sweep_grad(gradient):
     test_complex_vqe_named_lambda(cudaq.optimizers.Adam(), gradient)
 
+@skipIfPythonLessThan39
+def test_state_preparation():
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        qubits = cudaq.qvector(vec)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
+
+@skipIfPythonLessThan39
+def test_state_preparation_builder():
+    kernel, state = cudaq.make_kernel(list[complex])
+    qubits = kernel.qalloc(state)
+
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '00' in counts
+    assert '10' in counts
+    assert not '01' in counts
+    assert not '11' in counts
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index b598da28ed..9b8a80ba88 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -420,22 +420,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         throw std::runtime_error("Could not successfully apply quake-synth.");
     }
 
-    {
-      cudaq::info("Run State Prep.\n");
-      mlir::PassManager pm(&context);
-
-      pm.addPass(mlir::createCanonicalizerPass());
-      pm.addPass(mlir::createCSEPass());
-      pm.addPass(cudaq::opt::createLiftArrayAllocPass());
-      pm.addPass(cudaq::opt::createStatePreparation(kernelName));
-      if (disableMLIRthreading || enablePrintMLIREachPass)
-        moduleOp.getContext()->disableMultithreading();
-      if (enablePrintMLIREachPass)
-        pm.enableIRPrinting();
-      if (failed(pm.run(moduleOp)))
-        throw std::runtime_error("Could not successfully apply state prep.");
-    }
-
     runPassPipeline(passPipelineConfig, moduleOp);
 
     auto entryPointFunc = moduleOp.lookupSymbol<mlir::func::FuncOp>(
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index db1288caca..4c593f8085 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -159,17 +159,8 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
           throw std::runtime_error("Could not successfully apply quake-synth.");
       }
 
-      {
-        cudaq::info("Run State Prep.\n");
-        mlir::PassManager pm(&mlirContext);
-        pm.addPass(mlir::createCanonicalizerPass());
-        pm.addPass(mlir::createCSEPass());
-        pm.addPass(cudaq::opt::createLiftArrayAllocPass());
-        pm.addPass(cudaq::opt::createStatePreparation(name));
-        pm.addPass(mlir::createCanonicalizerPass());
-        if (failed(pm.run(moduleOp)))
-          throw std::runtime_error("Could not successfully apply state-prep.");
-      }
+      // Note: do not run state preparation pass here since we are always
+      // using simulators.
 
       // Run client-side passes. `clientPasses` is empty right now, but the code
       // below accommodates putting passes into it.
diff --git a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config
index 0b0555a3b3..c78a2b3e1e 100644
--- a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config
+++ b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config
@@ -16,7 +16,7 @@ GEN_TARGET_BACKEND=true
 LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline
-PLATFORM_LOWERING_CONFIG="expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
+PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 CODEGEN_EMISSION=qir-base
diff --git a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config
index 433658ec48..2db0f2b235 100644
--- a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config
+++ b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config
@@ -18,7 +18,7 @@ LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 # Define the lowering pipeline, here we lower to Base QIR
 # Note: the runtime will dynamically substitute %QPU_ARCH% based on
 # qpu-architecture
-PLATFORM_LOWERING_CONFIG="expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},delay-measurements,regtomem),iqm-gate-set-mapping"
+PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},delay-measurements,regtomem),iqm-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating IQM JSON.
 CODEGEN_EMISSION=iqm
diff --git a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config
index 3f157cb80d..042fb8dd8d 100644
--- a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config
+++ b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config
@@ -18,7 +18,7 @@ LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 # Define the lowering pipeline. Lucy has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Toshiko uses a Kagome lattice with 2-3 connectivity per qubit
-PLATFORM_LOWERING_CONFIG="expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},regtomem)"
+PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},regtomem)"
 
 
 # Tell the rest-qpu that we are generating QIR.
diff --git a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config
index 17696630be..bed7159b28 100644
--- a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config
+++ b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config
@@ -16,7 +16,7 @@ GEN_TARGET_BACKEND=true
 LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline, here we lower to Adaptive QIR
-PLATFORM_LOWERING_CONFIG="expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
+PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 CODEGEN_EMISSION=qir-adaptive
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index 886c3d92b8..994390cde3 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 // RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --enable-mlir --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
 #include <iostream>
@@ -16,6 +17,11 @@ __qpu__ void test_complex_constant_array() {
 }
 
 __qpu__ void test_complex_constant_array2() {
+   cudaq::qvector v1(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+   cudaq::qvector v2(std::vector<cudaq::complex>({ 0., 0., M_SQRT1_2, M_SQRT1_2}));
+}
+
+__qpu__ void test_complex_constant_array3() {
    cudaq::qvector v({
     cudaq::complex(M_SQRT1_2),
     cudaq::complex(M_SQRT1_2),
@@ -24,14 +30,14 @@ __qpu__ void test_complex_constant_array2() {
   });
 }
 
-__qpu__ void test_real_constant_array() {
-  cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
-}
-
 __qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
   cudaq::qvector q1 = inState;
 }
 
+__qpu__ void test_real_constant_array() {
+  cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
 __qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
   cudaq::qvector q1 = inState;
 }
@@ -49,6 +55,7 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
+
     {
       auto counts = cudaq::sample(test_complex_constant_array);
       printCounts(counts);
@@ -59,6 +66,11 @@ int main() {
       printCounts(counts);
     }
 
+    {
+      auto counts = cudaq::sample(test_complex_constant_array3);
+      printCounts(counts);
+    }
+
     {
       auto counts = cudaq::sample(test_real_constant_array);
       printCounts(counts);
@@ -118,8 +130,10 @@ int main() {
 // CHECK: 00
 // CHECK: 10
 
-// CHECK: 00
-// CHECK: 10
+// CHECK: 0001
+// CHECK: 0011
+// CHECK: 1001
+// CHECK: 1011
 
 // CHECK: 00
 // CHECK: 10
@@ -127,23 +141,20 @@ int main() {
 // CHECK: 00
 // CHECK: 10
 
-
 // CHECK: 00
 // CHECK: 10
-// CHECK: 01
-// CHECK: 11
 
-// CHECK: 00
-// CHECK: 10
 // CHECK: 01
 // CHECK: 11
 
 // CHECK: 00
 // CHECK: 10
+
 // CHECK: 01
 // CHECK: 11
 
 // CHECK: 00
 // CHECK: 10
+
 // CHECK: 01
-// CHECK: 11
+// CHECK: 11
\ No newline at end of file
diff --git a/test/Quake/lift_array.qke b/test/Quake/lift_array.qke
index b12196793d..a8b9b337b2 100644
--- a/test/Quake/lift_array.qke
+++ b/test/Quake/lift_array.qke
@@ -8,6 +8,34 @@
 
 // RUN: cudaq-opt -lift-array-value %s | FileCheck %s
 
+func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.70710678118654757 : f64
+    %0 = arith.truncf %cst_0 : f64 to f32
+    %1 = complex.create %0, %cst : complex<f32>
+    %2 = complex.create %cst, %cst : complex<f32>
+    %3 = cc.alloca !cc.array<complex<f32> x 4>
+    %4 = cc.cast %3 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %1, %4 : !cc.ptr<complex<f32>>
+    %5 = cc.compute_ptr %3[1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %1, %5 : !cc.ptr<complex<f32>>
+    %6 = cc.compute_ptr %3[2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %2, %6 : !cc.ptr<complex<f32>>
+    %7 = cc.compute_ptr %3[3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %2, %7 : !cc.ptr<complex<f32>>
+    %8 = quake.alloca !quake.veq<2>
+    %9 = quake.init_state %8, %4 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+    return
+  }
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = cc.address_of @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f32> x 4>>
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+// CHECK:           return
+// CHECK:         }
+
+
 func.func private @__nvqpp_vectorCopyCtor(%0: !cc.ptr<i8>, %1: i64, %2: i64) -> !cc.ptr<i8>
 
 func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
@@ -71,7 +99,6 @@ func.func @test2() -> !quake.veq<2> {
 // CHECK:           return %[[VAL_2]] : !quake.veq<2>
 // CHECK:         }
 
-// CHECK-DAG:         cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
-
-// CHECK-DAG:         cc.global constant @test2.rodata_{{[0-9]+}} ([1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00]) : !cc.array<f64 x 4>
-
+// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
+// CHECK-DAG:     cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
+// CHECK-DAG:     cc.global constant @test2.rodata_{{[0-9]+}} (dense<[1.000000e+00, 2.000000e+00, 6.000000e+00, 9.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
diff --git a/test/Quake/lift_array_temp.qke b/test/Quake/lift_array_temp.qke
new file mode 100644
index 0000000000..b3500d9e2a
--- /dev/null
+++ b/test/Quake/lift_array_temp.qke
@@ -0,0 +1,50 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -lift-array-value %s | FileCheck %s
+
+
+func.func private @__nvqpp_vectorCopyCtor(%0: !cc.ptr<i8>, %1: i64, %2: i64) -> !cc.ptr<i8>
+
+func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  %cst = arith.constant -0.70710678118654757 : f64
+  %c16_i64 = arith.constant 16 : i64
+  %c4_i64 = arith.constant 4 : i64
+  %cst_0 = arith.constant 0.70710678118654757 : f64
+  %cst_1 = arith.constant 0.000000e+00 : f64
+  %0 = complex.create %cst_0, %cst_1 : complex<f64>
+  %1 = complex.create %cst_0, %cst_1 : complex<f64>
+  %2 = complex.create %cst_0, %cst_1 : complex<f64>
+  %3 = complex.create %cst, %cst_1 : complex<f64>
+  %4 = cc.alloca !cc.array<complex<f64> x 4>
+  %5 = cc.cast %4 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %0, %5 : !cc.ptr<complex<f64>>
+  %6 = cc.compute_ptr %4[1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %1, %6 : !cc.ptr<complex<f64>>
+  %7 = cc.compute_ptr %4[2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %2, %7 : !cc.ptr<complex<f64>>
+  %8 = cc.compute_ptr %4[3] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %3, %8 : !cc.ptr<complex<f64>>
+  %9 = cc.cast %4 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+  %10 = call @__nvqpp_vectorCopyCtor(%9, %c4_i64, %c16_i64) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+  %11 = cc.stdvec_init %10, %c4_i64 : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+  return %11 : !cc.stdvec<complex<f64>>
+}
+  
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 16 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 4 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.address_of @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f64> x 4>>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_4:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_3]], %[[VAL_1]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+// CHECK:           return %[[VAL_5]] : !cc.stdvec<complex<f64>>
+// CHECK:         }
+
+// CHECK-DAG:         cc.global constant @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v.rodata_{{[0-9]+}} (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (-0.70710678118654757,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
+
diff --git a/test/Quake/state_prep.qke b/test/Quake/state_prep.qke
new file mode 100644
index 0000000000..3ba6d077bb
--- /dev/null
+++ b/test/Quake/state_prep.qke
@@ -0,0 +1,114 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -state-prep %s | FileCheck %s
+
+module {
+  func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %0 = cc.address_of @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_0 : !cc.ptr<!cc.array<complex<f32> x 4>>
+    %1 = quake.alloca !quake.veq<2>
+    %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+    return
+  }
+  cc.global constant @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
+
+
+ func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %0 = cc.address_of @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv.rodata_0 : !cc.ptr<!cc.array<f64 x 4>>
+    %1 = quake.alloca !quake.veq<2>
+    %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+    return
+  }
+  cc.global constant @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv.rodata_0 (dense<[0.70710678118654757, 0.70710678118654757, 0.000000e+00, 0.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
+
+  func.func @__nvqpp__mlirgen__function_test_complex_array_param._Z24test_complex_array_paramSt6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %0 = cc.address_of @__nvqpp_rodata_init_state.0 : !cc.ptr<!cc.array<complex<f32> x 4>>
+    %1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<!cc.array<complex<f32> x ?>>
+    %2 = quake.alloca !quake.veq<2>
+    %3 = quake.init_state %2, %1 : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x ?>>) -> !quake.veq<2>
+    return
+  }
+  cc.global constant @__nvqpp_rodata_init_state.0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_array_param._Z24test_complex_array_paramSt6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
+
+  func.func @__nvqpp__mlirgen__function_test_real_array_param._Z21test_real_array_paramSt6vectorIfSaIfEE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %0 = cc.address_of @__nvqpp_rodata_init_state.1 : !cc.ptr<!cc.array<f32 x 4>>
+    %1 = cc.cast %0 : (!cc.ptr<!cc.array<f32 x 4>>) -> !cc.ptr<!cc.array<f32 x ?>>
+    %2 = quake.alloca !quake.veq<2>
+    %3 = quake.init_state %2, %1 : (!quake.veq<2>, !cc.ptr<!cc.array<f32 x ?>>) -> !quake.veq<2>
+    return
+  }
+  cc.global constant @__nvqpp_rodata_init_state.1 (dense<[0.707106769, 0.707106769, 0.000000e+00, 0.000000e+00]> : tensor<4xf32>) : !cc.array<f32 x 4>
+
+// CHECK-LABEL:     func.func @__nvqpp__mlirgen__function_test_real_array_param._Z21test_real_array_paramSt6vectorIfSaIfEE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
+}

From 4828dbba3d08066bcbed1cc1972beef78cd3b56e Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 11 Jul 2024 09:16:16 -0700
Subject: [PATCH 17/50] Fix failing tests

---
 program.py                                    | 24 +++++++++++++++++++
 python/tests/backends/test_IQM.py             | 11 ++++-----
 python/tests/backends/test_IonQ.py            |  5 ++--
 python/tests/backends/test_OQC.py             |  6 ++---
 .../test_Quantinuum_LocalEmulation_builder.py |  3 ++-
 .../test_Quantinuum_LocalEmulation_kernel.py  |  3 ++-
 .../tests/backends/test_Quantinuum_builder.py |  3 ++-
 .../tests/backends/test_Quantinuum_kernel.py  |  3 ++-
 8 files changed, 43 insertions(+), 15 deletions(-)
 create mode 100644 program.py

diff --git a/program.py b/program.py
new file mode 100644
index 0000000000..0c5b92f20e
--- /dev/null
+++ b/program.py
@@ -0,0 +1,24 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq
+import numpy as np
+
+cudaq.set_target('iqm', url="http://localhost/cocos", **{"qpu-architecture": "Adonis"})
+
+@cudaq.kernel
+def kernel(vec: list[complex]):
+    qubits = cudaq.qvector(vec)
+
+state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
+counts = cudaq.sample(kernel, state)
+print(counts)
+assert '00' in counts
+assert '10' in counts
+assert not '01' in counts
+assert not '11' in counts
\ No newline at end of file
diff --git a/python/tests/backends/test_IQM.py b/python/tests/backends/test_IQM.py
index d91f8575e9..bf3746bce4 100644
--- a/python/tests/backends/test_IQM.py
+++ b/python/tests/backends/test_IQM.py
@@ -9,6 +9,7 @@
 import os
 import tempfile
 import time
+from typing import List
 from multiprocessing import Process
 import numpy as np
 
@@ -162,27 +163,25 @@ def kernel():
 def test_IQM_state_preparation():
 
     @cudaq.kernel
-    def kernel(vec: list[complex]):
+    def kernel(vec: List[complex]):
         qubits = cudaq.qvector(vec)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
     counts = cudaq.sample(kernel, state)
+    counts.dump()
     assert '00' in counts
     assert '10' in counts
-    assert not '01' in counts
-    assert not '11' in counts
 
 
 def test_IQM_state_preparation_builder():
-    kernel, state = cudaq.make_kernel(list[complex])
+    kernel, state = cudaq.make_kernel(List[complex])
     qubits = kernel.qalloc(state)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
     counts = cudaq.sample(kernel, state)
+    counts.dump()
     assert '00' in counts
     assert '10' in counts
-    assert not '01' in counts
-    assert not '11' in counts
 
 
 # leave for gdb debugging
diff --git a/python/tests/backends/test_IonQ.py b/python/tests/backends/test_IonQ.py
index 92a7ac8a5d..f468a1d9c8 100644
--- a/python/tests/backends/test_IonQ.py
+++ b/python/tests/backends/test_IonQ.py
@@ -9,6 +9,7 @@
 import cudaq, pytest, os, time
 from cudaq import spin
 import numpy as np
+from typing import List
 from multiprocessing import Process
 try:
     from utils.mock_qpu.ionq import startServer
@@ -160,7 +161,7 @@ def kernel():
 def test_ionq_state_preparation():
 
     @cudaq.kernel
-    def kernel(vec: list[complex]):
+    def kernel(vec: List[complex]):
         qubits = cudaq.qvector(vec)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
@@ -172,7 +173,7 @@ def kernel(vec: list[complex]):
 
 
 def test_ionq_state_preparation_builder():
-    kernel, state = cudaq.make_kernel(list[complex])
+    kernel, state = cudaq.make_kernel(List[complex])
     qubits = kernel.qalloc(state)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
diff --git a/python/tests/backends/test_OQC.py b/python/tests/backends/test_OQC.py
index 0dc40e4bec..1ff86c535c 100644
--- a/python/tests/backends/test_OQC.py
+++ b/python/tests/backends/test_OQC.py
@@ -9,7 +9,7 @@
 import os
 import sys
 import time
-
+from typing import List
 import pytest
 from multiprocessing import Process
 
@@ -162,7 +162,7 @@ def test_OQC_observe():
 def test_OQC_state_preparation():
 
     @cudaq.kernel
-    def kernel(vec: list[complex]):
+    def kernel(vec: List[complex]):
         qubits = cudaq.qvector(vec)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
@@ -174,7 +174,7 @@ def kernel(vec: list[complex]):
 
 
 def test_OQC_state_preparation_builder():
-    kernel, state = cudaq.make_kernel(list[complex])
+    kernel, state = cudaq.make_kernel(List[complex])
     qubits = kernel.qalloc(state)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
index 70a1d29aa9..58176b4e32 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
@@ -9,6 +9,7 @@
 import cudaq, pytest, os, time
 from cudaq import spin
 import numpy as np
+from typing import List
 from multiprocessing import Process
 
 
@@ -113,7 +114,7 @@ def test_quantinuum_exp_pauli():
 
 
 def test_quantinuum_state_preparation():
-    kernel, state = cudaq.make_kernel(list[complex])
+    kernel, state = cudaq.make_kernel(List[complex])
     qubits = kernel.qalloc(state)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
index 5576f46597..8471c10286 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
@@ -9,6 +9,7 @@
 import cudaq, pytest, os, time
 from cudaq import spin
 import numpy as np
+from typing import List
 from multiprocessing import Process
 
 
@@ -142,7 +143,7 @@ def kernel():
 def test_quantinuum_state_preparation():
 
     @cudaq.kernel
-    def kernel(vec: list[complex]):
+    def kernel(vec: List[complex]):
         qubits = cudaq.qvector(vec)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
diff --git a/python/tests/backends/test_Quantinuum_builder.py b/python/tests/backends/test_Quantinuum_builder.py
index d20cb0d499..48d50b7419 100644
--- a/python/tests/backends/test_Quantinuum_builder.py
+++ b/python/tests/backends/test_Quantinuum_builder.py
@@ -8,6 +8,7 @@
 
 import cudaq, pytest, os, time
 import numpy as np
+from typing import List
 from cudaq import spin
 from multiprocessing import Process
 try:
@@ -147,7 +148,7 @@ def test_quantinuum_observe():
 
 
 def test_quantinuum_state_preparation():
-    kernel, state = cudaq.make_kernel(list[complex])
+    kernel, state = cudaq.make_kernel(List[complex])
     qubits = kernel.qalloc(state)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index fc11224f5e..646f9cc787 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -10,6 +10,7 @@
 import numpy as np
 from cudaq import spin
 from multiprocessing import Process
+from typing import List
 try:
     from utils.mock_qpu.quantinuum import startServer
 except:
@@ -173,7 +174,7 @@ def kernel():
 def test_quantinuum_state_preparation():
 
     @cudaq.kernel
-    def kernel(vec: list[complex]):
+    def kernel(vec: List[complex]):
         qubits = cudaq.qvector(vec)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]

From 77dbe449b2dfeb67aa0765b81b92be3d29d1fc7d Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 11 Jul 2024 13:43:34 -0700
Subject: [PATCH 18/50] Fix failing tests

---
 dictionary.dic                                | Bin 0 -> 9936 bytes
 lib/Optimizer/Builder/Intrinsics.cpp          |  30 +++++++++---------
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |   4 +--
 python/tests/backends/test_IQM.py             |  20 +++++++-----
 runtime/common/BaseRestRemoteClient.h         |   4 +++
 .../execution/state_preparation_vector.cpp    |   3 +-
 6 files changed, 34 insertions(+), 27 deletions(-)
 create mode 100644 dictionary.dic

diff --git a/dictionary.dic b/dictionary.dic
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1930b6ba86d2e1409b65c5d6b65c39e30fa80
GIT binary patch
literal 9936
zcmai)4UlA4Rmbm6_v^2j>G|yW+S$!wAc4h@K={x|h0Mp!&ZIx4x8K{{*;K^zbidi{
zOi%Z0zwXUWL?B=hh)N_tETw{=f&no^1PZ926kjT)FhLY4wTz*PhzdzzSph8zfB*Yl
z?@mz5uA2S7cka38oO|xM=bn4tR)bx?**sGBx2nBn_lSM-J4ZV0JDNvc`jVGkXa1jG
zsrtzO;CL9yk*h-T=LU?qEoICThmE=VfH8l;pMHWEe@Xu0{EhI}{~hsLX7^mX<#!Dx
z^r;~pjb4wggffoijX{MuS_owwEgF;7d&!uLux!k@uwu-#&^2a8=ovEyE+Plh%p!8G
z5F+P@5dJHL@IPM&{|mqs<X<gB?hA$Ry+}y=i@{aeuMyIItq}TkAmYu-gf|&;y>QW(
zmkVilr4YS02<iW7A@<)WM9zW`J;#OEvnE6?T}%NvJemUII4MNlX(9d32;sj4EJD9k
zNT=5dk$anv_L>kOzHq~sEwF^VvqJhcg!H>zh<)2a_;<iE`ddQy+d}wvh47yjq9+hy
ze^-dUJ+K1*&kNyyoe=(C5MuZ1h48&Wh&^|KE_&W1q~F~_`n^?1zk7t(^>!ise^rRy
zcL=fbogiTD7b54^gvj}IA##30h@1z6$a$X-dw)xa9ltF^zz2oc^B{Pd{vQ(3|HDG~
zKO%(xqeA!}1~=gQ1F-))%0q@f#cxhX|L0lEo0nKDnxC;)GOHF{^TGr0Vc($vVbQ!$
zSS*>>2$>+)V!=FXv1ks*L@%44v{*66EV{<C=$SvUxMAMH!qfWrq{S8Ud5b0Umlj=f
zqoa1u6oeJeTomHqyDfU={X)k1kj1?Dw8gS{)MCYaA6#6?o5Kl^fWAgZV7eAPQ@6Nb
z_JqiPs}T9`7cwyqTXfCm!4<~)jF13}C&P6&X|ZIUFZ4>LY_Vc$;3@&%vsf^{B&6SG
zg-q~QESAmpLELXfQ{YXD=1L(6;ChRmSrWofvsg6eEtbq(7F~0{ka_ul#YOWeA$p&%
zST;`!OC__O7P{sSEqdlLA%1@~fskjR_!cYXZ5CbgHH)5k#^M=s)gXMt*9{h3b6iNA
zowArWn-)tZ5HcTkSzI*lwwN~$S}d7AvRF2cTXfA67CrMliyJ08M7|*L{e*C(U|wah
zXpRdp_@u>(xkE_Yy-SE4pR~AWz6dJMJZ^Et{JpSX%=ayN=0`%tJvS_L%~cjR%=JPR
z_@YJ6tP7d{pA~MD%nn#0j_$GOnFobT%%_BfiusZdMSpG4HO~lH;PYAX&0@)1D<pB>
zBqYI}5@Of3Mc3>JvFmNZ%A&d7V%|Jtv0(noV$tN;q{@}Dxm`#Cy5FK_J_uIu*C&L;
z`Im&qe^N-Id|HT~a-%}eJWog-Sq5D;pt8lh@huk3=$PoPd4WaG+#_Uz`KFM7`?f{b
zjF0n<o)=jxnpaqK&9X($RD?MEHj5Q=QHVXiEW`m1SS*__3Gw&8S*#c{p?A+*1rko?
zM&Tm+j>UpGDP+7oA#w2*ixu-ui>~>Ckd5eD7G1M3srL<YyO4GKE{kRJu#k2CWg-52
z(qhG|kr+MvbeoWQyu)I}+#}q`n+L$tD=X$9A@TO77FW#Ig={QOS@g^gEpC{hDd=v+
zTqz`>zY^TQpKIX%^~)SMYMi0ig5NQN10?J0P^_`FWrhYw*vV{sy|T1mhGIvF!sKvl
z{lt=eDVpSHZ0$tR%p{KHiCE8MX9xYpS-*9`uXWoUGnWkMuF1{}?Q00kl%tk{$xdfN
zdbQc}BRQ<G>|7*A<iH$G!ClA~nau3`NVT@zqF(dsXZ;<&)eU>OR0MtPLA!Z@fmrR(
z5jPuZ4;unAEqgWP*_lv%(5-e+FLNPZ)^F*7nM1#u_ZDT@^q_yyul2gs%_f%R7>CQ6
z%+98(y^D=zv)b7+htd$srP78p92)j7?o!;fx(#&H&0*VEaTg0}%+lELgP^h5Xg0cg
z=Abl{D+>SY^iaR?f|@4cG|7NewN|5x{3~wP+aTFyr?9KmU>uEhD=?#C_DnM8oLpLR
zmqU}hQmFMHCUyPRC8asW?RphbN_H$oQ^((_Hce)LO*lI`z~fYseV7#J8fPlK(A=pO
z8m-<%Gm&6hmYVEXvbxi5oejEXHl{5)JDBua=X*YL>WszL+N~32DnUb$?KL}-?zL*|
zo!uTTpHGEU(q49ExKFo(Fq3iGpiiemS&i0NGo6H1v4Jp?ZnSEcst~|!AzAE0m+0y-
zE-CnDG8I`Am`UeoslY}Bt<h>WT9(by>`*L?nHpv&MUx$m`>o!Nf}>RSiW+Thu-9tu
zb{jj5J7LShMgWXDy_P0l`p8n*sbsZPQ$W};x?X8RV>~q6Xc1I47?>zU0}-b-Oq|O;
zr!g?&809J)Clb|GQ0?AnCh;N^&37s!5rp%Y=B-qyX|o3r(e*o9ey6?T^I+ZXmB^yj
zjc7jGgsQ=aikw^q2Zm%6VmzfF#`jvp62qCrvtFg-%7t?ogm<v9Q47owEcYtq$_Ax!
zc7D{VZSVRWJ9oqkD}#0^V7eu%fu-`iQs8AGfjtdzL#E6OG8Zh9CVs_6h6-5{n9)o0
zKnbc$Qi?5<*A(Yvt(CDvZ}&o*fMNt?0??UEuJ8J-g{38vONc_j>`bOaJF&p?AqOIR
zg_B8$rl$$pqlnN6H8O`0$>lL55&z>>hI`UIrubl2cShqFVl#ol1K*|lv0=S0BGk#u
zC2F9MkrO5yyn+K}la!eoIGN=&v=HpIYUeubR=XD{%W4&qm;n|fow5U*6A272uWUG;
ztk!RD<9rR=)9RXlQr%Vd#bzkdor*|y0=x<}UpAL&5FScm%7)B|lJV4_C3YJZ{N|p?
z#;ImqlN<q$v;D>?YOR7zGO*vOqd*iCjHe<Ji6^F#7b`Z~?WSLCG3AQ|GAE?;X0Oq#
zTa{V3Ht?^r*lgF%5suWuBod_O1~)tHYQ0tslq}(+KxS=iwgXnnSe!Z;rpe@Hqg!j&
zeZqT@t&>v};SDWNcq&VTR7}fdE|CLLD0Ma2X)?_M*<t-PYGx!(jg9BC&Pw_CLSRM)
zIO7sNEA1X3+aY{bC~7qDsnizJLgK8G5JRGzo9<J!e5lKjKat`*-?ie$AYmS%w9@W$
zJ2)igP^@dYPiM}xciQZhn(3LuYO&yvl$_bY)#^^8sqS#kCPTW7jeQM@4HfyUgVpj9
zBcsyy6ymGQ^-i@@$MGW3ZzA2N1vqroWATi@4hjh2<S?vy7(al(RQ$xk`hxcP6KfSS
z64Oz^8BH8-bXR+udS^0|qlu;U#WRHk`;Ko$o%1~$&3eq^dXpVV*!RE;4^T5WBhHCl
zb=wy=1*tQh$XCyjps+Dd1QktuB3<jj((00FvoP5SIk8l&V$=ZHE18Tfp^ecl6<DD0
zd_f^KnW|OEunJ-%K|(W<BAQ04^NBO&EET1F%3%Z2!kf-$|EZFzSSB+GOQq|~;AOIk
z1n6v-&@FpAu!`oAAsI1q!{Qe`+`Nw_IR4nAIyzLVwlx1Xmywo-_rG_{!DPsZ()s5!
zsn!Xbv|Ah*-PbML6z9__Q7S4%7zYNhopgG&Zo{tRm{uq=xKvsv7#MfERj+E?L@<7z
z&R8F_dE;irM=n+sl4E=%_I8t+O1`ygdNOrh^Fb<>Zy~`rp6=7?oXRPMH7(_n#zAAR
zyr<XWyaFdUU?_rk8;j}CN2aQ^d%N0v#^VqeIkOj1@U@et=!vFax6_X5o7sdV<B8d!
zT05d^fiq{~r^D3chIS_~hfqfBU{CpEAjBRp8^@Bsnu1>@k~jBu_w4S7UXME^Czpvx
zh3Y1W4=VPLA_bg)>5SYxhQYggHVTFYIH6J)A75Q!yNu=8=*S@XPPM1~Nm(<?@^Q5u
zC>*tZPBemT7RYccPew})$8UDmOJ-t#M$OmBWgFZ#uX%Eh$7zV9stmVT4SbV}<vFip
z(@w2dub#KExtI|nc_a>F&!e0pmaznb?Q-)PWhwKJl#OruT{0VQOrn-*JzT1`)E#4~
z@~-Tdr<BAO>7{C?>jz3dlMW|j4Q?V`Xl!=EU3?^_<8pR1(Y#>G_F>*R3uNPk!YPh(
zF&&=Sk(=%Ic5j!Y%{x})k|9C$N+Dmlsi>l4cBI)pt6lKijus2M4&9YP>E38<6D@SN
z_1kDNBq*gQdPbaPI|%kjg9XmgY>|S&0oHqRBv$0OGU*iaE8-ka@38u}8miZa2B>nA
z<MCpF+%}ga<=#}0Q>KP@xDUpRj$f^_3J_!iSmnS_Os99ES&-%54OBR^EUl_d+Gzvy
zFLON-FYbW=dm0JJ*`d<C$f!Bovux2eJF??<YUfy9{nRlM(}_Ph>Xc3?>t^G;D!Ipb
zUt;D;6gec_(y6uOHJ!wCAAr`<@g*FlO9A%8Rxdn$%qF-UaFsu1hg<CqwS42wFnQ#h
zQqk2QrqZqIjvwq+YuZ9B5o#`TiK@MfnvxlsNVVu}3xd&jdC?XfiQT=;2HRlH;hsU1
z=?s<MmUK~xOeVN>P!4g%Qsup~D$q4)Bmn}m%)NVq0VxAGBMJMeX(3;5QYi1?@96}$
z64p;MgRJ?@zK=vF_o8kl!=b5vgqTf+WaxA7_vxIpQZ!H%E6(Oek#x3G-PvI<VW3Sr
z<w`;Efflv`lAr2MGlv$)c<<owWxA5iJc?+MV-Js9ZVFv1Wku>;wW17xw*KyWegb7>
zd1^9=iBz0*au_6=^rnYIyOi}t;@q~V?sCfO?i%-0u`=PLS9guwYb2)2muxQ430L2I
zic6T4yshe{vl@SEt8um$)^<Y!+|01uS5}V|m{KYkPBatAPLIsa{yX7_f)B=05lQKb
z^`4xHRon%x#<;*?vtf<soEM)R?f9<UZ+acS?r~NWyVe8jt1LP0ckGUm4tkqg?Pi??
zEMj=lNI|SMtAVN=OT$+)gMFec+(J4AO=tRaR_%01wt;quTo1`M{$?$Ao7u&&^C^UL
zh}Kjb8iv1rjHZG-l<v2oV=nugMpeh|Vv(zHPV=<vwCg=83w3exm9pJ&=Or(wbsD?&
z0)P{E)F|6Dl(3BqGKDOq{%9nPPOV4M)kvVCe98_yBx|*~i)3!51{vMGU2;DCy)r|{
z$recRj|^0y#@-q6k+v-jMdN*1odls91ZKrOwX9BIxs*!<gudOsutPL@F#(nsmM~M$
z-2zjcoyY|K#h$HSlnyxqk#A1xWE*q2lqVeB!pTaJqzAt;m0LLG+BXz4oWikX>TvGk
z3U=?xWrDpOwtp&>8as_!B*>_pgC29%(YhOrb62OFkys*Oth-_=%Rq*@%iV~!YeoWz
zj??KrEil(4-Ex_A=bGGj*6opqD9Jc3>KY_F!xTS<f@j!D<(cI?$0!@!3^ox~QCw+Q
zifl`UBpZEP%V{bn2JMAVbi0)~z(|K;%j;|ShYLf5dW&o33WrGH-Xb@Y^5pB|eBNVj
zY`jqHldqLmow20X-eQR~m@M5Ya+T&SmC43lbiXm3uKWG_joD;KB^;)PE|C=x!*o`z
zh%FzdB5-&?wKJYvZeQG!wp^U6Nlc&0)O~VXyQYh-X-7o)aqu!#RhY;y?<bR}M7G-r
zsv{Y?yO>Q_I$~yq_cu^yp;R$6p6qf83^=e(LXn7ES`@avh7Llase9%UF`zW&&Xh|9
zx*yEeNm5)lv|Gd>2jyJCBc7~<>8?*W*&8)_RQj1WOBt(B>F3?KBsZ-qHaE@=)f+*r
z&COu60%)no2{RcrbbMRu;%Q4p{oLSXy0S{xpi#_WSkxoT{!@<UR>Hlc?gyw3>wYFP
z3a}jR595g{Z9Gmqv5+QG^=h}e)okBkNjfHur<Q%H1<sj@6E1(99B~qyYizeU3UUiv
zvNMqzx}$o*-{QVZaZdPgFHFUG>clZ5ou=$j#NE;EY+K=DF<ld*j<Gkkbf|@i1Gm=J
z1y*0R&YGNZPKS3nQ-hW!!Cd3O>Du#a>Qwd42Itp?NcIM5T=H1XB<Qi2E|{5bEZwq&
zbsPe|9ZE%%;D5h)euQr%2e=21@$KwKKlXcs?{*^H%lA3q3BC^r^}CgR*KyzeUHYD+
z?-%*D6}DXdca!_>>!LnC;M<g1w)gRUhqhCEJ2}I5h|9ggr?0{HQs^J#8>1-8QNKC9
zO<s%6K7R|Ik2AD;`KHvTo#y+;H~8kI_YCrHWgL7?xYRECo9Q<f$^6$yM<*Ju+V#st
zHtYEoz6HLH?;zh`+<LzjpL_?IcSgRs3H>|p{0cr%`;G8@75;Seejm?Yfp?zw_c8|Y
zTtM%=eCv5N-$C^LHS~X)=RAL(f&XsCA^IqCh5E&j@km79!2Z?97qjpW_vetY-Gc4E
z!}kdF$?$$B_Wd!w(EC06#`X4S>{mwk=h*#C=4Eo9Fa9o)qrT7J*9Z3b`f$|tTE_JX
z?EE@5j7PGr;{D=2U3W+QZorO@!*@+&kL31$Z$kgG=>N@VT({GwfGzXT1|q+GZ=XHt
z_q+Jy31mGVU+Vqc(Y)&UF?_j$%&UKF{?+ymQH<z0kL<_bxedL~;@?+9{M(UF?*lJJ
z_P!W->hrhJ+|Pn*QQP(K-ALSj9Dn%C$$#?xWMsGc+#C7zQD}$o&8>{@1MvCuZ(z?a
z%D?oz9v_w>JvT%?Jb+xu_%OcsI<z%V@Bb0SkDm1#LB<j?-hl2;F$a(G_Zh}@S7gU)
zXkVxOU-;XIWT@X1`aXcHpM&;jG@bxoJWgN0>Js05QJ;~>-=gXFSJ?dfFn%7rhrwKA
z??K{E?MGw}<9#`H9OHQ#T}Pv_y`DKs&}S`@nS!2y_XpVY;z-Uv@6%U_V&JFg^KJTm
zVxNzH7}+kJk1#h!cz+6cZ>0az_*!)Ru0X!<S^B;V`nS>bMAW{`cwfL=J&yd3Kz~&<
zZ~w%*{{P1BMRu$I8yUxg8XNN8h_4&8eT=`#KA*jcw$I`BwJ2s@g3X`BU*%|Qe}gU8
zB4-WSSC|(cl<fD>NBC2WcZfcE?l2$sMdNp{?Yr3a0Qzp`FOCfHEHX!5jd=8~-$(Jo
zlg!2Yu;quydI#^XkLL3IJg;KA_})Ojry}1x#kk%H{m-H~d^)mUeAh-iS4TF!2cBdk
z?=f`zAd)9u{r-`*GuZcO<i8o;B*2f+r;h(0g!c<k-+xE$d-1dS%)|fDNY8s1Yl->!
zCjL>|&5=!d{?<NS?}~hI2D|RYSGQryLyWf>$ru0YkWq@{&cGwudHB8z?Uu;*qTP;-
VAByJwAU5kik$nz7`}je>{|1;FG}!<E

literal 0
HcmV?d00001

diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 2b8be438fd..0859199fe2 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -387,14 +387,6 @@ LogicalResult IRBuilder::loadIntrinsic(ModuleOp module, StringRef intrinName) {
       ParserConfig{module.getContext(), /*verifyAfterParse=*/false});
 }
 
-template <typename A>
-static std::vector<std::int32_t> asI32(const std::vector<A> &v) {
-  std::vector<std::int32_t> result(v.size());
-  for (auto iter : llvm::enumerate(v))
-    result[iter.index()] = static_cast<std::int32_t>(iter.value());
-  return result;
-}
-
 template <typename T>
 DenseElementsAttr createArrayAttr(const std::vector<T> &values, Type eleTy) {
   auto newValues = ArrayRef<T>(values.data(), values.size());
@@ -402,6 +394,17 @@ DenseElementsAttr createArrayAttr(const std::vector<T> &values, Type eleTy) {
   return DenseElementsAttr::get(tensorTy, newValues);
 }
 
+DenseElementsAttr createArrayAttr(const std::vector<bool> &values, Type eleTy) {
+  std::vector<std::byte> converted;
+  for (auto b : values) {
+    converted.push_back(std::byte(b));
+  }
+  auto newValues = ArrayRef<bool>(reinterpret_cast<bool *>(converted.data()),
+                                  converted.size());
+  auto tensorTy = RankedTensorType::get(converted.size(), eleTy);
+  return DenseElementsAttr::get(tensorTy, newValues);
+}
+
 template <typename A>
 cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
                                            StringRef name,
@@ -465,25 +468,22 @@ IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
                                 const std::vector<std::int16_t> &values) {
-  auto converted = asI32(values);
   return buildVectorOfConstantElements(loc, module, name, values, *this,
-                                       getI32Type());
+                                       getI16Type());
 }
 
 cc::GlobalOp
 IRBuilder::genVectorOfConstants(Location loc, ModuleOp module, StringRef name,
                                 const std::vector<std::int8_t> &values) {
-  auto converted = asI32(values);
   return buildVectorOfConstantElements(loc, module, name, values, *this,
-                                       getI32Type());
+                                       getI8Type());
 }
 
 cc::GlobalOp IRBuilder::genVectorOfConstants(Location loc, ModuleOp module,
                                              StringRef name,
                                              const std::vector<bool> &values) {
-  auto converted = asI32(values);
-  return buildVectorOfConstantElements(loc, module, name, converted, *this,
-                                       getI32Type());
+  return buildVectorOfConstantElements(loc, module, name, values, *this,
+                                       getI1Type());
 }
 
 Value IRBuilder::getByteSizeOfType(Location loc, Type ty) {
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 58a5f4a3f9..28622fa598 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -251,9 +251,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     generateNewValue = true;
   }
   if (generateNewValue) {
-    auto [memArr, data] = getArrayInMemory();
+    auto [memArr, _] = getArrayInMemory();
     OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointAfter(data.getDefiningOp());
+    builder.setInsertionPointAfter(memArr.getDefiningOp());
     Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
     Value newVec =
         builder.create<cudaq::cc::StdvecInitOp>(argLoc, strTy, memArr, size);
diff --git a/python/tests/backends/test_IQM.py b/python/tests/backends/test_IQM.py
index bf3746bce4..38e2b55363 100644
--- a/python/tests/backends/test_IQM.py
+++ b/python/tests/backends/test_IQM.py
@@ -161,27 +161,31 @@ def kernel():
 
 
 def test_IQM_state_preparation():
+    shots = 10000
 
     @cudaq.kernel
     def kernel(vec: List[complex]):
         qubits = cudaq.qvector(vec)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
-    counts = cudaq.sample(kernel, state)
-    counts.dump()
-    assert '00' in counts
-    assert '10' in counts
+    counts = cudaq.sample(kernel, state, shots_count=shots)
+    assert assert_close(counts["00"], shots / 2, 2)
+    assert assert_close(counts["10"], shots / 2, 2)
+    assert assert_close(counts["01"], 0., 2)
+    assert assert_close(counts["11"], 0., 2)
 
 
 def test_IQM_state_preparation_builder():
+    shots = 10000
     kernel, state = cudaq.make_kernel(List[complex])
     qubits = kernel.qalloc(state)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
-    counts = cudaq.sample(kernel, state)
-    counts.dump()
-    assert '00' in counts
-    assert '10' in counts
+    counts = cudaq.sample(kernel, state, shots_count=shots)
+    assert assert_close(counts["00"], shots / 2, 2)
+    assert assert_close(counts["10"], shots / 2, 2)
+    assert assert_close(counts["01"], 0., 2)
+    assert assert_close(counts["11"], 0., 2)
 
 
 # leave for gdb debugging
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index f9b7d0d736..9b2f51ffdb 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -21,6 +21,7 @@
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/Pipelines.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -159,6 +160,9 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
         if (funcOp && (funcOp->hasAttr(cudaq::kernelAttrName) ||
                        funcOp.getName().startswith("__nvqpp__mlirgen__")))
           moduleOp.push_back(funcOp.clone());
+        // Add globals defined in the module.
+        if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
+          moduleOp.push_back(globalOp.clone());
       }
 
       if (args) {
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index 994390cde3..35d2b68619 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -55,7 +55,6 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-
     {
       auto counts = cudaq::sample(test_complex_constant_array);
       printCounts(counts);
@@ -157,4 +156,4 @@ int main() {
 // CHECK: 10
 
 // CHECK: 01
-// CHECK: 11
\ No newline at end of file
+// CHECK: 11

From ebaf6c31d7adf7b4f2d5b407f069abd4b34285de Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 11 Jul 2024 16:30:38 -0700
Subject: [PATCH 19/50] Add remote sim tests

---
 lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp   |   6 +-
 runtime/common/BaseRestRemoteClient.h        |  21 +-
 targettests/Remote-Sim/state_init.cpp        |  95 +++++++++
 targettests/Remote-Sim/state_init_vector.cpp | 201 +++++++++++++++++++
 4 files changed, 314 insertions(+), 9 deletions(-)
 create mode 100644 targettests/Remote-Sim/state_init.cpp
 create mode 100644 targettests/Remote-Sim/state_init_vector.cpp

diff --git a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
index 8d4be784db..153f8d5b56 100644
--- a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
+++ b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
@@ -43,7 +43,9 @@ struct VerifyNVQIRCallOpsPass
           cudaq::opt::NVQIRInvokeRotationWithControlBits,
           cudaq::opt::NVQIRInvokeWithControlRegisterOrBits,
           cudaq::opt::NVQIRPackSingleQubitInArray,
-          cudaq::opt::NVQIRReleasePackedQubitArray};
+          cudaq::opt::NVQIRReleasePackedQubitArray,
+          cudaq::getNumQubitsFromCudaqState,
+          };
       // It must be either NVQIR extension functions or in the allowed list.
       return std::find(NVQIR_FUNCS.begin(), NVQIR_FUNCS.end(), functionName) !=
                  NVQIR_FUNCS.end() ||
@@ -71,7 +73,7 @@ struct VerifyNVQIRCallOpsPass
         passFailed = true;
         return WalkResult::interrupt();
       } else if (!isa<LLVM::AddressOfOp, LLVM::AllocaOp, LLVM::BitcastOp,
-                      LLVM::ExtractValueOp, LLVM::GEPOp, LLVM::LoadOp,
+                      LLVM::ExtractValueOp, LLVM::GEPOp, LLVM::IntToPtrOp, LLVM::LoadOp,
                       LLVM::StoreOp>(op)) {
         // No pointers allowed except for the above operations.
         for (auto oper : op->getOperands()) {
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 9b2f51ffdb..bde3a73011 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -155,19 +155,24 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       auto moduleOp = builder.create<mlir::ModuleOp>();
       moduleOp->setAttrs((*module)->getAttrDictionary());
       for (auto &op : *module) {
-        auto funcOp = dyn_cast<mlir::func::FuncOp>(op);
-        // Add quantum kernels defined in the module.
-        if (funcOp && (funcOp->hasAttr(cudaq::kernelAttrName) ||
-                       funcOp.getName().startswith("__nvqpp__mlirgen__")))
-          moduleOp.push_back(funcOp.clone());
-        // Add globals defined in the module.
-        if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
+        if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
+          // Add quantum kernels defined in the module.
+          if (funcOp->hasAttr(cudaq::kernelAttrName) ||
+              funcOp.getName().startswith("__nvqpp__mlirgen__") ||
+              funcOp.getBody().empty())
+            moduleOp.push_back(funcOp.clone());
+        }
+        if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
+          // Add globals defined in the module.
           moduleOp.push_back(globalOp.clone());
+        }
       }
 
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
+        moduleOp.getContext()->disableMultithreading();
+        pm.enableIRPrinting();
         pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args));
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
@@ -180,6 +185,8 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       // Run client-side passes. `clientPasses` is empty right now, but the code
       // below accommodates putting passes into it.
       mlir::PassManager pm(&mlirContext);
+      moduleOp.getContext()->disableMultithreading();
+      pm.enableIRPrinting();
       std::string errMsg;
       llvm::raw_string_ostream os(errMsg);
       const std::string pipeline =
diff --git a/targettests/Remote-Sim/state_init.cpp b/targettests/Remote-Sim/state_init.cpp
new file mode 100644
index 0000000000..6677b4746c
--- /dev/null
+++ b/targettests/Remote-Sim/state_init.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: remote-sim
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu %s -o %t && %t
+// RUN: nvq++ %cpp_std --target remote-mqpu %s -o %t && %t // TODO: this fails to compile, do we need it?
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+
+__qpu__ void test_complex_array_param(cudaq::state* inState) {
+  cudaq::qvector q1(inState);
+}
+
+void printCounts(cudaq::sample_result& result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    {
+      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      auto state = cudaq::state::from_data(vec);
+      auto state1 = cudaq::state::from_data(vec1);
+      {
+          // Passing state data as argument (kernel mode)
+          auto counts = cudaq::sample(test_complex_array_param, &state);
+          printCounts(counts);
+
+          counts = cudaq::sample(test_complex_array_param, &state1);
+          printCounts(counts);
+      }
+
+      // {
+      //     // Passing state data as argument (builder mode)
+      //     auto [kernel, state] = cudaq::make_kernel<cudaq::state*>();
+      //     auto qubits = kernel.qalloc(state);
+
+      //     auto counts = cudaq::sample(kernel, &state);
+      //     printCounts(counts);
+
+      //     counts = cudaq::sample(kernel, &state1);
+      //     printCounts(counts);
+      // }
+    }
+}
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 0001
+// CHECK: 0011
+// CHECK: 1001
+// CHECK: 1011
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
diff --git a/targettests/Remote-Sim/state_init_vector.cpp b/targettests/Remote-Sim/state_init_vector.cpp
new file mode 100644
index 0000000000..7e93b63dae
--- /dev/null
+++ b/targettests/Remote-Sim/state_init_vector.cpp
@@ -0,0 +1,201 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: remote-sim
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu %s -o %t && %t
+// RUN: nvq++ %cpp_std --target remote-mqpu %s -o %t && %t // TODO: this fails to compile, do we need it?
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+
+
+__qpu__ void test_complex_constant_array() {
+   cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+}
+
+__qpu__ void test_complex_constant_array2() {
+   cudaq::qvector v1(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+   cudaq::qvector v2(std::vector<cudaq::complex>({ 0., 0., M_SQRT1_2, M_SQRT1_2}));
+}
+
+__qpu__ void test_complex_constant_array3() {
+   cudaq::qvector v({
+    cudaq::complex(M_SQRT1_2),
+    cudaq::complex(M_SQRT1_2),
+    cudaq::complex(0.0),
+    cudaq::complex(0.0)
+  });
+}
+
+__qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test_real_constant_array() {
+  cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
+}
+
+__qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+__qpu__ void test_double_array_param(std::vector<double> inState) {
+  cudaq::qvector q = inState;
+}
+
+__qpu__ void test_float_array_param(std::vector<float> inState) {
+  cudaq::qvector q = inState;
+}
+
+void printCounts(cudaq::sample_result& result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+    // {
+    //   auto counts = cudaq::sample(test_complex_constant_array);
+    //   printCounts(counts);
+    // }
+
+    // {
+    //   auto counts = cudaq::sample(test_complex_constant_array2);
+    //   printCounts(counts);
+    // }
+
+    // {
+    //   auto counts = cudaq::sample(test_complex_constant_array3);
+    //   printCounts(counts);
+    // }
+
+    // {
+    //   auto counts = cudaq::sample(test_real_constant_array);
+    //   printCounts(counts);
+    // }
+
+    // {
+      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      {
+          // Passing state data as argument (kernel mode)
+          auto counts = cudaq::sample(test_complex_array_param, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(test_complex_array_param, vec1);
+          printCounts(counts);
+      }
+
+    //   {
+    //       // Passing state data as argument (builder mode)
+    //       auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+    //       auto qubits = kernel.qalloc(v);
+
+    //       auto counts = cudaq::sample(kernel, vec);
+    //       printCounts(counts);
+
+    //       counts = cudaq::sample(kernel, vec1);
+    //       printCounts(counts);
+    //   }
+    // }
+
+    // {
+    //   std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    //   std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+    //   {
+    //       // Passing state data as argument (kernel mode)
+    //       auto counts = cudaq::sample(test_real_array_param, vec);
+    //       printCounts(counts);
+
+    //       counts = cudaq::sample(test_real_array_param, vec1);
+    //       printCounts(counts);
+    //   }
+
+    //   {
+    //       // Passing state data as argument (builder mode)
+    //       auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
+    //       auto qubits = kernel.qalloc(v);
+
+    //       auto counts = cudaq::sample(kernel, vec);
+    //       printCounts(counts);
+
+    //       counts = cudaq::sample(kernel, vec1);
+    //       printCounts(counts);
+    //   }
+    // }
+
+    // Error message: "Invalid user-provided state data. Simulator is FP64 but state data is FP32."
+    // {
+    //   std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    //   std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+    //   {
+    //       // Passing state data as argument (kernel mode)
+    //       auto counts = cudaq::sample(test_double_array_param, vec);
+    //       printCounts(counts);
+
+    //       counts = cudaq::sample(test_double_array_param, vec1);
+    //       printCounts(counts);
+    //   }
+    // }
+
+    // UCX  ERROR Failed to allocate memory pool (name=mm_recv_desc) chunk: Out of memory
+    // {
+    //   std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    //   std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+    //   {
+    //       // Passing state data as argument (kernel mode)
+    //       auto counts = cudaq::sample(test_float_array_param, vec);
+    //       printCounts(counts);
+
+    //       counts = cudaq::sample(test_float_array_param, vec1);
+    //       printCounts(counts);
+    //   }
+    // }
+}
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 0001
+// CHECK: 0011
+// CHECK: 1001
+// CHECK: 1011
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11
+
+// CHECK: 00
+// CHECK: 10
+
+// CHECK: 01
+// CHECK: 11

From 794f564d1ab5e9c98b2728008dffc51f90a7da1f Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 11 Jul 2024 16:32:25 -0700
Subject: [PATCH 20/50] Remove files added by mistake

---
 dictionary.dic | Bin 9936 -> 0 bytes
 program.py     |  24 ------------------------
 2 files changed, 24 deletions(-)
 delete mode 100644 dictionary.dic
 delete mode 100644 program.py

diff --git a/dictionary.dic b/dictionary.dic
deleted file mode 100644
index d5b1930b6ba86d2e1409b65c5d6b65c39e30fa80..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9936
zcmai)4UlA4Rmbm6_v^2j>G|yW+S$!wAc4h@K={x|h0Mp!&ZIx4x8K{{*;K^zbidi{
zOi%Z0zwXUWL?B=hh)N_tETw{=f&no^1PZ926kjT)FhLY4wTz*PhzdzzSph8zfB*Yl
z?@mz5uA2S7cka38oO|xM=bn4tR)bx?**sGBx2nBn_lSM-J4ZV0JDNvc`jVGkXa1jG
zsrtzO;CL9yk*h-T=LU?qEoICThmE=VfH8l;pMHWEe@Xu0{EhI}{~hsLX7^mX<#!Dx
z^r;~pjb4wggffoijX{MuS_owwEgF;7d&!uLux!k@uwu-#&^2a8=ovEyE+Plh%p!8G
z5F+P@5dJHL@IPM&{|mqs<X<gB?hA$Ry+}y=i@{aeuMyIItq}TkAmYu-gf|&;y>QW(
zmkVilr4YS02<iW7A@<)WM9zW`J;#OEvnE6?T}%NvJemUII4MNlX(9d32;sj4EJD9k
zNT=5dk$anv_L>kOzHq~sEwF^VvqJhcg!H>zh<)2a_;<iE`ddQy+d}wvh47yjq9+hy
ze^-dUJ+K1*&kNyyoe=(C5MuZ1h48&Wh&^|KE_&W1q~F~_`n^?1zk7t(^>!ise^rRy
zcL=fbogiTD7b54^gvj}IA##30h@1z6$a$X-dw)xa9ltF^zz2oc^B{Pd{vQ(3|HDG~
zKO%(xqeA!}1~=gQ1F-))%0q@f#cxhX|L0lEo0nKDnxC;)GOHF{^TGr0Vc($vVbQ!$
zSS*>>2$>+)V!=FXv1ks*L@%44v{*66EV{<C=$SvUxMAMH!qfWrq{S8Ud5b0Umlj=f
zqoa1u6oeJeTomHqyDfU={X)k1kj1?Dw8gS{)MCYaA6#6?o5Kl^fWAgZV7eAPQ@6Nb
z_JqiPs}T9`7cwyqTXfCm!4<~)jF13}C&P6&X|ZIUFZ4>LY_Vc$;3@&%vsf^{B&6SG
zg-q~QESAmpLELXfQ{YXD=1L(6;ChRmSrWofvsg6eEtbq(7F~0{ka_ul#YOWeA$p&%
zST;`!OC__O7P{sSEqdlLA%1@~fskjR_!cYXZ5CbgHH)5k#^M=s)gXMt*9{h3b6iNA
zowArWn-)tZ5HcTkSzI*lwwN~$S}d7AvRF2cTXfA67CrMliyJ08M7|*L{e*C(U|wah
zXpRdp_@u>(xkE_Yy-SE4pR~AWz6dJMJZ^Et{JpSX%=ayN=0`%tJvS_L%~cjR%=JPR
z_@YJ6tP7d{pA~MD%nn#0j_$GOnFobT%%_BfiusZdMSpG4HO~lH;PYAX&0@)1D<pB>
zBqYI}5@Of3Mc3>JvFmNZ%A&d7V%|Jtv0(noV$tN;q{@}Dxm`#Cy5FK_J_uIu*C&L;
z`Im&qe^N-Id|HT~a-%}eJWog-Sq5D;pt8lh@huk3=$PoPd4WaG+#_Uz`KFM7`?f{b
zjF0n<o)=jxnpaqK&9X($RD?MEHj5Q=QHVXiEW`m1SS*__3Gw&8S*#c{p?A+*1rko?
zM&Tm+j>UpGDP+7oA#w2*ixu-ui>~>Ckd5eD7G1M3srL<YyO4GKE{kRJu#k2CWg-52
z(qhG|kr+MvbeoWQyu)I}+#}q`n+L$tD=X$9A@TO77FW#Ig={QOS@g^gEpC{hDd=v+
zTqz`>zY^TQpKIX%^~)SMYMi0ig5NQN10?J0P^_`FWrhYw*vV{sy|T1mhGIvF!sKvl
z{lt=eDVpSHZ0$tR%p{KHiCE8MX9xYpS-*9`uXWoUGnWkMuF1{}?Q00kl%tk{$xdfN
zdbQc}BRQ<G>|7*A<iH$G!ClA~nau3`NVT@zqF(dsXZ;<&)eU>OR0MtPLA!Z@fmrR(
z5jPuZ4;unAEqgWP*_lv%(5-e+FLNPZ)^F*7nM1#u_ZDT@^q_yyul2gs%_f%R7>CQ6
z%+98(y^D=zv)b7+htd$srP78p92)j7?o!;fx(#&H&0*VEaTg0}%+lELgP^h5Xg0cg
z=Abl{D+>SY^iaR?f|@4cG|7NewN|5x{3~wP+aTFyr?9KmU>uEhD=?#C_DnM8oLpLR
zmqU}hQmFMHCUyPRC8asW?RphbN_H$oQ^((_Hce)LO*lI`z~fYseV7#J8fPlK(A=pO
z8m-<%Gm&6hmYVEXvbxi5oejEXHl{5)JDBua=X*YL>WszL+N~32DnUb$?KL}-?zL*|
zo!uTTpHGEU(q49ExKFo(Fq3iGpiiemS&i0NGo6H1v4Jp?ZnSEcst~|!AzAE0m+0y-
zE-CnDG8I`Am`UeoslY}Bt<h>WT9(by>`*L?nHpv&MUx$m`>o!Nf}>RSiW+Thu-9tu
zb{jj5J7LShMgWXDy_P0l`p8n*sbsZPQ$W};x?X8RV>~q6Xc1I47?>zU0}-b-Oq|O;
zr!g?&809J)Clb|GQ0?AnCh;N^&37s!5rp%Y=B-qyX|o3r(e*o9ey6?T^I+ZXmB^yj
zjc7jGgsQ=aikw^q2Zm%6VmzfF#`jvp62qCrvtFg-%7t?ogm<v9Q47owEcYtq$_Ax!
zc7D{VZSVRWJ9oqkD}#0^V7eu%fu-`iQs8AGfjtdzL#E6OG8Zh9CVs_6h6-5{n9)o0
zKnbc$Qi?5<*A(Yvt(CDvZ}&o*fMNt?0??UEuJ8J-g{38vONc_j>`bOaJF&p?AqOIR
zg_B8$rl$$pqlnN6H8O`0$>lL55&z>>hI`UIrubl2cShqFVl#ol1K*|lv0=S0BGk#u
zC2F9MkrO5yyn+K}la!eoIGN=&v=HpIYUeubR=XD{%W4&qm;n|fow5U*6A272uWUG;
ztk!RD<9rR=)9RXlQr%Vd#bzkdor*|y0=x<}UpAL&5FScm%7)B|lJV4_C3YJZ{N|p?
z#;ImqlN<q$v;D>?YOR7zGO*vOqd*iCjHe<Ji6^F#7b`Z~?WSLCG3AQ|GAE?;X0Oq#
zTa{V3Ht?^r*lgF%5suWuBod_O1~)tHYQ0tslq}(+KxS=iwgXnnSe!Z;rpe@Hqg!j&
zeZqT@t&>v};SDWNcq&VTR7}fdE|CLLD0Ma2X)?_M*<t-PYGx!(jg9BC&Pw_CLSRM)
zIO7sNEA1X3+aY{bC~7qDsnizJLgK8G5JRGzo9<J!e5lKjKat`*-?ie$AYmS%w9@W$
zJ2)igP^@dYPiM}xciQZhn(3LuYO&yvl$_bY)#^^8sqS#kCPTW7jeQM@4HfyUgVpj9
zBcsyy6ymGQ^-i@@$MGW3ZzA2N1vqroWATi@4hjh2<S?vy7(al(RQ$xk`hxcP6KfSS
z64Oz^8BH8-bXR+udS^0|qlu;U#WRHk`;Ko$o%1~$&3eq^dXpVV*!RE;4^T5WBhHCl
zb=wy=1*tQh$XCyjps+Dd1QktuB3<jj((00FvoP5SIk8l&V$=ZHE18Tfp^ecl6<DD0
zd_f^KnW|OEunJ-%K|(W<BAQ04^NBO&EET1F%3%Z2!kf-$|EZFzSSB+GOQq|~;AOIk
z1n6v-&@FpAu!`oAAsI1q!{Qe`+`Nw_IR4nAIyzLVwlx1Xmywo-_rG_{!DPsZ()s5!
zsn!Xbv|Ah*-PbML6z9__Q7S4%7zYNhopgG&Zo{tRm{uq=xKvsv7#MfERj+E?L@<7z
z&R8F_dE;irM=n+sl4E=%_I8t+O1`ygdNOrh^Fb<>Zy~`rp6=7?oXRPMH7(_n#zAAR
zyr<XWyaFdUU?_rk8;j}CN2aQ^d%N0v#^VqeIkOj1@U@et=!vFax6_X5o7sdV<B8d!
zT05d^fiq{~r^D3chIS_~hfqfBU{CpEAjBRp8^@Bsnu1>@k~jBu_w4S7UXME^Czpvx
zh3Y1W4=VPLA_bg)>5SYxhQYggHVTFYIH6J)A75Q!yNu=8=*S@XPPM1~Nm(<?@^Q5u
zC>*tZPBemT7RYccPew})$8UDmOJ-t#M$OmBWgFZ#uX%Eh$7zV9stmVT4SbV}<vFip
z(@w2dub#KExtI|nc_a>F&!e0pmaznb?Q-)PWhwKJl#OruT{0VQOrn-*JzT1`)E#4~
z@~-Tdr<BAO>7{C?>jz3dlMW|j4Q?V`Xl!=EU3?^_<8pR1(Y#>G_F>*R3uNPk!YPh(
zF&&=Sk(=%Ic5j!Y%{x})k|9C$N+Dmlsi>l4cBI)pt6lKijus2M4&9YP>E38<6D@SN
z_1kDNBq*gQdPbaPI|%kjg9XmgY>|S&0oHqRBv$0OGU*iaE8-ka@38u}8miZa2B>nA
z<MCpF+%}ga<=#}0Q>KP@xDUpRj$f^_3J_!iSmnS_Os99ES&-%54OBR^EUl_d+Gzvy
zFLON-FYbW=dm0JJ*`d<C$f!Bovux2eJF??<YUfy9{nRlM(}_Ph>Xc3?>t^G;D!Ipb
zUt;D;6gec_(y6uOHJ!wCAAr`<@g*FlO9A%8Rxdn$%qF-UaFsu1hg<CqwS42wFnQ#h
zQqk2QrqZqIjvwq+YuZ9B5o#`TiK@MfnvxlsNVVu}3xd&jdC?XfiQT=;2HRlH;hsU1
z=?s<MmUK~xOeVN>P!4g%Qsup~D$q4)Bmn}m%)NVq0VxAGBMJMeX(3;5QYi1?@96}$
z64p;MgRJ?@zK=vF_o8kl!=b5vgqTf+WaxA7_vxIpQZ!H%E6(Oek#x3G-PvI<VW3Sr
z<w`;Efflv`lAr2MGlv$)c<<owWxA5iJc?+MV-Js9ZVFv1Wku>;wW17xw*KyWegb7>
zd1^9=iBz0*au_6=^rnYIyOi}t;@q~V?sCfO?i%-0u`=PLS9guwYb2)2muxQ430L2I
zic6T4yshe{vl@SEt8um$)^<Y!+|01uS5}V|m{KYkPBatAPLIsa{yX7_f)B=05lQKb
z^`4xHRon%x#<;*?vtf<soEM)R?f9<UZ+acS?r~NWyVe8jt1LP0ckGUm4tkqg?Pi??
zEMj=lNI|SMtAVN=OT$+)gMFec+(J4AO=tRaR_%01wt;quTo1`M{$?$Ao7u&&^C^UL
zh}Kjb8iv1rjHZG-l<v2oV=nugMpeh|Vv(zHPV=<vwCg=83w3exm9pJ&=Or(wbsD?&
z0)P{E)F|6Dl(3BqGKDOq{%9nPPOV4M)kvVCe98_yBx|*~i)3!51{vMGU2;DCy)r|{
z$recRj|^0y#@-q6k+v-jMdN*1odls91ZKrOwX9BIxs*!<gudOsutPL@F#(nsmM~M$
z-2zjcoyY|K#h$HSlnyxqk#A1xWE*q2lqVeB!pTaJqzAt;m0LLG+BXz4oWikX>TvGk
z3U=?xWrDpOwtp&>8as_!B*>_pgC29%(YhOrb62OFkys*Oth-_=%Rq*@%iV~!YeoWz
zj??KrEil(4-Ex_A=bGGj*6opqD9Jc3>KY_F!xTS<f@j!D<(cI?$0!@!3^ox~QCw+Q
zifl`UBpZEP%V{bn2JMAVbi0)~z(|K;%j;|ShYLf5dW&o33WrGH-Xb@Y^5pB|eBNVj
zY`jqHldqLmow20X-eQR~m@M5Ya+T&SmC43lbiXm3uKWG_joD;KB^;)PE|C=x!*o`z
zh%FzdB5-&?wKJYvZeQG!wp^U6Nlc&0)O~VXyQYh-X-7o)aqu!#RhY;y?<bR}M7G-r
zsv{Y?yO>Q_I$~yq_cu^yp;R$6p6qf83^=e(LXn7ES`@avh7Llase9%UF`zW&&Xh|9
zx*yEeNm5)lv|Gd>2jyJCBc7~<>8?*W*&8)_RQj1WOBt(B>F3?KBsZ-qHaE@=)f+*r
z&COu60%)no2{RcrbbMRu;%Q4p{oLSXy0S{xpi#_WSkxoT{!@<UR>Hlc?gyw3>wYFP
z3a}jR595g{Z9Gmqv5+QG^=h}e)okBkNjfHur<Q%H1<sj@6E1(99B~qyYizeU3UUiv
zvNMqzx}$o*-{QVZaZdPgFHFUG>clZ5ou=$j#NE;EY+K=DF<ld*j<Gkkbf|@i1Gm=J
z1y*0R&YGNZPKS3nQ-hW!!Cd3O>Du#a>Qwd42Itp?NcIM5T=H1XB<Qi2E|{5bEZwq&
zbsPe|9ZE%%;D5h)euQr%2e=21@$KwKKlXcs?{*^H%lA3q3BC^r^}CgR*KyzeUHYD+
z?-%*D6}DXdca!_>>!LnC;M<g1w)gRUhqhCEJ2}I5h|9ggr?0{HQs^J#8>1-8QNKC9
zO<s%6K7R|Ik2AD;`KHvTo#y+;H~8kI_YCrHWgL7?xYRECo9Q<f$^6$yM<*Ju+V#st
zHtYEoz6HLH?;zh`+<LzjpL_?IcSgRs3H>|p{0cr%`;G8@75;Seejm?Yfp?zw_c8|Y
zTtM%=eCv5N-$C^LHS~X)=RAL(f&XsCA^IqCh5E&j@km79!2Z?97qjpW_vetY-Gc4E
z!}kdF$?$$B_Wd!w(EC06#`X4S>{mwk=h*#C=4Eo9Fa9o)qrT7J*9Z3b`f$|tTE_JX
z?EE@5j7PGr;{D=2U3W+QZorO@!*@+&kL31$Z$kgG=>N@VT({GwfGzXT1|q+GZ=XHt
z_q+Jy31mGVU+Vqc(Y)&UF?_j$%&UKF{?+ymQH<z0kL<_bxedL~;@?+9{M(UF?*lJJ
z_P!W->hrhJ+|Pn*QQP(K-ALSj9Dn%C$$#?xWMsGc+#C7zQD}$o&8>{@1MvCuZ(z?a
z%D?oz9v_w>JvT%?Jb+xu_%OcsI<z%V@Bb0SkDm1#LB<j?-hl2;F$a(G_Zh}@S7gU)
zXkVxOU-;XIWT@X1`aXcHpM&;jG@bxoJWgN0>Js05QJ;~>-=gXFSJ?dfFn%7rhrwKA
z??K{E?MGw}<9#`H9OHQ#T}Pv_y`DKs&}S`@nS!2y_XpVY;z-Uv@6%U_V&JFg^KJTm
zVxNzH7}+kJk1#h!cz+6cZ>0az_*!)Ru0X!<S^B;V`nS>bMAW{`cwfL=J&yd3Kz~&<
zZ~w%*{{P1BMRu$I8yUxg8XNN8h_4&8eT=`#KA*jcw$I`BwJ2s@g3X`BU*%|Qe}gU8
zB4-WSSC|(cl<fD>NBC2WcZfcE?l2$sMdNp{?Yr3a0Qzp`FOCfHEHX!5jd=8~-$(Jo
zlg!2Yu;quydI#^XkLL3IJg;KA_})Ojry}1x#kk%H{m-H~d^)mUeAh-iS4TF!2cBdk
z?=f`zAd)9u{r-`*GuZcO<i8o;B*2f+r;h(0g!c<k-+xE$d-1dS%)|fDNY8s1Yl->!
zCjL>|&5=!d{?<NS?}~hI2D|RYSGQryLyWf>$ru0YkWq@{&cGwudHB8z?Uu;*qTP;-
VAByJwAU5kik$nz7`}je>{|1;FG}!<E

diff --git a/program.py b/program.py
deleted file mode 100644
index 0c5b92f20e..0000000000
--- a/program.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-import cudaq
-import numpy as np
-
-cudaq.set_target('iqm', url="http://localhost/cocos", **{"qpu-architecture": "Adonis"})
-
-@cudaq.kernel
-def kernel(vec: list[complex]):
-    qubits = cudaq.qvector(vec)
-
-state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
-counts = cudaq.sample(kernel, state)
-print(counts)
-assert '00' in counts
-assert '10' in counts
-assert not '01' in counts
-assert not '11' in counts
\ No newline at end of file

From ca7b683b43fc66728f835fb772c04642e8fef017 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 12 Jul 2024 09:19:57 -0700
Subject: [PATCH 21/50] Update lib/Optimizer/Transforms/LiftArrayAlloc.cpp

Co-authored-by: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
---
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index d541edcacb..53150aadcb 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -96,9 +96,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
                                 PatternRewriter &rewriter) const override {
     SmallVector<Operation *> stores;
     bool toGlobal = false;
-    if (!isGoodCandidate(alloc, stores, dom, toGlobal)) {
+    if (!isGoodCandidate(alloc, stores, dom, toGlobal))
       return failure();
-    }
 
     LLVM_DEBUG(llvm::dbgs() << "Candidate was found\n");
     auto arrTy = cast<cudaq::cc::ArrayType>(alloc.getType().getElementType());

From bdf119e95da6f06f65913645f54aec937bb2b1b0 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 12 Jul 2024 09:22:12 -0700
Subject: [PATCH 22/50] Update runtime/common/BaseRemoteRESTQPU.h

Co-authored-by: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
---
 runtime/common/BaseRemoteRESTQPU.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index c6c88c4d0c..381c94312e 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -382,11 +382,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     moduleOp.push_back(func.clone());
     moduleOp->setAttrs(m_module->getAttrDictionary());
 
-    for (auto &op : m_module.getOps()) {
-      if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
+    for (auto &op : m_module.getOps())
+      if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());
-      }
-    }
 
     // Lambda to apply a specific pipeline to the given ModuleOp
     auto runPassPipeline = [&](const std::string &pipeline,

From 2c88a78544316e9a60a3c4f31155ea9a0d977476 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 12 Jul 2024 09:36:52 -0700
Subject: [PATCH 23/50] Update lib/Optimizer/Transforms/LiftArrayAlloc.cpp

Co-authored-by: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
---
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 53150aadcb..20d9ae04dc 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -165,9 +165,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
               ArrayRef<cudaq::cc::ExtractValueArg>{offset});
           continue;
         }
-        if (isa<cudaq::cc::StoreOp>(useuser)) {
+        if (isa<cudaq::cc::StoreOp>(useuser))
           toErase.push_back(useuser);
-        }
         isLive = true;
       }
       if (!isLive)

From 2d9e957540aecf58e4552e62ccf4c78f87d49a52 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 12 Jul 2024 16:03:42 -0700
Subject: [PATCH 24/50] Addressed CR comments, added tests

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   2 -
 include/cudaq/Optimizer/Transforms/Passes.td  |  12 +-
 lib/Optimizer/Transforms/CMakeLists.txt       |   3 +-
 lib/Optimizer/Transforms/ConstPropComplex.cpp | 210 ++++++++++++++++++
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   | 134 +----------
 lib/Optimizer/Transforms/StatePreparation.cpp |  31 ++-
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   5 +-
 .../default/rest/helpers/ionq/ionq.config     |   2 +-
 .../default/rest/helpers/iqm/iqm.config       |   2 +-
 .../default/rest/helpers/oqc/oqc.config       |   2 +-
 .../rest/helpers/quantinuum/quantinuum.config |   2 +-
 test/Quake/const_prop_complex.qke             | 133 +++++++++++
 test/Quake/lift_array.qke                     |  98 ++++----
 13 files changed, 433 insertions(+), 203 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/ConstPropComplex.cpp
 create mode 100644 test/Quake/const_prop_complex.qke

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 57b79cdec2..996b6e56a7 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -36,12 +36,10 @@ createApplyOpSpecializationPass(bool computeActionOpt);
 std::unique_ptr<mlir::Pass> createDelayMeasurementsPass();
 std::unique_ptr<mlir::Pass> createExpandMeasurementsPass();
 std::unique_ptr<mlir::Pass> createLambdaLiftingPass();
-std::unique_ptr<mlir::Pass> createLiftArrayAllocPass();
 std::unique_ptr<mlir::Pass> createLowerToCFGPass();
 std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
-std::unique_ptr<mlir::Pass> createStatePreparation();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 1a2675d482..2f6a951551 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -118,6 +118,14 @@ def CombineQuantumAllocations :
   let dependentDialects = ["cudaq::cc::CCDialect", "quake::QuakeDialect"];
 }
 
+def ConstPropComplex : Pass<"const-prop-complex", "mlir::ModuleOp"> {
+  let summary = "Create and propagate complex constants.";
+  let description = [{
+    Rewrite the complex.CreateOp to complex.ConstantOp when possible.
+    Replace array pointer casts with element pointer casts.
+  }];
+}
+
 def ConvertToDirectCalls :
     Pass<"indirect-to-direct-calls", "mlir::func::FuncOp"> {
   let summary = "Convert calls to direct calls to Quake routines.";
@@ -532,7 +540,7 @@ def ObserveAnsatz : Pass<"observe-ansatz", "mlir::func::FuncOp"> {
   ];
 }
 
-def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
+def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
   let summary =
     "Convert state vector data into gates";
   let description = [{
@@ -574,8 +582,6 @@ def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> {
     }
     ```
   }];
-
-  let constructor = "cudaq::opt::createStatePreparation()";
 }
 
 def PromoteRefToVeqAlloc : Pass<"promote-qubit-allocation"> {
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index ac79e8dd03..78157ee94b 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -17,6 +17,7 @@ add_cudaq_library(OptTransforms
   ApplyOpSpecialization.cpp
   BasisConversion.cpp
   CombineQuantumAlloc.cpp
+  ConstPropComplex.cpp
   Decomposition.cpp
   DecompositionPatterns.cpp
   DelayMeasurements.cpp
@@ -40,13 +41,13 @@ add_cudaq_library(OptTransforms
   MultiControlDecomposition.cpp
   ObserveAnsatz.cpp
   PruneCtrlRelations.cpp
+  PySynthCallableBlockArgs.cpp
   QuakeAddMetadata.cpp
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
   StateDecomposer.cpp
   StatePreparation.cpp
-  PySynthCallableBlockArgs.cpp
 
   DEPENDS
     OptTransformsPassIncGen
diff --git a/lib/Optimizer/Transforms/ConstPropComplex.cpp b/lib/Optimizer/Transforms/ConstPropComplex.cpp
new file mode 100644
index 0000000000..aa06f044e5
--- /dev/null
+++ b/lib/Optimizer/Transforms/ConstPropComplex.cpp
@@ -0,0 +1,210 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_CONSTPROPCOMPLEX
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "const-prop-complex"
+
+using namespace mlir;
+
+namespace {
+
+// Replace array ptr casts that throw away the size by a cast to element
+// pointer.
+//
+//%1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) ->
+//!cc.ptr<!cc.array<complex<f32> x ?>>
+// ->
+//%1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) ->
+//!cc.ptr<complex<f32>>
+class CastArrayPtrPattern : public OpRewritePattern<cudaq::cc::CastOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::CastOp cast,
+                                PatternRewriter &rewriter) const override {
+
+    auto fromTy = cast.getOperand().getType();
+    auto toTy = cast.getType();
+
+    if (auto ptrFromTy = dyn_cast<cudaq::cc::PointerType>(fromTy)) {
+      if (auto arrayFromTy =
+              dyn_cast<cudaq::cc::ArrayType>(ptrFromTy.getElementType())) {
+        if (auto ptrToTy = dyn_cast<cudaq::cc::PointerType>(toTy)) {
+          if (auto arrayToTy =
+                  dyn_cast<cudaq::cc::ArrayType>(ptrToTy.getElementType())) {
+            if (arrayFromTy.getElementType() == arrayToTy.getElementType()) {
+              auto eleTy = arrayFromTy.getElementType();
+              auto elePtrType = cudaq::cc::PointerType::get(eleTy);
+              rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(cast, elePtrType,
+                                                             cast.getOperand());
+              return success();
+            }
+          }
+        }
+      }
+    }
+    return failure();
+  }
+};
+
+// Fold complex.create ops if the arguments are constants.
+class ComplexCreatePattern : public OpRewritePattern<complex::CreateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(complex::CreateOp create,
+                                PatternRewriter &rewriter) const override {
+    auto re = create.getReal();
+    auto im = create.getImaginary();
+    auto reCon = re.getDefiningOp<arith::ConstantOp>();
+    auto imCon = im.getDefiningOp<arith::ConstantOp>();
+    if (reCon && imCon) {
+      auto aa = ArrayAttr::get(
+          rewriter.getContext(),
+          ArrayRef<Attribute>{reCon.getValue(), imCon.getValue()});
+      rewriter.replaceOpWithNewOp<complex::ConstantOp>(create, create.getType(),
+                                                       aa);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Fold arith.trunc ops if the argument is constant.
+class FloatTruncatePattern : public OpRewritePattern<arith::TruncFOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::TruncFOp truncate,
+                                PatternRewriter &rewriter) const override {
+    auto val = truncate.getOperand();
+    auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
+    if (valCon) {
+      auto v = valCon.value().convertToDouble();
+      auto fTy = dyn_cast<FloatType>(truncate.getType());
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
+          truncate, APFloat{static_cast<float>(v)}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Fold arith.ext ops if the argument is constant.
+class FloatExtendPattern : public OpRewritePattern<arith::ExtFOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::ExtFOp extend,
+                                PatternRewriter &rewriter) const override {
+    auto val = extend.getOperand();
+    auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
+    if (valCon) {
+      auto v = valCon.value().convertToFloat();
+      auto fTy = dyn_cast<FloatType>(extend.getType());
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
+          extend, APFloat{static_cast<double>(v)}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Fold complex.re ops if the argument is constant.
+class ComplexRePattern : public OpRewritePattern<complex::ReOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(complex::ReOp re,
+                                PatternRewriter &rewriter) const override {
+    auto val = re.getOperand();
+    auto valCon = val.getDefiningOp<complex::ConstantOp>();
+    if (valCon) {
+      auto attr = valCon.getValue();
+      auto real = cast<FloatAttr>(attr[0]).getValue();
+      auto fTy = dyn_cast<FloatType>(re.getType());
+      auto v = fTy.isF64() ? real.convertToDouble() : real.convertToFloat();
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(re, APFloat{v}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Fold complex.im ops if the argument is constant.
+class ComplexImPattern : public OpRewritePattern<complex::ImOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(complex::ImOp im,
+                                PatternRewriter &rewriter) const override {
+    auto val = im.getOperand();
+    auto valCon = val.getDefiningOp<complex::ConstantOp>();
+    if (valCon) {
+      auto attr = valCon.getValue();
+      auto imag = cast<FloatAttr>(attr[1]).getValue();
+      auto fTy = dyn_cast<FloatType>(im.getType());
+      auto v = fTy.isF64() ? imag.convertToDouble() : imag.convertToFloat();
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(im, APFloat{v}, fTy);
+      return success();
+    }
+    return failure();
+  }
+};
+
+class ConstPropComplexPass
+    : public cudaq::opt::impl::ConstPropComplexBase<ConstPropComplexPass> {
+public:
+  using ConstPropComplexBase::ConstPropComplexBase;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    auto module = getOperation();
+    for (Operation &op : *module.getBody()) {
+      auto func = dyn_cast<func::FuncOp>(op);
+      if (!func)
+        continue;
+      DominanceInfo domInfo(func);
+      std::string funcName = func.getName().str();
+      RewritePatternSet patterns(ctx);
+      patterns.insert<ComplexCreatePattern>(ctx);
+      patterns.insert<FloatExtendPattern>(ctx);
+      patterns.insert<FloatTruncatePattern>(ctx);
+      patterns.insert<ComplexRePattern>(ctx);
+      patterns.insert<ComplexImPattern>(ctx);
+      patterns.insert<CastArrayPtrPattern>(ctx);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Before lifting constant array: " << func << '\n');
+
+      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                              std::move(patterns))))
+        signalPassFailure();
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "After lifting constant array: " << func << '\n');
+    }
+  }
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 20d9ae04dc..1554acec06 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -100,7 +100,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       return failure();
 
     LLVM_DEBUG(llvm::dbgs() << "Candidate was found\n");
-    auto arrTy = cast<cudaq::cc::ArrayType>(alloc.getType().getElementType());
+    auto arrTy = cast<cudaq::cc::ArrayType>(alloc.getElementType());
     SmallVector<Attribute> values;
 
     // Every element of `stores` must be a cc::StoreOp with a ConstantOp as the
@@ -217,8 +217,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     if (std::distance(alloc->getUses().begin(), alloc->getUses().end()) < size)
       return false;
 
-    //  Keep a scoreboard for every element in the array. Every element *must*
-    //  be stored to with a constant exactly one time.
+    // Keep a scoreboard for every element in the array. Every element *must*
+    // be stored to with a constant exactly one time.
     scoreboard.resize(size);
     for (int i = 0; i < size; i++)
       scoreboard[i] = nullptr;
@@ -284,19 +284,11 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
               scoreboard[0] = w;
               continue;
             }
-          // can be a cast only used for a quake.init_state or vector init
-          continue;
-        } else {
-          if (getWriteOp(cast, 0)) {
-            LLVM_DEBUG(llvm::dbgs()
-                       << "another cast used in store: " << *op << '\n');
-            return false;
-          }
-          // can be a cast only used for a quake.init_state or vector init
-          toGlobal = true;
-          continue;
+          return false;
         }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
+        toGlobalUses.push_back(op);
+        toGlobal = true;
         continue;
       }
       LLVM_DEBUG(llvm::dbgs() << "unexpected use: " << *op << '\n');
@@ -341,111 +333,6 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
   mutable ModuleOp module;
 };
 
-// Fold complex.create ops if the arguments are constants.
-class ComplexCreatePattern : public OpRewritePattern<complex::CreateOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(complex::CreateOp create,
-                                PatternRewriter &rewriter) const override {
-    auto re = create.getReal();
-    auto im = create.getImaginary();
-    auto reCon = re.getDefiningOp<arith::ConstantOp>();
-    auto imCon = im.getDefiningOp<arith::ConstantOp>();
-    if (reCon && imCon) {
-      auto aa = ArrayAttr::get(
-          rewriter.getContext(),
-          ArrayRef<Attribute>{reCon.getValue(), imCon.getValue()});
-      rewriter.replaceOpWithNewOp<complex::ConstantOp>(create, create.getType(),
-                                                       aa);
-      return success();
-    }
-    return failure();
-  }
-};
-
-// Fold arith.trunc ops if the argument is constant.
-class FloatTruncatePattern : public OpRewritePattern<arith::TruncFOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(arith::TruncFOp truncate,
-                                PatternRewriter &rewriter) const override {
-    auto val = truncate.getOperand();
-    auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
-    if (valCon) {
-      auto v = valCon.value().convertToDouble();
-      auto fTy = dyn_cast<FloatType>(truncate.getType());
-      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
-          truncate, APFloat{static_cast<float>(v)}, fTy);
-      return success();
-    }
-    return failure();
-  }
-};
-
-// Fold arith.ext ops if the argument is constant.
-class FloatExtendPattern : public OpRewritePattern<arith::ExtFOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(arith::ExtFOp extend,
-                                PatternRewriter &rewriter) const override {
-    auto val = extend.getOperand();
-    auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
-    if (valCon) {
-      auto v = valCon.value().convertToFloat();
-      auto fTy = dyn_cast<FloatType>(extend.getType());
-      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
-          extend, APFloat{static_cast<double>(v)}, fTy);
-      return success();
-    }
-    return failure();
-  }
-};
-
-// Fold complex.re ops if the argument is constant.
-class ComplexRePattern : public OpRewritePattern<complex::ReOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(complex::ReOp re,
-                                PatternRewriter &rewriter) const override {
-    auto val = re.getOperand();
-    auto valCon = val.getDefiningOp<complex::ConstantOp>();
-    if (valCon) {
-      auto attr = valCon.getValue();
-      auto real = cast<FloatAttr>(attr[0]).getValue();
-      auto fTy = dyn_cast<FloatType>(re.getType());
-      auto v = fTy.isF64() ? real.convertToDouble() : real.convertToFloat();
-      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(re, APFloat{v}, fTy);
-      return success();
-    }
-    return failure();
-  }
-};
-
-// Fold complex.im ops if the argument is constant.
-class ComplexImPattern : public OpRewritePattern<complex::ImOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(complex::ImOp im,
-                                PatternRewriter &rewriter) const override {
-    auto val = im.getOperand();
-    auto valCon = val.getDefiningOp<complex::ConstantOp>();
-    if (valCon) {
-      auto attr = valCon.getValue();
-      auto real = cast<FloatAttr>(attr[0]).getValue();
-      auto fTy = dyn_cast<FloatType>(im.getType());
-      auto v = fTy.isF64() ? real.convertToDouble() : real.convertToFloat();
-      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(im, APFloat{v}, fTy);
-      return success();
-    }
-    return failure();
-  }
-};
-
 class LiftArrayAllocPass
     : public cudaq::opt::impl::LiftArrayAllocBase<LiftArrayAllocPass> {
 public:
@@ -462,11 +349,6 @@ class LiftArrayAllocPass
       std::string funcName = func.getName().str();
       RewritePatternSet patterns(ctx);
       patterns.insert<AllocaPattern>(ctx, domInfo, funcName, module);
-      patterns.insert<ComplexCreatePattern>(ctx);
-      patterns.insert<FloatExtendPattern>(ctx);
-      patterns.insert<FloatTruncatePattern>(ctx);
-      patterns.insert<ComplexRePattern>(ctx);
-      patterns.insert<ComplexImPattern>(ctx);
 
       LLVM_DEBUG(llvm::dbgs()
                  << "Before lifting constant array: " << func << '\n');
@@ -481,7 +363,3 @@ class LiftArrayAllocPass
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> cudaq::opt::createLiftArrayAllocPass() {
-  return std::make_unique<LiftArrayAllocPass>();
-}
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 8f84623a29..83e60cc734 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -23,9 +23,15 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include <span>
 
+namespace cudaq::opt {
+#define GEN_PASS_DEF_STATEPREPARATION
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
 #define DEBUG_TYPE "state-preparation"
 
 using namespace mlir;
@@ -34,7 +40,7 @@ using namespace mlir;
 /// For example:
 ///
 ///
-/// Before PrepareState (state-prep):
+/// Before StatePreparation (state-prep):
 ///
 /// module {
 ///   func.func @foo() attributes {
@@ -49,7 +55,7 @@ using namespace mlir;
 ///    !cc.array<complex<f32> x 4>
 /// }
 ///
-/// After PrepareState (state-prep):
+/// After StatePreparation (state-prep):
 ///
 /// module {
 ///   func.func @foo() attributes {
@@ -113,6 +119,8 @@ readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
 LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
   auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
   auto toErase = std::vector<mlir::Operation *>();
+  auto succeeded = false;
+
   funcOp->walk([&](Operation *op) {
     if (auto initOp = dyn_cast<quake::InitializeStateOp>(op)) {
       toErase.push_back(initOp);
@@ -145,18 +153,24 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
             initOp.replaceAllUsesWith(qubits);
             toErase.push_back(addr);
             toErase.push_back(global);
+            succeeded = true;
           }
         }
       }
     }
   });
 
+  if (!succeeded) {
+    funcOp.emitOpError("StatePreparation failed to replace quake.init_state");
+    return failure();
+  }
+
   for (auto &op : toErase) {
     if (op->getUses().empty()) {
       op->erase();
     } else {
-      module.emitOpError("StatePreparation failed to remove quake.init_state "
-                         "or its dependencies.");
+      op->emitOpError("StatePreparation failed to remove quake.init_state "
+                      "or its dependencies.");
       return failure();
     }
   }
@@ -164,10 +178,11 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
   return success();
 }
 
-class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
+class StatePreparationPass
+    : public cudaq::opt::impl::StatePreparationBase<StatePreparationPass> {
 protected:
 public:
-  StatePreparation() = default;
+  using StatePreparationBase::StatePreparationBase;
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
@@ -190,7 +205,3 @@ class StatePreparation : public cudaq::opt::PrepareStateBase<StatePreparation> {
 };
 
 } // namespace
-
-std::unique_ptr<mlir::Pass> cudaq::opt::createStatePreparation() {
-  return std::make_unique<StatePreparation>();
-}
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 6d238509ec..6d2afefb6d 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -463,7 +463,7 @@ py::object pyAltLaunchKernelR(const std::string &name, MlirModule module,
 }
 
 /// @brief Helper function to get boolean environment variable
-bool getEnvBool(const char *envName, bool defaultVal = false) {
+static bool getEnvBool(const char *envName, bool defaultVal = false) {
   if (auto envVal = std::getenv(envName)) {
     std::string tmp(envVal);
     std::transform(tmp.begin(), tmp.end(), tmp.begin(),
@@ -499,8 +499,9 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   // in their runtime.
   auto &platform = cudaq::get_platform();
   if (!platform.is_simulator() || platform.is_emulated()) {
+    pm.addPass(cudaq::opt::createConstPropComplex());
     pm.addPass(createCSEPass());
-    pm.addPass(cudaq::opt::createLiftArrayAllocPass());
+    pm.addPass(cudaq::opt::createLiftArrayAlloc());
     pm.addPass(cudaq::opt::createStatePreparation());
   }
   pm.addPass(createCanonicalizerPass());
diff --git a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config
index c78a2b3e1e..053134b680 100644
--- a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config
+++ b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.config
@@ -16,7 +16,7 @@ GEN_TARGET_BACKEND=true
 LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline
-PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
+PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),ionq-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 CODEGEN_EMISSION=qir-base
diff --git a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config
index 2db0f2b235..3d98fd209e 100644
--- a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config
+++ b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.config
@@ -18,7 +18,7 @@ LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 # Define the lowering pipeline, here we lower to Base QIR
 # Note: the runtime will dynamically substitute %QPU_ARCH% based on
 # qpu-architecture
-PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},delay-measurements,regtomem),iqm-gate-set-mapping"
+PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},delay-measurements,regtomem),iqm-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating IQM JSON.
 CODEGEN_EMISSION=iqm
diff --git a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config
index 042fb8dd8d..d447f49cf8 100644
--- a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config
+++ b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.config
@@ -18,7 +18,7 @@ LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 # Define the lowering pipeline. Lucy has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Toshiko uses a Kagome lattice with 2-3 connectivity per qubit
-PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},regtomem)"
+PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg,qubit-mapping{device=file(%QPU_ARCH%)},regtomem)"
 
 
 # Tell the rest-qpu that we are generating QIR.
diff --git a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config
index bed7159b28..64452c8759 100644
--- a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config
+++ b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.config
@@ -16,7 +16,7 @@ GEN_TARGET_BACKEND=true
 LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline, here we lower to Adaptive QIR
-PLATFORM_LOWERING_CONFIG="canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
+PLATFORM_LOWERING_CONFIG="const-prop-complex,canonicalize,cse,lift-array-value,state-prep,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),quantinuum-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 CODEGEN_EMISSION=qir-adaptive
diff --git a/test/Quake/const_prop_complex.qke b/test/Quake/const_prop_complex.qke
new file mode 100644
index 0000000000..4f0b5215cf
--- /dev/null
+++ b/test/Quake/const_prop_complex.qke
@@ -0,0 +1,133 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -const-prop-complex -cse -lift-array-value %s | FileCheck %s
+
+func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.70710678118654757 : f64
+    %0 = arith.truncf %cst_0 : f64 to f32
+    %1 = complex.create %0, %cst : complex<f32>
+    %2 = complex.create %cst, %cst : complex<f32>
+    %3 = cc.alloca !cc.array<complex<f32> x 4>
+    %4 = cc.cast %3 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %1, %4 : !cc.ptr<complex<f32>>
+    %5 = cc.compute_ptr %3[1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %1, %5 : !cc.ptr<complex<f32>>
+    %6 = cc.compute_ptr %3[2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %2, %6 : !cc.ptr<complex<f32>>
+    %7 = cc.compute_ptr %3[3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+    cc.store %2, %7 : !cc.ptr<complex<f32>>
+    %8 = quake.alloca !quake.veq<2>
+    %9 = quake.init_state %8, %4 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+    return
+  }
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = complex.constant [0.707106769 : f32, 0.000000e+00 : f32] : complex<f32>
+// CHECK:           %[[VAL_1:.*]] = complex.constant [0.000000e+00 : f32, 0.000000e+00 : f32] : complex<f32>
+// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.array<complex<f32> x 4>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_4]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_5]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_8:.*]] = quake.init_state %[[VAL_7]], %[[VAL_3]] : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+// CHECK:           return
+// CHECK:           }
+
+
+func.func private @__nvqpp_vectorCopyCtor(%0: !cc.ptr<i8>, %1: i64, %2: i64) -> !cc.ptr<i8>
+
+func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  %cst = arith.constant -0.70710678118654757 : f64
+  %c16_i64 = arith.constant 16 : i64
+  %c4_i64 = arith.constant 4 : i64
+  %cst_0 = arith.constant 0.70710678118654757 : f64
+  %cst_1 = arith.constant 0.000000e+00 : f64
+  %0 = complex.create %cst_0, %cst_1 : complex<f64>
+  %1 = complex.create %cst_0, %cst_1 : complex<f64>
+  %2 = complex.create %cst_0, %cst_1 : complex<f64>
+  %3 = complex.create %cst, %cst_1 : complex<f64>
+  %4 = cc.alloca !cc.array<complex<f64> x 4>
+  %5 = cc.cast %4 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %0, %5 : !cc.ptr<complex<f64>>
+  %6 = cc.compute_ptr %4[1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %1, %6 : !cc.ptr<complex<f64>>
+  %7 = cc.compute_ptr %4[2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %2, %7 : !cc.ptr<complex<f64>>
+  %8 = cc.compute_ptr %4[3] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %3, %8 : !cc.ptr<complex<f64>>
+  %9 = cc.cast %4 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+  %10 = call @__nvqpp_vectorCopyCtor(%9, %c4_i64, %c16_i64) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+  %11 = cc.stdvec_init %10, %c4_i64 : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+  return %11 : !cc.stdvec<complex<f64>>
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = complex.constant [0.70710678118654757, 0.000000e+00] : complex<f64>
+// CHECK:           %[[VAL_1:.*]] = complex.constant [-0.70710678118654757, 0.000000e+00] : complex<f64>
+// CHECK:           %[[VAL_2:.*]] = arith.constant 16 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 4 : i64
+// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<complex<f64> x 4>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_5]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_6]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_7]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_4]][3] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_8]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_10:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_9]], %[[VAL_3]], %[[VAL_2]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_11:.*]] = cc.stdvec_init %[[VAL_10]], %[[VAL_3]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+// CHECK:           return %[[VAL_11]] : !cc.stdvec<complex<f64>>
+// CHECK:         }
+
+
+func.func @test2() -> !quake.veq<2> {
+  %0 = cc.alloca !cc.array<f64 x 4>
+  %1 = cc.compute_ptr %0[0] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+  %2 = arith.constant 1.0 : f64
+  cc.store %2, %1 : !cc.ptr<f64>
+  %3 = cc.compute_ptr %0[1] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+  %4 = arith.constant 2.0 : f64
+  cc.store %4, %3 : !cc.ptr<f64>
+  %5 = cc.compute_ptr %0[2] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+  %6 = arith.constant 6.0 : f64
+  cc.store %6, %5 : !cc.ptr<f64>
+  %7 = cc.compute_ptr %0[3] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+  %8 = arith.constant 9.0 : f64
+  cc.store %8, %7 : !cc.ptr<f64>
+  %9 = quake.alloca !quake.veq<2>
+  %10 = quake.init_state %9, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+  return %10 : !quake.veq<2>
+}
+
+// CHECK-LABEL:   func.func @test2() -> !quake.veq<2> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 9.000000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 6.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 2.000000e+00 : f64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f64
+// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<f64 x 4>
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]][0] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+// CHECK:           cc.store %[[VAL_3]], %[[VAL_5]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_6]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_7]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_4]][3] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_8]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_9:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_10:.*]] = quake.init_state %[[VAL_9]], %[[VAL_4]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+// CHECK:           return %[[VAL_10]] : !quake.veq<2>
+// CHECK:         }
\ No newline at end of file
diff --git a/test/Quake/lift_array.qke b/test/Quake/lift_array.qke
index a8b9b337b2..73a450d42c 100644
--- a/test/Quake/lift_array.qke
+++ b/test/Quake/lift_array.qke
@@ -9,24 +9,21 @@
 // RUN: cudaq-opt -lift-array-value %s | FileCheck %s
 
 func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-    %cst = arith.constant 0.000000e+00 : f32
-    %cst_0 = arith.constant 0.70710678118654757 : f64
-    %0 = arith.truncf %cst_0 : f64 to f32
-    %1 = complex.create %0, %cst : complex<f32>
-    %2 = complex.create %cst, %cst : complex<f32>
-    %3 = cc.alloca !cc.array<complex<f32> x 4>
-    %4 = cc.cast %3 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-    cc.store %1, %4 : !cc.ptr<complex<f32>>
-    %5 = cc.compute_ptr %3[1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-    cc.store %1, %5 : !cc.ptr<complex<f32>>
-    %6 = cc.compute_ptr %3[2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-    cc.store %2, %6 : !cc.ptr<complex<f32>>
-    %7 = cc.compute_ptr %3[3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-    cc.store %2, %7 : !cc.ptr<complex<f32>>
-    %8 = quake.alloca !quake.veq<2>
-    %9 = quake.init_state %8, %4 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
-    return
-  }
+  %cst = complex.constant [0.707106769 : f32, 0.000000e+00 : f32] : complex<f32>
+  %cst_0 = complex.constant [0.000000e+00 : f32, 0.000000e+00 : f32] : complex<f32>
+  %0 = cc.alloca !cc.array<complex<f32> x 4>
+  %1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %cst, %1 : !cc.ptr<complex<f32>>
+  %2 = cc.compute_ptr %0[1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %cst, %2 : !cc.ptr<complex<f32>>
+  %3 = cc.compute_ptr %0[2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %cst_0, %3 : !cc.ptr<complex<f32>>
+  %4 = cc.compute_ptr %0[3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %cst_0, %4 : !cc.ptr<complex<f32>>
+  %5 = quake.alloca !quake.veq<2>
+  %6 = quake.init_state %5, %1 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+  return
+}
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_{{[0-9]+}} : !cc.ptr<!cc.array<complex<f32> x 4>>
@@ -36,31 +33,26 @@ func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_compl
 // CHECK:         }
 
 
-func.func private @__nvqpp_vectorCopyCtor(%0: !cc.ptr<i8>, %1: i64, %2: i64) -> !cc.ptr<i8>
+func.func private @__nvqpp_vectorCopyCtor(!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
 
 func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-  %cst = arith.constant -0.70710678118654757 : f64
+  %cst = complex.constant [0.70710678118654757, 0.000000e+00] : complex<f64>
+  %cst_0 = complex.constant [-0.70710678118654757, 0.000000e+00] : complex<f64>
   %c16_i64 = arith.constant 16 : i64
   %c4_i64 = arith.constant 4 : i64
-  %cst_0 = arith.constant 0.70710678118654757 : f64
-  %cst_1 = arith.constant 0.000000e+00 : f64
-  %0 = complex.create %cst_0, %cst_1 : complex<f64>
-  %1 = complex.create %cst_0, %cst_1 : complex<f64>
-  %2 = complex.create %cst_0, %cst_1 : complex<f64>
-  %3 = complex.create %cst, %cst_1 : complex<f64>
-  %4 = cc.alloca !cc.array<complex<f64> x 4>
-  %5 = cc.cast %4 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
-  cc.store %0, %5 : !cc.ptr<complex<f64>>
-  %6 = cc.compute_ptr %4[1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
-  cc.store %1, %6 : !cc.ptr<complex<f64>>
-  %7 = cc.compute_ptr %4[2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
-  cc.store %2, %7 : !cc.ptr<complex<f64>>
-  %8 = cc.compute_ptr %4[3] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
-  cc.store %3, %8 : !cc.ptr<complex<f64>>
-  %9 = cc.cast %4 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
-  %10 = call @__nvqpp_vectorCopyCtor(%9, %c4_i64, %c16_i64) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
-  %11 = cc.stdvec_init %10, %c4_i64 : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
-  return %11 : !cc.stdvec<complex<f64>>
+  %0 = cc.alloca !cc.array<complex<f64> x 4>
+  %1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %cst, %1 : !cc.ptr<complex<f64>>
+  %2 = cc.compute_ptr %0[1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %cst, %2 : !cc.ptr<complex<f64>>
+  %3 = cc.compute_ptr %0[2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %cst, %3 : !cc.ptr<complex<f64>>
+  %4 = cc.compute_ptr %0[3] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+  cc.store %cst_0, %4 : !cc.ptr<complex<f64>>
+  %5 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<i8>
+  %6 = call @__nvqpp_vectorCopyCtor(%5, %c4_i64, %c16_i64) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+  %7 = cc.stdvec_init %6, %c4_i64 : (!cc.ptr<i8>, i64) -> !cc.stdvec<complex<f64>>
+  return %7 : !cc.stdvec<complex<f64>>
 }
   
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generator_1v() -> !cc.stdvec<complex<f64>> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
@@ -74,22 +66,22 @@ func.func @__nvqpp__mlirgen__function_custom_h_generator_1._Z20custom_h_generato
 // CHECK:         }
 
 func.func @test2() -> !quake.veq<2> {
+  %cst = arith.constant 9.000000e+00 : f64
+  %cst_0 = arith.constant 6.000000e+00 : f64
+  %cst_1 = arith.constant 2.000000e+00 : f64
+  %cst_2 = arith.constant 1.000000e+00 : f64
   %0 = cc.alloca !cc.array<f64 x 4>
   %1 = cc.compute_ptr %0[0] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
-  %2 = arith.constant 1.0 : f64
-  cc.store %2, %1 : !cc.ptr<f64>
-  %3 = cc.compute_ptr %0[1] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
-  %4 = arith.constant 2.0 : f64
-  cc.store %4, %3 : !cc.ptr<f64>
-  %5 = cc.compute_ptr %0[2] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
-  %6 = arith.constant 6.0 : f64
-  cc.store %6, %5 : !cc.ptr<f64>
-  %7 = cc.compute_ptr %0[3] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
-  %8 = arith.constant 9.0 : f64
-  cc.store %8, %7 : !cc.ptr<f64>
-  %9 = quake.alloca !quake.veq<2>
-  %10 = quake.init_state %9, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
-  return %10 : !quake.veq<2>
+  cc.store %cst_2, %1 : !cc.ptr<f64>
+  %2 = cc.compute_ptr %0[1] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+  cc.store %cst_1, %2 : !cc.ptr<f64>
+  %3 = cc.compute_ptr %0[2] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+  cc.store %cst_0, %3 : !cc.ptr<f64>
+  %4 = cc.compute_ptr %0[3] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
+  cc.store %cst, %4 : !cc.ptr<f64>
+  %5 = quake.alloca !quake.veq<2>
+  %6 = quake.init_state %5, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
+  return %6 : !quake.veq<2>
 }
 
 // CHECK-LABEL:   func.func @test2() -> !quake.veq<2> {

From 138145646ceae2bc67e3695bc2de94ef4c1c8cbf Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 12 Jul 2024 18:31:29 -0700
Subject: [PATCH 25/50] Fix failing tests

---
 lib/Optimizer/Transforms/StatePreparation.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 83e60cc734..86fad793a5 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -119,11 +119,13 @@ readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
 LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
   auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
   auto toErase = std::vector<mlir::Operation *>();
-  auto succeeded = false;
+  auto hasInitState = false;
+  auto replacedInitState = false;
 
   funcOp->walk([&](Operation *op) {
     if (auto initOp = dyn_cast<quake::InitializeStateOp>(op)) {
       toErase.push_back(initOp);
+      hasInitState = true;
       auto loc = op->getLoc();
       builder.setInsertionPointAfter(initOp);
       // Find the qvector alloc.
@@ -153,14 +155,14 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
             initOp.replaceAllUsesWith(qubits);
             toErase.push_back(addr);
             toErase.push_back(global);
-            succeeded = true;
+            replacedInitState = true;
           }
         }
       }
     }
   });
 
-  if (!succeeded) {
+  if (hasInitState && !replacedInitState) {
     funcOp.emitOpError("StatePreparation failed to replace quake.init_state");
     return failure();
   }

From 907b415e52c6db48c6269b5515c2de53b4f74349 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 12 Jul 2024 20:28:12 -0700
Subject: [PATCH 26/50] Fixed error message

---
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 1554acec06..2336e6a97d 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -183,8 +183,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       if (op->getUses().empty()) {
         rewriter.eraseOp(op);
       } else {
-        module.emitOpError("LiftArrayAlloc failed to remove quake.init_state "
-                           "or its dependencies.");
+        op->emitOpError("LiftArrayAlloc failed to remove cc::AllocOp "
+                        "or its uses.");
         return failure();
       }
     }

From 19a164c8483fce031613a664dcc19bf3ba26bd4f Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 12 Jul 2024 21:39:48 -0700
Subject: [PATCH 27/50] Fix failing tests

---
 test/AST-Quake/custom_op_concrete_matrix.cpp | 2 +-
 test/Quake/const_prop_complex.qke            | 2 +-
 tools/nvqpp/nvq++.in                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/AST-Quake/custom_op_concrete_matrix.cpp b/test/AST-Quake/custom_op_concrete_matrix.cpp
index bb3b697005..3f350a6222 100644
--- a/test/AST-Quake/custom_op_concrete_matrix.cpp
+++ b/test/AST-Quake/custom_op_concrete_matrix.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %cpp_std %s | cudaq-opt -lift-array-value -get-concrete-matrix | FileCheck %s
+// RUN: cudaq-quake %cpp_std %s | cudaq-opt -const-prop-complex -lift-array-value -get-concrete-matrix | FileCheck %s
 
 #include <cudaq.h>
 
diff --git a/test/Quake/const_prop_complex.qke b/test/Quake/const_prop_complex.qke
index 4f0b5215cf..7b75d72ac9 100644
--- a/test/Quake/const_prop_complex.qke
+++ b/test/Quake/const_prop_complex.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt -const-prop-complex -cse -lift-array-value %s | FileCheck %s
+// RUN: cudaq-opt -const-prop-complex %s | FileCheck %s
 
 func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %cst = arith.constant 0.000000e+00 : f32
diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
index b1f55b57b1..2197e91ecb 100644
--- a/tools/nvqpp/nvq++.in
+++ b/tools/nvqpp/nvq++.in
@@ -708,7 +708,7 @@ fi
 if ${ENABLE_DEVICE_CODE_LOADERS}; then
 	RUN_OPT=true
 	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "func.func(quake-add-metadata)")
-	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1}")
+	OPT_PASSES=$(add_pass_to_pipeline "${OPT_PASSES}" "const-prop-complex,lift-array-value,func.func(get-concrete-matrix),device-code-loader{use-quake=1}")
 fi
 if ${ENABLE_LOWER_TO_CFG}; then
 	RUN_OPT=true

From 9d9b9e9fd44c779e5958b5c205335aa6e26b55ea Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 15 Jul 2024 11:05:14 -0700
Subject: [PATCH 28/50] Temp

---
 include/cudaq/Optimizer/Transforms/Passes.h   |  3 +-
 .../Optimizer/Transforms/SimulationData.h     | 50 +++++++++++++++++++
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp |  7 +--
 runtime/common/BaseRemoteRESTQPU.h            | 24 ++++++++-
 4 files changed, 79 insertions(+), 5 deletions(-)
 create mode 100644 include/cudaq/Optimizer/Transforms/SimulationData.h

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 996b6e56a7..40ee1465be 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -15,6 +15,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "SimulationData.h"
 
 namespace cudaq::opt {
 
@@ -41,7 +42,7 @@ std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
-std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *);
+std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, SimulationData::getSimulationDataFunc*, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
 std::unique_ptr<mlir::Pass> createUnwindLoweringPass();
 
diff --git a/include/cudaq/Optimizer/Transforms/SimulationData.h b/include/cudaq/Optimizer/Transforms/SimulationData.h
new file mode 100644
index 0000000000..77f5fe37db
--- /dev/null
+++ b/include/cudaq/Optimizer/Transforms/SimulationData.h
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#include <numbers>
+#include <vector>
+
+// cudaq::state is defined in the runtime. The compiler will never need to know
+// about its implementation and there should not be a circular build/library
+// dependence because of it. Simply forward declare it, as it is notional.
+namespace cudaq {
+class state;
+}
+
+
+/// Owns the data
+class SimulationData {
+ public:
+  typedef SimulationData (getSimulationDataFunc)(cudaq::state*);
+
+  SimulationData(void *data, std::size_t size, std::size_t elementSize): 
+    data(data), size(size), elementSize(elementSize) {}
+  
+  template <typename T> 
+  std::vector<T> toVector() {
+    assert(sizeof(T) == elementSize && "incorrect element size in simulation data");
+    std::vector<T> result;
+
+    for (auto i = 0; i < size; i++) {
+      auto elePtr = reinterpret_cast<T*>(data + i*elementSize);
+      result[i] = *elePtr;
+    }
+
+    return result;
+  }
+
+  ~SimulationData() {
+    delete data;
+  }
+
+private:
+  void* data;
+  std::size_t size;
+  std::size_t elementSize;
+};
+
+
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index c657f53ae7..cd7a0cfc9a 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -15,6 +15,7 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "cudaq/Optimizer/Transforms/SimulationData.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -359,7 +360,7 @@ class QuakeSynthesizer
 
 public:
   QuakeSynthesizer() = default;
-  QuakeSynthesizer(std::string_view kernel, void *a)
+  QuakeSynthesizer(std::string_view kernel, SimulationData::getSimulationDataFunc* getData, void *a)
       : kernelName(kernel), args(a) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
@@ -713,6 +714,6 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
 }
 
 std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, void *a) {
-  return std::make_unique<QuakeSynthesizer>(kernelName, a);
+cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, SimulationData::getSimulationDataFunc* getData, void *a) {
+  return std::make_unique<QuakeSynthesizer>(kernelName, getData, a);
 }
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 30445dae45..42005600d2 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -22,6 +22,7 @@
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "cudaq/Optimizer/Transforms/SimulationData.h"
 #include "cudaq/Support/Plugin.h"
 #include "cudaq/platform/qpu.h"
 #include "cudaq/platform/quantum_platform.h"
@@ -355,6 +356,27 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     return output_names;
   }
 
+  SimulationData readSimulationData(cudaq::state* s) {
+    void *dataPtr = nullptr;
+    void *dataPtr = nullptr;
+    auto stateVector = s->get_tensor();
+    auto precision = s->get_precision();
+    auto numElements = stateVector.get_num_elements();
+    auto elementSize = 0;
+    if (precision == SimulationState::precision::fp32) {
+      elementSize = sizeof(std::complex<float);
+      auto *hostData = new std::complex<float>[numElements];
+      s->to_host(hostData, numElements);
+      dataPtr = reinterpret_cast<void *>(hostData);
+    } else {
+      elementSize = sizeof(std::complex<double);
+      auto *hostData = new std::complex<double>[numElements];
+      s->to_host(hostData, numElements);
+      dataPtr = reinterpret_cast<void *>(hostData);
+    }
+    return SimulationData(dataPtr, numElements, elementSize);
+}
+
   /// @brief Extract the Quake representation for the given kernel name and
   /// lower it to the code format required for the specific backend. The
   /// lowering process is controllable via the configuration file in the
@@ -413,7 +435,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
+      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, readSimulationData, updatedArgs));
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();

From bd002f201cdfa4343f22e65dec16ee74a06ba8e1 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 16 Jul 2024 09:22:17 -0700
Subject: [PATCH 29/50] Synthesize state pointers for remote sim

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   2 +-
 .../Optimizer/Transforms/SimulationData.h     |  24 +--
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 139 ++++++++++++++++--
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   2 +-
 runtime/common/BaseRemoteRESTQPU.h            |  23 +--
 runtime/common/BaseRestRemoteClient.h         |  42 +++++-
 unittests/Optimizer/QuakeSynthTester.cpp      |   2 +-
 7 files changed, 183 insertions(+), 51 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 40ee1465be..0f63d7b3b2 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -42,7 +42,7 @@ std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
-std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, SimulationData::getSimulationDataFunc*, void *);
+std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, SimulationStateData::getDataFunc*, void *);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
 std::unique_ptr<mlir::Pass> createUnwindLoweringPass();
 
diff --git a/include/cudaq/Optimizer/Transforms/SimulationData.h b/include/cudaq/Optimizer/Transforms/SimulationData.h
index 77f5fe37db..62fcb12c2e 100644
--- a/include/cudaq/Optimizer/Transforms/SimulationData.h
+++ b/include/cudaq/Optimizer/Transforms/SimulationData.h
@@ -5,9 +5,14 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
+
+#pragma once
+
 #include <numbers>
 #include <vector>
 
+#include <iostream>
+
 // cudaq::state is defined in the runtime. The compiler will never need to know
 // about its implementation and there should not be a circular build/library
 // dependence because of it. Simply forward declare it, as it is notional.
@@ -17,11 +22,11 @@ class state;
 
 
 /// Owns the data
-class SimulationData {
+class SimulationStateData {
  public:
-  typedef SimulationData (getSimulationDataFunc)(cudaq::state*);
+  typedef SimulationStateData (getDataFunc)(cudaq::state*);
 
-  SimulationData(void *data, std::size_t size, std::size_t elementSize): 
+  SimulationStateData(void *data, std::size_t size, std::size_t elementSize): 
     data(data), size(size), elementSize(elementSize) {}
   
   template <typename T> 
@@ -29,19 +34,20 @@ class SimulationData {
     assert(sizeof(T) == elementSize && "incorrect element size in simulation data");
     std::vector<T> result;
 
-    for (auto i = 0; i < size; i++) {
-      auto elePtr = reinterpret_cast<T*>(data + i*elementSize);
-      result[i] = *elePtr;
+    std::cout << "SimulationStateData:" << std::endl;
+    for (std::size_t i = 0; i < size; i++) {
+      auto elePtr = reinterpret_cast<T*>(data) + i;
+      result.push_back(*elePtr);
+      std::cout << *elePtr << std::endl;
     }
 
     return result;
   }
 
-  ~SimulationData() {
-    delete data;
+  ~SimulationStateData() {
+    delete reinterpret_cast<int*>(data);
   }
 
-private:
   void* data;
   std::size_t size;
   std::size_t elementSize;
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index cd7a0cfc9a..85fac7009c 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -26,6 +26,8 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
+#include <iostream>
+
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
@@ -125,6 +127,74 @@ createArrayInMemory(OpBuilder &builder, ModuleOp module, unsigned &counter,
   return {buffer, data};
 }
 
+template <typename ELETY, typename T, typename MAKER>
+LogicalResult
+synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument, ELETY eleTy, std::vector<T> &vec,
+                         MAKER makeElementValue) {
+  auto *ctx = builder.getContext();
+  auto argLoc = argument.getLoc();
+
+  auto strTy = cudaq::cc::StdvecType::get(eleTy);
+  auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
+
+  builder.setInsertionPointToStart(argument.getOwner());
+  
+  // Iterate over the users of this state argument.
+  for (auto *argUser : argument.getUsers()) {
+    // Replace a calls to runtime function that reads the number of qubits
+    // with the log of the length, which is a synthesized constant.
+    if (auto numOfQubitsOp = dyn_cast<func::CallOp>(argUser)) {
+      if (auto calleeAttr = numOfQubitsOp.getCalleeAttr()) {
+        auto funcName = calleeAttr.getValue().str();
+        std::cout << "Call on state: " << funcName << std::endl;
+        if (funcName == cudaq::getNumQubitsFromCudaqState) {
+          Value numOfQubits = builder.create<arith::ConstantIntOp>(
+              argLoc, log2(vec.size()), builder.getI64Type());
+          numOfQubitsOp.replaceAllUsesWith(ValueRange{numOfQubits});
+          numOfQubitsOp.erase();
+          std::cout << "Removed getNumQubitsFromCudaqState" << std::endl;
+        } else {
+          argUser->emitError("Unexpected call on state argument");
+          return failure();
+        }
+      }
+    }
+  }
+
+  std::cout << "Synthesizing vec" << std::endl;
+  OpBuilder::InsertionGuard guard(builder);
+  auto [buffer, _] =
+      createArrayInMemory(builder, module, counter, argument, vec, arrTy);
+  auto ptrArrEleTy =
+      cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
+  Value memArr = builder.create<cudaq::cc::CastOp>(argLoc, ptrArrEleTy, buffer);
+
+  builder.setInsertionPointAfter(memArr.getDefiningOp());
+  Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
+  Value newVec =
+      builder.create<cudaq::cc::StdvecInitOp>(argLoc, strTy, memArr, size);
+  argument.replaceAllUsesWith(newVec);
+
+  std::cout << "Done Synthesizing vec" << std::endl;
+  
+  return success();
+}
+
+static LogicalResult
+synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument, std::vector<std::complex<float>> &vec) {
+  return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
+                                            ComplexType::get(builder.getF32Type()), vec, makeComplexElement<float>);
+}
+
+static LogicalResult
+synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                         BlockArgument argument, std::vector<std::complex<double>> &vec) {
+  return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
+                                            ComplexType::get(builder.getF64Type()), vec, makeComplexElement<double>);
+}
+
 template <typename ELETY, typename T, typename MAKER>
 LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
@@ -132,9 +202,11 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
+  
   assert(isa<cudaq::cc::StdvecType>(argTy));
   auto strTy = cast<cudaq::cc::StdvecType>(argTy);
   auto eleTy = cast<ELETY>(strTy.getElementType());
+
   builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
 
@@ -357,11 +429,12 @@ class QuakeSynthesizer
 
   // The raw pointer to the runtime arguments.
   void *args;
+   SimulationStateData::getDataFunc* getStateData;
 
 public:
   QuakeSynthesizer() = default;
-  QuakeSynthesizer(std::string_view kernel, SimulationData::getSimulationDataFunc* getData, void *a)
-      : kernelName(kernel), args(a) {}
+  QuakeSynthesizer(std::string_view kernel, SimulationStateData::getDataFunc* getData, void *a)
+      : kernelName(kernel), args(a), getStateData(getData) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
@@ -500,19 +573,53 @@ class QuakeSynthesizer
 
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          // Special case of a `cudaq::state*` which must be in the same address
-          // space. This references a container to a set of simulation
-          // amplitudes.
-          synthesizeRuntimeArgument<cudaq::state *>(
-              builder, argument, args, offset, sizeof(void *),
-              [=](OpBuilder &builder, cudaq::state **concrete) {
-                Value rawPtr = builder.create<arith::ConstantIntOp>(
-                    loc, reinterpret_cast<std::intptr_t>(*concrete),
-                    sizeof(void *) * 8);
-                auto stateTy = cudaq::cc::StateType::get(builder.getContext());
-                return builder.create<cudaq::cc::CastOp>(
-                    loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
-              });
+          if (getStateData != nullptr) {
+            std::cout << "Reading state data:" << std::endl;
+              cudaq::state* concrete;
+              std::memcpy(&concrete, ((char *)args) + offset, sizeof(cudaq::state*));
+              std::cout << "Getting state data:" << std::endl;
+              auto stateData = getStateData(concrete);
+              if (stateData.elementSize == sizeof(std::complex<double>)) {
+                auto v = stateData.toVector<std::complex<double>>();
+                std::cout << "Read vector of double:" << std::endl;
+                for (auto e: v) {
+                  std::cout << e << "," <<std::endl;
+                }
+                if (failed(synthesizeStateArgument(builder, module, counter, argument, v))) {
+                  module.emitError("Failed to synthesize state*");
+                }
+              } else {
+                auto v = stateData.toVector<std::complex<float>>();
+                std::cout << "Read vector of float:" << std::endl;
+                for (auto e: v) {
+                  std::cout << e << "," <<std::endl;
+                }
+                if (failed(synthesizeStateArgument(builder, module, counter, argument, v)))
+                  module.emitError("Failed to synthesize state*");
+                
+                std::cout << "Synthesized float state" << std::endl;
+              }
+              std::cout << "Done synthesizing states" << std::endl;
+          }
+          
+
+          // // Special case of a `cudaq::state*` which must be in the same address
+          // // space. This references a container to a set of simulation
+          // // amplitudes.
+          // synthesizeRuntimeArgument<cudaq::state *>(
+          //     builder, argument, args, offset, sizeof(void *),
+          //     [=](OpBuilder &builder, cudaq::state **concrete) {
+          //       Value rawPtr = builder.create<arith::ConstantIntOp>(
+          //           loc, reinterpret_cast<std::intptr_t>(*concrete),
+          //           sizeof(void *) * 8);
+          //       auto stateTy = cudaq::cc::StateType::get(builder.getContext());
+          //       return builder.create<cudaq::cc::CastOp>(
+          //           loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
+          //     });
+          else {
+            funcOp.emitOpError("synthesis: unsupported state argument type");
+            signalPassFailure();
+          }
           continue;
         }
         // N.B. Other pointers will not be materialized and may be in a
@@ -714,6 +821,6 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
 }
 
 std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, SimulationData::getSimulationDataFunc* getData, void *a) {
+cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, SimulationStateData::getDataFunc* getData, void *a) {
   return std::make_unique<QuakeSynthesizer>(kernelName, getData, a);
 }
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 6d2afefb6d..16d5575228 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -491,7 +491,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
       getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
 
   PassManager pm(context);
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs));
+  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, nullptr, rawArgs));
   pm.addPass(createCanonicalizerPass());
 
   // Run state preparation for quantum devices only.
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 42005600d2..471d171673 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -356,27 +356,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     return output_names;
   }
 
-  SimulationData readSimulationData(cudaq::state* s) {
-    void *dataPtr = nullptr;
-    void *dataPtr = nullptr;
-    auto stateVector = s->get_tensor();
-    auto precision = s->get_precision();
-    auto numElements = stateVector.get_num_elements();
-    auto elementSize = 0;
-    if (precision == SimulationState::precision::fp32) {
-      elementSize = sizeof(std::complex<float);
-      auto *hostData = new std::complex<float>[numElements];
-      s->to_host(hostData, numElements);
-      dataPtr = reinterpret_cast<void *>(hostData);
-    } else {
-      elementSize = sizeof(std::complex<double);
-      auto *hostData = new std::complex<double>[numElements];
-      s->to_host(hostData, numElements);
-      dataPtr = reinterpret_cast<void *>(hostData);
-    }
-    return SimulationData(dataPtr, numElements, elementSize);
-}
-
   /// @brief Extract the Quake representation for the given kernel name and
   /// lower it to the code format required for the specific backend. The
   /// lowering process is controllable via the configuration file in the
@@ -435,7 +414,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, readSimulationData, updatedArgs));
+      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, nullptr, updatedArgs));
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index d82457669d..65ff68c4ad 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -92,6 +92,39 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
                        });
   }
 
+  static SimulationStateData readSimulationStateData(cudaq::state* s) {
+    std::cout << "Reading sim state data" << std::endl;
+    void *dataPtr = nullptr;
+    auto stateVector = s->get_tensor();
+    auto precision = s->get_precision();
+    auto numElements = stateVector.get_num_elements();
+    auto elementSize = 0;
+    if (precision == SimulationState::precision::fp32) {
+      std::cout << "32 bit precision" << std::endl;
+      elementSize = sizeof(std::complex<float>);
+      auto *hostData = new std::complex<float>[numElements];
+      std::cout << "Reading host data" << std::endl;
+      s->to_host(hostData, numElements);
+      std::cout << "Host data:" << std::endl;
+      for (size_t i = 0; i< numElements; i++) {
+        std::cout << hostData[i] << std::endl;
+      }
+      dataPtr = reinterpret_cast<void *>(hostData);
+    } else {
+      std::cout << "64 bit precision" << std::endl;
+      elementSize = sizeof(std::complex<double>);
+      auto *hostData = new std::complex<double>[numElements];
+      std::cout << "Reading host data" << std::endl;
+      s->to_host(hostData, numElements);
+       std::cout << "Host data:" << std::endl;
+      for (size_t i = 0; i< numElements; i++) {
+        std::cout << hostData[i] << std::endl;
+      }
+      dataPtr = reinterpret_cast<void *>(hostData);
+    }
+    return SimulationStateData(dataPtr, numElements, elementSize);
+  }
+
 public:
   virtual void setConfig(
       const std::unordered_map<std::string, std::string> &configs) override {
@@ -172,7 +205,14 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
         mlir::PassManager pm(&mlirContext);
         moduleOp.getContext()->disableMultithreading();
         pm.enableIRPrinting();
-        pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args));
+        auto &platform = cudaq::get_platform();
+        if (platform.is_simulator()) {
+          // For efficiency, we don't run state prep to convert states to gates on
+          // simulators, instead we synthesize them as vectors.
+          pm.addPass(cudaq::opt::createQuakeSynthesizer(name, readSimulationStateData, args));
+        } else {
+          pm.addPass(cudaq::opt::createQuakeSynthesizer(name, nullptr, args));
+        }
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
index c7e6bce0c5..745043ebd1 100644
--- a/unittests/Optimizer/QuakeSynthTester.cpp
+++ b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -54,7 +54,7 @@ LogicalResult runQuakeSynth(std::string_view kernelName, void *rawArgs,
   PassManager pm(module->getContext());
   module->getContext()->disableMultithreading();
   pm.enableIRPrinting();
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, rawArgs));
+  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, nullptr, rawArgs));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());

From 87aff5fa94fb8bfb9574cf81aa441ded7fe066c7 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 16 Jul 2024 09:26:20 -0700
Subject: [PATCH 30/50] Addressed CR comments

---
 targettests/execution/state_preparation_vector.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index 35d2b68619..1a1c7421aa 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -6,8 +6,13 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --enable-mlir --target quantinuum --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --enable-mlir                               %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 
 #include <cudaq.h>
 #include <iostream>

From 0931e6fa870b22bce0d25780f9b837e66659196a Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 16 Jul 2024 11:25:56 -0700
Subject: [PATCH 31/50] Cleanup

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   3 +-
 .../Optimizer/Transforms/SimulationData.h     |  28 ++--
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 148 +++++++++---------
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   2 +-
 runtime/common/BaseRemoteRESTQPU.h            |   2 +-
 runtime/common/BaseRestRemoteClient.h         |   4 +-
 targettests/Remote-Sim/state_init.cpp         |  26 ---
 unittests/Optimizer/QuakeSynthTester.cpp      |   2 +-
 8 files changed, 98 insertions(+), 117 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 0f63d7b3b2..7745734c49 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -42,7 +42,8 @@ std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
-std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, SimulationStateData::getDataFunc*, void *);
+std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *, SimulationStateData::getDataFunc*);
+std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, void *, bool);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
 std::unique_ptr<mlir::Pass> createUnwindLoweringPass();
 
diff --git a/include/cudaq/Optimizer/Transforms/SimulationData.h b/include/cudaq/Optimizer/Transforms/SimulationData.h
index 62fcb12c2e..9504dc7c08 100644
--- a/include/cudaq/Optimizer/Transforms/SimulationData.h
+++ b/include/cudaq/Optimizer/Transforms/SimulationData.h
@@ -29,20 +29,20 @@ class SimulationStateData {
   SimulationStateData(void *data, std::size_t size, std::size_t elementSize): 
     data(data), size(size), elementSize(elementSize) {}
   
-  template <typename T> 
-  std::vector<T> toVector() {
-    assert(sizeof(T) == elementSize && "incorrect element size in simulation data");
-    std::vector<T> result;
-
-    std::cout << "SimulationStateData:" << std::endl;
-    for (std::size_t i = 0; i < size; i++) {
-      auto elePtr = reinterpret_cast<T*>(data) + i;
-      result.push_back(*elePtr);
-      std::cout << *elePtr << std::endl;
-    }
-
-    return result;
-  }
+  // template <typename T> 
+  // std::vector<T> toVector() {
+  //   assert(sizeof(T) == elementSize && "incorrect element size in simulation data");
+  //   std::vector<T> result;
+
+  //   std::cout << "SimulationStateData:" << std::endl;
+  //   for (std::size_t i = 0; i < size; i++) {
+  //     auto elePtr = reinterpret_cast<T*>(data) + i;
+  //     result.push_back(*elePtr);
+  //     std::cout << *elePtr << std::endl;
+  //   }
+
+  //   return result;
+  // }
 
   ~SimulationStateData() {
     delete reinterpret_cast<int*>(data);
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 85fac7009c..bde179512d 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -127,6 +127,19 @@ createArrayInMemory(OpBuilder &builder, ModuleOp module, unsigned &counter,
   return {buffer, data};
 }
 
+template <typename T> 
+std::vector<T> stateDataToVector(SimulationStateData& stateData) {
+  assert(sizeof(T) == stateData.elementSize && "incorrect element size in simulation data");
+  std::vector<T> result;
+
+  for (std::size_t i = 0; i < stateData.size; i++) {
+    auto elePtr = reinterpret_cast<T*>(stateData.data) + i;
+    result.push_back(*elePtr);
+  }
+
+  return result;
+}
+
 template <typename ELETY, typename T, typename MAKER>
 LogicalResult
 synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
@@ -135,10 +148,11 @@ synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
   auto *ctx = builder.getContext();
   auto argLoc = argument.getLoc();
 
-  auto strTy = cudaq::cc::StdvecType::get(eleTy);
+  //auto strTy = cudaq::cc::StdvecType::get(eleTy);
   auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
 
   builder.setInsertionPointToStart(argument.getOwner());
+  auto toErase = std::vector<mlir::Operation *>();
   
   // Iterate over the users of this state argument.
   for (auto *argUser : argument.getUsers()) {
@@ -147,13 +161,11 @@ synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     if (auto numOfQubitsOp = dyn_cast<func::CallOp>(argUser)) {
       if (auto calleeAttr = numOfQubitsOp.getCalleeAttr()) {
         auto funcName = calleeAttr.getValue().str();
-        std::cout << "Call on state: " << funcName << std::endl;
         if (funcName == cudaq::getNumQubitsFromCudaqState) {
           Value numOfQubits = builder.create<arith::ConstantIntOp>(
               argLoc, log2(vec.size()), builder.getI64Type());
           numOfQubitsOp.replaceAllUsesWith(ValueRange{numOfQubits});
-          numOfQubitsOp.erase();
-          std::cout << "Removed getNumQubitsFromCudaqState" << std::endl;
+          toErase.push_back(numOfQubitsOp);
         } else {
           argUser->emitError("Unexpected call on state argument");
           return failure();
@@ -162,7 +174,6 @@ synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     }
   }
 
-  std::cout << "Synthesizing vec" << std::endl;
   OpBuilder::InsertionGuard guard(builder);
   auto [buffer, _] =
       createArrayInMemory(builder, module, counter, argument, vec, arrTy);
@@ -170,29 +181,34 @@ synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
       cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
   Value memArr = builder.create<cudaq::cc::CastOp>(argLoc, ptrArrEleTy, buffer);
 
-  builder.setInsertionPointAfter(memArr.getDefiningOp());
-  Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
-  Value newVec =
-      builder.create<cudaq::cc::StdvecInitOp>(argLoc, strTy, memArr, size);
-  argument.replaceAllUsesWith(newVec);
-
-  std::cout << "Done Synthesizing vec" << std::endl;
+  // builder.setInsertionPointAfter(memArr.getDefiningOp());
+  // Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
+  // Value newVec =
+  //     builder.create<cudaq::cc::StdvecInitOp>(argLoc, strTy, memArr, size);
+  argument.replaceAllUsesWith(memArr);
   
-  return success();
-}
+  for (auto &op : toErase) {
+    op->erase();
+  }
 
-static LogicalResult
-synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<std::complex<float>> &vec) {
-  return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
-                                            ComplexType::get(builder.getF32Type()), vec, makeComplexElement<float>);
+  return success();
 }
 
 static LogicalResult
 synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, std::vector<std::complex<double>> &vec) {
-  return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
+                         BlockArgument argument, SimulationStateData& stateData) {
+  
+  if (stateData.elementSize == sizeof(std::complex<double>)) {
+    auto vec = stateDataToVector<std::complex<double>>(stateData);
+    return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
                                             ComplexType::get(builder.getF64Type()), vec, makeComplexElement<double>);
+  } else if (stateData.elementSize == sizeof(std::complex<float>)) {
+    auto vec = stateDataToVector<std::complex<float>>(stateData);
+    return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
+                                            ComplexType::get(builder.getF32Type()), vec, makeComplexElement<float>);
+  }
+  module.emitError("unexpected element size in simulation state data");
+  return failure();
 }
 
 template <typename ELETY, typename T, typename MAKER>
@@ -429,12 +445,17 @@ class QuakeSynthesizer
 
   // The raw pointer to the runtime arguments.
   void *args;
-   SimulationStateData::getDataFunc* getStateData;
+  
+  // Function to read the state data, if any.
+  SimulationStateData::getDataFunc* getStateData;
+  
+  // Is the simulation running in the same address space as synthesis?
+  bool sameAddressSpace;
 
 public:
   QuakeSynthesizer() = default;
-  QuakeSynthesizer(std::string_view kernel, SimulationStateData::getDataFunc* getData, void *a)
-      : kernelName(kernel), args(a), getStateData(getData) {}
+  QuakeSynthesizer(std::string_view kernel, void *a,  SimulationStateData::getDataFunc* getData, bool sameSpace = false)
+      : kernelName(kernel), args(a), getStateData(getData), sameAddressSpace(sameSpace) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
@@ -573,51 +594,31 @@ class QuakeSynthesizer
 
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          if (getStateData != nullptr) {
-            std::cout << "Reading state data:" << std::endl;
-              cudaq::state* concrete;
-              std::memcpy(&concrete, ((char *)args) + offset, sizeof(cudaq::state*));
-              std::cout << "Getting state data:" << std::endl;
-              auto stateData = getStateData(concrete);
-              if (stateData.elementSize == sizeof(std::complex<double>)) {
-                auto v = stateData.toVector<std::complex<double>>();
-                std::cout << "Read vector of double:" << std::endl;
-                for (auto e: v) {
-                  std::cout << e << "," <<std::endl;
-                }
-                if (failed(synthesizeStateArgument(builder, module, counter, argument, v))) {
-                  module.emitError("Failed to synthesize state*");
-                }
-              } else {
-                auto v = stateData.toVector<std::complex<float>>();
-                std::cout << "Read vector of float:" << std::endl;
-                for (auto e: v) {
-                  std::cout << e << "," <<std::endl;
-                }
-                if (failed(synthesizeStateArgument(builder, module, counter, argument, v)))
-                  module.emitError("Failed to synthesize state*");
-                
-                std::cout << "Synthesized float state" << std::endl;
-              }
-              std::cout << "Done synthesizing states" << std::endl;
-          }
-          
-
-          // // Special case of a `cudaq::state*` which must be in the same address
-          // // space. This references a container to a set of simulation
-          // // amplitudes.
-          // synthesizeRuntimeArgument<cudaq::state *>(
-          //     builder, argument, args, offset, sizeof(void *),
-          //     [=](OpBuilder &builder, cudaq::state **concrete) {
-          //       Value rawPtr = builder.create<arith::ConstantIntOp>(
-          //           loc, reinterpret_cast<std::intptr_t>(*concrete),
-          //           sizeof(void *) * 8);
-          //       auto stateTy = cudaq::cc::StateType::get(builder.getContext());
-          //       return builder.create<cudaq::cc::CastOp>(
-          //           loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
-          //     });
-          else {
-            funcOp.emitOpError("synthesis: unsupported state argument type");
+          if (sameAddressSpace) {
+            // Special case of a `cudaq::state*` which must be in the same address
+            // space. This references a container to a set of simulation
+            // amplitudes.
+            synthesizeRuntimeArgument<cudaq::state *>(
+                builder, argument, args, offset, sizeof(void *),
+                [=](OpBuilder &builder, cudaq::state **concrete) {
+                  Value rawPtr = builder.create<arith::ConstantIntOp>(
+                      loc, reinterpret_cast<std::intptr_t>(*concrete),
+                      sizeof(void *) * 8);
+                  auto stateTy = cudaq::cc::StateType::get(builder.getContext());
+                  return builder.create<cudaq::cc::CastOp>(
+                      loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
+                });
+          } else if (getStateData != nullptr) {
+            // Special case of running on a simulator in a different address space,
+            // when we know how to convert state to data.
+            cudaq::state* concrete;
+            std::memcpy(&concrete, ((char *)args) + offset, sizeof(cudaq::state*));
+            auto stateData = getStateData(concrete);
+            if (failed(synthesizeStateArgument(builder, module, counter, argument, stateData)))
+                module.emitError("Failed to synthesize state*");
+          } else {
+            // All other cases are not yet supported (i.e. quantum hardware).
+            funcOp.emitOpError("synthesis: unsupported argument type: state*");
             signalPassFailure();
           }
           continue;
@@ -821,6 +822,11 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
 }
 
 std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, SimulationStateData::getDataFunc* getData, void *a) {
-  return std::make_unique<QuakeSynthesizer>(kernelName, getData, a);
+cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, void *a, SimulationStateData::getDataFunc* getData) {
+  return std::make_unique<QuakeSynthesizer>(kernelName, a, getData, false);
+}
+
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, void *a, bool sameAddressSpace) {
+  return std::make_unique<QuakeSynthesizer>(kernelName, a, nullptr, sameAddressSpace);
 }
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 16d5575228..a0032cf82e 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -491,7 +491,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
       getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
 
   PassManager pm(context);
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, nullptr, rawArgs));
+  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs, true));
   pm.addPass(createCanonicalizerPass());
 
   // Run state preparation for quantum devices only.
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 471d171673..3a35a019d2 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -414,7 +414,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, nullptr, updatedArgs));
+      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs, nullptr));
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 65ff68c4ad..1ea51ff344 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -209,9 +209,9 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
         if (platform.is_simulator()) {
           // For efficiency, we don't run state prep to convert states to gates on
           // simulators, instead we synthesize them as vectors.
-          pm.addPass(cudaq::opt::createQuakeSynthesizer(name, readSimulationStateData, args));
+          pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args, readSimulationStateData));
         } else {
-          pm.addPass(cudaq::opt::createQuakeSynthesizer(name, nullptr, args));
+          pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args, nullptr));
         }
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
diff --git a/targettests/Remote-Sim/state_init.cpp b/targettests/Remote-Sim/state_init.cpp
index 6677b4746c..4615e2ec08 100644
--- a/targettests/Remote-Sim/state_init.cpp
+++ b/targettests/Remote-Sim/state_init.cpp
@@ -65,31 +65,5 @@ int main() {
 // CHECK: 00
 // CHECK: 10
 
-// CHECK: 0001
-// CHECK: 0011
-// CHECK: 1001
-// CHECK: 1011
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 01
-// CHECK: 11
-
-// CHECK: 00
-// CHECK: 10
-
-// CHECK: 01
-// CHECK: 11
-
-// CHECK: 00
-// CHECK: 10
-
 // CHECK: 01
 // CHECK: 11
diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
index 745043ebd1..ede99e8e82 100644
--- a/unittests/Optimizer/QuakeSynthTester.cpp
+++ b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -54,7 +54,7 @@ LogicalResult runQuakeSynth(std::string_view kernelName, void *rawArgs,
   PassManager pm(module->getContext());
   module->getContext()->disableMultithreading();
   pm.enableIRPrinting();
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, nullptr, rawArgs));
+  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, rawArgs, nullptr));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());

From cd528c75aa67258c0c75554b650706ce356997da Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 17 Jul 2024 15:23:37 -0700
Subject: [PATCH 32/50] Address CR comments

---
 dictionary.dic                                | Bin 0 -> 9936 bytes
 lib/Optimizer/Builder/Intrinsics.cpp          |   8 +-
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 -
 lib/Optimizer/Transforms/ConstPropComplex.cpp |  70 ++--
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   |  27 +-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 127 +++++---
 lib/Optimizer/Transforms/StateDecomposer.cpp  | 128 --------
 lib/Optimizer/Transforms/StateDecomposer.h    | 175 ----------
 lib/Optimizer/Transforms/StatePreparation.cpp | 301 ++++++++++++++++--
 python/cudaq/kernel/ast_bridge.py             |  14 +-
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  14 +-
 runtime/common/BaseRemoteRESTQPU.h            |  13 +-
 runtime/common/CMakeLists.txt                 |   3 +-
 runtime/common/Environment.cpp                |  26 ++
 runtime/common/Environment.h                  |  16 +
 .../execution/state_preparation_vector.cpp    | 240 +++++++++-----
 test/Quake/const_prop_complex.qke             |  37 +++
 test/Quake/state_prep.qke                     |  97 +++---
 18 files changed, 702 insertions(+), 595 deletions(-)
 create mode 100644 dictionary.dic
 delete mode 100644 lib/Optimizer/Transforms/StateDecomposer.cpp
 delete mode 100644 lib/Optimizer/Transforms/StateDecomposer.h
 create mode 100644 runtime/common/Environment.cpp
 create mode 100644 runtime/common/Environment.h

diff --git a/dictionary.dic b/dictionary.dic
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1930b6ba86d2e1409b65c5d6b65c39e30fa80
GIT binary patch
literal 9936
zcmai)4UlA4Rmbm6_v^2j>G|yW+S$!wAc4h@K={x|h0Mp!&ZIx4x8K{{*;K^zbidi{
zOi%Z0zwXUWL?B=hh)N_tETw{=f&no^1PZ926kjT)FhLY4wTz*PhzdzzSph8zfB*Yl
z?@mz5uA2S7cka38oO|xM=bn4tR)bx?**sGBx2nBn_lSM-J4ZV0JDNvc`jVGkXa1jG
zsrtzO;CL9yk*h-T=LU?qEoICThmE=VfH8l;pMHWEe@Xu0{EhI}{~hsLX7^mX<#!Dx
z^r;~pjb4wggffoijX{MuS_owwEgF;7d&!uLux!k@uwu-#&^2a8=ovEyE+Plh%p!8G
z5F+P@5dJHL@IPM&{|mqs<X<gB?hA$Ry+}y=i@{aeuMyIItq}TkAmYu-gf|&;y>QW(
zmkVilr4YS02<iW7A@<)WM9zW`J;#OEvnE6?T}%NvJemUII4MNlX(9d32;sj4EJD9k
zNT=5dk$anv_L>kOzHq~sEwF^VvqJhcg!H>zh<)2a_;<iE`ddQy+d}wvh47yjq9+hy
ze^-dUJ+K1*&kNyyoe=(C5MuZ1h48&Wh&^|KE_&W1q~F~_`n^?1zk7t(^>!ise^rRy
zcL=fbogiTD7b54^gvj}IA##30h@1z6$a$X-dw)xa9ltF^zz2oc^B{Pd{vQ(3|HDG~
zKO%(xqeA!}1~=gQ1F-))%0q@f#cxhX|L0lEo0nKDnxC;)GOHF{^TGr0Vc($vVbQ!$
zSS*>>2$>+)V!=FXv1ks*L@%44v{*66EV{<C=$SvUxMAMH!qfWrq{S8Ud5b0Umlj=f
zqoa1u6oeJeTomHqyDfU={X)k1kj1?Dw8gS{)MCYaA6#6?o5Kl^fWAgZV7eAPQ@6Nb
z_JqiPs}T9`7cwyqTXfCm!4<~)jF13}C&P6&X|ZIUFZ4>LY_Vc$;3@&%vsf^{B&6SG
zg-q~QESAmpLELXfQ{YXD=1L(6;ChRmSrWofvsg6eEtbq(7F~0{ka_ul#YOWeA$p&%
zST;`!OC__O7P{sSEqdlLA%1@~fskjR_!cYXZ5CbgHH)5k#^M=s)gXMt*9{h3b6iNA
zowArWn-)tZ5HcTkSzI*lwwN~$S}d7AvRF2cTXfA67CrMliyJ08M7|*L{e*C(U|wah
zXpRdp_@u>(xkE_Yy-SE4pR~AWz6dJMJZ^Et{JpSX%=ayN=0`%tJvS_L%~cjR%=JPR
z_@YJ6tP7d{pA~MD%nn#0j_$GOnFobT%%_BfiusZdMSpG4HO~lH;PYAX&0@)1D<pB>
zBqYI}5@Of3Mc3>JvFmNZ%A&d7V%|Jtv0(noV$tN;q{@}Dxm`#Cy5FK_J_uIu*C&L;
z`Im&qe^N-Id|HT~a-%}eJWog-Sq5D;pt8lh@huk3=$PoPd4WaG+#_Uz`KFM7`?f{b
zjF0n<o)=jxnpaqK&9X($RD?MEHj5Q=QHVXiEW`m1SS*__3Gw&8S*#c{p?A+*1rko?
zM&Tm+j>UpGDP+7oA#w2*ixu-ui>~>Ckd5eD7G1M3srL<YyO4GKE{kRJu#k2CWg-52
z(qhG|kr+MvbeoWQyu)I}+#}q`n+L$tD=X$9A@TO77FW#Ig={QOS@g^gEpC{hDd=v+
zTqz`>zY^TQpKIX%^~)SMYMi0ig5NQN10?J0P^_`FWrhYw*vV{sy|T1mhGIvF!sKvl
z{lt=eDVpSHZ0$tR%p{KHiCE8MX9xYpS-*9`uXWoUGnWkMuF1{}?Q00kl%tk{$xdfN
zdbQc}BRQ<G>|7*A<iH$G!ClA~nau3`NVT@zqF(dsXZ;<&)eU>OR0MtPLA!Z@fmrR(
z5jPuZ4;unAEqgWP*_lv%(5-e+FLNPZ)^F*7nM1#u_ZDT@^q_yyul2gs%_f%R7>CQ6
z%+98(y^D=zv)b7+htd$srP78p92)j7?o!;fx(#&H&0*VEaTg0}%+lELgP^h5Xg0cg
z=Abl{D+>SY^iaR?f|@4cG|7NewN|5x{3~wP+aTFyr?9KmU>uEhD=?#C_DnM8oLpLR
zmqU}hQmFMHCUyPRC8asW?RphbN_H$oQ^((_Hce)LO*lI`z~fYseV7#J8fPlK(A=pO
z8m-<%Gm&6hmYVEXvbxi5oejEXHl{5)JDBua=X*YL>WszL+N~32DnUb$?KL}-?zL*|
zo!uTTpHGEU(q49ExKFo(Fq3iGpiiemS&i0NGo6H1v4Jp?ZnSEcst~|!AzAE0m+0y-
zE-CnDG8I`Am`UeoslY}Bt<h>WT9(by>`*L?nHpv&MUx$m`>o!Nf}>RSiW+Thu-9tu
zb{jj5J7LShMgWXDy_P0l`p8n*sbsZPQ$W};x?X8RV>~q6Xc1I47?>zU0}-b-Oq|O;
zr!g?&809J)Clb|GQ0?AnCh;N^&37s!5rp%Y=B-qyX|o3r(e*o9ey6?T^I+ZXmB^yj
zjc7jGgsQ=aikw^q2Zm%6VmzfF#`jvp62qCrvtFg-%7t?ogm<v9Q47owEcYtq$_Ax!
zc7D{VZSVRWJ9oqkD}#0^V7eu%fu-`iQs8AGfjtdzL#E6OG8Zh9CVs_6h6-5{n9)o0
zKnbc$Qi?5<*A(Yvt(CDvZ}&o*fMNt?0??UEuJ8J-g{38vONc_j>`bOaJF&p?AqOIR
zg_B8$rl$$pqlnN6H8O`0$>lL55&z>>hI`UIrubl2cShqFVl#ol1K*|lv0=S0BGk#u
zC2F9MkrO5yyn+K}la!eoIGN=&v=HpIYUeubR=XD{%W4&qm;n|fow5U*6A272uWUG;
ztk!RD<9rR=)9RXlQr%Vd#bzkdor*|y0=x<}UpAL&5FScm%7)B|lJV4_C3YJZ{N|p?
z#;ImqlN<q$v;D>?YOR7zGO*vOqd*iCjHe<Ji6^F#7b`Z~?WSLCG3AQ|GAE?;X0Oq#
zTa{V3Ht?^r*lgF%5suWuBod_O1~)tHYQ0tslq}(+KxS=iwgXnnSe!Z;rpe@Hqg!j&
zeZqT@t&>v};SDWNcq&VTR7}fdE|CLLD0Ma2X)?_M*<t-PYGx!(jg9BC&Pw_CLSRM)
zIO7sNEA1X3+aY{bC~7qDsnizJLgK8G5JRGzo9<J!e5lKjKat`*-?ie$AYmS%w9@W$
zJ2)igP^@dYPiM}xciQZhn(3LuYO&yvl$_bY)#^^8sqS#kCPTW7jeQM@4HfyUgVpj9
zBcsyy6ymGQ^-i@@$MGW3ZzA2N1vqroWATi@4hjh2<S?vy7(al(RQ$xk`hxcP6KfSS
z64Oz^8BH8-bXR+udS^0|qlu;U#WRHk`;Ko$o%1~$&3eq^dXpVV*!RE;4^T5WBhHCl
zb=wy=1*tQh$XCyjps+Dd1QktuB3<jj((00FvoP5SIk8l&V$=ZHE18Tfp^ecl6<DD0
zd_f^KnW|OEunJ-%K|(W<BAQ04^NBO&EET1F%3%Z2!kf-$|EZFzSSB+GOQq|~;AOIk
z1n6v-&@FpAu!`oAAsI1q!{Qe`+`Nw_IR4nAIyzLVwlx1Xmywo-_rG_{!DPsZ()s5!
zsn!Xbv|Ah*-PbML6z9__Q7S4%7zYNhopgG&Zo{tRm{uq=xKvsv7#MfERj+E?L@<7z
z&R8F_dE;irM=n+sl4E=%_I8t+O1`ygdNOrh^Fb<>Zy~`rp6=7?oXRPMH7(_n#zAAR
zyr<XWyaFdUU?_rk8;j}CN2aQ^d%N0v#^VqeIkOj1@U@et=!vFax6_X5o7sdV<B8d!
zT05d^fiq{~r^D3chIS_~hfqfBU{CpEAjBRp8^@Bsnu1>@k~jBu_w4S7UXME^Czpvx
zh3Y1W4=VPLA_bg)>5SYxhQYggHVTFYIH6J)A75Q!yNu=8=*S@XPPM1~Nm(<?@^Q5u
zC>*tZPBemT7RYccPew})$8UDmOJ-t#M$OmBWgFZ#uX%Eh$7zV9stmVT4SbV}<vFip
z(@w2dub#KExtI|nc_a>F&!e0pmaznb?Q-)PWhwKJl#OruT{0VQOrn-*JzT1`)E#4~
z@~-Tdr<BAO>7{C?>jz3dlMW|j4Q?V`Xl!=EU3?^_<8pR1(Y#>G_F>*R3uNPk!YPh(
zF&&=Sk(=%Ic5j!Y%{x})k|9C$N+Dmlsi>l4cBI)pt6lKijus2M4&9YP>E38<6D@SN
z_1kDNBq*gQdPbaPI|%kjg9XmgY>|S&0oHqRBv$0OGU*iaE8-ka@38u}8miZa2B>nA
z<MCpF+%}ga<=#}0Q>KP@xDUpRj$f^_3J_!iSmnS_Os99ES&-%54OBR^EUl_d+Gzvy
zFLON-FYbW=dm0JJ*`d<C$f!Bovux2eJF??<YUfy9{nRlM(}_Ph>Xc3?>t^G;D!Ipb
zUt;D;6gec_(y6uOHJ!wCAAr`<@g*FlO9A%8Rxdn$%qF-UaFsu1hg<CqwS42wFnQ#h
zQqk2QrqZqIjvwq+YuZ9B5o#`TiK@MfnvxlsNVVu}3xd&jdC?XfiQT=;2HRlH;hsU1
z=?s<MmUK~xOeVN>P!4g%Qsup~D$q4)Bmn}m%)NVq0VxAGBMJMeX(3;5QYi1?@96}$
z64p;MgRJ?@zK=vF_o8kl!=b5vgqTf+WaxA7_vxIpQZ!H%E6(Oek#x3G-PvI<VW3Sr
z<w`;Efflv`lAr2MGlv$)c<<owWxA5iJc?+MV-Js9ZVFv1Wku>;wW17xw*KyWegb7>
zd1^9=iBz0*au_6=^rnYIyOi}t;@q~V?sCfO?i%-0u`=PLS9guwYb2)2muxQ430L2I
zic6T4yshe{vl@SEt8um$)^<Y!+|01uS5}V|m{KYkPBatAPLIsa{yX7_f)B=05lQKb
z^`4xHRon%x#<;*?vtf<soEM)R?f9<UZ+acS?r~NWyVe8jt1LP0ckGUm4tkqg?Pi??
zEMj=lNI|SMtAVN=OT$+)gMFec+(J4AO=tRaR_%01wt;quTo1`M{$?$Ao7u&&^C^UL
zh}Kjb8iv1rjHZG-l<v2oV=nugMpeh|Vv(zHPV=<vwCg=83w3exm9pJ&=Or(wbsD?&
z0)P{E)F|6Dl(3BqGKDOq{%9nPPOV4M)kvVCe98_yBx|*~i)3!51{vMGU2;DCy)r|{
z$recRj|^0y#@-q6k+v-jMdN*1odls91ZKrOwX9BIxs*!<gudOsutPL@F#(nsmM~M$
z-2zjcoyY|K#h$HSlnyxqk#A1xWE*q2lqVeB!pTaJqzAt;m0LLG+BXz4oWikX>TvGk
z3U=?xWrDpOwtp&>8as_!B*>_pgC29%(YhOrb62OFkys*Oth-_=%Rq*@%iV~!YeoWz
zj??KrEil(4-Ex_A=bGGj*6opqD9Jc3>KY_F!xTS<f@j!D<(cI?$0!@!3^ox~QCw+Q
zifl`UBpZEP%V{bn2JMAVbi0)~z(|K;%j;|ShYLf5dW&o33WrGH-Xb@Y^5pB|eBNVj
zY`jqHldqLmow20X-eQR~m@M5Ya+T&SmC43lbiXm3uKWG_joD;KB^;)PE|C=x!*o`z
zh%FzdB5-&?wKJYvZeQG!wp^U6Nlc&0)O~VXyQYh-X-7o)aqu!#RhY;y?<bR}M7G-r
zsv{Y?yO>Q_I$~yq_cu^yp;R$6p6qf83^=e(LXn7ES`@avh7Llase9%UF`zW&&Xh|9
zx*yEeNm5)lv|Gd>2jyJCBc7~<>8?*W*&8)_RQj1WOBt(B>F3?KBsZ-qHaE@=)f+*r
z&COu60%)no2{RcrbbMRu;%Q4p{oLSXy0S{xpi#_WSkxoT{!@<UR>Hlc?gyw3>wYFP
z3a}jR595g{Z9Gmqv5+QG^=h}e)okBkNjfHur<Q%H1<sj@6E1(99B~qyYizeU3UUiv
zvNMqzx}$o*-{QVZaZdPgFHFUG>clZ5ou=$j#NE;EY+K=DF<ld*j<Gkkbf|@i1Gm=J
z1y*0R&YGNZPKS3nQ-hW!!Cd3O>Du#a>Qwd42Itp?NcIM5T=H1XB<Qi2E|{5bEZwq&
zbsPe|9ZE%%;D5h)euQr%2e=21@$KwKKlXcs?{*^H%lA3q3BC^r^}CgR*KyzeUHYD+
z?-%*D6}DXdca!_>>!LnC;M<g1w)gRUhqhCEJ2}I5h|9ggr?0{HQs^J#8>1-8QNKC9
zO<s%6K7R|Ik2AD;`KHvTo#y+;H~8kI_YCrHWgL7?xYRECo9Q<f$^6$yM<*Ju+V#st
zHtYEoz6HLH?;zh`+<LzjpL_?IcSgRs3H>|p{0cr%`;G8@75;Seejm?Yfp?zw_c8|Y
zTtM%=eCv5N-$C^LHS~X)=RAL(f&XsCA^IqCh5E&j@km79!2Z?97qjpW_vetY-Gc4E
z!}kdF$?$$B_Wd!w(EC06#`X4S>{mwk=h*#C=4Eo9Fa9o)qrT7J*9Z3b`f$|tTE_JX
z?EE@5j7PGr;{D=2U3W+QZorO@!*@+&kL31$Z$kgG=>N@VT({GwfGzXT1|q+GZ=XHt
z_q+Jy31mGVU+Vqc(Y)&UF?_j$%&UKF{?+ymQH<z0kL<_bxedL~;@?+9{M(UF?*lJJ
z_P!W->hrhJ+|Pn*QQP(K-ALSj9Dn%C$$#?xWMsGc+#C7zQD}$o&8>{@1MvCuZ(z?a
z%D?oz9v_w>JvT%?Jb+xu_%OcsI<z%V@Bb0SkDm1#LB<j?-hl2;F$a(G_Zh}@S7gU)
zXkVxOU-;XIWT@X1`aXcHpM&;jG@bxoJWgN0>Js05QJ;~>-=gXFSJ?dfFn%7rhrwKA
z??K{E?MGw}<9#`H9OHQ#T}Pv_y`DKs&}S`@nS!2y_XpVY;z-Uv@6%U_V&JFg^KJTm
zVxNzH7}+kJk1#h!cz+6cZ>0az_*!)Ru0X!<S^B;V`nS>bMAW{`cwfL=J&yd3Kz~&<
zZ~w%*{{P1BMRu$I8yUxg8XNN8h_4&8eT=`#KA*jcw$I`BwJ2s@g3X`BU*%|Qe}gU8
zB4-WSSC|(cl<fD>NBC2WcZfcE?l2$sMdNp{?Yr3a0Qzp`FOCfHEHX!5jd=8~-$(Jo
zlg!2Yu;quydI#^XkLL3IJg;KA_})Ojry}1x#kk%H{m-H~d^)mUeAh-iS4TF!2cBdk
z?=f`zAd)9u{r-`*GuZcO<i8o;B*2f+r;h(0g!c<k-+xE$d-1dS%)|fDNY8s1Yl->!
zCjL>|&5=!d{?<NS?}~hI2D|RYSGQryLyWf>$ru0YkWq@{&cGwudHB8z?Uu;*qTP;-
VAByJwAU5kik$nz7`}je>{|1;FG}!<E

literal 0
HcmV?d00001

diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 0859199fe2..5fce56bd90 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -388,13 +388,15 @@ LogicalResult IRBuilder::loadIntrinsic(ModuleOp module, StringRef intrinName) {
 }
 
 template <typename T>
-DenseElementsAttr createArrayAttr(const std::vector<T> &values, Type eleTy) {
+DenseElementsAttr createDenseElementsAttr(const std::vector<T> &values,
+                                          Type eleTy) {
   auto newValues = ArrayRef<T>(values.data(), values.size());
   auto tensorTy = RankedTensorType::get(values.size(), eleTy);
   return DenseElementsAttr::get(tensorTy, newValues);
 }
 
-DenseElementsAttr createArrayAttr(const std::vector<bool> &values, Type eleTy) {
+DenseElementsAttr createDenseElementsAttr(const std::vector<bool> &values,
+                                          Type eleTy) {
   std::vector<std::byte> converted;
   for (auto b : values) {
     converted.push_back(std::byte(b));
@@ -417,7 +419,7 @@ cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
   builder.setInsertionPointToEnd(module.getBody());
   auto globalTy = cc::ArrayType::get(ctx, eleTy, values.size());
 
-  auto arrayAttr = createArrayAttr(values, eleTy);
+  auto arrayAttr = createDenseElementsAttr(values, eleTy);
   return builder.create<cudaq::cc::GlobalOp>(loc, globalTy, name, arrayAttr,
                                              /*constant=*/true,
                                              /*external=*/false);
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 0b5aa6d23a..881625db21 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -47,7 +47,6 @@ add_cudaq_library(OptTransforms
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
-  StateDecomposer.cpp
   StatePreparation.cpp
 
   DEPENDS
diff --git a/lib/Optimizer/Transforms/ConstPropComplex.cpp b/lib/Optimizer/Transforms/ConstPropComplex.cpp
index 7439b44a4b..d1ffc8d5af 100644
--- a/lib/Optimizer/Transforms/ConstPropComplex.cpp
+++ b/lib/Optimizer/Transforms/ConstPropComplex.cpp
@@ -29,45 +29,6 @@ using namespace mlir;
 
 namespace {
 
-// Replace array ptr casts that throw away the size by a cast to element
-// pointer.
-//
-//%1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) ->
-//! cc.ptr<!cc.array<complex<f32> x ?>>
-// ->
-//%1 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 4>>) ->
-//! cc.ptr<complex<f32>>
-class CastArrayPtrPattern : public OpRewritePattern<cudaq::cc::CastOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(cudaq::cc::CastOp cast,
-                                PatternRewriter &rewriter) const override {
-
-    auto fromTy = cast.getOperand().getType();
-    auto toTy = cast.getType();
-
-    if (auto ptrFromTy = dyn_cast<cudaq::cc::PointerType>(fromTy)) {
-      if (auto arrayFromTy =
-              dyn_cast<cudaq::cc::ArrayType>(ptrFromTy.getElementType())) {
-        if (auto ptrToTy = dyn_cast<cudaq::cc::PointerType>(toTy)) {
-          if (auto arrayToTy =
-                  dyn_cast<cudaq::cc::ArrayType>(ptrToTy.getElementType())) {
-            if (arrayFromTy.getElementType() == arrayToTy.getElementType()) {
-              auto eleTy = arrayFromTy.getElementType();
-              auto elePtrType = cudaq::cc::PointerType::get(eleTy);
-              rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(cast, elePtrType,
-                                                             cast.getOperand());
-              return success();
-            }
-          }
-        }
-      }
-    }
-    return failure();
-  }
-};
-
 // Fold complex.create ops if the arguments are constants.
 class ComplexCreatePattern : public OpRewritePattern<complex::CreateOp> {
 public:
@@ -91,6 +52,35 @@ class ComplexCreatePattern : public OpRewritePattern<complex::CreateOp> {
   }
 };
 
+// Fold floating point cast ops if the argument is constant.
+class FloatCastPattern : public OpRewritePattern<cudaq::cc::CastOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::CastOp cast,
+                                PatternRewriter &rewriter) const override {
+    auto val = cast.getOperand();
+    auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
+    if (valCon) {
+      auto fTy = dyn_cast<FloatType>(cast.getType());
+      if (fTy == rewriter.getF64Type()) {
+        auto v = valCon.value().convertToFloat();
+        auto fTy = dyn_cast<FloatType>(cast.getType());
+        rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
+            cast, APFloat{static_cast<double>(v)}, fTy);
+        return success();
+      } else if (fTy == rewriter.getF32Type()) {
+        auto v = valCon.value().convertToDouble();
+        auto fTy = dyn_cast<FloatType>(cast.getType());
+        rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
+            cast, APFloat{static_cast<float>(v)}, fTy);
+        return success();
+      }
+    }
+    return failure();
+  }
+};
+
 // Fold arith.trunc ops if the argument is constant.
 class FloatTruncatePattern : public OpRewritePattern<arith::TruncFOp> {
 public:
@@ -189,11 +179,11 @@ class ConstPropComplexPass
       std::string funcName = func.getName().str();
       RewritePatternSet patterns(ctx);
       patterns.insert<ComplexCreatePattern>(ctx);
+      patterns.insert<FloatCastPattern>(ctx);
       patterns.insert<FloatExtendPattern>(ctx);
       patterns.insert<FloatTruncatePattern>(ctx);
       patterns.insert<ComplexRePattern>(ctx);
       patterns.insert<ComplexImPattern>(ctx);
-      patterns.insert<CastArrayPtrPattern>(ctx);
 
       LLVM_DEBUG(llvm::dbgs()
                  << "Before lifting constant array: " << func << '\n');
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 2336e6a97d..9de57fad81 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -180,13 +180,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     }
 
     for (auto *op : toErase) {
-      if (op->getUses().empty()) {
-        rewriter.eraseOp(op);
-      } else {
-        op->emitOpError("LiftArrayAlloc failed to remove cc::AllocOp "
-                        "or its uses.");
-        return failure();
-      }
+      rewriter.eraseOp(op);
     }
     return success();
   }
@@ -259,7 +253,9 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       return theStore;
     };
 
-    auto ptrArrEleTy = cudaq::cc::PointerType::get(arrTy.getElementType());
+    auto unsizedArrTy = cudaq::cc::ArrayType::get(arrEleTy);
+    auto ptrUnsizedArrTy = cudaq::cc::PointerType::get(unsizedArrTy);
+    auto ptrArrEleTy = cudaq::cc::PointerType::get(arrEleTy);
     for (auto &use : alloc->getUses()) {
       // All uses *must* be a degenerate cc.cast, cc.compute_ptr, or
       // cc.init_state.
@@ -278,6 +274,7 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         return false;
       }
       if (auto cast = dyn_cast<cudaq::cc::CastOp>(op)) {
+        // Process casts that are used in store ops.
         if (cast.getType() == ptrArrEleTy) {
           if (auto w = getWriteOp(cast, 0))
             if (!scoreboard[0]) {
@@ -286,6 +283,20 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
             }
           return false;
         }
+        // Process casts that are used in quake.init_state.
+        if (cast.getType() == ptrUnsizedArrTy) {
+          if (getWriteOp(cast, 0))
+            LLVM_DEBUG(
+                llvm::dbgs()
+                << "unexpected use of array size removing cast in a store"
+                << *op << '\n');
+          continue;
+        }
+        if (isa<quake::InitializeStateOp>(op)) {
+          toGlobalUses.push_back(op);
+          toGlobal = true;
+          continue;
+        }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
         toGlobalUses.push_back(op);
         toGlobal = true;
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index d8cc3e22d0..e309f86214 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -104,31 +104,22 @@ Value makeComplexElement(OpBuilder &builder, Location argLoc,
   return builder.create<complex::ConstantOp>(argLoc, eleTy, complexVal);
 }
 
-template <typename T>
-std::tuple<Value, Value>
-createArrayInMemory(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                    BlockArgument argument, std::vector<T> &vec,
-                    cudaq::cc::ArrayType arrTy) {
-  auto argLoc = argument.getLoc();
-
-  // Stick global at end of Module.
-  std::string symbol = "__nvqpp_rodata_init_state." + std::to_string(counter++);
-
-  cudaq::IRBuilder irBuilder(builder);
-  irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
-
-  builder.setInsertionPointToStart(argument.getOwner());
-  auto buffer = builder.create<cudaq::cc::AddressOfOp>(
-      argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
-  auto data = builder.create<cudaq::cc::LoadOp>(argLoc, arrTy, buffer);
-  return {buffer, data};
+/// returns true if and only if \p argument is used by a `quake.init_state`
+/// operation.
+static bool hasInitStateUse(BlockArgument argument) {
+  for (auto *argUser : argument.getUsers())
+    if (auto stdvecDataOp = dyn_cast<cudaq::cc::StdvecDataOp>(argUser))
+      for (auto *dataUser : stdvecDataOp->getUsers())
+        if (isa<quake::InitializeStateOp>(dataUser))
+          return true;
+  return false;
 }
 
-template <typename ELETY, typename T, typename MAKER>
+template <typename ELETY, typename T, typename ATTR, typename MAKER>
 LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<T> &vec,
-                         MAKER makeElementValue) {
+                         ATTR arrayAttr, MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
   assert(isa<cudaq::cc::StdvecType>(argTy));
@@ -138,24 +129,43 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
   auto argLoc = argument.getLoc();
 
   auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
+  auto conArray = builder.create<cudaq::cc::ConstantArrayOp>(
+      argLoc, cudaq::cc::ArrayType::get(ctx, eleTy, vec.size()), arrayAttr);
+
   std::optional<Value> arrayInMemory;
-  std::optional<Value> conArray;
   auto ptrEleTy = cudaq::cc::PointerType::get(eleTy);
   bool generateNewValue = false;
 
   // Helper function that materializes the array in memory.
-  auto getArrayInMemory = [&]() -> std::tuple<Value, Value> {
+  auto getArrayInMemory = [&]() -> Value {
     if (arrayInMemory)
-      return {*arrayInMemory, *conArray};
+      return *arrayInMemory;
     OpBuilder::InsertionGuard guard(builder);
-    auto [buffer, data] =
-        createArrayInMemory(builder, module, counter, argument, vec, arrTy);
+    auto argLoc = argument.getLoc();
+
+    Value buffer;
+    if (hasInitStateUse(argument)) {
+      // Stick global at end of Module.
+      std::string symbol =
+          "__nvqpp_rodata_init_state." + std::to_string(counter++);
+
+      cudaq::IRBuilder irBuilder(builder);
+      irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
+
+      builder.setInsertionPointToStart(argument.getOwner());
+      buffer = builder.create<cudaq::cc::AddressOfOp>(
+          argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
+    } else {
+      builder.setInsertionPointAfter(conArray);
+      buffer = builder.create<cudaq::cc::AllocaOp>(argLoc, arrTy);
+      builder.create<cudaq::cc::StoreOp>(argLoc, conArray, buffer);
+    }
+
     auto ptrArrEleTy =
         cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
     Value res = builder.create<cudaq::cc::CastOp>(argLoc, ptrArrEleTy, buffer);
     arrayInMemory = res;
-    conArray = data;
-    return {res, data};
+    return res;
   };
 
   auto replaceLoads = [&](cudaq::cc::ComputePtrOp elePtrOp,
@@ -208,11 +218,11 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
           if (index == cudaq::cc::ComputePtrOp::kDynamicIndex) {
             OpBuilder::InsertionGuard guard(builder);
             builder.setInsertionPoint(elePtrOp);
-            auto [memArr, conArray] = getArrayInMemory();
             Value getEle = builder.create<cudaq::cc::ExtractValueOp>(
                 elePtrOp.getLoc(), eleTy, conArray,
                 elePtrOp.getDynamicIndices()[0]);
             if (failed(replaceLoads(elePtrOp, getEle))) {
+              Value memArr = getArrayInMemory();
               builder.setInsertionPoint(elePtrOp);
               Value newComputedPtr = builder.create<cudaq::cc::ComputePtrOp>(
                   argLoc, ptrEleTy, memArr,
@@ -225,7 +235,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
           Value runtimeParam =
               makeElementValue(builder, argLoc, vec[index], eleTy);
           if (failed(replaceLoads(elePtrOp, runtimeParam))) {
-            auto [memArr, _] = getArrayInMemory();
+            Value memArr = getArrayInMemory();
             OpBuilder::InsertionGuard guard(builder);
             builder.setInsertionPoint(elePtrOp);
             Value newComputedPtr = builder.create<cudaq::cc::ComputePtrOp>(
@@ -240,7 +250,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
       // Check if there were other uses of `vec.data()` and simply forward the
       // constant array as materialized in memory.
       if (replaceOtherUses) {
-        auto [memArr, _] = getArrayInMemory();
+        Value memArr = getArrayInMemory();
         stdvecDataOp.replaceAllUsesWith(memArr);
       }
       continue;
@@ -252,7 +262,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     generateNewValue = true;
   }
   if (generateNewValue) {
-    auto [memArr, _] = getArrayInMemory();
+    Value memArr = getArrayInMemory();
     OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPointAfter(memArr.getDefiningOp());
     Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
@@ -271,11 +281,16 @@ std::vector<std::int32_t> asI32(const std::vector<A> &v) {
   return result;
 }
 
+// TODO: consider using DenseArrayAttr here instead. NB: such a change may alter
+// the output of the constant array op.
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<bool> &vec) {
-  return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec, makeIntegerElement<bool>);
+
+  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
+  return synthesizeVectorArgument<IntegerType>(builder, module, counter,
+                                               argument, vec, arrayAttr,
+                                               makeIntegerElement<bool>);
 }
 
 static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
@@ -283,8 +298,10 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int8_t> &vec) {
-  return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec, makeIntegerElement<std::int8_t>);
+  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
+  return synthesizeVectorArgument<IntegerType>(builder, module, counter,
+                                               argument, vec, arrayAttr,
+                                               makeIntegerElement<std::int8_t>);
 }
 
 static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
@@ -292,8 +309,9 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int16_t> &vec) {
+  auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
   return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec,
+      builder, module, counter, argument, vec, arrayAttr,
       makeIntegerElement<std::int16_t>);
 }
 
@@ -302,8 +320,9 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int32_t> &vec) {
+  auto arrayAttr = builder.getI32ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec,
+      builder, module, counter, argument, vec, arrayAttr,
       makeIntegerElement<std::int32_t>);
 }
 
@@ -312,39 +331,59 @@ static LogicalResult synthesizeVectorArgument(OpBuilder &builder,
                                               unsigned &counter,
                                               BlockArgument argument,
                                               std::vector<std::int64_t> &vec) {
+  auto arrayAttr = builder.getI64ArrayAttr(vec);
   return synthesizeVectorArgument<IntegerType>(
-      builder, module, counter, argument, vec,
+      builder, module, counter, argument, vec, arrayAttr,
       makeIntegerElement<std::int64_t>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<float> &vec) {
+  auto arrayAttr = builder.getF32ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
-                                             vec, makeFloatElement<float>);
+                                             vec, arrayAttr,
+                                             makeFloatElement<float>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<double> &vec) {
+  auto arrayAttr = builder.getF64ArrayAttr(vec);
   return synthesizeVectorArgument<FloatType>(builder, module, counter, argument,
-                                             vec, makeFloatElement<double>);
+                                             vec, arrayAttr,
+                                             makeFloatElement<double>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
                          std::vector<std::complex<float>> &vec) {
-  return synthesizeVectorArgument<ComplexType>(
-      builder, module, counter, argument, vec, makeComplexElement<float>);
+
+  std::vector<float> vec2;
+  for (auto c : vec) {
+    vec2.push_back(c.real());
+    vec2.push_back(c.imag());
+  }
+  auto arrayAttr = builder.getF32ArrayAttr(vec2);
+  return synthesizeVectorArgument<ComplexType>(builder, module, counter,
+                                               argument, vec, arrayAttr,
+                                               makeComplexElement<float>);
 }
 
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
                          std::vector<std::complex<double>> &vec) {
-  return synthesizeVectorArgument<ComplexType>(
-      builder, module, counter, argument, vec, makeComplexElement<double>);
+  std::vector<double> vec2;
+  for (auto c : vec) {
+    vec2.push_back(c.real());
+    vec2.push_back(c.imag());
+  }
+  auto arrayAttr = builder.getF64ArrayAttr(vec2);
+  return synthesizeVectorArgument<ComplexType>(builder, module, counter,
+                                               argument, vec, arrayAttr,
+                                               makeComplexElement<double>);
 }
 
 namespace {
diff --git a/lib/Optimizer/Transforms/StateDecomposer.cpp b/lib/Optimizer/Transforms/StateDecomposer.cpp
deleted file mode 100644
index 62ca8a9d73..0000000000
--- a/lib/Optimizer/Transforms/StateDecomposer.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "StateDecomposer.h"
-
-namespace cudaq::details {
-
-std::vector<std::size_t> grayCode(std::size_t numBits) {
-  std::vector<std::size_t> result(1ULL << numBits);
-  for (std::size_t i = 0; i < (1ULL << numBits); ++i)
-    result[i] = ((i >> 1) ^ i);
-  return result;
-}
-
-std::vector<std::size_t> getControlIndices(std::size_t numBits) {
-  auto code = grayCode(numBits);
-  std::vector<std::size_t> indices;
-  for (auto i = 0u; i < code.size(); ++i) {
-    // The position of the control in the lth CNOT gate is set to match
-    // the position where the lth and (l + 1)th bit strings g[l] and g[l+1] of
-    // the binary reflected Gray code differ.
-    auto position = std::log2(code[i] ^ code[(i + 1) % code.size()]);
-    // N.B: In CUDA Quantum we write the least significant bit (LSb) on the left
-    //
-    //  lsb -v
-    //       001
-    //         ^- msb
-    //
-    // Meaning that the bitstring 001 represents the number four instead of one.
-    // The above position calculation uses the 'normal' convention of writing
-    // numbers with the LSb on the left.
-    //
-    // Now, what we need to find out is the position of the 1 in the bitstring.
-    // If we take LSb as being position 0, then for the normal convention its
-    // position will be 0. Using CUDA Quantum convention it will be 2. Hence,
-    // we need to convert the position we find using:
-    //
-    // numBits - position - 1
-    //
-    // The extra -1 is to account for indices starting at 0. Using the above
-    // examples:
-    //
-    // bitstring: 001
-    // numBits: 3
-    // position: 0
-    //
-    // We have the converted position: 2, which is what we need.
-    indices.emplace_back(numBits - position - 1);
-  }
-  return indices;
-}
-
-std::vector<double> convertAngles(const std::span<double> alphas) {
-  // Implements Eq. (3) from https://arxiv.org/pdf/quant-ph/0407010.pdf
-  //
-  // N.B: The paper does fails to explicitly define what is the dot operator in
-  // the exponent of -1. Ref. 3 solves the mystery: its the bitwise inner
-  // product.
-  auto bitwiseInnerProduct = [](std::size_t a, std::size_t b) {
-    auto product = a & b;
-    auto sumOfProducts = 0;
-    while (product) {
-      sumOfProducts += product & 0b1 ? 1 : 0;
-      product = product >> 1;
-    }
-    return sumOfProducts;
-  };
-  std::vector<double> thetas(alphas.size(), 0);
-  for (std::size_t i = 0u; i < alphas.size(); ++i) {
-    for (std::size_t j = 0u; j < alphas.size(); ++j)
-      thetas[i] +=
-          bitwiseInnerProduct(j, ((i >> 1) ^ i)) & 0b1 ? -alphas[j] : alphas[j];
-    thetas[i] /= alphas.size();
-  }
-  return thetas;
-}
-
-std::vector<double> getAlphaZ(const std::span<double> data,
-                              std::size_t numQubits, std::size_t k) {
-  // Implements Eq. (5) from https://arxiv.org/pdf/quant-ph/0407010.pdf
-  std::vector<double> angles;
-  double divisor = static_cast<double>(1ULL << (k - 1));
-  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
-    double angle = 0.0;
-    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l)
-      // N.B: There is an extra '-1' on these indices computations to account
-      // for the fact that our indices start at 0.
-      angle += data[(2 * j - 1) * (1 << (k - 1)) + l - 1] -
-               data[(2 * j - 2) * (1 << (k - 1)) + l - 1];
-    angles.push_back(angle / divisor);
-  }
-  return angles;
-}
-
-std::vector<double> getAlphaY(const std::span<double> data,
-                              std::size_t numQubits, std::size_t k) {
-  // Implements Eq. (8) from https://arxiv.org/pdf/quant-ph/0407010.pdf
-  // N.B: There is an extra '-1' on these indices computations to account for
-  // the fact that our indices start at 0.
-  std::vector<double> angles;
-  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
-    double numerator = 0;
-    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l) {
-      numerator +=
-          std::pow(std::abs(data[(2 * j - 1) * (1 << (k - 1)) + l - 1]), 2);
-    }
-
-    double denominator = 0;
-    for (std::size_t l = 1; l <= (1ULL << k); ++l) {
-      denominator += std::pow(std::abs(data[(j - 1) * (1 << k) + l - 1]), 2);
-    }
-
-    if (denominator == 0.0) {
-      assert(numerator == 0.0 &&
-             "If the denominator is zero, the numerator must also be zero.");
-      angles.push_back(0.0);
-      continue;
-    }
-    angles.push_back(2.0 * std::asin(std::sqrt(numerator / denominator)));
-  }
-  return angles;
-}
-} // namespace cudaq::details
diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h
deleted file mode 100644
index a09b8a64e9..0000000000
--- a/lib/Optimizer/Transforms/StateDecomposer.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "PassDetails.h"
-#include "cudaq/Optimizer/Builder/Runtime.h"
-#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
-#include "cudaq/Optimizer/Transforms/Passes.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Target/LLVMIR/TypeToLLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include <span>
-
-namespace cudaq::details {
-
-/// @brief Converts angles of a uniformly controlled rotation to angles of
-/// non-controlled rotations.
-std::vector<double> convertAngles(const std::span<double> alphas);
-
-/// @brief Return the control indices dictated by the gray code implementation.
-///
-/// Here, numBits is the number of controls.
-std::vector<std::size_t> getControlIndices(std::size_t numBits);
-
-/// @brief Return angles required to implement a uniformly controlled z-rotation
-/// on the `kth` qubit.
-std::vector<double> getAlphaZ(const std::span<double> data,
-                              std::size_t numQubits, std::size_t k);
-
-/// @brief Return angles required to implement a uniformly controlled y-rotation
-/// on the `kth` qubit.
-std::vector<double> getAlphaY(const std::span<double> data,
-                              std::size_t numQubits, std::size_t k);
-} // namespace cudaq::details
-
-class StateGateBuilder {
-public:
-  StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q)
-      : builder(b), loc(l), qubits(q) {}
-
-  template <typename Op>
-  void applyRotationOp(double theta, std::size_t target) {
-    auto qubit = createQubitRef(target);
-    auto thetaValue = createAngleValue(theta);
-    builder.create<Op>(loc, thetaValue, mlir::ValueRange{}, qubit);
-  };
-
-  void applyX(std::size_t control, std::size_t target) {
-    auto qubitC = createQubitRef(control);
-    auto qubitT = createQubitRef(target);
-    builder.create<quake::XOp>(loc, qubitC, qubitT);
-  };
-
-private:
-  mlir::Value createQubitRef(std::size_t index) {
-    if (qubitRefs.contains(index)) {
-      return qubitRefs[index];
-    }
-
-    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(
-        loc, index, builder.getIntegerType(64));
-    auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
-    qubitRefs[index] = ref;
-    return ref;
-  }
-
-  mlir::Value createAngleValue(double angle) {
-    return builder.create<mlir::arith::ConstantFloatOp>(
-        loc, llvm::APFloat{angle}, builder.getF64Type());
-  }
-
-  mlir::OpBuilder &builder;
-  mlir::Location &loc;
-  mlir::Value &qubits;
-
-  std::unordered_map<std::size_t, mlir::Value> qubitRefs =
-      std::unordered_map<std::size_t, mlir::Value>();
-};
-
-class StateDecomposer {
-public:
-  StateDecomposer(StateGateBuilder &b, std::span<std::complex<double>> a)
-      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
-
-  /// @brief Decompose the input state vector data to a set of controlled
-  /// operations and rotations. This function takes as input a `OpBuilder`
-  /// and appends the operations of the decomposition to its internal
-  /// representation. This implementation follows the algorithm defined in
-  /// `https://arxiv.org/pdf/quant-ph/0407010.pdf`.
-  void decompose() {
-
-    // Decompose the state into phases and magnitudes.
-    bool needsPhaseEqualization = false;
-    std::vector<double> phases;
-    std::vector<double> magnitudes;
-    for (const auto &a : amplitudes) {
-      phases.push_back(std::arg(a));
-      magnitudes.push_back(std::abs(a));
-      // FIXME: remove magic number.
-      needsPhaseEqualization |= std::abs(phases.back()) > 1e-10;
-    }
-
-    // N.B: The algorithm, as described in the paper, creates a circuit that
-    // begins with a target state and brings it to the all zero state. Hence,
-    // this implementation do the two steps described in Section III in reverse
-    // order.
-
-    // Apply uniformly controlled y-rotations, the construction in Eq. (4).
-    for (std::size_t j = 1; j <= numQubits; ++j) {
-      auto k = numQubits - j + 1;
-      auto numControls = j - 1;
-      auto target = j - 1;
-      auto alphaYk = cudaq::details::getAlphaY(magnitudes, numQubits, k);
-      applyRotation<quake::RyOp>(alphaYk, numControls, target);
-    }
-
-    if (!needsPhaseEqualization)
-      return;
-
-    // Apply uniformly controlled z-rotations, the construction in Eq. (4).
-    for (std::size_t j = 1; j <= numQubits; ++j) {
-      auto k = numQubits - j + 1;
-      auto numControls = j - 1;
-      auto target = j - 1;
-      auto alphaZk = cudaq::details::getAlphaZ(phases, numQubits, k);
-      if (alphaZk.empty())
-        continue;
-      applyRotation<quake::RzOp>(alphaZk, numControls, target);
-    }
-  }
-
-private:
-  /// @brief Apply a uniformly controlled rotation on the target qubit.
-  template <typename Op>
-  void applyRotation(const std::span<double> alphas, std::size_t numControls,
-                     std::size_t target) {
-
-    // In our model the index 1 (i.e. |01>) in quantum state data
-    // corresponds to qubits[0] = 1 and qubits[1] = 0.
-    // Revert the order of qubits as the state preparation algorithm
-    // we use assumes the opposite.
-    auto qubitIndex = [&](std::size_t i) { return numQubits - i - 1; };
-
-    auto thetas = cudaq::details::convertAngles(alphas);
-    if (numControls == 0) {
-      builder.applyRotationOp<Op>(thetas[0], qubitIndex(target));
-      return;
-    }
-
-    auto controlIndices = cudaq::details::getControlIndices(numControls);
-    assert(thetas.size() == controlIndices.size());
-    for (auto [i, c] : llvm::enumerate(controlIndices)) {
-      builder.applyRotationOp<Op>(thetas[i], qubitIndex(target));
-      builder.applyX(qubitIndex(c), qubitIndex(target));
-    }
-  }
-
-  StateGateBuilder &builder;
-  std::span<std::complex<double>> amplitudes;
-  std::size_t numQubits;
-};
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 86fad793a5..f7a104b2ae 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "StateDecomposer.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -36,12 +35,257 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
+namespace cudaq::details {
+
+std::vector<std::size_t> grayCode(std::size_t numBits) {
+  std::vector<std::size_t> result(1ULL << numBits);
+  for (std::size_t i = 0; i < (1ULL << numBits); ++i)
+    result[i] = ((i >> 1) ^ i);
+  return result;
+}
+
+std::vector<std::size_t> getControlIndices(std::size_t numBits) {
+  auto code = grayCode(numBits);
+  std::vector<std::size_t> indices;
+  for (auto i = 0u; i < code.size(); ++i) {
+    // The position of the control in the lth CNOT gate is set to match
+    // the position where the lth and (l + 1)th bit strings g[l] and g[l+1] of
+    // the binary reflected Gray code differ.
+    auto position = std::log2(code[i] ^ code[(i + 1) % code.size()]);
+    // N.B: In CUDA Quantum we write the least significant bit (LSb) on the left
+    //
+    //  lsb -v
+    //       001
+    //         ^- msb
+    //
+    // Meaning that the bitstring 001 represents the number four instead of one.
+    // The above position calculation uses the 'normal' convention of writing
+    // numbers with the LSb on the left.
+    //
+    // Now, what we need to find out is the position of the 1 in the bitstring.
+    // If we take LSb as being position 0, then for the normal convention its
+    // position will be 0. Using CUDA Quantum convention it will be 2. Hence,
+    // we need to convert the position we find using:
+    //
+    // numBits - position - 1
+    //
+    // The extra -1 is to account for indices starting at 0. Using the above
+    // examples:
+    //
+    // bitstring: 001
+    // numBits: 3
+    // position: 0
+    //
+    // We have the converted position: 2, which is what we need.
+    indices.emplace_back(numBits - position - 1);
+  }
+  return indices;
+}
+
+std::vector<double> convertAngles(const std::span<double> alphas) {
+  // Implements Eq. (3) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  //
+  // N.B: The paper does fails to explicitly define what is the dot operator in
+  // the exponent of -1. Ref. 3 solves the mystery: its the bitwise inner
+  // product.
+  auto bitwiseInnerProduct = [](std::size_t a, std::size_t b) {
+    auto product = a & b;
+    auto sumOfProducts = 0;
+    while (product) {
+      sumOfProducts += product & 0b1 ? 1 : 0;
+      product = product >> 1;
+    }
+    return sumOfProducts;
+  };
+  std::vector<double> thetas(alphas.size(), 0);
+  for (std::size_t i = 0u; i < alphas.size(); ++i) {
+    for (std::size_t j = 0u; j < alphas.size(); ++j)
+      thetas[i] +=
+          bitwiseInnerProduct(j, ((i >> 1) ^ i)) & 0b1 ? -alphas[j] : alphas[j];
+    thetas[i] /= alphas.size();
+  }
+  return thetas;
+}
+
+std::vector<double> getAlphaZ(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (5) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  std::vector<double> angles;
+  double divisor = static_cast<double>(1ULL << (k - 1));
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double angle = 0.0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l)
+      // N.B: There is an extra '-1' on these indices computations to account
+      // for the fact that our indices start at 0.
+      angle += data[(2 * j - 1) * (1 << (k - 1)) + l - 1] -
+               data[(2 * j - 2) * (1 << (k - 1)) + l - 1];
+    angles.push_back(angle / divisor);
+  }
+  return angles;
+}
+
+std::vector<double> getAlphaY(const std::span<double> data,
+                              std::size_t numQubits, std::size_t k) {
+  // Implements Eq. (8) from https://arxiv.org/pdf/quant-ph/0407010.pdf
+  // N.B: There is an extra '-1' on these indices computations to account for
+  // the fact that our indices start at 0.
+  std::vector<double> angles;
+  for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) {
+    double numerator = 0;
+    for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l) {
+      numerator +=
+          std::pow(std::abs(data[(2 * j - 1) * (1 << (k - 1)) + l - 1]), 2);
+    }
+
+    double denominator = 0;
+    for (std::size_t l = 1; l <= (1ULL << k); ++l) {
+      denominator += std::pow(std::abs(data[(j - 1) * (1 << k) + l - 1]), 2);
+    }
+
+    if (denominator == 0.0) {
+      assert(numerator == 0.0 &&
+             "If the denominator is zero, the numerator must also be zero.");
+      angles.push_back(0.0);
+      continue;
+    }
+    angles.push_back(2.0 * std::asin(std::sqrt(numerator / denominator)));
+  }
+  return angles;
+}
+} // namespace cudaq::details
+
+class StateGateBuilder {
+public:
+  StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q)
+      : builder(b), loc(l), qubits(q) {}
+
+  template <typename Op>
+  void applyRotationOp(double theta, std::size_t target) {
+    auto qubit = createQubitRef(target);
+    auto thetaValue = createAngleValue(theta);
+    builder.create<Op>(loc, thetaValue, mlir::ValueRange{}, qubit);
+  };
+
+  void applyX(std::size_t control, std::size_t target) {
+    auto qubitC = createQubitRef(control);
+    auto qubitT = createQubitRef(target);
+    builder.create<quake::XOp>(loc, qubitC, qubitT);
+  };
+
+private:
+  mlir::Value createQubitRef(std::size_t index) {
+    if (qubitRefs.contains(index)) {
+      return qubitRefs[index];
+    }
+
+    auto indexValue = builder.create<mlir::arith::ConstantIntOp>(
+        loc, index, builder.getIntegerType(64));
+    auto ref = builder.create<quake::ExtractRefOp>(loc, qubits, indexValue);
+    qubitRefs[index] = ref;
+    return ref;
+  }
+
+  mlir::Value createAngleValue(double angle) {
+    return builder.create<mlir::arith::ConstantFloatOp>(
+        loc, llvm::APFloat{angle}, builder.getF64Type());
+  }
+
+  mlir::OpBuilder &builder;
+  mlir::Location &loc;
+  mlir::Value &qubits;
+
+  std::unordered_map<std::size_t, mlir::Value> qubitRefs =
+      std::unordered_map<std::size_t, mlir::Value>();
+};
+
+class StateDecomposer {
+public:
+  StateDecomposer(StateGateBuilder &b, std::span<std::complex<double>> a)
+      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+
+  /// @brief Decompose the input state vector data to a set of controlled
+  /// operations and rotations. This function takes as input a `OpBuilder`
+  /// and appends the operations of the decomposition to its internal
+  /// representation. This implementation follows the algorithm defined in
+  /// `https://arxiv.org/pdf/quant-ph/0407010.pdf`.
+  void decompose() {
+
+    // Decompose the state into phases and magnitudes.
+    bool needsPhaseEqualization = false;
+    std::vector<double> phases;
+    std::vector<double> magnitudes;
+    for (const auto &a : amplitudes) {
+      phases.push_back(std::arg(a));
+      magnitudes.push_back(std::abs(a));
+      // FIXME: remove magic number.
+      needsPhaseEqualization |= std::abs(phases.back()) > 1e-10;
+    }
+
+    // N.B: The algorithm, as described in the paper, creates a circuit that
+    // begins with a target state and brings it to the all zero state. Hence,
+    // this implementation do the two steps described in Section III in reverse
+    // order.
+
+    // Apply uniformly controlled y-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaYk = cudaq::details::getAlphaY(magnitudes, numQubits, k);
+      applyRotation<quake::RyOp>(alphaYk, numControls, target);
+    }
+
+    if (!needsPhaseEqualization)
+      return;
+
+    // Apply uniformly controlled z-rotations, the construction in Eq. (4).
+    for (std::size_t j = 1; j <= numQubits; ++j) {
+      auto k = numQubits - j + 1;
+      auto numControls = j - 1;
+      auto target = j - 1;
+      auto alphaZk = cudaq::details::getAlphaZ(phases, numQubits, k);
+      if (alphaZk.empty())
+        continue;
+      applyRotation<quake::RzOp>(alphaZk, numControls, target);
+    }
+  }
+
+private:
+  /// @brief Apply a uniformly controlled rotation on the target qubit.
+  template <typename Op>
+  void applyRotation(const std::span<double> alphas, std::size_t numControls,
+                     std::size_t target) {
+
+    // In our model the index 1 (i.e. |01>) in quantum state data
+    // corresponds to qubits[0] = 1 and qubits[1] = 0.
+    // Revert the order of qubits as the state preparation algorithm
+    // we use assumes the opposite.
+    auto qubitIndex = [&](std::size_t i) { return numQubits - i - 1; };
+
+    auto thetas = cudaq::details::convertAngles(alphas);
+    if (numControls == 0) {
+      builder.applyRotationOp<Op>(thetas[0], qubitIndex(target));
+      return;
+    }
+
+    auto controlIndices = cudaq::details::getControlIndices(numControls);
+    assert(thetas.size() == controlIndices.size());
+    for (auto [i, c] : llvm::enumerate(controlIndices)) {
+      builder.applyRotationOp<Op>(thetas[i], qubitIndex(target));
+      builder.applyX(qubitIndex(c), qubitIndex(target));
+    }
+  }
+
+  StateGateBuilder &builder;
+  std::span<std::complex<double>> amplitudes;
+  std::size_t numQubits;
+};
+
 /// Replace a qubit initialization from vectors with quantum gates.
 /// For example:
 ///
-///
 /// Before StatePreparation (state-prep):
-///
+/// ```
 /// module {
 ///   func.func @foo() attributes {
 ///     %0 = cc.address_of @foo.rodata_0 : !cc.ptr<!cc.array<complex<f32> x 4>>
@@ -54,9 +298,10 @@ using namespace mlir;
 ///      (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) :
 ///    !cc.array<complex<f32> x 4>
 /// }
+/// ```
 ///
 /// After StatePreparation (state-prep):
-///
+/// ```
 /// module {
 ///   func.func @foo() attributes {
 ///     %0 = quake.alloca !quake.veq<2>
@@ -75,22 +320,16 @@ using namespace mlir;
 ///     return
 ///   }
 /// }
+/// ```
 
 namespace {
 
 std::vector<std::complex<double>>
-readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
+readGlobalConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
   std::vector<std::complex<double>> result{};
 
   auto attr = global.getValue();
-  auto type = global.getType().getElementType();
-
-  auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(type);
-  assert(arrayTy);
-  assert(attr.has_value());
-
-  auto elementsAttr = dyn_cast<mlir::ElementsAttr>(attr.value());
-  assert(elementsAttr);
+  auto elementsAttr = cast<mlir::ElementsAttr>(attr.value());
   auto eleTy = elementsAttr.getElementType();
   auto values = elementsAttr.getValues<mlir::Attribute>();
 
@@ -119,13 +358,10 @@ readConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
 LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
   auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
   auto toErase = std::vector<mlir::Operation *>();
-  auto hasInitState = false;
-  auto replacedInitState = false;
+  auto result = success();
 
   funcOp->walk([&](Operation *op) {
     if (auto initOp = dyn_cast<quake::InitializeStateOp>(op)) {
-      toErase.push_back(initOp);
-      hasInitState = true;
       auto loc = op->getLoc();
       builder.setInsertionPointAfter(initOp);
       // Find the qvector alloc.
@@ -134,10 +370,10 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
 
         // Find vector data.
         auto data = initOp.getOperand(1);
-        if (auto cast = dyn_cast<cudaq::cc::CastOp>(data.getDefiningOp())) {
+        auto cast = dyn_cast<cudaq::cc::CastOp>(data.getDefiningOp());
+        if (cast)
           data = cast.getOperand();
-          toErase.push_back(cast);
-        }
+
         if (auto addr =
                 dyn_cast<cudaq::cc::AddressOfOp>(data.getDefiningOp())) {
 
@@ -145,7 +381,7 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
           auto symbol = module.lookupSymbol(globalName);
           if (auto global = dyn_cast<cudaq::cc::GlobalOp>(symbol)) {
             // Read state initialization data from the global array.
-            auto vec = readConstantArray(builder, global);
+            auto vec = readGlobalConstantArray(builder, global);
 
             // Prepare state from vector data.
             auto gateBuilder = StateGateBuilder(builder, loc, qubits);
@@ -153,31 +389,26 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
             decomposer.decompose();
 
             initOp.replaceAllUsesWith(qubits);
+            toErase.push_back(initOp);
+            if (cast)
+              toErase.push_back(cast);
             toErase.push_back(addr);
             toErase.push_back(global);
-            replacedInitState = true;
+            return;
           }
         }
       }
+      funcOp.emitOpError(
+          "StatePreparation failed to find to replace quake.state_init");
+      result = failure();
     }
   });
 
-  if (hasInitState && !replacedInitState) {
-    funcOp.emitOpError("StatePreparation failed to replace quake.init_state");
-    return failure();
-  }
-
   for (auto &op : toErase) {
-    if (op->getUses().empty()) {
-      op->erase();
-    } else {
-      op->emitOpError("StatePreparation failed to remove quake.init_state "
-                      "or its dependencies.");
-      return failure();
-    }
+    op->erase();
   }
 
-  return success();
+  return result;
 }
 
 class StatePreparationPass
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 25b80d9043..825ee78c2a 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -539,18 +539,18 @@ def __copyVectorAndCastElements(self, source, targetEleType):
         if (sourceEleType == targetEleType):
             return sourcePtr
 
-        sourceArrTy = cc.ArrayType.get(self.ctx, sourceEleType)
+        sourceArrType = cc.ArrayType.get(self.ctx, sourceEleType)
         sourceElePtrTy = cc.PointerType.get(self.ctx, sourceEleType)
-        sourceArrPtrTy = cc.PointerType.get(self.ctx, sourceArrTy)
+        sourceArrElePtrTy = cc.PointerType.get(self.ctx, sourceArrType)
         sourceValue = self.ifPointerThenLoad(sourcePtr)
-        sourceDataPtr = cc.StdvecDataOp(sourceArrPtrTy, sourceValue).result
+        sourceDataPtr = cc.StdvecDataOp(sourceArrElePtrTy, sourceValue).result
         sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result
 
-        targetElePtrTy = cc.PointerType.get(self.ctx, targetEleType)
+        targetElePtrType = cc.PointerType.get(self.ctx, targetEleType)
         targetTy = cc.ArrayType.get(self.ctx, targetEleType)
-        targetArrPtrTy = cc.PointerType.get(self.ctx, targetTy)
+        targetArrElePtrTy = cc.PointerType.get(self.ctx, targetTy)
         targetVecTy = cc.StdvecType.get(self.ctx, targetEleType)
-        targetPtr = cc.AllocaOp(targetArrPtrTy,
+        targetPtr = cc.AllocaOp(targetArrElePtrTy,
                                 TypeAttr.get(targetEleType),
                                 seqSize=sourceSize).result
 
@@ -561,7 +561,7 @@ def bodyBuilder(iterVar):
                                       rawIndex).result
             loadedEle = cc.LoadOp(eleAddr).result
             castedEle = self.promoteOperandType(targetEleType, loadedEle)
-            targetEleAddr = cc.ComputePtrOp(targetElePtrTy, targetPtr,
+            targetEleAddr = cc.ComputePtrOp(targetElePtrType, targetPtr,
                                             [iterVar], rawIndex).result
             cc.StoreOp(castedEle, targetEleAddr)
 
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index f5cc0bec07..8496199d15 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -8,6 +8,7 @@
 
 #include "JITExecutionCache.h"
 #include "common/ArgumentWrapper.h"
+#include "common/Environment.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/Optimizer/CAPI/Dialects.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
@@ -493,18 +494,6 @@ py::object pyAltLaunchKernelR(const std::string &name, MlirModule module,
   return returnValue;
 }
 
-/// @brief Helper function to get boolean environment variable
-static bool getEnvBool(const char *envName, bool defaultVal = false) {
-  if (auto envVal = std::getenv(envName)) {
-    std::string tmp(envVal);
-    std::transform(tmp.begin(), tmp.end(), tmp.begin(),
-                   [](unsigned char c) { return std::tolower(c); });
-    if (tmp == "1" || tmp == "on" || tmp == "true" || tmp == "yes")
-      return true;
-  }
-  return defaultVal;
-}
-
 MlirModule synthesizeKernel(const std::string &name, MlirModule module,
                             cudaq::OpaqueArguments &runtimeArgs) {
   ScopedTraceWithContext(cudaq::TIMING_JIT, "synthesizeKernel", name);
@@ -531,7 +520,6 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto &platform = cudaq::get_platform();
   if (!platform.is_simulator() || platform.is_emulated()) {
     pm.addPass(cudaq::opt::createConstPropComplex());
-    pm.addPass(createCSEPass());
     pm.addPass(cudaq::opt::createLiftArrayAlloc());
     pm.addPass(cudaq::opt::createStatePreparation());
   }
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 30445dae45..00e91ffc33 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "common/Environment.h"
 #include "common/ExecutionContext.h"
 #include "common/Executor.h"
 #include "common/FmtCore.h"
@@ -125,18 +126,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     delete jit;
   }
 
-  /// @brief Helper function to get boolean environment variable
-  bool getEnvBool(const char *envName, bool defaultVal = false) {
-    if (auto envVal = std::getenv(envName)) {
-      std::string tmp(envVal);
-      std::transform(tmp.begin(), tmp.end(), tmp.begin(),
-                     [](unsigned char c) { return std::tolower(c); });
-      if (tmp == "1" || tmp == "on" || tmp == "true" || tmp == "yes")
-        return true;
-    }
-    return defaultVal;
-  }
-
   virtual std::tuple<mlir::ModuleOp, mlir::MLIRContext *, void *>
   extractQuakeCodeAndContext(const std::string &kernelName, void *data) = 0;
   virtual void cleanupContext(mlir::MLIRContext *context) { return; }
diff --git a/runtime/common/CMakeLists.txt b/runtime/common/CMakeLists.txt
index 01b4b0a235..220c60efe1 100644
--- a/runtime/common/CMakeLists.txt
+++ b/runtime/common/CMakeLists.txt
@@ -17,6 +17,7 @@ set(COMMON_RUNTIME_SRC
   Resources.cpp
   Trace.cpp
   Future.cpp
+  Environment.cpp
   Executor.cpp
 )
 
@@ -83,7 +84,7 @@ endif()
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
 
-add_library(cudaq-mlir-runtime SHARED RuntimeMLIR.cpp JIT.cpp Logger.cpp)
+add_library(cudaq-mlir-runtime SHARED RuntimeMLIR.cpp Environment.cpp JIT.cpp Logger.cpp)
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-mlir-runtime)
 set_source_files_properties(JIT.cpp PROPERTIES COMPILE_FLAGS -fno-rtti)
 
diff --git a/runtime/common/Environment.cpp b/runtime/common/Environment.cpp
new file mode 100644
index 0000000000..e22e4a066e
--- /dev/null
+++ b/runtime/common/Environment.cpp
@@ -0,0 +1,26 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "Environment.h"
+#include <algorithm>
+#include <string>
+
+namespace cudaq {
+
+/// @brief Helper function to get boolean environment variable
+bool getEnvBool(const char *envName, bool defaultVal = false) {
+  if (auto envVal = std::getenv(envName)) {
+    std::string tmp(envVal);
+    std::transform(tmp.begin(), tmp.end(), tmp.begin(),
+                    [](unsigned char c) { return std::tolower(c); });
+    return (tmp == "1" || tmp == "on" || tmp == "true" || tmp == "y" || tmp == "yes");
+  }
+  return defaultVal;
+}
+
+} // namespace cudaq
diff --git a/runtime/common/Environment.h b/runtime/common/Environment.h
new file mode 100644
index 0000000000..9bbea871f9
--- /dev/null
+++ b/runtime/common/Environment.h
@@ -0,0 +1,16 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+namespace cudaq {
+
+/// @brief Helper function to get boolean environment variable
+bool getEnvBool(const char *envName, bool defaultVal);
+
+} // namespace cudaq
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index 1a1c7421aa..add36c5f31 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -6,7 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: nvq++ %cpp_std --enable-mlir                               %s -o %t && %t | FileCheck %s
+// Simulators
+// RUN: nvq++ %cpp_std --enable-mlir --target nvidia %s       -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --enable-mlir --target nvidia-fp64 %s  -o %t && %t | FileCheck %s
+
+// Quantum emulators
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
 // 2 different IQM machines for 2 different topologies
@@ -18,16 +22,26 @@
 #include <iostream>
 
 __qpu__ void test_complex_constant_array() {
-   cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+  cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+}
+
+#ifdef CUDAQ_SIMULATION_SCALAR_FP32
+__qpu__ void test_complex_constant_array_floating_point() {
+  cudaq::qvector v(std::vector<std::complex<float>>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
+#else
+__qpu__ void test_complex_constant_array_floating_point() {
+  cudaq::qvector v(std::vector<std::complex<double>>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+}
+#endif
 
 __qpu__ void test_complex_constant_array2() {
-   cudaq::qvector v1(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
-   cudaq::qvector v2(std::vector<cudaq::complex>({ 0., 0., M_SQRT1_2, M_SQRT1_2}));
+  cudaq::qvector v1(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+  cudaq::qvector v2(std::vector<cudaq::complex>({ 0., 0., M_SQRT1_2, M_SQRT1_2}));
 }
 
 __qpu__ void test_complex_constant_array3() {
-   cudaq::qvector v({
+  cudaq::qvector v({
     cudaq::complex(M_SQRT1_2),
     cudaq::complex(M_SQRT1_2),
     cudaq::complex(0.0),
@@ -39,14 +53,44 @@ __qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
   cudaq::qvector q1 = inState;
 }
 
+#ifdef CUDAQ_SIMULATION_SCALAR_FP32
+__qpu__ void test_complex_array_param_floating_point(std::vector<std::complex<float>> inState) {
+  cudaq::qvector q1 = inState;
+}
+#else
+__qpu__ void test_complex_array_param_floating_point(std::vector<std::complex<double>> inState) {
+  cudaq::qvector q1 = inState;
+}
+#endif
+
 __qpu__ void test_real_constant_array() {
   cudaq::qvector v({ M_SQRT1_2, M_SQRT1_2, 0., 0.});
 }
 
+#ifdef CUDAQ_SIMULATION_SCALAR_FP32
+__qpu__ void test_real_constant_array_floating_point() {
+  cudaq::qvector v(std::vector<float>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+}
+#else
+__qpu__ void test_real_constant_array_floating_point() {
+  cudaq::qvector v(std::vector<double>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+}
+#endif
+
 __qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
   cudaq::qvector q1 = inState;
 }
 
+#ifdef CUDAQ_SIMULATION_SCALAR_FP32
+__qpu__ void test_real_array_param_floating_point(std::vector<float> inState) {
+  cudaq::qvector q1 = inState;
+}
+#else
+__qpu__ void test_real_array_param_floating_point(std::vector<double> inState) {
+  cudaq::qvector q1 = inState;
+}
+#endif
+
 void printCounts(cudaq::sample_result& result) {
   std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
@@ -60,105 +104,155 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-    {
-      auto counts = cudaq::sample(test_complex_constant_array);
-      printCounts(counts);
-    }
-
-    {
-      auto counts = cudaq::sample(test_complex_constant_array2);
-      printCounts(counts);
-    }
-
-    {
-      auto counts = cudaq::sample(test_complex_constant_array3);
-      printCounts(counts);
-    }
-
-    {
-      auto counts = cudaq::sample(test_real_constant_array);
-      printCounts(counts);
-    }
+  {
+    auto counts = cudaq::sample(test_complex_constant_array);
+    printCounts(counts);
+  }
 
-    {
-      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-      {
-          // Passing state data as argument (kernel mode)
-          auto counts = cudaq::sample(test_complex_array_param, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(test_complex_array_param, vec1);
-          printCounts(counts);
-      }
-
-      {
-          // Passing state data as argument (builder mode)
-          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
-          auto qubits = kernel.qalloc(v);
-
-          auto counts = cudaq::sample(kernel, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(kernel, vec1);
-          printCounts(counts);
-      }
-    }
+// CHECK: 00
+// CHECK: 10
 
-    {
-      std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-      {
-          // Passing state data as argument (kernel mode)
-          auto counts = cudaq::sample(test_real_array_param, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(test_real_array_param, vec1);
-          printCounts(counts);
-      }
-
-      {
-          // Passing state data as argument (builder mode)
-          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
-          auto qubits = kernel.qalloc(v);
-
-          auto counts = cudaq::sample(kernel, vec);
-          printCounts(counts);
-
-          counts = cudaq::sample(kernel, vec1);
-          printCounts(counts);
-      }
-    }
-}
+  {
+    auto counts = cudaq::sample(test_complex_constant_array_floating_point);
+    printCounts(counts);
+  }
 
 // CHECK: 00
 // CHECK: 10
 
+  {
+    auto counts = cudaq::sample(test_complex_constant_array2);
+    printCounts(counts);
+  }
+
 // CHECK: 0001
 // CHECK: 0011
 // CHECK: 1001
 // CHECK: 1011
 
+  {
+    auto counts = cudaq::sample(test_complex_constant_array3);
+    printCounts(counts);
+  }
+
 // CHECK: 00
 // CHECK: 10
 
+  {
+    auto counts = cudaq::sample(test_real_constant_array);
+    printCounts(counts);
+  }
+
 // CHECK: 00
 // CHECK: 10
 
+  {
+    auto counts = cudaq::sample(test_real_constant_array_floating_point);
+    printCounts(counts);
+  }
+
 // CHECK: 00
 // CHECK: 10
 
+  {
+    std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test_complex_array_param, vec);
+        printCounts(counts);
+
+// CHECK: 00
+// CHECK: 10
+
+        counts = cudaq::sample(test_complex_array_param, vec1);
+        printCounts(counts);
+
+// CHECK: 01
+// CHECK: 11
+    }
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test_complex_array_param_floating_point, vec);
+        printCounts(counts);
+
+// CHECK: 00
+// CHECK: 10
+
+        counts = cudaq::sample(test_complex_array_param_floating_point, vec1);
+        printCounts(counts);
+
+// CHECK: 01
+// CHECK: 11
+    }
+
+    {
+        // Passing state data as argument (builder mode)
+        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+        auto qubits = kernel.qalloc(v);
+
+        auto counts = cudaq::sample(kernel, vec);
+        printCounts(counts);
+
+// CHECK: 00
+// CHECK: 10
+
+        counts = cudaq::sample(kernel, vec1);
+        printCounts(counts);
+
 // CHECK: 01
 // CHECK: 11
+    }
+  }
+
+  {
+    std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+    std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test_real_array_param, vec);
+        printCounts(counts);
 
 // CHECK: 00
 // CHECK: 10
 
+        counts = cudaq::sample(test_real_array_param, vec1);
+        printCounts(counts);
+
 // CHECK: 01
 // CHECK: 11
+    }
+    {
+        // Passing state data as argument (kernel mode)
+        auto counts = cudaq::sample(test_real_array_param_floating_point, vec);
+        printCounts(counts);
 
 // CHECK: 00
 // CHECK: 10
 
+        counts = cudaq::sample(test_real_array_param_floating_point, vec1);
+        printCounts(counts);
+
 // CHECK: 01
 // CHECK: 11
+    }
+
+    {
+        // Passing state data as argument (builder mode)
+        auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
+        auto qubits = kernel.qalloc(v);
+
+        auto counts = cudaq::sample(kernel, vec);
+        printCounts(counts);
+
+// CHECK: 00
+// CHECK: 10
+
+        counts = cudaq::sample(kernel, vec1);
+        printCounts(counts);
+
+// CHECK: 01
+// CHECK: 11
+    }
+  }
+}
diff --git a/test/Quake/const_prop_complex.qke b/test/Quake/const_prop_complex.qke
index 7b75d72ac9..884a21486b 100644
--- a/test/Quake/const_prop_complex.qke
+++ b/test/Quake/const_prop_complex.qke
@@ -8,6 +8,43 @@
 
 // RUN: cudaq-opt -const-prop-complex %s | FileCheck %s
 
+func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant 0.70710678118654757 : f64
+  %0 = cc.cast %cst_0 : (f64) -> f32
+  %1 = complex.create %0, %cst : complex<f32>
+  %2 = complex.create %cst, %cst : complex<f32>
+  %3 = cc.alloca !cc.array<complex<f32> x 4>
+  %4 = cc.cast %3 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %1, %4 : !cc.ptr<complex<f32>>
+  %5 = cc.compute_ptr %3[1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %1, %5 : !cc.ptr<complex<f32>>
+  %6 = cc.compute_ptr %3[2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %2, %6 : !cc.ptr<complex<f32>>
+  %7 = cc.compute_ptr %3[3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+  cc.store %2, %7 : !cc.ptr<complex<f32>>
+  %8 = quake.alloca !quake.veq<2>
+  %9 = quake.init_state %8, %4 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+  return
+}
+
+// CHECK-LABEL:   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = complex.constant [0.707106769 : f32, 0.000000e+00 : f32] : complex<f32>
+// CHECK:           %[[VAL_1:.*]] = complex.constant [0.000000e+00 : f32, 0.000000e+00 : f32] : complex<f32>
+// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.array<complex<f32> x 4>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_4]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_5]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr<complex<f32>>
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_8:.*]] = quake.init_state %[[VAL_7]], %[[VAL_3]] : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
+// CHECK:           return
+// CHECK:           }
+
 func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %cst = arith.constant 0.000000e+00 : f32
     %cst_0 = arith.constant 0.70710678118654757 : f64
diff --git a/test/Quake/state_prep.qke b/test/Quake/state_prep.qke
index 3ba6d077bb..4289571b33 100644
--- a/test/Quake/state_prep.qke
+++ b/test/Quake/state_prep.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt -state-prep %s | FileCheck %s
+// RUN: cudaq-opt -state-prep -canonicalize %s | FileCheck %s
 
 module {
   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
@@ -18,23 +18,19 @@ module {
   cc.global constant @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv.rodata_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_3]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_2]][0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_4:.*]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_3]]] %[[VAL_4:.*]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_4:.*]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_3]]] %[[VAL_4:.*]] : (!quake.ref, !quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }
 
-
  func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %0 = cc.address_of @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv.rodata_0 : !cc.ptr<!cc.array<f64 x 4>>
     %1 = quake.alloca !quake.veq<2>
@@ -44,19 +40,16 @@ module {
   cc.global constant @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv.rodata_0 (dense<[0.70710678118654757, 0.70710678118654757, 0.000000e+00, 0.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_4]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_4]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_4]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -70,19 +63,16 @@ module {
   cc.global constant @__nvqpp_rodata_init_state.0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_complex_array_param._Z24test_complex_array_paramSt6vectorISt7complexIfESaIS1_EE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_3]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_2]][0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_4]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_3]]] %[[VAL_4]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_4]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_3]]] %[[VAL_4]] : (!quake.ref, !quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -95,20 +85,17 @@ module {
   }
   cc.global constant @__nvqpp_rodata_init_state.1 (dense<[0.707106769, 0.707106769, 0.000000e+00, 0.000000e+00]> : tensor<4xf32>) : !cc.array<f32 x 4>
 
-// CHECK-LABEL:     func.func @__nvqpp__mlirgen__function_test_real_array_param._Z21test_real_array_paramSt6vectorIfSaIfEE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           quake.ry (%[[VAL_3]]) %[[VAL_2]] : (f64, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][%[[VAL_4]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_6]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:           %[[VAL_7:.*]] = arith.constant 0.78539816339744839 : f64
-// CHECK:           quake.ry (%[[VAL_7]]) %[[VAL_5]] : (f64, !quake.ref) -> ()
-// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_real_array_param._Z21test_real_array_paramSt6vectorIfSaIfEE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2:.*]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_3]] : (f64, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_4]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_3]]] %[[VAL_4]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           quake.ry (%[[VAL_0]]) %[[VAL_4]] : (f64, !quake.ref) -> ()
+// CHECK:           quake.x [%[[VAL_3]]] %[[VAL_4]] : (!quake.ref, !quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }
 }

From 516e50e583f076b4e4ec97065e95a2d7caa031b3 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 17 Jul 2024 15:43:57 -0700
Subject: [PATCH 33/50] Format

---
 runtime/common/Environment.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/runtime/common/Environment.cpp b/runtime/common/Environment.cpp
index e22e4a066e..130349e543 100644
--- a/runtime/common/Environment.cpp
+++ b/runtime/common/Environment.cpp
@@ -17,8 +17,9 @@ bool getEnvBool(const char *envName, bool defaultVal = false) {
   if (auto envVal = std::getenv(envName)) {
     std::string tmp(envVal);
     std::transform(tmp.begin(), tmp.end(), tmp.begin(),
-                    [](unsigned char c) { return std::tolower(c); });
-    return (tmp == "1" || tmp == "on" || tmp == "true" || tmp == "y" || tmp == "yes");
+                   [](unsigned char c) { return std::tolower(c); });
+    return (tmp == "1" || tmp == "on" || tmp == "true" || tmp == "y" ||
+            tmp == "yes");
   }
   return defaultVal;
 }

From 6ccfc63ac556bf4acfa08d761bfefcccac7f14b4 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 17 Jul 2024 17:27:10 -0700
Subject: [PATCH 34/50] Make lift alloc more tolerant, fixed failing test

---
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp | 68 +++++++++++++--------
 runtime/common/RuntimeMLIRCommonImpl.h      |  1 -
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 9de57fad81..47c19b402d 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -46,42 +46,48 @@ std::vector<A> readConstantValues(SmallVectorImpl<Attribute> &vec, Type eleTy) {
     } else if constexpr (std::is_same_v<A, float>) {
       auto v = cast<FloatAttr>(a);
       result.emplace_back(v.getValue().convertToFloat());
-    } else {
-      assert(false && "unexpected type in constant array");
     }
   }
   return result;
 }
 
-void genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder,
-                                        Location loc, ModuleOp module,
-                                        StringRef name,
-                                        SmallVector<Attribute> &values,
-                                        Type eleTy) {
+LogicalResult genVectorOfConstantsFromAttributes(cudaq::IRBuilder irBuilder,
+                                                 Location loc, ModuleOp module,
+                                                 StringRef name,
+                                                 SmallVector<Attribute> &values,
+                                                 Type eleTy) {
 
   if (auto cTy = dyn_cast<ComplexType>(eleTy)) {
     auto floatTy = cTy.getElementType();
     if (floatTy == irBuilder.getF64Type()) {
       auto vals = readConstantValues<std::complex<double>>(values, cTy);
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return;
+      if (vals.size() == values.size()) {
+        irBuilder.genVectorOfConstants(loc, module, name, vals);
+        return success();
+      }
     } else if (floatTy == irBuilder.getF32Type()) {
       auto vals = readConstantValues<std::complex<float>>(values, cTy);
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return;
+      if (vals.size() == values.size()) {
+        irBuilder.genVectorOfConstants(loc, module, name, vals);
+        return success();
+      }
     }
   } else if (auto floatTy = dyn_cast<FloatType>(eleTy)) {
     if (floatTy == irBuilder.getF64Type()) {
       auto vals = readConstantValues<double>(values, floatTy);
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return;
+      if (vals.size() == values.size()) {
+        irBuilder.genVectorOfConstants(loc, module, name, vals);
+        return success();
+      }
     } else if (floatTy == irBuilder.getF32Type()) {
       auto vals = readConstantValues<float>(values, floatTy);
-      irBuilder.genVectorOfConstants(loc, module, name, vals);
-      return;
+      if (vals.size() == values.size()) {
+        irBuilder.genVectorOfConstants(loc, module, name, vals);
+        return success();
+      }
     }
   }
-  assert(false && "unexpected element type in constant array");
+  return failure();
 }
 } // namespace
 
@@ -128,10 +134,14 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       // Build a new name based on the kernel name.
       std::string name = funcName + ".rodata_" + std::to_string(counter++);
       cudaq::IRBuilder irBuilder(rewriter.getContext());
-      genVectorOfConstantsFromAttributes(irBuilder, loc, module, name, values,
-                                         eleTy);
-      conGlobal = rewriter.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
-      conArr = rewriter.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
+      if (succeeded(genVectorOfConstantsFromAttributes(irBuilder, loc, module,
+                                                       name, values, eleTy))) {
+        conGlobal = rewriter.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
+        conArr = rewriter.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
+      } else {
+        conArr =
+            rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
+      }
     } else {
       conArr =
           rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
@@ -169,6 +179,14 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
           toErase.push_back(useuser);
         isLive = true;
       }
+      if (auto ist = dyn_cast<quake::InitializeStateOp>(user)) {
+        rewriter.setInsertionPointAfter(user);
+        LLVM_DEBUG(llvm::dbgs() << "replaced init_state\n");
+        assert(conGlobal && "global must be defined");
+        rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
+            ist, ist.getType(), ist.getTargets(), conGlobal);
+        continue;
+      }
       if (!isLive)
         toErase.push_back(user);
     }
@@ -292,16 +310,16 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
                 << *op << '\n');
           continue;
         }
-        if (isa<quake::InitializeStateOp>(op)) {
-          toGlobalUses.push_back(op);
-          toGlobal = true;
-          continue;
-        }
         LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
         toGlobalUses.push_back(op);
         toGlobal = true;
         continue;
       }
+      if (isa<quake::InitializeStateOp>(op)) {
+        toGlobalUses.push_back(op);
+        toGlobal = true;
+        continue;
+      }
       LLVM_DEBUG(llvm::dbgs() << "unexpected use: " << *op << '\n');
       toGlobalUses.push_back(op);
       toGlobal = true;
diff --git a/runtime/common/RuntimeMLIRCommonImpl.h b/runtime/common/RuntimeMLIRCommonImpl.h
index 586bcba422..c396136ce8 100644
--- a/runtime/common/RuntimeMLIRCommonImpl.h
+++ b/runtime/common/RuntimeMLIRCommonImpl.h
@@ -369,7 +369,6 @@ qirProfileTranslationFunction(const char *qirProfile, mlir::Operation *op,
   mlir::PassManager pm(context);
   if (printIntermediateMLIR)
     pm.enableIRPrinting();
-
   std::string errMsg;
   llvm::raw_string_ostream errOs(errMsg);
   cudaq::opt::addPipelineConvertToQIR(pm, qirProfile);

From 2226653e06791c24dc48a8d25405f6d2279d7e0c Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 17 Jul 2024 17:42:33 -0700
Subject: [PATCH 35/50] Removed unneded changes

---
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   |  4 ++--
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 10 ++--------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 47c19b402d..1867170141 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -229,8 +229,8 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
     if (std::distance(alloc->getUses().begin(), alloc->getUses().end()) < size)
       return false;
 
-    // Keep a scoreboard for every element in the array. Every element *must*
-    // be stored to with a constant exactly one time.
+    // Keep a scoreboard for every element in the array. Every element *must* be
+    // stored to with a constant exactly one time.
     scoreboard.resize(size);
     for (int i = 0; i < size; i++)
       scoreboard[i] = nullptr;
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index e309f86214..5eb99d24cd 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -127,11 +127,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
   auto eleTy = cast<ELETY>(strTy.getElementType());
   builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
-
-  auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
   auto conArray = builder.create<cudaq::cc::ConstantArrayOp>(
       argLoc, cudaq::cc::ArrayType::get(ctx, eleTy, vec.size()), arrayAttr);
-
+  auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
   std::optional<Value> arrayInMemory;
   auto ptrEleTy = cudaq::cc::PointerType::get(eleTy);
   bool generateNewValue = false;
@@ -141,8 +139,6 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     if (arrayInMemory)
       return *arrayInMemory;
     OpBuilder::InsertionGuard guard(builder);
-    auto argLoc = argument.getLoc();
-
     Value buffer;
     if (hasInitStateUse(argument)) {
       // Stick global at end of Module.
@@ -250,7 +246,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
       // Check if there were other uses of `vec.data()` and simply forward the
       // constant array as materialized in memory.
       if (replaceOtherUses) {
-        Value memArr = getArrayInMemory();
+        auto memArr = getArrayInMemory();
         stdvecDataOp.replaceAllUsesWith(memArr);
       }
       continue;
@@ -286,7 +282,6 @@ std::vector<std::int32_t> asI32(const std::vector<A> &v) {
 static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument, std::vector<bool> &vec) {
-
   auto arrayAttr = builder.getI32ArrayAttr(asI32(vec));
   return synthesizeVectorArgument<IntegerType>(builder, module, counter,
                                                argument, vec, arrayAttr,
@@ -359,7 +354,6 @@ static LogicalResult
 synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          BlockArgument argument,
                          std::vector<std::complex<float>> &vec) {
-
   std::vector<float> vec2;
   for (auto c : vec) {
     vec2.push_back(c.real());

From 96598f2eee88a176ccdfd8400a0cad7ee89045c0 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 17 Jul 2024 18:01:11 -0700
Subject: [PATCH 36/50] Addressed more CR comments

---
 lib/Optimizer/Transforms/ConstPropComplex.cpp    | 10 ++++------
 lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp |  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lib/Optimizer/Transforms/ConstPropComplex.cpp b/lib/Optimizer/Transforms/ConstPropComplex.cpp
index d1ffc8d5af..9fe626c1e1 100644
--- a/lib/Optimizer/Transforms/ConstPropComplex.cpp
+++ b/lib/Optimizer/Transforms/ConstPropComplex.cpp
@@ -178,12 +178,10 @@ class ConstPropComplexPass
       DominanceInfo domInfo(func);
       std::string funcName = func.getName().str();
       RewritePatternSet patterns(ctx);
-      patterns.insert<ComplexCreatePattern>(ctx);
-      patterns.insert<FloatCastPattern>(ctx);
-      patterns.insert<FloatExtendPattern>(ctx);
-      patterns.insert<FloatTruncatePattern>(ctx);
-      patterns.insert<ComplexRePattern>(ctx);
-      patterns.insert<ComplexImPattern>(ctx);
+      patterns
+          .insert<ComplexCreatePattern, FloatCastPattern, FloatExtendPattern,
+                  FloatTruncatePattern, ComplexRePattern, ComplexImPattern>(
+              ctx);
 
       LLVM_DEBUG(llvm::dbgs()
                  << "Before lifting constant array: " << func << '\n');
diff --git a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
index 26dc40f9a9..1c3ec42a1c 100644
--- a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
+++ b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
@@ -106,8 +106,8 @@ class GenerateDeviceCodeLoader
           LLVM_DEBUG(llvm::dbgs() << "adding declaration: " << op);
           declarations.push_back(&op);
         }
-      } else if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
-        LLVM_DEBUG(llvm::dbgs() << "adding global: " << op);
+      } else if (auto ccGlobalOp = dyn_cast<cudaq::cc::GlobalOp>(op)) {
+        LLVM_DEBUG(llvm::dbgs() << "adding global constants: " << op);
         declarations.push_back(&op);
       }
     }

From 63adba0fafd26f610787760ca6a964a776a7e770 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 08:46:11 -0700
Subject: [PATCH 37/50] Fix failing test

---
 targettests/execution/state_preparation_vector.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index add36c5f31..c51a723460 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -7,8 +7,7 @@
  ******************************************************************************/
 
 // Simulators
-// RUN: nvq++ %cpp_std --enable-mlir --target nvidia %s       -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --enable-mlir --target nvidia-fp64 %s  -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --enable-mlir  %s                             -o %t && %t | FileCheck %s
 
 // Quantum emulators
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s

From d3d11373bb0cb9489911dc586bb37119991aa0e0 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 09:22:25 -0700
Subject: [PATCH 38/50] Added a test for cast pattern in const_prop+complex

---
 lib/Optimizer/Transforms/ConstPropComplex.cpp |  6 ++-
 .../execution/state_preparation_vector.cpp    | 17 ++++++
 test/Quake/const_prop_complex.qke             | 53 +++++++------------
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/lib/Optimizer/Transforms/ConstPropComplex.cpp b/lib/Optimizer/Transforms/ConstPropComplex.cpp
index 9fe626c1e1..939634bf83 100644
--- a/lib/Optimizer/Transforms/ConstPropComplex.cpp
+++ b/lib/Optimizer/Transforms/ConstPropComplex.cpp
@@ -63,13 +63,15 @@ class FloatCastPattern : public OpRewritePattern<cudaq::cc::CastOp> {
     auto valCon = val.getDefiningOp<arith::ConstantFloatOp>();
     if (valCon) {
       auto fTy = dyn_cast<FloatType>(cast.getType());
-      if (fTy == rewriter.getF64Type()) {
+      auto opTy = dyn_cast<FloatType>(cast.getOperand().getType());
+      if (fTy == rewriter.getF64Type() && opTy == rewriter.getF32Type()) {
         auto v = valCon.value().convertToFloat();
         auto fTy = dyn_cast<FloatType>(cast.getType());
         rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
             cast, APFloat{static_cast<double>(v)}, fTy);
         return success();
-      } else if (fTy == rewriter.getF32Type()) {
+      } else if (fTy == rewriter.getF32Type() &&
+                 opTy == rewriter.getF64Type()) {
         auto v = valCon.value().convertToDouble();
         auto fTy = dyn_cast<FloatType>(cast.getType());
         rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index c51a723460..8bce594ee6 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -20,6 +20,15 @@
 #include <cudaq.h>
 #include <iostream>
 
+__qpu__ float test_const_prop_cast() {
+  return M_SQRT1_2;
+}
+
+__qpu__ void test_const_prop_cast_caller() {
+  auto c = test_const_prop_cast();
+  cudaq::qvector v(std::vector<cudaq::complex>({ c, c, 0., 0.}));
+}
+
 __qpu__ void test_complex_constant_array() {
   cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
@@ -103,6 +112,14 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
+  {
+    auto counts = cudaq::sample(test_const_prop_cast_caller);
+    printCounts(counts);
+  }
+
+// CHECK: 00
+// CHECK: 10
+
   {
     auto counts = cudaq::sample(test_complex_constant_array);
     printCounts(counts);
diff --git a/test/Quake/const_prop_complex.qke b/test/Quake/const_prop_complex.qke
index 884a21486b..2840d2cdaa 100644
--- a/test/Quake/const_prop_complex.qke
+++ b/test/Quake/const_prop_complex.qke
@@ -8,42 +8,27 @@
 
 // RUN: cudaq-opt -const-prop-complex %s | FileCheck %s
 
-func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-  %cst = arith.constant 0.000000e+00 : f32
-  %cst_0 = arith.constant 0.70710678118654757 : f64
-  %0 = cc.cast %cst_0 : (f64) -> f32
-  %1 = complex.create %0, %cst : complex<f32>
-  %2 = complex.create %cst, %cst : complex<f32>
-  %3 = cc.alloca !cc.array<complex<f32> x 4>
-  %4 = cc.cast %3 : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-  cc.store %1, %4 : !cc.ptr<complex<f32>>
-  %5 = cc.compute_ptr %3[1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-  cc.store %1, %5 : !cc.ptr<complex<f32>>
-  %6 = cc.compute_ptr %3[2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-  cc.store %2, %6 : !cc.ptr<complex<f32>>
-  %7 = cc.compute_ptr %3[3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-  cc.store %2, %7 : !cc.ptr<complex<f32>>
-  %8 = quake.alloca !quake.veq<2>
-  %9 = quake.init_state %8, %4 : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
-  return
+func.func @__nvqpp__mlirgen__function_test_const_prop_cast_double() -> f32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  %cst = arith.constant 0.70710678118654757 : f64
+  %0 = cc.cast %cst : (f64) -> f32
+  return %0 : f32
 }
 
-// CHECK-LABEL:   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = complex.constant [0.707106769 : f32, 0.000000e+00 : f32] : complex<f32>
-// CHECK:           %[[VAL_1:.*]] = complex.constant [0.000000e+00 : f32, 0.000000e+00 : f32] : complex<f32>
-// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.array<complex<f32> x 4>
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<complex<f32>>
-// CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_4]] : !cc.ptr<complex<f32>>
-// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_5]] : !cc.ptr<complex<f32>>
-// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][3] : (!cc.ptr<!cc.array<complex<f32> x 4>>) -> !cc.ptr<complex<f32>>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr<complex<f32>>
-// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_8:.*]] = quake.init_state %[[VAL_7]], %[[VAL_3]] : (!quake.veq<2>, !cc.ptr<complex<f32>>) -> !quake.veq<2>
-// CHECK:           return
-// CHECK:           }
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_const_prop_cast_double() -> f32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0.707106769 : f32
+// CHECK:           return %[[VAL_0]] : f32
+// CHECK:         }
+
+func.func @__nvqpp__mlirgen__function_test_const_prop_cast_float() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  %cst = arith.constant 0.7071067 : f32
+  %0 = cc.cast %cst : (f32) -> f64
+  return %0 : f64
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_const_prop_cast_float() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0.70710670948028564 : f64
+// CHECK:           return %[[VAL_0]] : f64
+// CHECK:         }
 
 func.func @__nvqpp__mlirgen__function_test_complex_constant_array._Z27test_complex_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %cst = arith.constant 0.000000e+00 : f32

From b3e8dcb1b7ccd75676817be0a224875799c2a611 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 09:53:28 -0700
Subject: [PATCH 39/50] Fixed incorrect validation of InitStateOp

---
 lib/Optimizer/Dialect/Quake/QuakeOps.cpp      |  12 +-
 .../state_preparation_vector_sizes.cpp        | 250 ++++++++++++++++++
 2 files changed, 257 insertions(+), 5 deletions(-)
 create mode 100644 targettests/execution/state_preparation_vector_sizes.cpp

diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
index 4aef581938..093e209bc3 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
@@ -509,14 +509,16 @@ LogicalResult quake::ExtractRefOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult quake::InitializeStateOp::verify() {
-  auto veqTy = cast<quake::VeqType>(getTargets().getType());
-  if (veqTy.hasSpecifiedSize())
-    if (!std::has_single_bit(veqTy.getSize()))
-      return emitOpError("initialize state vector must be power of 2, but is " +
-                         std::to_string(veqTy.getSize()) + " instead.");
   auto ptrTy = cast<cudaq::cc::PointerType>(getState().getType());
   Type ty = ptrTy.getElementType();
   if (auto arrTy = dyn_cast<cudaq::cc::ArrayType>(ty)) {
+    if (!arrTy.isUnknownSize()) {
+      std::size_t size = arrTy.getSize();
+      if (!std::has_single_bit(size))
+        return emitOpError(
+            "initialize state vector must be power of 2, but is " +
+            std::to_string(size) + " instead.");
+    }
     if (!isa<FloatType, ComplexType>(arrTy.getElementType()))
       return emitOpError("invalid data pointer type");
   } else if (!isa<FloatType, ComplexType, cudaq::cc::StateType>(ty)) {
diff --git a/targettests/execution/state_preparation_vector_sizes.cpp b/targettests/execution/state_preparation_vector_sizes.cpp
new file mode 100644
index 0000000000..3c4d2a2ea7
--- /dev/null
+++ b/targettests/execution/state_preparation_vector_sizes.cpp
@@ -0,0 +1,250 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Simulators
+// RUN: nvq++ %cpp_std --enable-mlir  %s                             -o %t && %t | FileCheck %s
+
+// Quantum emulators
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+#include <iostream>
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void test(std::vector<cudaq::complex> inState) {
+  cudaq::qvector q1 = inState;
+}
+
+void printCounts(cudaq::sample_result &result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    // Reverse the bits so that the output is a binary number.
+    std::reverse(bits.begin(), bits.end());
+    std::cout << bits << '\n';
+  }
+}
+
+void printState(const std::vector<cudaq::complex> &amplitudes) {
+  std::cout << "state: (size: " << amplitudes.size() << ") { ";
+  for (auto a : amplitudes)
+    std::cout << a << ' ';
+  std::cout << "}\n";
+}
+
+int main() {
+  constexpr auto kNUM_QUBITS = 5u;
+  for (auto n = 0u; n < kNUM_QUBITS; ++n) {
+    const auto dimension = (1ULL << (n + 1));
+    for (auto i = 0u; i < dimension; ++i) {
+      std::vector<cudaq::complex> amplitudes(dimension, 0.);
+      amplitudes[i] = 1.;
+      printState(amplitudes);
+      auto counts = cudaq::sample(test, amplitudes);
+      printCounts(counts);
+      std::cout << '\n';
+    }
+  }
+}
+
+// CHECK:           state: (size: 2) { (1,0) (0,0) }
+// CHECK:           0
+
+// CHECK:           state: (size: 2) { (0,0) (1,0) }
+// CHECK:           1
+
+// CHECK:           state: (size: 4) { (1,0) (0,0) (0,0) (0,0) }
+// CHECK:           00
+
+// CHECK:           state: (size: 4) { (0,0) (1,0) (0,0) (0,0) }
+// CHECK:           01
+
+// CHECK:           state: (size: 4) { (0,0) (0,0) (1,0) (0,0) }
+// CHECK:           10
+
+// CHECK:           state: (size: 4) { (0,0) (0,0) (0,0) (1,0) }
+// CHECK:           11
+
+// CHECK:           state: (size: 8) { (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           000
+
+// CHECK:           state: (size: 8) { (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           001
+
+// CHECK:           state: (size: 8) { (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           010
+
+// CHECK:           state: (size: 8) { (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           011
+
+// CHECK:           state: (size: 8) { (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) }
+// CHECK:           100
+
+// CHECK:           state: (size: 8) { (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) }
+// CHECK:           101
+
+// CHECK:           state: (size: 8) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) }
+// CHECK:           110
+
+// CHECK:           state: (size: 8) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) }
+// CHECK:           111
+
+// CHECK:           state: (size: 16) { (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0000
+
+// CHECK:           state: (size: 16) { (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0001
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0010
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0011
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0100
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0101
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0110
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           0111
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           1000
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           1001
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           1010
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           1011
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) }
+// CHECK:           1100
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) }
+// CHECK:           1101
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) }
+// CHECK:           1110
+
+// CHECK:           state: (size: 16) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) }
+// CHECK:           1111
+
+// CHECK:           state: (size: 32) { (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00000
+
+// CHECK:           state: (size: 32) { (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00001
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00010
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00011
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00100
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00101
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00110
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           00111
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01000
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01001
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01010
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01011
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01100
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01101
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01110
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           01111
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10000
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10001
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10010
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10011
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10100
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10101
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10110
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           10111
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           11000
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           11001
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           11010
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) (0,0) }
+// CHECK:           11011
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) (0,0) }
+// CHECK:           11100
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) (0,0) }
+// CHECK:           11101
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (0,0) }
+// CHECK:           11110
+
+// CHECK:           state: (size: 32) { (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) }
+// CHECK:           11111
\ No newline at end of file

From a927660fb6d417c542f68c1f503c0d539aaf475a Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 10:00:15 -0700
Subject: [PATCH 40/50] Addressed more comments

---
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 1867170141..a5896e201c 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -197,9 +197,9 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       toErase.push_back(alloc);
     }
 
-    for (auto *op : toErase) {
+    for (auto *op : toErase)
       rewriter.eraseOp(op);
-    }
+
     return success();
   }
 

From 22f2e5c3ad32674782e39e2bb049200213f4eb70 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 10:51:03 -0700
Subject: [PATCH 41/50] Update lib/Optimizer/Transforms/StatePreparation.cpp

Co-authored-by: Bruno Schmitt <7152025+boschmitt@users.noreply.github.com>
---
 lib/Optimizer/Transforms/StatePreparation.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index f7a104b2ae..53c6a972a0 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -52,7 +52,7 @@ std::vector<std::size_t> getControlIndices(std::size_t numBits) {
     // the position where the lth and (l + 1)th bit strings g[l] and g[l+1] of
     // the binary reflected Gray code differ.
     auto position = std::log2(code[i] ^ code[(i + 1) % code.size()]);
-    // N.B: In CUDA Quantum we write the least significant bit (LSb) on the left
+    // N.B: The algorithm expects the least significant bit (LSb) on the left
     //
     //  lsb -v
     //       001
@@ -63,8 +63,8 @@ std::vector<std::size_t> getControlIndices(std::size_t numBits) {
     // numbers with the LSb on the left.
     //
     // Now, what we need to find out is the position of the 1 in the bitstring.
-    // If we take LSb as being position 0, then for the normal convention its
-    // position will be 0. Using CUDA Quantum convention it will be 2. Hence,
+    // If we take LSB as being position 0, then for the normal convention its
+    // position will be 0. Using the algorithm's convention it will be 2. Hence,
     // we need to convert the position we find using:
     //
     // numBits - position - 1

From 69afc99b3b68ad7aa21aa515404893c5e11f4565 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 11:33:45 -0700
Subject: [PATCH 42/50] Added a threshold option to StatePrep pass, added tests

---
 include/cudaq/Optimizer/Transforms/Passes.td  |  5 +++++
 lib/Optimizer/Transforms/StatePreparation.cpp | 20 +++++++++--------
 python/tests/backends/test_IQM.py             | 22 +++++++++++++++++++
 python/tests/backends/test_IonQ.py            | 22 +++++++++++++++++++
 python/tests/backends/test_OQC.py             | 22 +++++++++++++++++++
 .../test_Quantinuum_LocalEmulation_builder.py | 11 ++++++++++
 .../test_Quantinuum_LocalEmulation_kernel.py  | 12 ++++++++++
 .../tests/backends/test_Quantinuum_builder.py | 11 ++++++++++
 .../tests/backends/test_Quantinuum_kernel.py  | 11 ++++++++++
 9 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 80e31c66fc..f226b7044a 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -618,6 +618,11 @@ def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
     }
     ```
   }];
+
+  let options = [
+    Option<"phaseThreshold", "threshold", "double",
+      /*default=*/"1e-10", "Equalize the state if larger than the threshold">,
+  ];
 }
 
 def PromoteRefToVeqAlloc : Pass<"promote-qubit-allocation"> {
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 53c6a972a0..f0df842885 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -200,8 +200,10 @@ class StateGateBuilder {
 
 class StateDecomposer {
 public:
-  StateDecomposer(StateGateBuilder &b, std::span<std::complex<double>> a)
-      : builder(b), amplitudes(a), numQubits(log2(a.size())) {}
+  StateDecomposer(StateGateBuilder &b, std::span<std::complex<double>> a,
+                  double t)
+      : builder(b), amplitudes(a), numQubits(log2(a.size())),
+        phaseThreshold(t) {}
 
   /// @brief Decompose the input state vector data to a set of controlled
   /// operations and rotations. This function takes as input a `OpBuilder`
@@ -217,8 +219,7 @@ class StateDecomposer {
     for (const auto &a : amplitudes) {
       phases.push_back(std::arg(a));
       magnitudes.push_back(std::abs(a));
-      // FIXME: remove magic number.
-      needsPhaseEqualization |= std::abs(phases.back()) > 1e-10;
+      needsPhaseEqualization |= std::abs(phases.back()) > phaseThreshold;
     }
 
     // N.B: The algorithm, as described in the paper, creates a circuit that
@@ -279,6 +280,7 @@ class StateDecomposer {
   StateGateBuilder &builder;
   std::span<std::complex<double>> amplitudes;
   std::size_t numQubits;
+  double phaseThreshold;
 };
 
 /// Replace a qubit initialization from vectors with quantum gates.
@@ -355,7 +357,8 @@ readGlobalConstantArray(mlir::OpBuilder &builder, cudaq::cc::GlobalOp &global) {
   return result;
 }
 
-LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
+LogicalResult transform(ModuleOp module, func::FuncOp funcOp,
+                        double phaseThreshold) {
   auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front());
   auto toErase = std::vector<mlir::Operation *>();
   auto result = success();
@@ -385,7 +388,7 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
 
             // Prepare state from vector data.
             auto gateBuilder = StateGateBuilder(builder, loc, qubits);
-            auto decomposer = StateDecomposer(gateBuilder, vec);
+            auto decomposer = StateDecomposer(gateBuilder, vec, phaseThreshold);
             decomposer.decompose();
 
             initOp.replaceAllUsesWith(qubits);
@@ -398,8 +401,7 @@ LogicalResult transform(ModuleOp module, func::FuncOp funcOp) {
           }
         }
       }
-      funcOp.emitOpError(
-          "StatePreparation failed to find to replace quake.state_init");
+      funcOp.emitOpError("StatePreparation failed to replace quake.state_init");
       result = failure();
     }
   });
@@ -427,7 +429,7 @@ class StatePreparationPass
         continue;
       std::string kernelName = funcOp.getName().str();
 
-      auto result = transform(module, funcOp);
+      auto result = transform(module, funcOp, phaseThreshold);
       if (result.failed()) {
         funcOp.emitOpError("Failed to prepare state for '" + kernelName);
         signalPassFailure();
diff --git a/python/tests/backends/test_IQM.py b/python/tests/backends/test_IQM.py
index 38e2b55363..3408ef1602 100644
--- a/python/tests/backends/test_IQM.py
+++ b/python/tests/backends/test_IQM.py
@@ -174,6 +174,17 @@ def kernel(vec: List[complex]):
     assert assert_close(counts["01"], 0., 2)
     assert assert_close(counts["11"], 0., 2)
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert assert_close(counts["000"], shots / 2, 2)
+    assert assert_close(counts["100"], shots / 2, 2)
+    assert assert_close(counts["001"], 0., 2)
+    assert assert_close(counts["010"], 0., 2)
+    assert assert_close(counts["011"], 0., 2)
+    assert assert_close(counts["101"], 0., 2)
+    assert assert_close(counts["110"], 0., 2)
+    assert assert_close(counts["111"], 0., 2)
+
 
 def test_IQM_state_preparation_builder():
     shots = 10000
@@ -187,6 +198,17 @@ def test_IQM_state_preparation_builder():
     assert assert_close(counts["01"], 0., 2)
     assert assert_close(counts["11"], 0., 2)
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert assert_close(counts["000"], shots / 2, 2)
+    assert assert_close(counts["100"], shots / 2, 2)
+    assert assert_close(counts["001"], 0., 2)
+    assert assert_close(counts["010"], 0., 2)
+    assert assert_close(counts["011"], 0., 2)
+    assert assert_close(counts["101"], 0., 2)
+    assert assert_close(counts["110"], 0., 2)
+    assert assert_close(counts["111"], 0., 2)
+
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/python/tests/backends/test_IonQ.py b/python/tests/backends/test_IonQ.py
index f468a1d9c8..dfba4c9f55 100644
--- a/python/tests/backends/test_IonQ.py
+++ b/python/tests/backends/test_IonQ.py
@@ -171,6 +171,17 @@ def kernel(vec: List[complex]):
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
 
 def test_ionq_state_preparation_builder():
     kernel, state = cudaq.make_kernel(List[complex])
@@ -183,6 +194,17 @@ def test_ionq_state_preparation_builder():
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/python/tests/backends/test_OQC.py b/python/tests/backends/test_OQC.py
index 1ff86c535c..ee02efe4fc 100644
--- a/python/tests/backends/test_OQC.py
+++ b/python/tests/backends/test_OQC.py
@@ -172,6 +172,17 @@ def kernel(vec: List[complex]):
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
 
 def test_OQC_state_preparation_builder():
     kernel, state = cudaq.make_kernel(List[complex])
@@ -184,6 +195,17 @@ def test_OQC_state_preparation_builder():
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
index 58176b4e32..f03f1875bc 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_builder.py
@@ -124,6 +124,17 @@ def test_quantinuum_state_preparation():
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
index a32ad35f5f..0e21a5bf88 100644
--- a/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
+++ b/python/tests/backends/test_Quantinuum_LocalEmulation_kernel.py
@@ -153,6 +153,18 @@ def kernel(vec: List[complex]):
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
+
 def test_arbitrary_unitary_synthesis():
     import numpy as np
     cudaq.register_operation("custom_h",
diff --git a/python/tests/backends/test_Quantinuum_builder.py b/python/tests/backends/test_Quantinuum_builder.py
index 48d50b7419..c0589552c9 100644
--- a/python/tests/backends/test_Quantinuum_builder.py
+++ b/python/tests/backends/test_Quantinuum_builder.py
@@ -158,6 +158,17 @@ def test_quantinuum_state_preparation():
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index 646f9cc787..b27c339419 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -184,6 +184,17 @@ def kernel(vec: List[complex]):
     assert not '01' in counts
     assert not '11' in counts
 
+    state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0., 0., 0., 0., 0.]
+    counts = cudaq.sample(kernel, state)
+    assert '000' in counts
+    assert '100' in counts
+    assert not '001' in counts
+    assert not '010' in counts
+    assert not '011' in counts
+    assert not '101' in counts
+    assert not '110' in counts
+    assert not '111' in counts
+
 
 # leave for gdb debugging
 if __name__ == "__main__":

From da3d3f97308d1a035b6f0c85ac291322bffa942a Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 13:16:48 -0700
Subject: [PATCH 43/50] Update include/cudaq/Optimizer/Transforms/Passes.td

Co-authored-by: Bruno Schmitt <7152025+boschmitt@users.noreply.github.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index f226b7044a..1293cd2c5b 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -621,7 +621,7 @@ def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
 
   let options = [
     Option<"phaseThreshold", "threshold", "double",
-      /*default=*/"1e-10", "Equalize the state if larger than the threshold">,
+      /*default=*/"1e-10", "Threshold to trigger phase equalization">,
   ];
 }
 

From 801512c0c7a0dad19106ad7fbe786d4d7757702d Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 15:07:03 -0700
Subject: [PATCH 44/50] Cleanup

---
 include/cudaq/Optimizer/Transforms/Passes.h   |   9 +-
 .../Optimizer/Transforms/SimulationData.h     |  25 ++--
 lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp    |   6 +-
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 137 +++++++++---------
 runtime/common/BaseRemoteRESTQPU.h            |   3 +-
 runtime/common/BaseRestRemoteClient.h         |  31 +---
 targettests/Remote-Sim/state_init.cpp         |  47 ++----
 7 files changed, 113 insertions(+), 145 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 0b07b01cbb..ae9c6d2188 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -12,10 +12,10 @@
 // These transforms can generally be thought of as "optimizations" or "rewrites"
 // on the IR.
 
+#include "SimulationData.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
-#include "SimulationData.h"
 
 namespace cudaq::opt {
 
@@ -42,8 +42,11 @@ std::unique_ptr<mlir::Pass> createObserveAnsatzPass(std::vector<bool> &);
 std::unique_ptr<mlir::Pass> createQuakeAddMetadata();
 std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
-std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, const void *, SimulationStateData::getDataFunc*, std::size_t startingArgIdx = 0);
-std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view, const void *, bool sameAddressSpace = false);
+std::unique_ptr<mlir::Pass>
+createQuakeSynthesizer(std::string_view, const void *,
+                       SimulationStateData::getDataFunc *, std::size_t);
+std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view,
+                                                   const void *, bool);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
 std::unique_ptr<mlir::Pass> createUnwindLoweringPass();
 
diff --git a/include/cudaq/Optimizer/Transforms/SimulationData.h b/include/cudaq/Optimizer/Transforms/SimulationData.h
index 9504dc7c08..d0c8b3b5b4 100644
--- a/include/cudaq/Optimizer/Transforms/SimulationData.h
+++ b/include/cudaq/Optimizer/Transforms/SimulationData.h
@@ -20,19 +20,18 @@ namespace cudaq {
 class state;
 }
 
-
 /// Owns the data
 class SimulationStateData {
- public:
-  typedef SimulationStateData (getDataFunc)(cudaq::state*);
+public:
+  typedef SimulationStateData(getDataFunc)(cudaq::state *);
+
+  SimulationStateData(void *data, std::size_t size, std::size_t elementSize)
+      : data(data), size(size), elementSize(elementSize) {}
 
-  SimulationStateData(void *data, std::size_t size, std::size_t elementSize): 
-    data(data), size(size), elementSize(elementSize) {}
-  
-  // template <typename T> 
+  // template <typename T>
   // std::vector<T> toVector() {
-  //   assert(sizeof(T) == elementSize && "incorrect element size in simulation data");
-  //   std::vector<T> result;
+  //   assert(sizeof(T) == elementSize && "incorrect element size in simulation
+  //   data"); std::vector<T> result;
 
   //   std::cout << "SimulationStateData:" << std::endl;
   //   for (std::size_t i = 0; i < size; i++) {
@@ -44,13 +43,9 @@ class SimulationStateData {
   //   return result;
   // }
 
-  ~SimulationStateData() {
-    delete reinterpret_cast<int*>(data);
-  }
+  ~SimulationStateData() { delete reinterpret_cast<int *>(data); }
 
-  void* data;
+  void *data;
   std::size_t size;
   std::size_t elementSize;
 };
-
-
diff --git a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
index 153f8d5b56..ed78fe7bd4 100644
--- a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
+++ b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
@@ -45,7 +45,7 @@ struct VerifyNVQIRCallOpsPass
           cudaq::opt::NVQIRPackSingleQubitInArray,
           cudaq::opt::NVQIRReleasePackedQubitArray,
           cudaq::getNumQubitsFromCudaqState,
-          };
+      };
       // It must be either NVQIR extension functions or in the allowed list.
       return std::find(NVQIR_FUNCS.begin(), NVQIR_FUNCS.end(), functionName) !=
                  NVQIR_FUNCS.end() ||
@@ -73,8 +73,8 @@ struct VerifyNVQIRCallOpsPass
         passFailed = true;
         return WalkResult::interrupt();
       } else if (!isa<LLVM::AddressOfOp, LLVM::AllocaOp, LLVM::BitcastOp,
-                      LLVM::ExtractValueOp, LLVM::GEPOp, LLVM::IntToPtrOp, LLVM::LoadOp,
-                      LLVM::StoreOp>(op)) {
+                      LLVM::ExtractValueOp, LLVM::GEPOp, LLVM::IntToPtrOp,
+                      LLVM::LoadOp, LLVM::StoreOp>(op)) {
         // No pointers allowed except for the above operations.
         for (auto oper : op->getOperands()) {
           if (isa<LLVM::LLVMPointerType>(oper.getType())) {
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 839716a534..e1a708412a 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -118,33 +118,49 @@ static bool hasInitStateUse(BlockArgument argument) {
   return false;
 }
 
-template <typename T> 
-std::vector<T> stateDataToVector(SimulationStateData& stateData) {
-  assert(sizeof(T) == stateData.elementSize && "incorrect element size in simulation data");
+template <typename T>
+std::vector<T> stateDataToVector(SimulationStateData &stateData) {
+  assert(sizeof(T) == stateData.elementSize &&
+         "incorrect element size in simulation data");
   std::vector<T> result;
 
   for (std::size_t i = 0; i < stateData.size; i++) {
-    auto elePtr = reinterpret_cast<T*>(stateData.data) + i;
+    auto elePtr = reinterpret_cast<T *>(stateData.data) + i;
     result.push_back(*elePtr);
   }
 
   return result;
 }
 
-template <typename ELETY, typename T, typename MAKER>
-LogicalResult
-synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, ELETY eleTy, std::vector<T> &vec,
-                         MAKER makeElementValue) {
-  auto *ctx = builder.getContext();
+template <typename T>
+Value createGlobalArray(OpBuilder &builder, ModuleOp module, unsigned &counter,
+                        BlockArgument argument, Type arrTy,
+                        std::vector<T> vec) {
+  OpBuilder::InsertionGuard guard(builder);
   auto argLoc = argument.getLoc();
 
-  //auto strTy = cudaq::cc::StdvecType::get(eleTy);
+  // Stick global at end of Module.
+  std::string symbol = "__nvqpp_rodata_init_state." + std::to_string(counter++);
+
+  cudaq::IRBuilder irBuilder(builder);
+  irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
+
+  builder.setInsertionPointToStart(argument.getOwner());
+  return builder.create<cudaq::cc::AddressOfOp>(
+      argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
+}
+
+template <typename T>
+LogicalResult synthesizeStateArgument(OpBuilder &builder, ModuleOp module,
+                                      unsigned &counter, BlockArgument argument,
+                                      Type eleTy, std::vector<T> &vec) {
+  auto *ctx = builder.getContext();
+  auto argLoc = argument.getLoc();
   auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
 
   builder.setInsertionPointToStart(argument.getOwner());
   auto toErase = std::vector<mlir::Operation *>();
-  
+
   // Iterate over the users of this state argument.
   for (auto *argUser : argument.getUsers()) {
     // Replace a calls to runtime function that reads the number of qubits
@@ -166,37 +182,33 @@ synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
   }
 
   OpBuilder::InsertionGuard guard(builder);
-  auto [buffer, _] =
-      createArrayInMemory(builder, module, counter, argument, vec, arrTy);
+  auto buffer =
+      createGlobalArray(builder, module, counter, argument, arrTy, vec);
   auto ptrArrEleTy =
       cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
   Value memArr = builder.create<cudaq::cc::CastOp>(argLoc, ptrArrEleTy, buffer);
 
-  // builder.setInsertionPointAfter(memArr.getDefiningOp());
-  // Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
-  // Value newVec =
-  //     builder.create<cudaq::cc::StdvecInitOp>(argLoc, strTy, memArr, size);
   argument.replaceAllUsesWith(memArr);
-  
-  for (auto &op : toErase) {
+
+  for (auto &op : toErase)
     op->erase();
-  }
 
   return success();
 }
 
-static LogicalResult
-synthesizeStateArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
-                         BlockArgument argument, SimulationStateData& stateData) {
-  
+static LogicalResult synthesizeStateArgument(OpBuilder &builder,
+                                             ModuleOp module, unsigned &counter,
+                                             BlockArgument argument,
+                                             SimulationStateData &stateData) {
+
   if (stateData.elementSize == sizeof(std::complex<double>)) {
     auto vec = stateDataToVector<std::complex<double>>(stateData);
-    return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
-                                            ComplexType::get(builder.getF64Type()), vec, makeComplexElement<double>);
+    return synthesizeStateArgument(builder, module, counter, argument,
+                                   ComplexType::get(builder.getF64Type()), vec);
   } else if (stateData.elementSize == sizeof(std::complex<float>)) {
     auto vec = stateDataToVector<std::complex<float>>(stateData);
-    return synthesizeStateArgument<ComplexType>(builder, module, counter, argument,
-                                            ComplexType::get(builder.getF32Type()), vec, makeComplexElement<float>);
+    return synthesizeStateArgument(builder, module, counter, argument,
+                                   ComplexType::get(builder.getF32Type()), vec);
   }
   module.emitError("unexpected element size in simulation state data");
   return failure();
@@ -209,7 +221,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          ATTR arrayAttr, MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
-  
+
   assert(isa<cudaq::cc::StdvecType>(argTy));
   auto strTy = cast<cudaq::cc::StdvecType>(argTy);
   auto eleTy = cast<ELETY>(strTy.getElementType());
@@ -230,16 +242,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     OpBuilder::InsertionGuard guard(builder);
     Value buffer;
     if (hasInitStateUse(argument)) {
-      // Stick global at end of Module.
-      std::string symbol =
-          "__nvqpp_rodata_init_state." + std::to_string(counter++);
-
-      cudaq::IRBuilder irBuilder(builder);
-      irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
-
-      builder.setInsertionPointToStart(argument.getOwner());
-      buffer = builder.create<cudaq::cc::AddressOfOp>(
-          argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
+      buffer =
+          createGlobalArray(builder, module, counter, argument, arrTy, vec);
     } else {
       builder.setInsertionPointAfter(conArray);
       buffer = builder.create<cudaq::cc::AllocaOp>(argLoc, arrTy);
@@ -478,19 +482,13 @@ class QuakeSynthesizer
 
   // The raw pointer to the runtime arguments.
   const void *args;
-  
+
   // Function to read the state data, if any.
-  SimulationStateData::getDataFunc* getStateData = nullptr;
-  
+  SimulationStateData::getDataFunc *getStateData = nullptr;
+
   // Is the simulation running in the same address space as synthesis?
   bool sameAddressSpace = false;
 
-public:
-  QuakeSynthesizer() = default;
-  QuakeSynthesizer(std::string_view kernel, void *a, SimulationStateData::getDataFunc* getData, bool sameSpace)
-      : kernelName(kernel), args(a), getStateData(getData), sameAddressSpace(sameSpace) {}
-  const void *args;
-
   // The starting argument index to synthesize. Typically 0 but may be >0 for
   // partial synthesis. If >0, it is assumed that the first argument(s) are NOT
   // in `args`.
@@ -504,10 +502,10 @@ class QuakeSynthesizer
       : kernelName(kernel), args(a), sameAddressSpace(sameSpace) {}
 
   // Execution on a remote simulator
-  QuakeSynthesizer(std::string_view kernel, const void *a, SimulationStateData::getDataFunc* getData, std::size_t s)
+  QuakeSynthesizer(std::string_view kernel, const void *a,
+                   SimulationStateData::getDataFunc *getData, std::size_t s)
       : kernelName(kernel), args(a), getStateData(getData), startingArgIdx(s) {}
 
-
   mlir::ModuleOp getModule() { return getOperation(); }
 
   std::pair<std::size_t, std::vector<std::size_t>>
@@ -647,8 +645,8 @@ class QuakeSynthesizer
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(type)) {
         if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
           if (sameAddressSpace) {
-            // Special case of a `cudaq::state*` which must be in the same address
-            // space. This references a container to a set of simulation
+            // Special case of a `cudaq::state*` which must be in the same
+            // address space. This references a container to a set of simulation
             // amplitudes.
             synthesizeRuntimeArgument<cudaq::state *>(
                 builder, argument, args, offset, sizeof(void *),
@@ -656,21 +654,25 @@ class QuakeSynthesizer
                   Value rawPtr = builder.create<arith::ConstantIntOp>(
                       loc, reinterpret_cast<std::intptr_t>(*concrete),
                       sizeof(void *) * 8);
-                  auto stateTy = cudaq::cc::StateType::get(builder.getContext());
+                  auto stateTy =
+                      cudaq::cc::StateType::get(builder.getContext());
                   return builder.create<cudaq::cc::CastOp>(
                       loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
                 });
           } else if (getStateData != nullptr) {
-            // Special case of running on a simulator in a different address space,
-            // when we know how to convert state to data.
-            cudaq::state* concrete;
-            std::memcpy(&concrete, ((char *)args) + offset, sizeof(cudaq::state*));
+            // Special case of running on a simulator in a different address
+            // space, when we know how to convert state to data.
+            cudaq::state *concrete;
+            std::memcpy(&concrete, ((const char *)args) + offset,
+                        sizeof(cudaq::state *));
             auto stateData = getStateData(concrete);
-            if (failed(synthesizeStateArgument(builder, module, counter, argument, stateData)))
-                module.emitError("Failed to synthesize state*");
+            if (failed(synthesizeStateArgument(builder, module, counter,
+                                               argument, stateData)))
+              module.emitError("Failed to synthesize state*");
           } else {
             // All other cases are not yet supported (i.e. quantum hardware).
-            funcOp.emitOpError("synthesis: unsupported argument type: state*");
+            funcOp.emitOpError("synthesis: unsupported argument type on "
+                               "quantum devices: state*");
             signalPassFailure();
           }
           continue;
@@ -879,16 +881,17 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
 }
 
 /// Execution on remote simulator
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a, SimulationStateData::getDataFunc* getData,
+std::unique_ptr<mlir::Pass>
+cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a,
+                                   SimulationStateData::getDataFunc *getData,
                                    std::size_t startingArgIdx = 0) {
-  return std::make_unique<QuakeSynthesizer>(kernelName, a, getData, startingArgIdx);
+  return std::make_unique<QuakeSynthesizer>(kernelName, a, getData,
+                                            startingArgIdx);
 }
 
 /// Execution on the same address space in a simulator or a quantum device
 std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a, bool sameAddressSpace = false) {
+cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a,
+                                   bool sameAddressSpace = false) {
   return std::make_unique<QuakeSynthesizer>(kernelName, a, sameAddressSpace);
 }
-
-
-
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index f185f24643..da595b2493 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -403,7 +403,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs, false));
+      pm.addPass(
+          cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs, false));
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 66608327dc..639302fe04 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -92,34 +92,22 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
                        });
   }
 
-  static SimulationStateData readSimulationStateData(cudaq::state* s) {
-    std::cout << "Reading sim state data" << std::endl;
+  static SimulationStateData readSimulationStateData(cudaq::state *s) {
+    ;
     void *dataPtr = nullptr;
     auto stateVector = s->get_tensor();
     auto precision = s->get_precision();
     auto numElements = stateVector.get_num_elements();
     auto elementSize = 0;
     if (precision == SimulationState::precision::fp32) {
-      std::cout << "32 bit precision" << std::endl;
       elementSize = sizeof(std::complex<float>);
       auto *hostData = new std::complex<float>[numElements];
-      std::cout << "Reading host data" << std::endl;
       s->to_host(hostData, numElements);
-      std::cout << "Host data:" << std::endl;
-      for (size_t i = 0; i< numElements; i++) {
-        std::cout << hostData[i] << std::endl;
-      }
       dataPtr = reinterpret_cast<void *>(hostData);
     } else {
-      std::cout << "64 bit precision" << std::endl;
       elementSize = sizeof(std::complex<double>);
       auto *hostData = new std::complex<double>[numElements];
-      std::cout << "Reading host data" << std::endl;
       s->to_host(hostData, numElements);
-       std::cout << "Host data:" << std::endl;
-      for (size_t i = 0; i< numElements; i++) {
-        std::cout << hostData[i] << std::endl;
-      }
       dataPtr = reinterpret_cast<void *>(hostData);
     }
     return SimulationStateData(dataPtr, numElements, elementSize);
@@ -205,26 +193,19 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (args) {
         cudaq::info("Run Quake Synth.\n");
         mlir::PassManager pm(&mlirContext);
-        moduleOp.getContext()->disableMultithreading();
-        pm.enableIRPrinting();
-        auto &platform = cudaq::get_platform();
-        
         // For efficiency, we don't run state prep to convert states to gates on
-        // simulators, instead we synthesize them as vectors.
-        pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args, readSimulationStateData, startingArgIdx));
+        // remote simulators, instead we synthesize states as vectors.
+        // Pass the data reader function to the synthesizer for this purpose.
+        pm.addPass(cudaq::opt::createQuakeSynthesizer(
+            name, args, readSimulationStateData, startingArgIdx));
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");
       }
 
-      // Note: do not run state preparation pass here since we are always
-      // using simulators.
-
       // Run client-side passes. `clientPasses` is empty right now, but the code
       // below accommodates putting passes into it.
       mlir::PassManager pm(&mlirContext);
-      moduleOp.getContext()->disableMultithreading();
-      pm.enableIRPrinting();
       std::string errMsg;
       llvm::raw_string_ostream os(errMsg);
       const std::string pipeline =
diff --git a/targettests/Remote-Sim/state_init.cpp b/targettests/Remote-Sim/state_init.cpp
index 4615e2ec08..735cb16f43 100644
--- a/targettests/Remote-Sim/state_init.cpp
+++ b/targettests/Remote-Sim/state_init.cpp
@@ -10,7 +10,6 @@
 
 // clang-format off
 // RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu %s -o %t && %t
-// RUN: nvq++ %cpp_std --target remote-mqpu %s -o %t && %t // TODO: this fails to compile, do we need it?
 // clang-format on
 
 #include <cudaq.h>
@@ -34,36 +33,22 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-    {
-      std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-      std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-      auto state = cudaq::state::from_data(vec);
-      auto state1 = cudaq::state::from_data(vec1);
-      {
-          // Passing state data as argument (kernel mode)
-          auto counts = cudaq::sample(test_complex_array_param, &state);
-          printCounts(counts);
-
-          counts = cudaq::sample(test_complex_array_param, &state1);
-          printCounts(counts);
-      }
-
-      // {
-      //     // Passing state data as argument (builder mode)
-      //     auto [kernel, state] = cudaq::make_kernel<cudaq::state*>();
-      //     auto qubits = kernel.qalloc(state);
-
-      //     auto counts = cudaq::sample(kernel, &state);
-      //     printCounts(counts);
-
-      //     counts = cudaq::sample(kernel, &state1);
-      //     printCounts(counts);
-      // }
-    }
+  std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.,  0., 0.,  0., 0.};
+  std::vector<cudaq::complex> vec1{0., 0.,  0., 0., 0., 0., M_SQRT1_2, M_SQRT1_2};
+  auto state = cudaq::state::from_data(vec);
+  auto state1 = cudaq::state::from_data(vec1);
+  {
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_complex_array_param, &state);
+      printCounts(counts);
+
+      counts = cudaq::sample(test_complex_array_param, &state1);
+      printCounts(counts);
+  }
 }
 
-// CHECK: 00
-// CHECK: 10
+// CHECK: 000
+// CHECK: 100
 
-// CHECK: 01
-// CHECK: 11
+// CHECK: 011
+// CHECK: 111

From e73ac1c948531cf89cbad989c9b1f6841699c518 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 18 Jul 2024 15:20:30 -0700
Subject: [PATCH 45/50] Add tests for vector data serialization for remote sim

---
 targettests/Remote-Sim/state_init_vector.cpp | 202 ++++++++++---------
 1 file changed, 107 insertions(+), 95 deletions(-)

diff --git a/targettests/Remote-Sim/state_init_vector.cpp b/targettests/Remote-Sim/state_init_vector.cpp
index 7e93b63dae..b8d6bdb3bb 100644
--- a/targettests/Remote-Sim/state_init_vector.cpp
+++ b/targettests/Remote-Sim/state_init_vector.cpp
@@ -10,14 +10,11 @@
 
 // clang-format off
 // RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu %s -o %t && %t
-// RUN: nvq++ %cpp_std --target remote-mqpu %s -o %t && %t // TODO: this fails to compile, do we need it?
 // clang-format on
 
 #include <cudaq.h>
 #include <iostream>
 
-
-
 __qpu__ void test_complex_constant_array() {
    cudaq::qvector v(std::vector<cudaq::complex>({ M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
@@ -69,27 +66,41 @@ void printCounts(cudaq::sample_result& result) {
 }
 
 int main() {
-    // {
-    //   auto counts = cudaq::sample(test_complex_constant_array);
-    //   printCounts(counts);
-    // }
-
-    // {
-    //   auto counts = cudaq::sample(test_complex_constant_array2);
-    //   printCounts(counts);
-    // }
-
-    // {
-    //   auto counts = cudaq::sample(test_complex_constant_array3);
-    //   printCounts(counts);
-    // }
-
-    // {
-    //   auto counts = cudaq::sample(test_real_constant_array);
-    //   printCounts(counts);
-    // }
-
-    // {
+    {
+      auto counts = cudaq::sample(test_complex_constant_array);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+    {
+      auto counts = cudaq::sample(test_complex_constant_array2);
+      printCounts(counts);
+    }
+
+// CHECK: 0001
+// CHECK: 0011
+// CHECK: 1001
+// CHECK: 1011
+
+    {
+      auto counts = cudaq::sample(test_complex_constant_array3);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+    {
+      auto counts = cudaq::sample(test_real_constant_array);
+      printCounts(counts);
+    }
+
+// CHECK: 00
+// CHECK: 10
+
+    {
       std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
       std::vector<cudaq::complex> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
       {
@@ -101,92 +112,79 @@ int main() {
           printCounts(counts);
       }
 
-    //   {
-    //       // Passing state data as argument (builder mode)
-    //       auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
-    //       auto qubits = kernel.qalloc(v);
-
-    //       auto counts = cudaq::sample(kernel, vec);
-    //       printCounts(counts);
-
-    //       counts = cudaq::sample(kernel, vec1);
-    //       printCounts(counts);
-    //   }
-    // }
-
-    // {
-    //   std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    //   std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-    //   {
-    //       // Passing state data as argument (kernel mode)
-    //       auto counts = cudaq::sample(test_real_array_param, vec);
-    //       printCounts(counts);
-
-    //       counts = cudaq::sample(test_real_array_param, vec1);
-    //       printCounts(counts);
-    //   }
-
-    //   {
-    //       // Passing state data as argument (builder mode)
-    //       auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
-    //       auto qubits = kernel.qalloc(v);
-
-    //       auto counts = cudaq::sample(kernel, vec);
-    //       printCounts(counts);
-
-    //       counts = cudaq::sample(kernel, vec1);
-    //       printCounts(counts);
-    //   }
-    // }
-
-    // Error message: "Invalid user-provided state data. Simulator is FP64 but state data is FP32."
-    // {
-    //   std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    //   std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-    //   {
-    //       // Passing state data as argument (kernel mode)
-    //       auto counts = cudaq::sample(test_double_array_param, vec);
-    //       printCounts(counts);
-
-    //       counts = cudaq::sample(test_double_array_param, vec1);
-    //       printCounts(counts);
-    //   }
-    // }
-
-    // UCX  ERROR Failed to allocate memory pool (name=mm_recv_desc) chunk: Out of memory
-    // {
-    //   std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
-    //   std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
-    //   {
-    //       // Passing state data as argument (kernel mode)
-    //       auto counts = cudaq::sample(test_float_array_param, vec);
-    //       printCounts(counts);
-
-    //       counts = cudaq::sample(test_float_array_param, vec1);
-    //       printCounts(counts);
-    //   }
-    // }
-}
-
 // CHECK: 00
 // CHECK: 10
 
-// CHECK: 0001
-// CHECK: 0011
-// CHECK: 1001
-// CHECK: 1011
+// CHECK: 01
+// CHECK: 11
+
+      {
+          // Passing state data as argument (builder mode)
+          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::complex>>();
+          auto qubits = kernel.qalloc(v);
+
+          auto counts = cudaq::sample(kernel, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(kernel, vec1);
+          printCounts(counts);
+      }
+    }
 
 // CHECK: 00
 // CHECK: 10
 
+// CHECK: 01
+// CHECK: 11
+
+    {
+      std::vector<cudaq::real> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<cudaq::real> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+      {
+          // Passing state data as argument (kernel mode)
+          auto counts = cudaq::sample(test_real_array_param, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(test_real_array_param, vec1);
+          printCounts(counts);
+      }
+
 // CHECK: 00
 // CHECK: 10
 
+// CHECK: 01
+// CHECK: 11
+
+      {
+          // Passing state data as argument (builder mode)
+          auto [kernel, v] = cudaq::make_kernel<std::vector<cudaq::real>>();
+          auto qubits = kernel.qalloc(v);
+
+          auto counts = cudaq::sample(kernel, vec);
+          printCounts(counts);
+
+          counts = cudaq::sample(kernel, vec1);
+          printCounts(counts);
+      }
+
 // CHECK: 00
 // CHECK: 10
 
 // CHECK: 01
 // CHECK: 11
+    }
+
+    {
+      std::vector<double> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<double> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_double_array_param, vec);
+      printCounts(counts);
+
+      counts = cudaq::sample(test_double_array_param, vec1);
+      printCounts(counts);
+    }
 
 // CHECK: 00
 // CHECK: 10
@@ -194,8 +192,22 @@ int main() {
 // CHECK: 01
 // CHECK: 11
 
+    {
+      std::vector<float> vec{M_SQRT1_2, M_SQRT1_2, 0., 0.};
+      std::vector<float> vec1{0., 0., M_SQRT1_2, M_SQRT1_2};
+
+      // Passing state data as argument (kernel mode)
+      auto counts = cudaq::sample(test_float_array_param, vec);
+      printCounts(counts);
+
+      counts = cudaq::sample(test_float_array_param, vec1);
+      printCounts(counts);
+    }
+
 // CHECK: 00
 // CHECK: 10
 
 // CHECK: 01
 // CHECK: 11
+}
+

From f4cc697ea42ff3d649f71704709bfbb061ce7059 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 19 Jul 2024 10:42:44 -0700
Subject: [PATCH 46/50] Merge with main

---
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp         |  2 --
 .../runtime/cudaq/platform/py_alt_launch_kernel.cpp   | 11 -----------
 runtime/common/BaseRestRemoteClient.h                 |  1 -
 3 files changed, 14 deletions(-)

diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 3ddb72b2fe..edbb25daa3 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -26,8 +26,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
-#include <iostream>
-
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 1b2b8e55c5..dfd2384b38 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -514,17 +514,6 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs, true));
   pm.addPass(createCanonicalizerPass());
 
-  // Run state preparation for quantum devices only.
-  // Simulators have direct implementation of state initialization
-  // in their runtime.
-  auto &platform = cudaq::get_platform();
-  if (!platform.is_simulator() || platform.is_emulated()) {
-    pm.addPass(cudaq::opt::createConstPropComplex());
-    pm.addPass(cudaq::opt::createLiftArrayAlloc());
-    pm.addPass(cudaq::opt::createStatePreparation());
-  }
-  pm.addPass(createCanonicalizerPass());
-
   // Run state preparation for quantum devices only.
   // Simulators have direct implementation of state initialization
   // in their runtime.
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index b86a4d0ebf..a752d2d35f 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -93,7 +93,6 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
   }
 
   static SimulationStateData readSimulationStateData(cudaq::state *s) {
-    ;
     void *dataPtr = nullptr;
     auto stateVector = s->get_tensor();
     auto precision = s->get_precision();

From 585973fc3759c85bbaf2992214a92a92a9476231 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 19 Jul 2024 11:35:41 -0700
Subject: [PATCH 47/50] Add more synth tests

---
 unittests/Optimizer/QuakeSynthTester.cpp | 59 ++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
index aa25940255..414d1d14c2 100644
--- a/unittests/Optimizer/QuakeSynthTester.cpp
+++ b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -328,6 +328,65 @@ TEST(QuakeSynthTests, checkVectorOfInt) {
   EXPECT_EQ(countz.size(), 1);
 }
 
+TEST(QuakeSynthTests, checkStatePointerLocalSim) {
+  auto [kernel, thetas] = cudaq::make_kernel<cudaq::state*>();
+  auto theta = thetas[0];
+  auto phi = thetas[1];
+  auto q = kernel.qalloc(3);
+  kernel.x(q[0]);
+  kernel.ry(theta, q[1]);
+  kernel.ry(phi, q[2]);
+  kernel.x<cudaq::ctrl>(q[2], q[0]);
+  kernel.x<cudaq::ctrl>(q[0], q[1]);
+  kernel.ry(-theta, q[1]);
+  kernel.x<cudaq::ctrl>(q[0], q[1]);
+  kernel.x<cudaq::ctrl>(q[1], q[0]);
+
+  std::cout << kernel.to_quake() << '\n';
+
+  // Set the proper name for the kernel
+  auto properName = cudaq::runtime::cudaqGenPrefixName + kernel.name();
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  cudaq::spin_op h3 = h + 9.625 - 9.625 * z(2) - 3.913119 * x(1) * x(2) -
+                      3.913119 * y(1) * y(2);
+
+  cudaq::state state = cudaq::state::from_data(std::vector<std::complex<double>>({.3591, .2569}));
+  double energy = cudaq::observe(kernel, h3, &state);
+  EXPECT_NEAR(energy, -2.045375, 1e-3);
+
+  // Map the kernel_builder to_quake output  to MLIR
+  auto context = cudaq::initializeMLIR();
+  auto module = parseSourceString<ModuleOp>(kernel.to_quake(), context.get());
+
+  // Create a struct defining the runtime args for the kernel
+  auto [args, offset] =
+      cudaq::mapToRawArgs(kernel.name(), std::vector<double>{.3591, .2569});
+
+  // Run quake-synth
+  EXPECT_TRUE(succeeded(runQuakeSynth(kernel.name(), args, module)));
+
+  // Get the function, make sure that it has no arguments
+  auto func = module->lookupSymbol<func::FuncOp>(properName);
+  EXPECT_TRUE(func);
+  EXPECT_TRUE(func.getArguments().empty());
+
+  func.dump();
+
+  // Lower to LLVM and create the JIT execution engine
+  EXPECT_TRUE(succeeded(lowerToLLVMDialect(*module)));
+  auto jitOrError = ExecutionEngine::create(*module);
+  EXPECT_TRUE(!!jitOrError);
+  std::unique_ptr<ExecutionEngine> jit = std::move(jitOrError.get());
+
+  // // Sample this new kernel processed with quake synth
+  energy = observeJitCode(jit.get(), h3, kernel.name());
+  // Should see the same thing as before.
+  EXPECT_NEAR(energy, -2.045375, 1e-3);
+}
+
 TEST(QuakeSynthTests, checkCallable) {
   auto [ansatz, thetas] = cudaq::make_kernel<std::vector<double>>();
   auto q = ansatz.qalloc(2);

From c88c51e53423c44ed9394c3264a036641e6c4e51 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 19 Jul 2024 14:07:03 -0700
Subject: [PATCH 48/50] Cleanup

---
 include/cudaq/Optimizer/Transforms/Passes.h   |  6 +--
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 43 +++++++------------
 runtime/common/BaseRemoteRESTQPU.h            |  3 +-
 unittests/Optimizer/QuakeSynthTester.cpp      |  8 ++--
 4 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index ae9c6d2188..03af81ab35 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -44,9 +44,9 @@ std::unique_ptr<mlir::Pass> createQuakeAddDeallocs();
 std::unique_ptr<mlir::Pass> createQuakeSynthesizer();
 std::unique_ptr<mlir::Pass>
 createQuakeSynthesizer(std::string_view, const void *,
-                       SimulationStateData::getDataFunc *, std::size_t);
-std::unique_ptr<mlir::Pass> createQuakeSynthesizer(std::string_view,
-                                                   const void *, bool);
+                       std::size_t startingArgIdx = 0,
+                       SimulationStateData::getDataFunc *getData = nullptr,
+                       bool sameAddressSpace = false);
 std::unique_ptr<mlir::Pass> createRaiseToAffinePass();
 std::unique_ptr<mlir::Pass> createUnwindLoweringPass();
 
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index edbb25daa3..8f36b03db8 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -481,28 +481,23 @@ class QuakeSynthesizer
   // The raw pointer to the runtime arguments.
   const void *args;
 
+  // The starting argument index to synthesize. Typically 0 but may be >0 for
+  // partial synthesis. If >0, it is assumed that the first argument(s) are NOT
+  // in `args`.
+  std::size_t startingArgIdx = 0;
+
   // Function to read the state data, if any.
   SimulationStateData::getDataFunc *getStateData = nullptr;
 
   // Is the simulation running in the same address space as synthesis?
   bool sameAddressSpace = false;
 
-  // The starting argument index to synthesize. Typically 0 but may be >0 for
-  // partial synthesis. If >0, it is assumed that the first argument(s) are NOT
-  // in `args`.
-  std::size_t startingArgIdx = 0;
-
 public:
   QuakeSynthesizer() = default;
-
-  // Execution in a same address space on a simulator, or a quantum device
-  QuakeSynthesizer(std::string_view kernel, const void *a, bool sameSpace)
-      : kernelName(kernel), args(a), sameAddressSpace(sameSpace) {}
-
-  // Execution on a remote simulator
-  QuakeSynthesizer(std::string_view kernel, const void *a,
-                   SimulationStateData::getDataFunc *getData, std::size_t s)
-      : kernelName(kernel), args(a), getStateData(getData), startingArgIdx(s) {}
+  QuakeSynthesizer(std::string_view kernel, const void *a, std::size_t s,
+                   SimulationStateData::getDataFunc *getData, bool sameSpace)
+      : kernelName(kernel), args(a), startingArgIdx(s), getStateData(getData),
+        sameAddressSpace(sameSpace) {}
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
@@ -881,18 +876,10 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
   return std::make_unique<QuakeSynthesizer>();
 }
 
-/// Execution on remote simulator
-std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a,
-                                   SimulationStateData::getDataFunc *getData,
-                                   std::size_t startingArgIdx = 0) {
-  return std::make_unique<QuakeSynthesizer>(kernelName, a, getData,
-                                            startingArgIdx);
-}
-
-/// Execution on the same address space in a simulator or a quantum device
-std::unique_ptr<mlir::Pass>
-cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a,
-                                   bool sameAddressSpace = false) {
-  return std::make_unique<QuakeSynthesizer>(kernelName, a, sameAddressSpace);
+std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer(
+    std::string_view kernelName, const void *a, std::size_t startingArgIdx = 0,
+    SimulationStateData::getDataFunc *getData = nullptr,
+    bool sameAddressSpace = false) {
+  return std::make_unique<QuakeSynthesizer>(kernelName, a, startingArgIdx,
+                                            getData, sameAddressSpace);
 }
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index f6d914d250..2ea79e3f91 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -406,8 +406,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (updatedArgs) {
       cudaq::info("Run Quake Synth.\n");
       mlir::PassManager pm(&context);
-      pm.addPass(
-          cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs, false));
+      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
       pm.addPass(mlir::createCanonicalizerPass());
       if (disableMLIRthreading || enablePrintMLIREachPass)
         moduleOp.getContext()->disableMultithreading();
diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
index 414d1d14c2..a74a1c82bb 100644
--- a/unittests/Optimizer/QuakeSynthTester.cpp
+++ b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -54,7 +54,8 @@ LogicalResult runQuakeSynth(std::string_view kernelName, void *rawArgs,
   PassManager pm(module->getContext());
   module->getContext()->disableMultithreading();
   pm.enableIRPrinting();
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, rawArgs, true));
+  pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, rawArgs, 0, nullptr,
+                                                true));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(cudaq::opt::createExpandMeasurementsPass());
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
@@ -329,7 +330,7 @@ TEST(QuakeSynthTests, checkVectorOfInt) {
 }
 
 TEST(QuakeSynthTests, checkStatePointerLocalSim) {
-  auto [kernel, thetas] = cudaq::make_kernel<cudaq::state*>();
+  auto [kernel, thetas] = cudaq::make_kernel<cudaq::state *>();
   auto theta = thetas[0];
   auto phi = thetas[1];
   auto q = kernel.qalloc(3);
@@ -353,7 +354,8 @@ TEST(QuakeSynthTests, checkStatePointerLocalSim) {
   cudaq::spin_op h3 = h + 9.625 - 9.625 * z(2) - 3.913119 * x(1) * x(2) -
                       3.913119 * y(1) * y(2);
 
-  cudaq::state state = cudaq::state::from_data(std::vector<std::complex<double>>({.3591, .2569}));
+  cudaq::state state = cudaq::state::from_data(
+      std::vector<std::complex<double>>({.3591, .2569}));
   double energy = cudaq::observe(kernel, h3, &state);
   EXPECT_NEAR(energy, -2.045375, 1e-3);
 

From 10becb801b12d094955f12f7890478bf77c7f26d Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 19 Jul 2024 15:47:45 -0700
Subject: [PATCH 49/50] Cleanup

---
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp          | 6 +++---
 python/runtime/cudaq/platform/py_alt_launch_kernel.cpp | 2 +-
 runtime/common/BaseRestRemoteClient.h                  | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 8f36b03db8..4a62afdf77 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -877,9 +877,9 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
 }
 
 std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer(
-    std::string_view kernelName, const void *a, std::size_t startingArgIdx = 0,
-    SimulationStateData::getDataFunc *getData = nullptr,
-    bool sameAddressSpace = false) {
+    std::string_view kernelName, const void *a, std::size_t startingArgIdx,
+    SimulationStateData::getDataFunc *getData,
+    bool sameAddressSpace) {
   return std::make_unique<QuakeSynthesizer>(kernelName, a, startingArgIdx,
                                             getData, sameAddressSpace);
 }
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index dfd2384b38..93477941d3 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -511,7 +511,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
       getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
 
   PassManager pm(context);
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs, true));
+  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs, 0, nullptr, true));
   pm.addPass(createCanonicalizerPass());
 
   // Run state preparation for quantum devices only.
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index a752d2d35f..79ef3a5043 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -196,7 +196,7 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
         // remote simulators, instead we synthesize states as vectors.
         // Pass the data reader function to the synthesizer for this purpose.
         pm.addPass(cudaq::opt::createQuakeSynthesizer(
-            name, args, readSimulationStateData, startingArgIdx));
+            name, args, startingArgIdx, readSimulationStateData));
         pm.addPass(mlir::createCanonicalizerPass());
         if (failed(pm.run(moduleOp)))
           throw std::runtime_error("Could not successfully apply quake-synth.");

From a4d16e7f02b5a1f2cbb58edf9a3a3376f135497c Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 19 Jul 2024 15:50:47 -0700
Subject: [PATCH 50/50] Format

---
 lib/Optimizer/Transforms/QuakeSynthesizer.cpp          | 3 +--
 python/runtime/cudaq/platform/py_alt_launch_kernel.cpp | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 4a62afdf77..a32eb6d737 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -878,8 +878,7 @@ std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer() {
 
 std::unique_ptr<mlir::Pass> cudaq::opt::createQuakeSynthesizer(
     std::string_view kernelName, const void *a, std::size_t startingArgIdx,
-    SimulationStateData::getDataFunc *getData,
-    bool sameAddressSpace) {
+    SimulationStateData::getDataFunc *getData, bool sameAddressSpace) {
   return std::make_unique<QuakeSynthesizer>(kernelName, a, startingArgIdx,
                                             getData, sameAddressSpace);
 }
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 93477941d3..353e36bcd4 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -511,7 +511,8 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
       getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
 
   PassManager pm(context);
-  pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs, 0, nullptr, true));
+  pm.addPass(
+      cudaq::opt::createQuakeSynthesizer(name, rawArgs, 0, nullptr, true));
   pm.addPass(createCanonicalizerPass());
 
   // Run state preparation for quantum devices only.