diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h index 996b6e56a7..422032326c 100644 --- a/include/cudaq/Optimizer/Transforms/Passes.h +++ b/include/cudaq/Optimizer/Transforms/Passes.h @@ -40,6 +40,8 @@ std::unique_ptr createLowerToCFGPass(); std::unique_ptr createObserveAnsatzPass(std::vector &); std::unique_ptr createQuakeAddMetadata(); std::unique_ptr createQuakeAddDeallocs(); +std::unique_ptr createStatePreparation(); +std::unique_ptr createStatePreparation(std::string_view, void *); std::unique_ptr createQuakeSynthesizer(); std::unique_ptr createQuakeSynthesizer(std::string_view, void *); std::unique_ptr createRaiseToAffinePass(); diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td index 8d2f0c1821..e5e15a8776 100644 --- a/include/cudaq/Optimizer/Transforms/Passes.td +++ b/include/cudaq/Optimizer/Transforms/Passes.td @@ -512,6 +512,17 @@ def PruneCtrlRelations : Pass<"pruned-ctrl-form", "mlir::func::FuncOp"> { }]; } +def PrepareState : Pass<"state-prep", "mlir::ModuleOp"> { + let summary = + "Convert state vector data into gates"; + let description = [{ + Convert quake representation that includes qubit initialization + from data into qubit initialization using gates. + }]; + + let constructor = "cudaq::opt::createStatePreparation()"; +} + def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> { let summary = "Synthesize concrete quantum program from Quake code plus runtime values."; diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt index 7600efe276..173cec4538 100644 --- a/lib/Optimizer/Transforms/CMakeLists.txt +++ b/lib/Optimizer/Transforms/CMakeLists.txt @@ -42,6 +42,8 @@ add_cudaq_library(OptTransforms QuakeSynthesizer.cpp RefToVeqAlloc.cpp RegToMem.cpp + StateDecomposer.cpp + StatePreparation.cpp PySynthCallableBlockArgs.cpp DEPENDS diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 625e9727c6..0244adb3d3 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -1220,7 +1220,6 @@ class GenerateKernelExecution } continue; } - stVal = builder.create(loc, stVal.getType(), stVal, arg, idx); } diff --git a/lib/Optimizer/Transforms/StateDecomposer.cpp b/lib/Optimizer/Transforms/StateDecomposer.cpp new file mode 100644 index 0000000000..3105fad707 --- /dev/null +++ b/lib/Optimizer/Transforms/StateDecomposer.cpp @@ -0,0 +1,128 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "StateDecomposer.h" + +namespace cudaq::details { + +std::vector grayCode(std::size_t numBits) { + std::vector result(1ULL << numBits); + for (std::size_t i = 0; i < (1ULL << numBits); ++i) + result[i] = ((i >> 1) ^ i); + return result; +} + +std::vector getControlIndices(std::size_t numBits) { + auto code = grayCode(numBits); + std::vector indices; + for (auto i = 0u; i < code.size(); ++i) { + // The position of the control in the lth CNOT gate is set to match + // the position where the lth and (l + 1)th bit strings g[l] and g[l+1] of + // the binary reflected Gray code differ. + auto position = std::log2(code[i] ^ code[(i + 1) % code.size()]); + // N.B: In CUDA Quantum we write the least significant bit (LSb) on the left + // + // lsb -v + // 001 + // ^- msb + // + // Meaning that the bitstring 001 represents the number four instead of one. + // The above position calculation uses the 'normal' convention of writing + // numbers with the LSb on the left. + // + // Now, what we need to find out is the position of the 1 in the bitstring. + // If we take LSb as being position 0, then for the normal convention its + // position will be 0. Using CUDA Quantum convention it will be 2. Hence, + // we need to convert the position we find using: + // + // numBits - position - 1 + // + // The extra -1 is to account for indices starting at 0. Using the above + // examples: + // + // bitstring: 001 + // numBits: 3 + // position: 0 + // + // We have the converted position: 2, which is what we need. + indices.emplace_back(numBits - position - 1); + } + return indices; +} + +std::vector convertAngles(const std::span alphas) { + // Implements Eq. (3) from https://arxiv.org/pdf/quant-ph/0407010.pdf + // + // N.B: The paper does fails to explicitly define what is the dot operator in + // the exponent of -1. Ref. 3 solves the mystery: its the bitwise inner + // product. + auto bitwiseInnerProduct = [](std::size_t a, std::size_t b) { + auto product = a & b; + auto sumOfProducts = 0; + while (product) { + sumOfProducts += product & 0b1 ? 1 : 0; + product = product >> 1; + } + return sumOfProducts; + }; + std::vector thetas(alphas.size(), 0); + for (std::size_t i = 0u; i < alphas.size(); ++i) { + for (std::size_t j = 0u; j < alphas.size(); ++j) + thetas[i] += + bitwiseInnerProduct(j, ((i >> 1) ^ i)) & 0b1 ? -alphas[j] : alphas[j]; + thetas[i] /= alphas.size(); + } + return thetas; +} + +std::vector getAlphaZ(const std::span data, + std::size_t numQubits, std::size_t k) { + // Implements Eq. (5) from https://arxiv.org/pdf/quant-ph/0407010.pdf + std::vector angles; + double divisor = static_cast(1ULL << (k - 1)); + for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) { + double angle = 0.0; + for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l) + // N.B: There is an extra '-1' on these indices computations to account + // for the fact that our indices start at 0. + angle += data[(2 * j - 1) * (1 << (k - 1)) + l - 1] - + data[(2 * j - 2) * (1 << (k - 1)) + l - 1]; + angles.push_back(angle / divisor); + } + return angles; +} + +std::vector getAlphaY(const std::span data, + std::size_t numQubits, std::size_t k) { + // Implements Eq. (8) from https://arxiv.org/pdf/quant-ph/0407010.pdf + // N.B: There is an extra '-1' on these indices computations to account for + // the fact that our indices start at 0. + std::vector angles; + for (std::size_t j = 1; j <= (1ULL << (numQubits - k)); ++j) { + double numerator = 0; + for (std::size_t l = 1; l <= (1ULL << (k - 1)); ++l) { + numerator += + std::pow(std::abs(data[(2 * j - 1) * (1 << (k - 1)) + l - 1]), 2); + } + + double denominator = 0; + for (std::size_t l = 1; l <= (1ULL << k); ++l) { + denominator += std::pow(std::abs(data[(j - 1) * (1 << k) + l - 1]), 2); + } + + if (denominator == 0.0) { + assert(numerator == 0.0 && + "If the denominator is zero, the numerator must also be zero."); + angles.push_back(0.0); + continue; + } + angles.push_back(2.0 * std::asin(std::sqrt(numerator / denominator))); + } + return angles; +} +} // namespace cudaq::details \ No newline at end of file diff --git a/lib/Optimizer/Transforms/StateDecomposer.h b/lib/Optimizer/Transforms/StateDecomposer.h new file mode 100644 index 0000000000..a698ac83c2 --- /dev/null +++ b/lib/Optimizer/Transforms/StateDecomposer.h @@ -0,0 +1,177 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "PassDetails.h" +#include "cudaq/Optimizer/Builder/Runtime.h" +#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h" +#include "cudaq/Optimizer/Dialect/CC/CCOps.h" +#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h" +#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h" +#include "cudaq/Optimizer/Transforms/Passes.h" +#include "llvm/Support/Debug.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Target/LLVMIR/TypeToLLVM.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/RegionUtils.h" +#include + +#include + +namespace cudaq::details { + +/// @brief Converts angles of a uniformly controlled rotation to angles of +/// non-controlled rotations. +std::vector convertAngles(const std::span alphas); + +/// @brief Return the control indices dictated by the gray code implementation. +/// +/// Here, numBits is the number of controls. +std::vector getControlIndices(std::size_t numBits); + +/// @brief Return angles required to implement a uniformly controlled z-rotation +/// on the `kth` qubit. +std::vector getAlphaZ(const std::span data, + std::size_t numQubits, std::size_t k); + +/// @brief Return angles required to implement a uniformly controlled y-rotation +/// on the `kth` qubit. +std::vector getAlphaY(const std::span data, + std::size_t numQubits, std::size_t k); +} // namespace cudaq::details + +class StateGateBuilder { +public: + StateGateBuilder(mlir::OpBuilder &b, mlir::Location &l, mlir::Value &q) + : builder(b), loc(l), qubits(q) {} + + template + void applyRotationOp(double theta, std::size_t target) { + auto qubit = createQubitRef(target); + auto thetaValue = createAngleValue(theta); + builder.create(loc, thetaValue, mlir::ValueRange{}, qubit); + }; + + void applyX(std::size_t control, std::size_t target) { + auto qubitC = createQubitRef(control); + auto qubitT = createQubitRef(target); + builder.create(loc, qubitC, qubitT); + }; + +private: + mlir::Value createQubitRef(std::size_t index) { + if (qubitRefs.contains(index)) { + return qubitRefs[index]; + } + + auto indexValue = builder.create( + loc, index, builder.getIntegerType(64)); + auto ref = builder.create(loc, qubits, indexValue); + qubitRefs[index] = ref; + return ref; + } + + mlir::Value createAngleValue(double angle) { + return builder.create( + loc, llvm::APFloat{angle}, builder.getF64Type()); + } + + mlir::OpBuilder &builder; + mlir::Location &loc; + mlir::Value &qubits; + + std::unordered_map qubitRefs = + std::unordered_map(); +}; + +class StateDecomposer { +public: + StateDecomposer(StateGateBuilder &b, std::span> a) + : builder(b), amplitudes(a), numQubits(log2(a.size())) {} + + /// @brief Decompose the input state vector data to a set of controlled + /// operations and rotations. This function takes as input a `OpBuilder` + /// and appends the operations of the decomposition to its internal + /// representation. This implementation follows the algorithm defined in + /// `https://arxiv.org/pdf/quant-ph/0407010.pdf`. + void decompose() { + + // Decompose the state into phases and magnitudes. + bool needsPhaseEqualization = false; + std::vector phases; + std::vector magnitudes; + for (const auto &a : amplitudes) { + phases.push_back(std::arg(a)); + magnitudes.push_back(std::abs(a)); + // FIXME: remove magic number. + needsPhaseEqualization |= std::abs(phases.back()) > 1e-10; + } + + // N.B: The algorithm, as described in the paper, creates a circuit that + // begins with a target state and brings it to the all zero state. Hence, + // this implementation do the two steps described in Section III in reverse + // order. + + // Apply uniformly controlled y-rotations, the construction in Eq. (4). + for (std::size_t j = 1; j <= numQubits; ++j) { + auto k = numQubits - j + 1; + auto numControls = j - 1; + auto target = j - 1; + auto alphaYk = cudaq::details::getAlphaY(magnitudes, numQubits, k); + applyRotation(alphaYk, numControls, target); + } + + if (!needsPhaseEqualization) + return; + + // Apply uniformly controlled z-rotations, the construction in Eq. (4). + for (std::size_t j = 1; j <= numQubits; ++j) { + auto k = numQubits - j + 1; + auto numControls = j - 1; + auto target = j - 1; + auto alphaZk = cudaq::details::getAlphaZ(phases, numQubits, k); + if (alphaZk.empty()) + continue; + applyRotation(alphaZk, numControls, target); + } + } + +private: + /// @brief Apply a uniformly controlled rotation on the target qubit. + template + void applyRotation(const std::span alphas, std::size_t numControls, + std::size_t target) { + + // In our model the index 1 (i.e. |01>) in quantum state data + // corresponds to qubits[0]=1 and qubits[1] = 0. + // Revert the order of qubits as the state preparation algorithm + // we use assumes the opposite. + auto qubitIndex = [&](std::size_t i) { return numQubits - i - 1; }; + + auto thetas = cudaq::details::convertAngles(alphas); + if (numControls == 0) { + builder.applyRotationOp(thetas[0], qubitIndex(target)); + return; + } + + auto controlIndices = cudaq::details::getControlIndices(numControls); + assert(thetas.size() == controlIndices.size()); + for (auto [i, c] : llvm::enumerate(controlIndices)) { + builder.applyRotationOp(thetas[i], qubitIndex(target)); + builder.applyX(qubitIndex(c), qubitIndex(target)); + } + } + + StateGateBuilder &builder; + std::span> amplitudes; + std::size_t numQubits; +}; diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp new file mode 100644 index 0000000000..785e70b3f8 --- /dev/null +++ b/lib/Optimizer/Transforms/StatePreparation.cpp @@ -0,0 +1,370 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "PassDetails.h" +#include "StateDecomposer.h" +#include "cudaq/Optimizer/Builder/Runtime.h" +#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h" +#include "cudaq/Optimizer/Dialect/CC/CCOps.h" +#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h" +#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h" +#include "cudaq/Optimizer/Transforms/Passes.h" +#include "llvm/Support/Debug.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Target/LLVMIR/TypeToLLVM.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/RegionUtils.h" +#include + +#define DEBUG_TYPE "state-preparation" + +using namespace mlir; + +/// Replace a qubit initialization from vectors with quantum gates. +/// For example: +/// +/// func.func @foo(%arg0 : !cc.stdvec>) { +/// %0 = cc.stdvec_size %arg0 : (!cc.stdvec>) -> i64 +/// %1 = math.cttz %0 : i64 +/// %2 = cc.stdvec_data %arg0 : (!cc.stdvec>) -> +/// !cc.ptr> %3 = quake.alloca !quake.veq[%1 : i64] %4 = +/// quake.init_state %3, %2 : (!quake.veq, !cc.ptr>) -> +/// !quake.veq return +/// } +/// +/// On a call that passes std::vector vec{M_SQRT1_2, 0., 0., +/// M_SQRT1_2} as arg0: +/// +/// func.func @foo(%arg0 : !cc.stdvec>) { +/// %0 = quake.alloca !quake.veq<2> +/// %c0_i64 = arith.constant 0 : i64 +/// %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<2>, i64) -> !quake.ref +/// %cst = arith.constant 1.5707963267948968 : f64 +/// quake.ry (%cst) %1 : (f64, !quake.ref) -> () +/// %c1_i64 = arith.constant 1 : i64 +/// %2 = quake.extract_ref %0[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref +/// %cst_0 = arith.constant 1.5707963267948966 : f64 +/// quake.ry (%cst_0) %2 : (f64, !quake.ref) -> () +/// quake.x [%1] %2 : (!quake.ref, !quake.ref) -> () +/// %cst_1 = arith.constant -1.5707963267948966 : f64 +/// quake.ry (%cst_1) %2 : (f64, !quake.ref) -> () +/// quake.x [%1] %2 : (!quake.ref, !quake.ref) -> () +/// return +/// } +/// +/// Note: the following synthesis and const prop passes will replace +/// the argument by a constant and propagate the values and vector size +/// through other instructions. + +namespace { + +template +concept IntegralType = + std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; + +template +concept FloatingType = std::is_same::value; + +template +concept DoubleType = std::is_same::value; + +template +concept ComplexDataType = FloatingType || DoubleType || IntegralType; + +/// Input was complex but we prefer +/// complex. Make a copy, extending the values. +template +std::vector> convertToComplex(std::complex *data, + std::uint64_t size) { + auto convertData = std::vector>(size); + for (std::size_t i = 0; i < size; ++i) + convertData[i] = std::complex{static_cast(data[i].real()), + static_cast(data[i].imag())}; + return convertData; +} + +template +std::vector> convertToComplex(std::complex *data, + std::uint64_t size) { + return std::vector>(data, data + size); +} + +/// Input was float/double but we prefer complex. +/// Make a copy, extending or truncating the values. +template +std::vector> convertToComplex(From *data, + std::uint64_t size) { + auto convertData = std::vector>(size); + for (std::size_t i = 0; i < size; ++i) + convertData[i] = std::complex{static_cast(data[i]), + static_cast(0.0)}; + return convertData; +} + +LogicalResult +prepareStateFromVectorArgument(OpBuilder &builder, ModuleOp module, + unsigned &counter, BlockArgument argument, + std::vector> &vec) { + auto *ctx = builder.getContext(); + auto argLoc = argument.getLoc(); + + auto toErase = std::vector(); + + for (auto *argUser : argument.getUsers()) { + // Handle the `StdvecSize` and `quake.alloca` use case: + // - Replace a `vec.size()` with the vector length. + // - Replace the number of qubits calculation with the vector length + // logarithm. + // - Replace `quake.alloca` with a constant size qvector allocation. + if (auto stdvecSizeOp = dyn_cast(argUser)) { + builder.setInsertionPointAfter(stdvecSizeOp); + Value length = builder.create( + argLoc, vec.size(), stdvecSizeOp.getType()); + + Value numQubits = builder.create( + argLoc, log2(vec.size()), stdvecSizeOp.getType()); + + for (auto *sizeUser : argUser->getUsers()) { + if (auto countZeroesOp = + dyn_cast(sizeUser)) { + for (auto *numQubitsUser : sizeUser->getUsers()) { + if (auto quakeAllocaOp = dyn_cast(numQubitsUser)) { + builder.setInsertionPointAfter(quakeAllocaOp); + auto veqTy = quake::VeqType::get(ctx, log2(vec.size())); + Value newAlloc = builder.create(argLoc, veqTy); + quakeAllocaOp.replaceAllUsesWith(newAlloc); + toErase.push_back(quakeAllocaOp); + } + } + countZeroesOp.replaceAllUsesWith(numQubits); + toErase.push_back(countZeroesOp); + } + } + + stdvecSizeOp.replaceAllUsesWith(length); + toErase.push_back(stdvecSizeOp); + continue; + } + + // Handle the `StdvecDataOp` and `quake.init_state` use case: + // - Replace a `quake.init_state` with gates preparing the state. + if (auto stdvecDataOp = dyn_cast(argUser)) { + for (auto *dataUser : stdvecDataOp->getUsers()) { + if (auto initOp = dyn_cast(dataUser)) { + builder.setInsertionPointAfter(initOp); + // Find the qvector alloc instruction + auto qubits = initOp.getOperand(0); + + // Prepare state from vector data. + auto gateBuilder = StateGateBuilder(builder, argLoc, qubits); + auto decomposer = StateDecomposer(gateBuilder, vec); + decomposer.decompose(); + + initOp.replaceAllUsesWith(qubits); + toErase.push_back(initOp); + } + } + } + } + + for (auto &op : toErase) { + op->erase(); + } + + return success(); +} + +class StatePreparation : public cudaq::opt::PrepareStateBase { +protected: + // The name of the kernel to be synthesized + std::string kernelName; + + // The raw pointer to the runtime arguments. + void *args; + +public: + StatePreparation() = default; + StatePreparation(std::string_view kernel, void *a) + : kernelName(kernel), args(a) {} + + mlir::ModuleOp getModule() { return getOperation(); } + + std::pair> + getTargetLayout(FunctionType funcTy) { + auto bufferTy = cudaq::opt::factory::buildInvokeStructType(funcTy); + StringRef dataLayoutSpec = ""; + if (auto attr = + getModule()->getAttr(cudaq::opt::factory::targetDataLayoutAttrName)) + dataLayoutSpec = cast(attr); + auto dataLayout = llvm::DataLayout(dataLayoutSpec); + // Convert bufferTy to llvm. + llvm::LLVMContext context; + LLVMTypeConverter converter(funcTy.getContext()); + cudaq::opt::initializeTypeConversions(converter); + auto llvmDialectTy = converter.convertType(bufferTy); + LLVM::TypeToLLVMIRTranslator translator(context); + auto *llvmStructTy = + cast(translator.translateType(llvmDialectTy)); + auto *layout = dataLayout.getStructLayout(llvmStructTy); + auto strSize = layout->getSizeInBytes(); + std::vector fieldOffsets; + for (std::size_t i = 0, I = bufferTy.getMembers().size(); i != I; ++i) + fieldOffsets.emplace_back(layout->getElementOffset(i)); + return {strSize, fieldOffsets}; + } + + void runOnOperation() override final { + auto module = getModule(); + unsigned counter = 0; + + if (args == nullptr || kernelName.empty()) { + module.emitOpError( + "State preparation requires a kernel and the values of the " + "arguments passed when it is called."); + signalPassFailure(); + return; + } + + auto kernelNameInQuake = cudaq::runtime::cudaqGenPrefixName + kernelName; + // Get the function we care about (the one with kernelName) + auto funcOp = module.lookupSymbol(kernelNameInQuake); + if (!funcOp) { + module.emitOpError("The kernel '" + kernelName + + "' was not found in the module."); + signalPassFailure(); + return; + } + + // Create the builder. + auto builder = OpBuilder::atBlockBegin(&funcOp.getBody().front()); + auto arguments = funcOp.getArguments(); + auto structLayout = getTargetLayout(funcOp.getFunctionType()); + // Keep track of the stdVec sizes. + std::vector> stdVecInfo; + + for (auto iter : llvm::enumerate(arguments)) { + auto argNum = iter.index(); + auto argument = iter.value(); + std::size_t offset = structLayout.second[argNum]; + + // Get the argument type + auto type = argument.getType(); + + if (auto ptrTy = dyn_cast(type)) { + if (isa(ptrTy.getElementType())) { + funcOp.emitOpError( + "State preparation from cudaq::state is not supported."); + return; + } + } + + // If std::vector type, add it to the list of vector info. + // These will be processed when we reach the buffer's appendix. + if (auto vecTy = dyn_cast(type)) { + auto eleTy = vecTy.getElementType(); + if (!isa(eleTy)) { + continue; + } + char *ptrToSizeInBuffer = static_cast(args) + offset; + auto sizeFromBuffer = + *reinterpret_cast(ptrToSizeInBuffer); + unsigned bytesInType = [&eleTy]() { + if (auto complexTy = dyn_cast(eleTy)) + return 2 * cudaq::opt::convertBitsToBytes( + complexTy.getElementType().getIntOrFloatBitWidth()); + return cudaq::opt::convertBitsToBytes(eleTy.getIntOrFloatBitWidth()); + }(); + assert(bytesInType > 0 && "element must have a size"); + auto vectorSize = sizeFromBuffer / bytesInType; + stdVecInfo.emplace_back(argNum, eleTy, vectorSize); + continue; + } + } + + // For any `std::vector` arguments, we now know the sizes so let's replace + // the block arg with the actual vector element data. First get the pointer + // to the start of the buffer's appendix. + auto structSize = structLayout.first; + char *bufferAppendix = static_cast(args) + structSize; + for (auto [idx, eleTy, vecLength] : stdVecInfo) { + if (!eleTy) { + bufferAppendix += vecLength; + continue; + } + auto doVector = [&](T) { + auto *ptr = reinterpret_cast(bufferAppendix); + auto v = convertToComplex(ptr, vecLength); + if (failed(prepareStateFromVectorArgument(builder, module, counter, + arguments[idx], v))) + funcOp.emitOpError("state preparation failed for vector"); + bufferAppendix += vecLength * sizeof(T); + }; + if (auto ty = dyn_cast(eleTy)) { + switch (ty.getIntOrFloatBitWidth()) { + case 1: + doVector(false); + break; + case 8: + doVector(std::int8_t{}); + break; + case 16: + doVector(std::int16_t{}); + break; + case 32: + doVector(std::int32_t{}); + break; + case 64: + doVector(std::int64_t{}); + break; + default: + bufferAppendix += vecLength * cudaq::opt::convertBitsToBytes( + ty.getIntOrFloatBitWidth()); + funcOp.emitOpError( + "state preparation failed for vector."); + break; + } + continue; + } + if (eleTy == builder.getF32Type()) { + doVector(float{}); + continue; + } + if (eleTy == builder.getF64Type()) { + doVector(double{}); + continue; + } + if (eleTy == ComplexType::get(builder.getF32Type())) { + doVector(std::complex{}); + continue; + } + if (eleTy == ComplexType::get(builder.getF64Type())) { + doVector(std::complex{}); + continue; + } + } + } +}; + +} // namespace + +std::unique_ptr cudaq::opt::createStatePreparation() { + return std::make_unique(); +} + +std::unique_ptr +cudaq::opt::createStatePreparation(std::string_view kernelName, void *a) { + return std::make_unique(kernelName, a); +} diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index 476ee187bb..f2da95a8f3 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -475,6 +475,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module, PassManager pm(context); pm.addPass(createCanonicalizerPass()); + pm.addPass(cudaq::opt::createStatePreparation(name, rawArgs)); pm.addPass(cudaq::opt::createQuakeSynthesizer(name, rawArgs)); pm.addPass(cudaq::opt::createExpandMeasurementsPass()); pm.addNestedPass(cudaq::opt::createClassicalMemToReg()); diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py index de072335bf..fc11224f5e 100644 --- a/python/tests/backends/test_Quantinuum_kernel.py +++ b/python/tests/backends/test_Quantinuum_kernel.py @@ -7,6 +7,7 @@ # ============================================================================ # import cudaq, pytest, os, time +import numpy as np from cudaq import spin from multiprocessing import Process try: @@ -169,6 +170,20 @@ def kernel(): result = cudaq.sample(kernel) +def test_quantinuum_state_preparation(): + + @cudaq.kernel + def kernel(vec: list[complex]): + qubits = cudaq.qvector(vec) + + state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.] + counts = cudaq.sample(kernel, state) + assert '00' in counts + assert '10' in counts + assert not '01' in counts + assert not '11' in counts + + # leave for gdb debugging if __name__ == "__main__": loc = os.path.abspath(__file__) diff --git a/python/tests/kernel/test_kernel_qvector_init.py b/python/tests/kernel/test_kernel_qvector_init.py index ddaeb6cc4d..28260dcb4d 100644 --- a/python/tests/kernel/test_kernel_qvector_init.py +++ b/python/tests/kernel/test_kernel_qvector_init.py @@ -5,11 +5,18 @@ # This source code and the accompanying materials are made available under # # the terms of the Apache License 2.0 which accompanies this distribution. # # ============================================================================ # + +import os, sys import pytest import cudaq import numpy as np +## [PYTHON_VERSION_FIX] +skipIfPythonLessThan39 = pytest.mark.skipif( + sys.version_info < (3, 9), + reason="built-in collection types such as `list` not supported") + skipIfNvidiaFP64NotInstalled = pytest.mark.skipif( not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia-fp64')), reason='Could not find nvidia-fp64 in installation') @@ -18,30 +25,34 @@ not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')), reason='Could not find nvidia in installation') +# state preparation and synthesis -# float -@skipIfNvidiaFP64NotInstalled -def test_kernel_float_params_f64(): +@skipIfPythonLessThan39 +def test_kernel_state_preparation(): cudaq.reset_target() - cudaq.set_target('nvidia-fp64') - f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)] + c = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.] @cudaq.kernel - def kernel(vec: list[float]): + def kernel(vec: list[complex]): q = cudaq.qvector(vec) - counts = cudaq.sample(kernel, f) - print(counts) - assert '11' in counts + synthesized = cudaq.synthesize(kernel, c) + assert 'quake.init_state' in kernel.__str__() + assert not 'quake.init_state' in synthesized.__str__() + + counts = cudaq.sample(synthesized) assert '00' in counts + assert '10' in counts -@skipIfNvidiaNotInstalled -def test_kernel_float_params_f32(): +# float + + +@skipIfPythonLessThan39 +def test_kernel_float_params(): cudaq.reset_target() - cudaq.set_target('nvidia') f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)] @@ -156,33 +167,9 @@ def kernel(): # complex -@skipIfNvidiaFP64NotInstalled -def test_kernel_complex_params_rotate_f64(): - cudaq.reset_target() - cudaq.set_target('nvidia-fp64') - - c = [0. + 0j, 0., 0., 1.] - - @cudaq.kernel - def kernel(vec: list[complex]): - q = cudaq.qvector(vec) - x(q.front()) - y(q.back()) - h(q) - mz(q) - - counts = cudaq.sample(kernel, c) - print(f'rotate: {counts}') - assert '11' in counts - assert '00' in counts - assert '01' in counts - assert '10' in counts - - -@skipIfNvidiaNotInstalled -def test_kernel_complex_params_rotate_f32(): +@skipIfPythonLessThan39 +def test_kernel_complex_params_rotate(): cudaq.reset_target() - cudaq.set_target('nvidia') c = [0. + 0j, 0., 0., 1.] @@ -202,27 +189,9 @@ def kernel(vec: list[complex]): assert '10' in counts -@skipIfNvidiaFP64NotInstalled -def test_kernel_complex_params_f64(): - cudaq.reset_target() - cudaq.set_target('nvidia-fp64') - - c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)] - - @cudaq.kernel - def kernel(vec: list[complex]): - q = cudaq.qvector(vec) - - counts = cudaq.sample(kernel, c) - print(counts) - assert '11' in counts - assert '00' in counts - - -@skipIfNvidiaNotInstalled -def test_kernel_complex_params_f32(): +@skipIfPythonLessThan39 +def test_kernel_complex_params(): cudaq.reset_target() - cudaq.set_target('nvidia') c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)] @@ -337,10 +306,9 @@ def kernel(): # np arrays -@skipIfNvidiaFP64NotInstalled -def test_kernel_dtype_complex_params_f64(): +@skipIfPythonLessThan39 +def test_kernel_dtype_complex_params(): cudaq.reset_target() - cudaq.set_target('nvidia-fp64') c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)] @@ -354,10 +322,9 @@ def kernel(vec: list[complex]): assert '00' in counts -@skipIfNvidiaFP64NotInstalled -def test_kernel_dtype_complex128_params_f64(): +@skipIfPythonLessThan39 +def test_kernel_dtype_complex128_params(): cudaq.reset_target() - cudaq.set_target('nvidia-fp64') c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)] @@ -425,27 +392,9 @@ def kernel(vec: list[complex]): assert '00' in counts -@skipIfNvidiaFP64NotInstalled -def test_kernel_amplitudes_complex_params_f64(): - cudaq.reset_target() - cudaq.set_target('nvidia-fp64') - - c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]) - - @cudaq.kernel - def kernel(vec: list[complex]): - q = cudaq.qvector(vec) - - counts = cudaq.sample(kernel, c) - print(counts) - assert '11' in counts - assert '00' in counts - - -@skipIfNvidiaNotInstalled -def test_kernel_amplitudes_complex_params_f32(): +@skipIfPythonLessThan39 +def test_kernel_amplitudes_complex_params(): cudaq.reset_target() - cudaq.set_target('nvidia') c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]) @@ -459,10 +408,9 @@ def kernel(vec: list[complex]): assert '00' in counts -@skipIfNvidiaFP64NotInstalled -def test_kernel_amplitudes_complex_from_capture_f64(): +@skipIfPythonLessThan39 +def test_kernel_amplitudes_complex_from_capture(): cudaq.reset_target() - cudaq.set_target('nvidia-fp64') c = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)] @@ -476,23 +424,6 @@ def kernel(vec: list[complex]): assert '00' in counts -@skipIfNvidiaNotInstalled -def test_kernel_amplitudes_complex_from_capture_f32(): - cudaq.reset_target() - cudaq.set_target('nvidia') - - c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)] - - @cudaq.kernel - def kernel(vec: list[complex]): - q = cudaq.qvector(cudaq.amplitudes(vec)) - - counts = cudaq.sample(kernel, c) - print(counts) - assert '11' in counts - assert '00' in counts - - @skipIfNvidiaFP64NotInstalled def test_kernel_simulation_dtype_np_array_from_capture_f64(): cudaq.reset_target() @@ -568,40 +499,9 @@ def kernel(): # test errors -@skipIfNvidiaFP64NotInstalled -def test_kernel_error_invalid_array_size_f64(): - cudaq.reset_target() - cudaq.set_target('nvidia-fp64') - - @cudaq.kernel - def kernel(): - qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex)) - - with pytest.raises(RuntimeError) as e: - counts = cudaq.sample(kernel) - assert 'Invalid input state size for qvector init (not a power of 2)' in repr( - e) - - -@skipIfNvidiaFP64NotInstalled -def test_kernel_error_invalid_list_size_f64(): - cudaq.reset_target() - cudaq.set_target('nvidia-fp64') - - @cudaq.kernel - def kernel(): - qubits = cudaq.qvector([1., 0., 0.]) - - with pytest.raises(RuntimeError) as e: - counts = cudaq.sample(kernel) - assert 'Invalid input state size for qvector init (not a power of 2)' in repr( - e) - - -@skipIfNvidiaNotInstalled -def test_kernel_error_invalid_array_size_f32(): +@skipIfPythonLessThan39 +def test_kernel_error_invalid_array_size(): cudaq.reset_target() - cudaq.set_target('nvidia') @cudaq.kernel def kernel(): @@ -613,10 +513,9 @@ def kernel(): e) -@skipIfNvidiaNotInstalled -def test_kernel_error_invalid_list_size_f32(): +@skipIfPythonLessThan39 +def test_kernel_error_invalid_list_size(): cudaq.reset_target() - cudaq.set_target('nvidia') @cudaq.kernel def kernel(): @@ -629,6 +528,7 @@ def kernel(): def test_kernel_qvector_init_from_param_int(): + cudaq.reset_target() @cudaq.kernel def kernel(n: int): @@ -643,6 +543,8 @@ def kernel(n: int): def test_kernel_qvector_init_from_capture_int(): + cudaq.reset_target() + n = 2 @cudaq.kernel @@ -658,6 +560,7 @@ def kernel(): def test_kernel_qvector_init_from_int(): + cudaq.reset_target() @cudaq.kernel def kernel(): diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h index aa36a0c62d..08f41e60ec 100644 --- a/runtime/common/BaseRemoteRESTQPU.h +++ b/runtime/common/BaseRemoteRESTQPU.h @@ -401,6 +401,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU { if (updatedArgs) { cudaq::info("Run Quake Synth.\n"); mlir::PassManager pm(&context); + pm.addPass(cudaq::opt::createStatePreparation(kernelName, updatedArgs)); pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs)); if (disableMLIRthreading || enablePrintMLIREachPass) moduleOp.getContext()->disableMultithreading(); diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h index 520c20410f..0b9e8d313e 100644 --- a/runtime/common/BaseRestRemoteClient.h +++ b/runtime/common/BaseRestRemoteClient.h @@ -153,6 +153,7 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient { if (args) { cudaq::info("Run Quake Synth.\n"); mlir::PassManager pm(&mlirContext); + pm.addPass(cudaq::opt::createStatePreparation(name, args)); pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args)); if (failed(pm.run(moduleOp))) throw std::runtime_error("Could not successfully apply quake-synth."); diff --git a/targettests/execution/from_state_complex.cpp b/targettests/execution/from_state_complex.cpp new file mode 100644 index 0000000000..5ca8813393 --- /dev/null +++ b/targettests/execution/from_state_complex.cpp @@ -0,0 +1,27 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s + +#include + +__qpu__ void test(std::vector inState) { + cudaq::qvector q = inState; +} + +// CHECK: size 2 + +int main() { + std::vector vec{M_SQRT1_2, 0., 0., M_SQRT1_2}; + auto counts = cudaq::sample(test, vec); + counts.dump(); + + printf("size %zu\n", counts.size()); + return !(counts.size() == 2); +} diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp new file mode 100644 index 0000000000..35a628c06a --- /dev/null +++ b/targettests/execution/state_preparation_vector.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s + +#include +#include + +__qpu__ void test(std::vector inState) { + cudaq::qvector q1 = inState; +} + +void printCounts(cudaq::sample_result& result) { + std::vector values{}; + for (auto &&[bits, counts] : result) { + values.push_back(bits); + } + + std::sort(values.begin(), values.end()); + for (auto &&bits : values) { + std::cout << bits << '\n'; + } +} + +int main() { + std::vector vec{M_SQRT1_2, M_SQRT1_2, 0., 0.}; + std::vector vec1{0., 0., M_SQRT1_2, M_SQRT1_2}; + { + // Passing state data as argument (kernel mode) + auto counts = cudaq::sample(test, vec); + printCounts(counts); + + counts = cudaq::sample(test, vec1); + printCounts(counts); + } + + { + // Passing state data as argument (builder mode) + auto [kernel, v] = cudaq::make_kernel>(); + auto qubits = kernel.qalloc(v); + + auto counts = cudaq::sample(kernel, vec); + printCounts(counts); + } +} + +// CHECK: 00 +// CHECK: 10 +// CHECK: 01 +// CHECK: 11 +// CHECK: 00 +// CHECK: 10