From 651afb49fa7807073d493571913d3b999d742cc4 Mon Sep 17 00:00:00 2001 From: Ben Howe <141149032+bmhowe23@users.noreply.github.com> Date: Mon, 15 Jul 2024 17:49:40 -0500 Subject: [PATCH] NVQC Optimizations for VQE (C++ and Python) (#1901) This set of changes optimizes VQE performance when running simulations on NVQC. The changes work for both C++ and Python applications. Details: allow cudaq::vqe() to be invoked with variadic arguments similar to how we invoke cudaq::sample() and cudaq::observe(). More specifically, that means that we allow the user to pass the concrete, non-variational arguments (the arguments not subject to VQE optimization) directly into the cudaq::vqe() call. Users must switch to this calling convention to achieve notable speedups. --- .../examples/cpp/algorithms/qaoa_maxcut.cpp | 6 +- .../sphinx/examples/cpp/algorithms/vqe_h2.cpp | 8 +- include/cudaq/Optimizer/Builder/Factory.h | 5 +- include/cudaq/Optimizer/Transforms/Passes.h | 4 +- include/cudaq/Optimizer/Transforms/Passes.td | 2 + lib/Optimizer/Builder/Factory.cpp | 9 +- .../Transforms/GenKernelExecution.cpp | 21 ++- lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 54 ++++--- python/runtime/cudaq/algorithms/py_vqe.cpp | 121 +++++++++++++- .../cudaq/platform/py_alt_launch_kernel.cpp | 65 ++++++-- python/runtime/utils/PyRemoteSimulatorQPU.cpp | 49 ++++-- python/tests/remote/test_remote_code_exec.py | 25 +++ python/utils/OpaqueArguments.h | 12 +- runtime/common/BaseRemoteSimulatorQPU.h | 86 +++++++--- runtime/common/BaseRestRemoteClient.h | 108 ++++++++++--- runtime/common/KernelWrapper.h | 31 ++++ runtime/common/RemoteKernelExecutor.h | 18 ++- runtime/cudaq/algorithms/gradient.h | 57 ++++++- .../algorithms/gradients/central_difference.h | 6 + .../algorithms/gradients/forward_difference.h | 6 + .../algorithms/gradients/parameter_shift.h | 6 + runtime/cudaq/algorithms/vqe.h | 146 ++++++++++++++--- .../rest_server/helpers/RestRemoteServer.cpp | 94 ++++++++++- runtime/cudaq/platform/qpu.h | 10 ++ runtime/cudaq/platform/quantum_platform.cpp | 21 +++ runtime/cudaq/platform/quantum_platform.h | 11 ++ targettests/Remote-Sim/vqe_h2.cpp | 149 ++++++++++++++++++ 27 files changed, 977 insertions(+), 153 deletions(-) create mode 100644 targettests/Remote-Sim/vqe_h2.cpp diff --git a/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp b/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp index 63d55f51d3f..0c14b578c60 100644 --- a/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp +++ b/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp @@ -72,10 +72,8 @@ int main() { -M_PI / 8.0, M_PI / 8.0, n_params, std::mt19937::default_seed); // Call the optimizer - auto [opt_val, opt_params] = cudaq::vqe( - ansatz{}, Hp, optimizer, n_params, [&](std::vector params) { - return std::make_tuple(params, n_qubits, n_layers); - }); + auto [opt_val, opt_params] = + cudaq::vqe(ansatz{}, Hp, optimizer, n_params, n_qubits, n_layers); // Print the optimized value and the parameters printf("Optimal value = %.16lf\n", opt_val); diff --git a/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp b/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp index c38df10d15a..50c65da2d6a 100644 --- a/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp +++ b/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp @@ -104,18 +104,14 @@ int main() { so4_fabric ansatz; - auto argMapper = [&](std::vector x) { - return std::make_tuple(x, n_qubits, n_layers); - }; - // Run VQE. cudaq::optimizers::lbfgs optimizer; optimizer.initial_parameters = init_params; optimizer.max_eval = 20; optimizer.max_line_search_trials = 10; - cudaq::gradients::central_difference gradient(ansatz, argMapper); + cudaq::gradients::central_difference gradient; auto [opt_val, opt_params] = - cudaq::vqe(ansatz, gradient, H, optimizer, n_params, argMapper); + cudaq::vqe(ansatz, gradient, H, optimizer, n_params, n_qubits, n_layers); printf("Optimal value = %.16lf\n", opt_val); } diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h index 4771f828c52..42c2d306b83 100644 --- a/include/cudaq/Optimizer/Builder/Factory.h +++ b/include/cudaq/Optimizer/Builder/Factory.h @@ -91,7 +91,10 @@ mlir::Type genArgumentBufferType(mlir::Type ty); /// ``` /// where the values of the vector argument are pass-by-value and appended to /// the end of the struct as a sequence of \i n double values. -cudaq::cc::StructType buildInvokeStructType(mlir::FunctionType funcTy); +/// +/// The leading `startingArgIdx + 1` parameters are omitted from the struct. +cudaq::cc::StructType buildInvokeStructType(mlir::FunctionType funcTy, + std::size_t startingArgIdx = 0); /// Return the LLVM-IR dialect type: `[length x i8]`. inline mlir::Type getStringType(mlir::MLIRContext *ctx, std::size_t length) { diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h index 996b6e56a70..4b15be99155 100644 --- a/include/cudaq/Optimizer/Transforms/Passes.h +++ b/include/cudaq/Optimizer/Transforms/Passes.h @@ -41,7 +41,9 @@ std::unique_ptr createObserveAnsatzPass(std::vector &); std::unique_ptr createQuakeAddMetadata(); std::unique_ptr createQuakeAddDeallocs(); std::unique_ptr createQuakeSynthesizer(); -std::unique_ptr createQuakeSynthesizer(std::string_view, void *); +std::unique_ptr +createQuakeSynthesizer(std::string_view, const void *, + std::size_t startingArgIdx = 0); std::unique_ptr createRaiseToAffinePass(); std::unique_ptr createUnwindLoweringPass(); diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td index 1016351f7a2..e1a2d4f8b30 100644 --- a/include/cudaq/Optimizer/Transforms/Passes.td +++ b/include/cudaq/Optimizer/Transforms/Passes.td @@ -240,6 +240,8 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> { let options = [ Option<"outputFilename", "output-filename", "std::string", /*default=*/"\"-\"", "Name of output file.">, + Option<"startingArgIdx", "starting-arg-idx", "std::size_t", /*default=*/"0", + "The starting argument index for the argsCreator.">, ]; } diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index d22cfc097ce..df618fbc095 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -62,11 +62,14 @@ Type factory::genArgumentBufferType(Type ty) { return genBufferType(ty); } -cudaq::cc::StructType factory::buildInvokeStructType(FunctionType funcTy) { +cudaq::cc::StructType +factory::buildInvokeStructType(FunctionType funcTy, + std::size_t startingArgIdx) { auto *ctx = funcTy.getContext(); SmallVector eleTys; - for (auto inTy : funcTy.getInputs()) - eleTys.push_back(genBufferType(inTy)); + for (auto inTy : llvm::enumerate(funcTy.getInputs())) + if (inTy.index() >= startingArgIdx) + eleTys.push_back(genBufferType(inTy.value())); for (auto outTy : funcTy.getResults()) eleTys.push_back(genBufferType(outTy)); return cudaq::cc::StructType::get(ctx, eleTys); diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index be0326a4a49..c3ce43382bc 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -352,7 +352,7 @@ class GenerateKernelExecution builder.setInsertionPointToStart(entry); // Get the original function args - auto kernelArgTypes = devKernelTy.getInputs(); + auto kernelArgTypes = devKernelTy.getInputs().drop_front(startingArgIdx); // Init the struct Value stVal = builder.create(loc, msgStructTy); @@ -1531,8 +1531,23 @@ class GenerateKernelExecution funcTy, funcOp); // Generate the argsCreator function used by synthesis. - auto argsCreatorFunc = genKernelArgsCreatorFunction( - loc, builder, funcTy, structTy, classNameStr, hostFuncTy, hasThisPtr); + mlir::func::FuncOp argsCreatorFunc; + if (startingArgIdx == 0) { + argsCreatorFunc = + genKernelArgsCreatorFunction(loc, builder, funcTy, structTy, + classNameStr, hostFuncTy, hasThisPtr); + } else { + // We are operating in a very special case where we want the argsCreator + // function to ignore the first `startingArgIdx` arguments. In this + // situation, the argsCreator function will not be compatible with the + // other helper functions created in this pass, so it is assumed that + // the caller is OK with that. + auto structTy_argsCreator = + cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx); + argsCreatorFunc = genKernelArgsCreatorFunction( + loc, builder, funcTy, structTy_argsCreator, classNameStr, + hostFuncTy, hasThisPtr); + } // Generate a new mangled function on the host side to call the // callback function. diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp index 79548fd5fff..731b436edab 100644 --- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp +++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp @@ -55,14 +55,14 @@ class state; /// BlockArgument with it. template void synthesizeRuntimeArgument( - OpBuilder &builder, BlockArgument argument, void *args, std::size_t offset, - std::size_t typeSize, + OpBuilder &builder, BlockArgument argument, const void *args, + std::size_t offset, std::size_t typeSize, std::function &&opGenerator) { // Create an instance of the concrete type ConcreteType concrete; // Copy the void* struct member into that concrete instance - std::memcpy(&concrete, ((char *)args) + offset, typeSize); + std::memcpy(&concrete, ((const char *)args) + offset, typeSize); // Generate the MLIR Value (arith constant for example) auto runtimeArg = opGenerator(builder, &concrete); @@ -387,18 +387,26 @@ class QuakeSynthesizer std::string kernelName; // The raw pointer to the runtime arguments. - void *args; + const void *args; + + // The starting argument index to synthesize. Typically 0 but may be >0 for + // partial synthesis. If >0, it is assumed that the first argument(s) are NOT + // in `args`. + std::size_t startingArgIdx = 0; public: QuakeSynthesizer() = default; - QuakeSynthesizer(std::string_view kernel, void *a) + QuakeSynthesizer(std::string_view kernel, const void *a) : kernelName(kernel), args(a) {} + QuakeSynthesizer(std::string_view kernel, const void *a, std::size_t s) + : kernelName(kernel), args(a), startingArgIdx(s) {} mlir::ModuleOp getModule() { return getOperation(); } std::pair> getTargetLayout(FunctionType funcTy) { - auto bufferTy = cudaq::opt::factory::buildInvokeStructType(funcTy); + auto bufferTy = + cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx); StringRef dataLayoutSpec = ""; if (auto attr = getModule()->getAttr(cudaq::opt::factory::targetDataLayoutAttrName)) @@ -449,10 +457,10 @@ class QuakeSynthesizer // Keep track of the stdVec sizes. std::vector> stdVecInfo; - for (auto iter : llvm::enumerate(arguments)) { - auto argNum = iter.index(); - auto argument = iter.value(); - std::size_t offset = structLayout.second[argNum]; + for (std::size_t argNum = startingArgIdx, end = arguments.size(); + argNum < end; argNum++) { + auto argument = arguments[argNum]; + std::size_t offset = structLayout.second[argNum - startingArgIdx]; // Get the argument type auto type = argument.getType(); @@ -560,9 +568,10 @@ class QuakeSynthesizer signalPassFailure(); return; } - char *ptrToSizeInBuffer = static_cast(args) + offset; + const char *ptrToSizeInBuffer = + static_cast(args) + offset; auto sizeFromBuffer = - *reinterpret_cast(ptrToSizeInBuffer); + *reinterpret_cast(ptrToSizeInBuffer); auto bytesInType = [&eleTy]() -> unsigned { if (isa(eleTy)) return 16 /*bytes: sizeof(ptr) + sizeof(i64)*/; @@ -589,8 +598,10 @@ class QuakeSynthesizer // TODO: for now we can ignore empty struct types. continue; } - char *ptrToSizeInBuffer = static_cast(args) + offset; - auto rawSize = *reinterpret_cast(ptrToSizeInBuffer); + const char *ptrToSizeInBuffer = + static_cast(args) + offset; + auto rawSize = + *reinterpret_cast(ptrToSizeInBuffer); stdVecInfo.emplace_back(argNum, Type{}, rawSize); continue; } @@ -604,7 +615,7 @@ class QuakeSynthesizer // the block arg with the actual vector element data. First get the pointer // to the start of the buffer's appendix. auto structSize = structLayout.first; - char *bufferAppendix = static_cast(args) + structSize; + const char *bufferAppendix = static_cast(args) + structSize; for (auto [idx, eleTy, vecLength] : stdVecInfo) { if (!eleTy) { // FIXME: Skip struct values. @@ -614,7 +625,7 @@ class QuakeSynthesizer continue; } auto doVector = [&](T) { - auto *ptr = reinterpret_cast(bufferAppendix); + auto *ptr = reinterpret_cast(bufferAppendix); std::vector v(ptr, ptr + vecLength); if (failed(synthesizeVectorArgument(builder, module, counter, arguments[idx], v))) @@ -667,7 +678,8 @@ class QuakeSynthesizer // of sizes that are encoded starting at bufferAppendix. // At the end of the block of sizes, the C-strings will be encoded. auto numberSpans = vecLength; - auto *spanSizes = reinterpret_cast(bufferAppendix); + auto *spanSizes = + reinterpret_cast(bufferAppendix); bufferAppendix += vecLength * sizeof(std::uint64_t); // These strings are reified in the following way: // - Create an array numberSpans in length and where each element @@ -726,7 +738,8 @@ class QuakeSynthesizer // Remove the old arguments. auto numArgs = funcOp.getNumArguments(); BitVector argsToErase(numArgs); - for (std::size_t argIndex = 0; argIndex < numArgs; ++argIndex) { + for (std::size_t argIndex = startingArgIdx; argIndex < numArgs; + ++argIndex) { argsToErase.set(argIndex); if (!funcOp.getBody().front().getArgument(argIndex).getUses().empty()) { funcOp.emitError("argument(s) still in use after synthesis."); @@ -745,6 +758,7 @@ std::unique_ptr cudaq::opt::createQuakeSynthesizer() { } std::unique_ptr -cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, void *a) { - return std::make_unique(kernelName, a); +cudaq::opt::createQuakeSynthesizer(std::string_view kernelName, const void *a, + std::size_t startingArgIdx) { + return std::make_unique(kernelName, a, startingArgIdx); } diff --git a/python/runtime/cudaq/algorithms/py_vqe.cpp b/python/runtime/cudaq/algorithms/py_vqe.cpp index c88328f54b7..61b676a4fc5 100644 --- a/python/runtime/cudaq/algorithms/py_vqe.cpp +++ b/python/runtime/cudaq/algorithms/py_vqe.cpp @@ -9,6 +9,7 @@ #include #include +#include "common/ArgumentWrapper.h" #include "common/JsonConvert.h" #include "common/SerializedCodeExecutionContext.h" #include "cudaq/Optimizer/Dialect/CC/CCTypes.h" @@ -26,6 +27,11 @@ namespace cudaq { void pyAltLaunchKernel(const std::string &, MlirModule, OpaqueArguments &, const std::vector &); +void *pyGetKernelArgs(const std::string &name, MlirModule module, + cudaq::OpaqueArguments &runtimeArgs, + const std::vector &names, + std::size_t startingArgIdx); + /// @brief Return the quantum kernel `FuncOp` from the given `ModuleOp`. mlir::func::FuncOp getKernelFuncOp(mlir::ModuleOp &mod, const std::string &kernelName) { @@ -57,14 +63,21 @@ bool isArgumentStdVec(MlirModule &module, const std::string &kernelName, return isa(kernel.getArgument(argIdx).getType()); } -/// @brief Run `cudaq::observe` on the provided kernel and spin operator. -observe_result pyObserve(py::object &kernel, spin_op &spin_operator, - py::args args, const int shots, - bool argMapperProvided = false) { +/// @brief Return the kernel name and MLIR module for a kernel. +static inline std::pair +getKernelNameAndModule(py::object &kernel) { if (py::hasattr(kernel, "compile")) kernel.attr("compile")(); auto kernelName = kernel.attr("name").cast(); auto kernelMod = kernel.attr("module").cast(); + return std::make_pair(kernelName, kernelMod); +} + +/// @brief Run `cudaq::observe` on the provided kernel and spin operator. +observe_result pyObserve(py::object &kernel, spin_op &spin_operator, + py::args args, const int shots, + bool argMapperProvided = false) { + auto [kernelName, kernelMod] = getKernelNameAndModule(kernel); auto &platform = cudaq::get_platform(); args = simplifiedValidateInputArguments(args); auto *argData = toOpaqueArgs(args, kernelMod, kernelName); @@ -98,6 +111,69 @@ observe_result pyObserve(py::object &kernel, spin_op &spin_operator, .value(); } +/// @brief Return whether or not \p kernel is compatible with the remote VQE +/// implementation that requires the variation parameters to be the first +/// argument in the kernel. +static bool firstArgIsCompatibleWithRemoteVQE(py::object &kernel) { + auto [kernelName, kernelMod] = getKernelNameAndModule(kernel); + auto kernelFunc = getKernelFuncOp(kernelMod, kernelName); + if (kernelFunc.getNumArguments() < 1) + return false; + auto firstKernelArgTy = kernelFunc.getArgument(0).getType(); + if (auto stdVec = dyn_cast(firstKernelArgTy)) { + auto eleTy = stdVec.getElementType(); + return isa(eleTy); + } else { + return false; + } +} + +/// @brief Perform VQE on a remote platform using the C++ capabilities. This +/// function is used for many of the pyVQE variants below, so some of the +/// parameters may be nullptr. +static optimization_result +pyVQE_remote_cpp(cudaq::quantum_platform &platform, py::object &kernel, + spin_op &hamiltonian, cudaq::optimizer &optimizer, + cudaq::gradient *gradient, py::function *argumentMapper, + const int n_params, const int shots) { + auto [kernelName, kernelMod] = getKernelNameAndModule(kernel); + auto ctx = std::make_unique("observe", /*shots=*/0); + ctx->kernelName = kernelName; + ctx->spin = &hamiltonian; + platform.set_exec_ctx(ctx.get()); + + constexpr std::size_t startingArgIdx = 1; + cudaq::OpaqueArguments args; + void *kernelArgs = nullptr; + if (argumentMapper) { + std::vector myArg(n_params); + py::list py_list = py::cast(myArg); + py::tuple result = (*argumentMapper)(py_list); + py::args runtimeArgs = result; + + // Serialize arguments (all concrete parameters except for the first one) + // into kernelArgs buffer space. + auto kernelFunc = getKernelFuncOp(kernelMod, kernelName); + cudaq::packArgs( + args, runtimeArgs, kernelFunc, + [](OpaqueArguments &, py::object &) { return false; }, startingArgIdx); + } + kernelArgs = pyGetKernelArgs(kernelName, kernelMod, args, /*names=*/{}, + startingArgIdx); + + // Need to form cudaq::ArgWrapper and pass that into launchVQE. + std::vector names; + auto *wrapper = new cudaq::ArgWrapper{unwrap(kernelMod), names, kernelArgs}; + + platform.launchVQE(kernelName, wrapper, gradient, hamiltonian, optimizer, + n_params, shots); + platform.reset_exec_ctx(); + delete wrapper; + if (kernelArgs) + std::free(kernelArgs); + return ctx->optResult.value_or(optimization_result{}); +} + /// @brief Perform VQE on a remote platform. This function is used for many of /// the pyVQE variants below, so some of the parameters may be nullptr. static optimization_result @@ -185,11 +261,28 @@ pyVQE_remote(cudaq::quantum_platform &platform, py::object &kernel, return result; } +/// @brief Throw an exception instructing the user how to achieve optimal +/// performance +static void throwPerformanceError() { + throw std::runtime_error( + "ERROR: Achieving optimal VQE kernel on this platform requires the first " + "parameter in the kernel to be the variational parameter (list of " + "floats). Please update your VQE kernel to have list[float] as a its " + "first parameter\n"); +} + /// @brief Run `cudaq.vqe()` without a gradient strategy. optimization_result pyVQE(py::object &kernel, spin_op &hamiltonian, cudaq::optimizer &optimizer, const int n_params, const int shots = -1) { auto &platform = cudaq::get_platform(); + if (platform.supports_remote_vqe()) { + if (firstArgIsCompatibleWithRemoteVQE(kernel)) + return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer, + /*gradient=*/nullptr, /*argumentMapper=*/nullptr, + n_params, shots); + throwPerformanceError(); + } if (platform.supports_remote_serialized_code()) return pyVQE_remote(platform, kernel, hamiltonian, optimizer, /*gradient=*/nullptr, /*argumentMapper=*/nullptr, @@ -209,6 +302,13 @@ optimization_result pyVQE(py::object &kernel, spin_op &hamiltonian, cudaq::optimizer &optimizer, const int n_params, py::function &argumentMapper, const int shots = -1) { auto &platform = cudaq::get_platform(); + if (platform.supports_remote_vqe()) { + if (firstArgIsCompatibleWithRemoteVQE(kernel)) + return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer, + /*gradient=*/nullptr, &argumentMapper, n_params, + shots); + throwPerformanceError(); + } if (platform.supports_remote_serialized_code()) return pyVQE_remote(platform, kernel, hamiltonian, optimizer, /*gradient=*/nullptr, &argumentMapper, n_params, shots); @@ -235,6 +335,13 @@ optimization_result pyVQE(py::object &kernel, cudaq::gradient &gradient, // to allow for the calculation of the gradient vector with the // provided gradient strategy. auto &platform = cudaq::get_platform(); + if (platform.supports_remote_vqe()) { + if (firstArgIsCompatibleWithRemoteVQE(kernel)) + return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer, + &gradient, + /*argumentMapper=*/nullptr, n_params, shots); + throwPerformanceError(); + } if (platform.supports_remote_serialized_code()) return pyVQE_remote(platform, kernel, hamiltonian, optimizer, &gradient, /*argumentMapper=*/nullptr, n_params, shots); @@ -267,6 +374,12 @@ optimization_result pyVQE(py::object &kernel, cudaq::gradient &gradient, // to allow for the calculation of the gradient vector with the // provided gradient strategy. auto &platform = cudaq::get_platform(); + if (platform.supports_remote_vqe()) { + if (firstArgIsCompatibleWithRemoteVQE(kernel)) + return pyVQE_remote_cpp(platform, kernel, hamiltonian, optimizer, + &gradient, &argumentMapper, n_params, shots); + throwPerformanceError(); + } if (platform.supports_remote_serialized_code()) return pyVQE_remote(platform, kernel, hamiltonian, optimizer, &gradient, &argumentMapper, n_params, shots); diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index 6639e9b240c..392b1fa8fab 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -66,12 +66,19 @@ static std::unique_ptr cudaqStateStorage = std::tuple jitAndCreateArgs(const std::string &name, MlirModule module, cudaq::OpaqueArguments &runtimeArgs, - const std::vector &names, Type returnType) { + const std::vector &names, Type returnType, + std::size_t startingArgIdx = 0) { ScopedTraceWithContext(cudaq::TIMING_JIT, "jitAndCreateArgs", name); auto mod = unwrap(module); auto cloned = mod.clone(); auto context = cloned.getContext(); + // Do not cache the JIT if we are running with startingArgIdx > 0 because a) + // we won't be executing right after JIT-ing, and b) we might get called later + // this with startingArgIdx == 0, and we need that JIT to be performed and + // cached. + const bool allowCache = startingArgIdx == 0; + // Have we JIT compiled this before? auto hash = llvm::hash_code{0}; mod.walk([&hash](Operation *op) { @@ -80,7 +87,7 @@ jitAndCreateArgs(const std::string &name, MlirModule module, auto hashKey = static_cast(hash); ExecutionEngine *jit = nullptr; - if (jitCache->hasJITEngine(hashKey)) { + if (allowCache && jitCache->hasJITEngine(hashKey)) { jit = jitCache->getJITEngine(hashKey); } else { ScopedTraceWithContext(cudaq::TIMING_JIT, @@ -90,7 +97,8 @@ jitAndCreateArgs(const std::string &name, MlirModule module, pm.addNestedPass( cudaq::opt::createPySynthCallableBlockArgs(names)); pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader(/*genAsQuake=*/true)); - pm.addPass(cudaq::opt::createGenerateKernelExecution()); + pm.addPass(cudaq::opt::createGenerateKernelExecution( + {.startingArgIdx = startingArgIdx})); pm.addPass(cudaq::opt::createLambdaLiftingPass()); cudaq::opt::addPipelineConvertToQIR(pm); @@ -137,7 +145,8 @@ jitAndCreateArgs(const std::string &name, MlirModule module, auto uniqueJit = std::move(jitOrError.get()); jit = uniqueJit.release(); - jitCache->cache(hashKey, jit); + if (allowCache) + jitCache->cache(hashKey, jit); } // We need to append the return type to the OpaqueArguments here @@ -227,9 +236,14 @@ jitAndCreateArgs(const std::string &name, MlirModule module, std::tuple pyAltLaunchKernelBase(const std::string &name, MlirModule module, Type returnType, cudaq::OpaqueArguments &runtimeArgs, - const std::vector &names) { - auto [jit, rawArgs, size, returnOffset] = - jitAndCreateArgs(name, module, runtimeArgs, names, returnType); + const std::vector &names, + std::size_t startingArgIdx = 0) { + // Do not allow kernel execution if we are running with startingArgIdx > 0. + // This is used in remote VQE execution. + const bool launch = startingArgIdx == 0; + + auto [jit, rawArgs, size, returnOffset] = jitAndCreateArgs( + name, module, runtimeArgs, names, returnType, startingArgIdx); auto mod = unwrap(module); auto thunkName = name + ".thunk"; @@ -308,16 +322,18 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module, auto kernelReg = reinterpret_cast(*regFuncPtr); kernelReg(); - auto &platform = cudaq::get_platform(); - if (platform.is_remote() || platform.is_emulated()) { - auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs}; - cudaq::altLaunchKernel(name.c_str(), thunk, - reinterpret_cast(wrapper), size, - (uint64_t)returnOffset); - delete wrapper; - } else - cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, size, - (uint64_t)returnOffset); + if (launch) { + auto &platform = cudaq::get_platform(); + if (platform.is_remote() || platform.is_emulated()) { + auto *wrapper = new cudaq::ArgWrapper{mod, names, rawArgs}; + cudaq::altLaunchKernel(name.c_str(), thunk, + reinterpret_cast(wrapper), size, + (uint64_t)returnOffset); + delete wrapper; + } else + cudaq::altLaunchKernel(name.c_str(), thunk, rawArgs, size, + (uint64_t)returnOffset); + } return std::make_tuple(rawArgs, size, returnOffset); } @@ -388,6 +404,21 @@ void pyAltLaunchKernel(const std::string &name, MlirModule module, std::free(rawArgs); } +/// @brief Serialize \p runtimeArgs into a flat buffer starting at +/// \p startingArgIdx (0-based). This does not execute the kernel. This is +/// useful for VQE applications when you want to serialize the constant +/// parameters that are not being optimized. The caller is responsible for +/// executing `std::free()` on the return value. +void *pyGetKernelArgs(const std::string &name, MlirModule module, + cudaq::OpaqueArguments &runtimeArgs, + const std::vector &names, + std::size_t startingArgIdx) { + auto noneType = mlir::NoneType::get(unwrap(module).getContext()); + auto [rawArgs, size, returnOffset] = pyAltLaunchKernelBase( + name, module, noneType, runtimeArgs, names, startingArgIdx); + return rawArgs; +} + inline unsigned int byteSize(mlir::Type ty) { if (isa(ty)) { auto eleTy = cast(ty).getElementType(); diff --git a/python/runtime/utils/PyRemoteSimulatorQPU.cpp b/python/runtime/utils/PyRemoteSimulatorQPU.cpp index f59c296a1d0..b39f56f54ff 100644 --- a/python/runtime/utils/PyRemoteSimulatorQPU.cpp +++ b/python/runtime/utils/PyRemoteSimulatorQPU.cpp @@ -22,6 +22,35 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU { virtual bool isEmulated() override { return true; } + void launchVQE(const std::string &name, const void *kernelArgs, + cudaq::gradient *gradient, cudaq::spin_op H, + cudaq::optimizer &optimizer, const int n_params, + const std::size_t shots) override { + cudaq::ExecutionContext *executionContextPtr = + getExecutionContextForMyThread(); + + auto *wrapper = reinterpret_cast(kernelArgs); + auto m_module = wrapper->mod; + auto *mlirContext = m_module->getContext(); + + if (executionContextPtr && executionContextPtr->name == "tracer") + return; + + auto ctx = std::make_unique("observe", shots); + ctx->kernelName = name; + ctx->spin = &H; + if (shots > 0) + ctx->shots = shots; + + std::string errorMsg; + const bool requestOkay = m_client->sendRequest( + *mlirContext, *executionContextPtr, /*serializedCodeContext=*/nullptr, + gradient, &optimizer, n_params, m_simName, name, /*kernelFunc=*/nullptr, + wrapper->rawArgs, /*argSize=*/0, &errorMsg); + if (!requestOkay) + throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg); + } + void launchKernel(const std::string &name, void (*kernelFunc)(void *), void *args, std::uint64_t voidStarSize, std::uint64_t resultOffset) override { @@ -35,13 +64,8 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU { auto *mlirContext = m_module->getContext(); cudaq::ExecutionContext *executionContextPtr = - [&]() -> cudaq::ExecutionContext * { - std::scoped_lock lock(m_contextMutex); - const auto iter = m_contexts.find(std::this_thread::get_id()); - if (iter == m_contexts.end()) - return nullptr; - return iter->second; - }(); + getExecutionContextForMyThread(); + // Default context for a 'fire-and-ignore' kernel launch; i.e., no context // was set before launching the kernel. Use a static variable per thread to // set up a single-shot execution context for this case. @@ -52,6 +76,7 @@ class PyRemoteSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU { std::string errorMsg; const bool requestOkay = m_client->sendRequest( *mlirContext, executionContext, /*serializedCodeContext=*/nullptr, + /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0, m_simName, name, kernelFunc, wrapper->rawArgs, voidStarSize, &errorMsg); if (!requestOkay) throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg); @@ -84,13 +109,8 @@ class PyNvcfSimulatorQPU : public cudaq::BaseNvcfSimulatorQPU { auto *mlirContext = m_module->getContext(); cudaq::ExecutionContext *executionContextPtr = - [&]() -> cudaq::ExecutionContext * { - std::scoped_lock lock(m_contextMutex); - const auto iter = m_contexts.find(std::this_thread::get_id()); - if (iter == m_contexts.end()) - return nullptr; - return iter->second; - }(); + getExecutionContextForMyThread(); + // Default context for a 'fire-and-ignore' kernel launch; i.e., no context // was set before launching the kernel. Use a static variable per thread to // set up a single-shot execution context for this case. @@ -101,6 +121,7 @@ class PyNvcfSimulatorQPU : public cudaq::BaseNvcfSimulatorQPU { std::string errorMsg; const bool requestOkay = m_client->sendRequest( *mlirContext, executionContext, /*serializedCodeContext=*/nullptr, + /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0, m_simName, name, kernelFunc, wrapper->rawArgs, voidStarSize, &errorMsg); if (!requestOkay) throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg); diff --git a/python/tests/remote/test_remote_code_exec.py b/python/tests/remote/test_remote_code_exec.py index 33af45ddb95..960f22a8dfc 100644 --- a/python/tests/remote/test_remote_code_exec.py +++ b/python/tests/remote/test_remote_code_exec.py @@ -297,6 +297,31 @@ def kernel(angles: list[float], num_qubits: int): assert assert_close(parameter[0], 0.5840908448487905, 1e-3) +@skipIfPythonLessThan39 +def test_vqe_perf_warning(): + hamiltonian = 5.907 - 2.1433 * spin.x(0) * spin.x(1) - 2.1433 * spin.y( + 0) * spin.y(1) + .21829 * spin.z(0) - 6.125 * spin.z(1) + + @cudaq.kernel + def kernel(num_qubits: int, angles: list[float]): + qvector = cudaq.qvector(num_qubits) + x(qvector[0]) + ry(angles[0], qvector[1]) + x.ctrl(qvector[1], qvector[0]) + + optimizer = cudaq.optimizers.Adam() + grad = cudaq.gradients.CentralDifference() + + num_qubits = 2 + with pytest.raises(RuntimeError) as error: + energy, parameter = cudaq.vqe(kernel=kernel, + gradient_strategy=grad, + spin_operator=hamiltonian, + optimizer=optimizer, + argument_mapper=lambda x: (num_qubits, x), + parameter_count=1) + + # This is a helper function used by parameterized tests below. @pytest.mark.skip def test_complex_vqe_named_lambda(optimizer, gradient): diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h index f7b1f4bc988..661f5efb5d1 100644 --- a/python/utils/OpaqueArguments.h +++ b/python/utils/OpaqueArguments.h @@ -185,18 +185,18 @@ inline std::string mlirTypeToString(mlir::Type ty) { return msg; } -inline void -packArgs(OpaqueArguments &argData, py::args args, - mlir::func::FuncOp kernelFuncOp, - const std::function - &backupHandler) { +inline void packArgs(OpaqueArguments &argData, py::args args, + mlir::func::FuncOp kernelFuncOp, + const std::function &backupHandler, + std::size_t startingArgIdx = 0) { if (kernelFuncOp.getNumArguments() != args.size()) throw std::runtime_error("Invalid runtime arguments - kernel expected " + std::to_string(kernelFuncOp.getNumArguments()) + " but was provided " + std::to_string(args.size()) + " arguments."); - for (std::size_t i = 0; i < args.size(); i++) { + for (std::size_t i = startingArgIdx; i < args.size(); i++) { py::object arg = args[i]; auto kernelArgTy = kernelFuncOp.getArgument(i).getType(); llvm::TypeSwitch(kernelArgTy) diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h index 79cb796103e..fd73b89c7c8 100644 --- a/runtime/common/BaseRemoteSimulatorQPU.h +++ b/runtime/common/BaseRemoteSimulatorQPU.h @@ -14,15 +14,16 @@ #include "common/RuntimeMLIR.h" #include "common/SerializedCodeExecutionContext.h" #include "cudaq.h" +#include "cudaq/algorithms/gradient.h" +#include "cudaq/algorithms/optimizer.h" #include "cudaq/platform/qpu.h" #include "cudaq/platform/quantum_platform.h" #include namespace cudaq { -// TODO - Remove this once the public NVQC deployment supports this capability. -static inline bool serializedCodeExecOverride() { - if (auto envVal = std::getenv("CUDAQ_SER_CODE_EXEC")) { +static inline bool getEnvVarBool(const char *envVarName) { + if (auto envVal = std::getenv(envVarName)) { std::string tmp(envVal); std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](unsigned char c) { return std::tolower(c); }); @@ -32,6 +33,16 @@ static inline bool serializedCodeExecOverride() { return false; } +// TODO - Remove this once the public NVQC deployment supports this capability. +static inline bool remoteVQEExecOverride() { + return getEnvVarBool("CUDAQ_REMOTE_VQE"); +} + +// TODO - Remove this once the public NVQC deployment supports this capability. +static inline bool serializedCodeExecOverride() { + return getEnvVarBool("CUDAQ_SER_CODE_EXEC"); +} + // Remote QPU: delegating the execution to a remotely-hosted server, which can // reinstate the execution context and JIT-invoke the kernel. class BaseRemoteSimulatorQPU : public cudaq::QPU { @@ -42,6 +53,16 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { std::unique_ptr m_mlirContext; std::unique_ptr m_client; + /// @brief Return a pointer to the execution context for this thread. It will + /// return `nullptr` if it was not found in `m_contexts`. + cudaq::ExecutionContext *getExecutionContextForMyThread() { + std::scoped_lock lock(m_contextMutex); + const auto iter = m_contexts.find(std::this_thread::get_id()); + if (iter == m_contexts.end()) + return nullptr; + return iter->second; + } + public: BaseRemoteSimulatorQPU() : QPU(), @@ -57,6 +78,10 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { // Conditional feedback is handled by the server side. virtual bool supportsConditionalFeedback() override { return true; } + // VQE is executed fully on the server without the need to go back and forth + // in between observe calls + virtual bool supportsRemoteVQE() override { return true; } + // Remote serializable code is executed fully on the server without the need // to go back and forth in between observe calls (see // launchSerializedCodeExecution). @@ -80,6 +105,31 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { execution_queue->enqueue(task); } + void launchVQE(const std::string &name, const void *kernelArgs, + cudaq::gradient *gradient, cudaq::spin_op H, + cudaq::optimizer &optimizer, const int n_params, + const std::size_t shots) override { + cudaq::ExecutionContext *executionContextPtr = + getExecutionContextForMyThread(); + + if (executionContextPtr && executionContextPtr->name == "tracer") + return; + + auto ctx = std::make_unique("observe", shots); + ctx->kernelName = name; + ctx->spin = &H; + if (shots > 0) + ctx->shots = shots; + + std::string errorMsg; + const bool requestOkay = m_client->sendRequest( + *m_mlirContext, *executionContextPtr, /*serializedCodeContext=*/nullptr, + gradient, &optimizer, n_params, m_simName, name, /*kernelFunc=*/nullptr, + kernelArgs, /*argSize=*/0, &errorMsg); + if (!requestOkay) + throw std::runtime_error("Failed to launch VQE. Error: " + errorMsg); + } + void launchKernel(const std::string &name, void (*kernelFunc)(void *), void *args, std::uint64_t voidStarSize, std::uint64_t resultOffset) override { @@ -89,13 +139,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { name, qpu_id, m_simName); cudaq::ExecutionContext *executionContextPtr = - [&]() -> cudaq::ExecutionContext * { - std::scoped_lock lock(m_contextMutex); - const auto iter = m_contexts.find(std::this_thread::get_id()); - if (iter == m_contexts.end()) - return nullptr; - return iter->second; - }(); + getExecutionContextForMyThread(); if (executionContextPtr && executionContextPtr->name == "tracer") { return; @@ -109,10 +153,10 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { cudaq::ExecutionContext &executionContext = executionContextPtr ? *executionContextPtr : defaultContext; std::string errorMsg; - const bool requestOkay = - m_client->sendRequest(*m_mlirContext, executionContext, - /*serializedCodeContext=*/nullptr, m_simName, - name, kernelFunc, args, voidStarSize, &errorMsg); + const bool requestOkay = m_client->sendRequest( + *m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr, + /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0, + m_simName, name, kernelFunc, args, voidStarSize, &errorMsg); if (!requestOkay) throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg); } @@ -127,13 +171,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { name, qpu_id, m_simName); cudaq::ExecutionContext *executionContextPtr = - [&]() -> cudaq::ExecutionContext * { - std::scoped_lock lock(m_contextMutex); - const auto iter = m_contexts.find(std::this_thread::get_id()); - if (iter == m_contexts.end()) - return nullptr; - return iter->second; - }(); + getExecutionContextForMyThread(); if (executionContextPtr && executionContextPtr->name == "tracer") { return; @@ -150,6 +188,7 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { std::string errorMsg; const bool requestOkay = m_client->sendRequest( *m_mlirContext, executionContext, &serializeCodeExecutionObject, + /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0, m_simName, name, /*kernelFunc=*/nullptr, /*args=*/nullptr, /*voidStarSize=*/0, &errorMsg); if (!requestOkay) @@ -240,6 +279,11 @@ class BaseNvcfSimulatorQPU : public cudaq::BaseRemoteSimulatorQPU { m_client->setConfig(clientConfigs); } + // VQE is executed fully on the server without the need to go back and forth + // in between observe calls (see launchVQE). + // TODO - set this to true when NVQC supports this. + virtual bool supportsRemoteVQE() override { return remoteVQEExecOverride(); } + // Remote serializable code is executed fully on the server without the need // to go back and forth in between observe calls (see // launchSerializedCodeExecution). diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h index fb95c38f7a2..ada308c7799 100644 --- a/runtime/common/BaseRestRemoteClient.h +++ b/runtime/common/BaseRestRemoteClient.h @@ -111,8 +111,10 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient { std::string constructKernelPayload(mlir::MLIRContext &mlirContext, const std::string &name, - void (*kernelFunc)(void *), void *args, - std::uint64_t voidStarSize) { + void (*kernelFunc)(void *), + const void *args, + std::uint64_t voidStarSize, + std::size_t startingArgIdx) { if (cudaq::__internal__::isLibraryMode(name)) { // Library mode: retrieve the embedded bitcode in the executable. const auto path = llvm::sys::fs::getMainExecutable(nullptr, nullptr); @@ -174,7 +176,8 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient { if (args) { cudaq::info("Run Quake Synth.\n"); mlir::PassManager pm(&mlirContext); - pm.addPass(cudaq::opt::createQuakeSynthesizer(name, args)); + pm.addPass( + cudaq::opt::createQuakeSynthesizer(name, args, startingArgIdx)); pm.addPass(mlir::createCanonicalizerPass()); if (failed(pm.run(moduleOp))) throw std::runtime_error("Could not successfully apply quake-synth."); @@ -210,12 +213,53 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient { return llvm::encodeBase64(mlirCode); } } + cudaq::RestRequest constructVQEJobRequest( + mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context, + const std::string &backendSimName, const std::string &kernelName, + const void *kernelArgs, cudaq::gradient *gradient, + cudaq::optimizer &optimizer, const int n_params) { + cudaq::RestRequest request(io_context, version()); + + request.opt = RestRequestOptFields(); + request.opt->optimizer_n_params = n_params; + request.opt->optimizer_type = get_optimizer_type(optimizer); + request.opt->optimizer_ptr = &optimizer; + request.opt->gradient_ptr = gradient; + if (gradient) + request.opt->gradient_type = get_gradient_type(*gradient); + + request.entryPoint = kernelName; + request.passes = serverPasses; + request.format = cudaq::CodeFormat::MLIR; + request.code = + constructKernelPayload(mlirContext, kernelName, /*kernelFunc=*/nullptr, + /*kernelArgs=*/kernelArgs, + /*argsSize=*/0, /*startingArgIdx=*/1); + request.simulator = backendSimName; + // Remote server seed + // Note: unlike local executions whereby a static instance of the simulator + // is seeded once when `cudaq::set_random_seed` is called, thus not being + // re-seeded between executions. For remote executions, we use the runtime + // level seed value to seed a random number generator to seed the server. + // i.e., consecutive remote executions on the server from the same client + // session (where `cudaq::set_random_seed` is called), get new random seeds + // for each execution. The sequence is still deterministic based on the + // runtime-level seed value. + request.seed = [&]() { + std::uniform_int_distribution seedGen( + std::numeric_limits::min(), + std::numeric_limits::max()); + return seedGen(randEngine); + }(); + return request; + } cudaq::RestRequest constructJobRequest( mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context, cudaq::SerializedCodeExecutionContext *serializedCodeContext, const std::string &backendSimName, const std::string &kernelName, - void (*kernelFunc)(void *), void *kernelArgs, std::uint64_t argsSize) { + void (*kernelFunc)(void *), const void *kernelArgs, + std::uint64_t argsSize) { cudaq::RestRequest request(io_context, version()); if (serializedCodeContext) @@ -257,19 +301,22 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient { cudaq::IRPayLoad stateIrPayload1, stateIrPayload2; stateIrPayload1.entryPoint = kernelName1; - stateIrPayload1.ir = constructKernelPayload(mlirContext, kernelName1, - nullptr, args1, argsSize1); + stateIrPayload1.ir = + constructKernelPayload(mlirContext, kernelName1, nullptr, args1, + argsSize1, /*startingArgIdx=*/0); stateIrPayload2.entryPoint = kernelName2; - stateIrPayload2.ir = constructKernelPayload(mlirContext, kernelName2, - nullptr, args2, argsSize2); + stateIrPayload2.ir = + constructKernelPayload(mlirContext, kernelName2, nullptr, args2, + argsSize2, /*startingArgIdx=*/0); // First kernel of the overlap calculation request.code = stateIrPayload1.ir; request.entryPoint = stateIrPayload1.entryPoint; // Second kernel of the overlap calculation request.overlapKernel = stateIrPayload2; } else if (serializedCodeContext == nullptr) { - request.code = constructKernelPayload(mlirContext, kernelName, kernelFunc, - kernelArgs, argsSize); + request.code = + constructKernelPayload(mlirContext, kernelName, kernelFunc, + kernelArgs, argsSize, /*startingArgIdx=*/0); } request.simulator = backendSimName; // Remote server seed @@ -294,17 +341,26 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient { sendRequest(mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context, cudaq::SerializedCodeExecutionContext *serializedCodeContext, - const std::string &backendSimName, const std::string &kernelName, - void (*kernelFunc)(void *), void *kernelArgs, - std::uint64_t argsSize, std::string *optionalErrorMsg) override { + cudaq::gradient *vqe_gradient, cudaq::optimizer *vqe_optimizer, + const int vqe_n_params, const std::string &backendSimName, + const std::string &kernelName, void (*kernelFunc)(void *), + const void *kernelArgs, std::uint64_t argsSize, + std::string *optionalErrorMsg) override { if (isDisallowed(io_context.name)) throw std::runtime_error( io_context.name + " operation is not supported with cudaq target remote-mqpu!"); - cudaq::RestRequest request = constructJobRequest( - mlirContext, io_context, serializedCodeContext, backendSimName, - kernelName, kernelFunc, kernelArgs, argsSize); + cudaq::RestRequest request = [&]() { + if (vqe_n_params > 0) + return constructVQEJobRequest(mlirContext, io_context, backendSimName, + kernelName, kernelArgs, vqe_gradient, + *vqe_optimizer, vqe_n_params); + return constructJobRequest(mlirContext, io_context, serializedCodeContext, + backendSimName, kernelName, kernelFunc, + kernelArgs, argsSize); + }(); + if (request.code.empty() && (serializedCodeContext == nullptr || serializedCodeContext->source_code.empty())) { if (optionalErrorMsg) @@ -705,9 +761,11 @@ class BaseNvcfRuntimeClient : public cudaq::BaseRemoteRestRuntimeClient { sendRequest(mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context, cudaq::SerializedCodeExecutionContext *serializedCodeContext, - const std::string &backendSimName, const std::string &kernelName, - void (*kernelFunc)(void *), void *kernelArgs, - std::uint64_t argsSize, std::string *optionalErrorMsg) override { + cudaq::gradient *vqe_gradient, cudaq::optimizer *vqe_optimizer, + const int vqe_n_params, const std::string &backendSimName, + const std::string &kernelName, void (*kernelFunc)(void *), + const void *kernelArgs, std::uint64_t argsSize, + std::string *optionalErrorMsg) override { if (isDisallowed(io_context.name)) throw std::runtime_error( io_context.name + @@ -735,9 +793,15 @@ class BaseNvcfRuntimeClient : public cudaq::BaseRemoteRestRuntimeClient { } } // Construct the base `cudaq-qpud` request payload. - cudaq::RestRequest request = constructJobRequest( - mlirContext, io_context, serializedCodeContext, backendSimName, - kernelName, kernelFunc, kernelArgs, argsSize); + cudaq::RestRequest request = [&]() { + if (vqe_n_params > 0) + return constructVQEJobRequest(mlirContext, io_context, backendSimName, + kernelName, kernelArgs, vqe_gradient, + *vqe_optimizer, vqe_n_params); + return constructJobRequest(mlirContext, io_context, serializedCodeContext, + backendSimName, kernelName, kernelFunc, + kernelArgs, argsSize); + }(); if (request.code.empty() && (serializedCodeContext == nullptr || serializedCodeContext->source_code.empty())) { diff --git a/runtime/common/KernelWrapper.h b/runtime/common/KernelWrapper.h index 29b44d16e3c..e0af46e71ee 100644 --- a/runtime/common/KernelWrapper.h +++ b/runtime/common/KernelWrapper.h @@ -422,6 +422,8 @@ class WrapperFunctionHandlerHelper { public: using ArgTuple = std::tuple...>; using ArgIndices = std::make_index_sequence::value>; + using ArgIndicesPlus1 = + std::make_index_sequence<1 + std::tuple_size::value>; template static void invoke(CallableT &&func, const char *argData, @@ -436,6 +438,22 @@ class WrapperFunctionHandlerHelper { ArgIndices{}); } + // Specialization when the 1st std::vector argument has been excluded + // from the serialized args, but now you want to call it. + template + static void invoke(CallableT &&func, const std::vector &vec_parms, + const char *argData, std::size_t argSize) { + ArgTuple argsTuple; + // Deserialize buffer to args tuple + if (!deserialize(argData, argSize, argsTuple, ArgIndices{})) + throw std::runtime_error( + "Failed to deserialize arguments for wrapper function call"); + // Call the wrapped function with args tuple + auto newArgsTuple = std::tuple_cat(std::make_tuple(vec_parms), argsTuple); + WrapperFunctionHandlerCaller::call(std::forward(func), + newArgsTuple, ArgIndicesPlus1{}); + } + private: // Helper to deserialize a flat args buffer into typed args tuple. template @@ -476,6 +494,19 @@ void invokeCallableWithSerializedArgs(const char *argData, std::size_t argSize, InvokeArgTs...>::invoke(std::forward(func), argData, argSize); } +// Invoke a typed callable (functions) with a std::vec + serialized +// `args`. +template +void invokeCallableWithSerializedArgs_vec(const std::vector &vec_parms, + const char *argData, + std::size_t argSize, + CallableT &&func) { + WrapperFunctionHandlerHelper< + std::remove_reference_t, + InvokeArgTs...>::invoke(std::forward(func), vec_parms, argData, + argSize); +} + // Wrapper for quantum kernel invocation, i.e., `kernel(args...)`. // In library mode, if the remote platform is used, we redirect it to the // platform's `launchKernel` instead of invoking it. diff --git a/runtime/common/RemoteKernelExecutor.h b/runtime/common/RemoteKernelExecutor.h index 8647a2a5ff8..969d76b92b3 100644 --- a/runtime/common/RemoteKernelExecutor.h +++ b/runtime/common/RemoteKernelExecutor.h @@ -24,6 +24,8 @@ class MLIRContext; } namespace cudaq { class ExecutionContext; +class gradient; +class optimizer; class SerializedCodeExecutionContext; /// Base interface encapsulating a CUDA-Q runtime server capable of @@ -49,7 +51,14 @@ class RemoteRuntimeServer std::string_view ir, std::string_view kernelName, void *kernelArgs, std::uint64_t argsSize, std::size_t seed) = 0; - + // Handle incoming VQE requests + virtual void handleVQERequest(std::size_t reqId, + cudaq::ExecutionContext &io_context, + const std::string &backendSimName, + std::string_view ir, cudaq::gradient *gradient, + cudaq::optimizer &optimizer, const int n_params, + std::string_view kernelName, + std::size_t seed) = 0; // Destructor virtual ~RemoteRuntimeServer() = default; }; @@ -79,9 +88,10 @@ class RemoteRuntimeClient virtual bool sendRequest(mlir::MLIRContext &mlirContext, ExecutionContext &io_context, SerializedCodeExecutionContext *serializedCodeContext, - const std::string &backendSimName, const std::string &kernelName, - void (*kernelFunc)(void *), void *kernelArgs, - std::uint64_t argsSize, + cudaq::gradient *vqe_gradient, cudaq::optimizer *vqe_optimizer, + const int vqe_n_params, const std::string &backendSimName, + const std::string &kernelName, void (*kernelFunc)(void *), + const void *kernelArgs, std::uint64_t argsSize, std::string *optionalErrorMsg = nullptr) = 0; // Destructor virtual ~RemoteRuntimeClient() = default; diff --git a/runtime/cudaq/algorithms/gradient.h b/runtime/cudaq/algorithms/gradient.h index e094d7bb286..fe848fde561 100644 --- a/runtime/cudaq/algorithms/gradient.h +++ b/runtime/cudaq/algorithms/gradient.h @@ -37,12 +37,22 @@ class gradient { /// The parameterized ansatz, a quantum kernel expression std::function)> ansatz_functor; + // As an alternative to an ArgsMapper, we can have serialized arguments + // (excluding the initial std::vector variational parameters). + std::vector serializedArgs; + // Given the parameters x and the spin_op h, compute the // expected value with respect to the ansatz. double getExpectedValue(std::vector &x, spin_op h) { return cudaq::observe(ansatz_functor, h, x); } + // Copy constructor. Derived classes should implement the clone() method. + gradient(const gradient &o) { + ansatz_functor = o.ansatz_functor; + serializedArgs = o.serializedArgs; + } + public: /// Constructor, takes the quantum kernel with prescribed signature gradient(std::function)> &&kernel) @@ -66,14 +76,48 @@ class gradient { }; } + /// Take the quantum kernel and concrete arguments for all arguments except + /// the first std::vector argument, which is used for the variational + /// parameters for the gradient. Serialize and save those arguments into this + /// object. (Useful for NVQC.) + template + void setArgs(QuantumKernel &kernel, Args &&...args) { + static_assert( + std::is_invocable_v, Args...>, + "Kernel must be invocable with std::vector and Args..."); + // Serialize all the parameters except for the first std::vector + // parameter. The serialized ones will be saved and used later during each + // ansatz_functor invocation. + serializedArgs = serializeArgs(std::forward(args)...); + ansatz_functor = [&](std::vector x) { + cudaq::invokeCallableWithSerializedArgs_vec...>( + x, serializedArgs.data(), serializedArgs.size(), + std::forward(kernel)); + }; + } + + /// Set the kernel after the gradient has been constructed. Use of this + /// function requires that the kernel ONLY accept the variational parameters. + /// It cannot have any non-variational parameters. + template + void setKernel(QuantumKernel &kernel) { + static_assert(std::is_invocable_v>, + "Kernel must be invocable with std::vector"); + ansatz_functor = kernel; + } + /// Constructor, takes a callable that must have the /// prescribed call signature (void(std::vector)) - template + template >>> gradient(KernelT &kernel) { - if (kernel.getNumParams() != 1) - throw std::invalid_argument( - "Callable kernel from cudaq::make_kernel must " - "have 1 std::vector argument. Provide an ArgMapper if not."); + if constexpr (has_name::value) + if (kernel.getNumParams() != 1) + throw std::invalid_argument( + "Callable kernel from cudaq::make_kernel must " + "have 1 std::vector argument. Provide an ArgMapper if " + "not."); ansatz_functor = [&](std::vector x) { return cudaq::invokeKernel(std::forward(kernel), x); }; @@ -107,6 +151,9 @@ class gradient { const std::function)> &func, double funcAtX) = 0; + /// Clone the object. Must be implemented by derived classes. + virtual std::unique_ptr clone() = 0; + virtual ~gradient() = default; }; } // namespace cudaq diff --git a/runtime/cudaq/algorithms/gradients/central_difference.h b/runtime/cudaq/algorithms/gradients/central_difference.h index 08bb72ea926..79840f4a98a 100644 --- a/runtime/cudaq/algorithms/gradients/central_difference.h +++ b/runtime/cudaq/algorithms/gradients/central_difference.h @@ -17,6 +17,12 @@ class central_difference : public gradient { using gradient::gradient; double step = 1e-4; + virtual std::unique_ptr clone() override { + auto newGrad = std::make_unique(*this); + newGrad->step = this->step; + return newGrad; + } + void compute(const std::vector &x, std::vector &dx, const spin_op &h, double exp_h) override { auto tmpX = x; diff --git a/runtime/cudaq/algorithms/gradients/forward_difference.h b/runtime/cudaq/algorithms/gradients/forward_difference.h index 3663ae2cda1..777a1843d64 100644 --- a/runtime/cudaq/algorithms/gradients/forward_difference.h +++ b/runtime/cudaq/algorithms/gradients/forward_difference.h @@ -19,6 +19,12 @@ class forward_difference : public gradient { using gradient::gradient; double step = 1e-4; + virtual std::unique_ptr clone() override { + auto newGrad = std::make_unique(*this); + newGrad->step = this->step; + return newGrad; + } + /// @brief Compute the `forward_difference` gradient void compute(const std::vector &x, std::vector &dx, const spin_op &h, double funcAtX) override { diff --git a/runtime/cudaq/algorithms/gradients/parameter_shift.h b/runtime/cudaq/algorithms/gradients/parameter_shift.h index cb5963aed9d..18b131620f4 100644 --- a/runtime/cudaq/algorithms/gradients/parameter_shift.h +++ b/runtime/cudaq/algorithms/gradients/parameter_shift.h @@ -16,6 +16,12 @@ class parameter_shift : public gradient { using gradient::gradient; double shiftScalar = 0.5; + virtual std::unique_ptr clone() override { + auto newGrad = std::make_unique(*this); + newGrad->shiftScalar = this->shiftScalar; + return newGrad; + } + void compute(const std::vector &x, std::vector &dx, const spin_op &h, double exp_h) override { auto tmpX = x; diff --git a/runtime/cudaq/algorithms/vqe.h b/runtime/cudaq/algorithms/vqe.h index f97d4574817..47dfc3696ed 100644 --- a/runtime/cudaq/algorithms/vqe.h +++ b/runtime/cudaq/algorithms/vqe.h @@ -7,12 +7,48 @@ ******************************************************************************/ #pragma once +#include "cudaq/gradients.h" #include "gradient.h" #include "observe.h" #include "optimizer.h" +#include namespace cudaq { +namespace __internal__ { +/// \brief This is an internal helper function to reduce duplicated code in the +/// user-facing `vqe()` functions below. Users should not directly call this +/// function. +template , Args...>>> +static inline optimization_result +remote_vqe(cudaq::quantum_platform &platform, QuantumKernel &&kernel, + cudaq::spin_op &H, cudaq::optimizer &optimizer, + cudaq::gradient *gradient, const int n_params, + const std::size_t shots, Args &&...args) { + auto ctx = std::make_unique("observe", shots); + ctx->kernelName = cudaq::getKernelName(kernel); + ctx->spin = &H; + platform.set_exec_ctx(ctx.get()); + auto serializedArgsBuffer = serializeArgs(args...); + platform.launchVQE(ctx->kernelName, serializedArgsBuffer.data(), gradient, H, + optimizer, n_params, shots); + platform.reset_exec_ctx(); + return ctx->optResult.value_or(optimization_result{}); +} + +static inline void print_arg_mapper_warning() { + printf( + "WARNING: Usage of ArgMapper type on this platform will result in " + "suboptimal performance. Consider updating your code to update your " + "kernel to use this signature (std::function, " + "arg1, arg2, ...)>) and pass concrete arguments to cudaq::vqe() for " + "the non-variational arguments.\n"); +} + +} // namespace __internal__ + /// /// \brief Compute the minimal eigenvalue of \p H with VQE. /// @@ -24,6 +60,8 @@ namespace cudaq { /// gradients. /// \param n_params The number of variational parameters in the ansatz quantum /// kernel callable. +/// \param args Non-variational arguments to \p kernel that will be passed to +/// \p kernel on each invocation during VQE. /// \returns The optimal value and corresponding parameters as a /// cudaq::optimization_result (std::tuple>) /// @@ -52,22 +90,31 @@ namespace cudaq { /// auto [val, params] = cudaq::vqe(ansatz{}, H, optimizer, 1); /// \endcode /// -template +template , Args...>>> optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H, - cudaq::optimizer &optimizer, const int n_params) { + cudaq::optimizer &optimizer, const int n_params, + Args &&...args) { static_assert( - std::is_invocable_v>, + std::is_invocable_v, Args...>, "Invalid parameterized quantum kernel expression. Must have " - "void(std::vector) signature, or provide " + "void(std::vector, ) signature, or provide " "std::tuple(std::vector) ArgMapper function object."); if (optimizer.requiresGradients()) { throw std::invalid_argument("Provided cudaq::optimizer requires gradients. " "Please provide a cudaq::gradient instance."); } + auto &platform = cudaq::get_platform(); + if (platform.supports_remote_vqe()) + return __internal__::remote_vqe(platform, kernel, H, optimizer, + /*gradient=*/nullptr, n_params, /*shots=*/0, + args...); + return optimizer.optimize(n_params, [&](const std::vector &x, std::vector &grad_vec) { - double e = cudaq::observe(kernel, H, x); + double e = cudaq::observe(kernel, H, x, args...); return e; }); } @@ -84,6 +131,8 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H, /// gradients. /// \param n_params The number of variational parameters in the ansatz quantum /// kernel callable. +/// \param args Non-variational arguments to \p kernel that will be passed to +/// \p kernel on each invocation during VQE. /// \returns The optimal value and corresponding parameters as a /// cudaq::optimization_result (std::tuple>) /// @@ -112,23 +161,32 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H, /// auto [val, params] = cudaq::vqe(/*shots*/ 100, ansatz{}, H, optimizer, 1); /// \endcode /// -template +template , Args...>>> optimization_result vqe(std::size_t shots, QuantumKernel &&kernel, cudaq::spin_op H, cudaq::optimizer &optimizer, - const int n_params) { + const int n_params, Args &&...args) { static_assert( - std::is_invocable_v>, + std::is_invocable_v, Args...>, "Invalid parameterized quantum kernel expression. Must have " - "void(std::vector) signature, or provide " + "void(std::vector, ) signature, or provide " "std::tuple(std::vector) ArgMapper function object."); if (optimizer.requiresGradients()) { throw std::invalid_argument("Provided cudaq::optimizer requires gradients. " "Please provide a cudaq::gradient instance."); } + auto &platform = cudaq::get_platform(); + if (platform.supports_remote_vqe()) + return __internal__::remote_vqe(platform, kernel, H, optimizer, + /*gradient=*/nullptr, n_params, shots, + args...); + return optimizer.optimize(n_params, [&](const std::vector &x, std::vector &grad_vec) { - double e = cudaq::observe(shots, kernel, H, x); + observe_options options{static_cast(shots), cudaq::noise_model{}}; + double e = cudaq::observe(options, kernel, H, x, args...); return e; }); } @@ -147,6 +205,8 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel, /// the minimal eigenvalue of \p H. /// \param n_params The number of variational parameters in the ansatz quantum /// kernel callable. +/// \param args Non-variational arguments to \p kernel that will be passed to +/// \p kernel on each invocation during VQE. /// \returns The optimal value and corresponding parameters as a /// cudaq::optimization_result (std::tuple>) /// @@ -176,22 +236,45 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel, /// cudaq::vqe(ansatz, gradient, H, optimizer, 1); /// \endcode /// -template +template , Args...>>> optimization_result vqe(QuantumKernel &&kernel, cudaq::gradient &gradient, cudaq::spin_op H, cudaq::optimizer &optimizer, - const int n_params) { + const int n_params, Args &&...args) { static_assert( - std::is_invocable_v>, + std::is_invocable_v> || + std::is_invocable_v, Args...>, "Invalid parameterized quantum kernel expression. Must have " - "void(std::vector) signature, or provide " + "void(std::vector, ) signature, or provide " "std::tuple(std::vector) ArgMapper function object."); + + auto &platform = cudaq::get_platform(); + if (platform.supports_remote_vqe()) + return __internal__::remote_vqe(platform, kernel, H, optimizer, &gradient, + n_params, + /*shots=*/0, args...); + auto requires_grad = optimizer.requiresGradients(); - return optimizer.optimize(n_params, [&](const std::vector &x, - std::vector &grad_vec) { - double e = cudaq::observe(kernel, H, x); + // If there are additional arguments, we need to clone the gradient and + // provide it the concrete arguments. + // Note: the strange initialization of newGrad is to avoid a C++17 compiler + // error that happens because the `swap` is ambiguous between the unique_ptr + // and the qubit swap. + std::unique_ptr newGrad = [&]() { if (requires_grad) { - gradient.compute(x, grad_vec, H, e); + auto newGrad_ = gradient.clone(); + if constexpr (sizeof...(args) > 0) + newGrad_->setArgs(kernel, args...); + return newGrad_; } + return std::unique_ptr(); + }(); + return optimizer.optimize(n_params, [&](const std::vector &x, + std::vector &grad_vec) { + double e = cudaq::observe(kernel, H, x, args...); + if (requires_grad) + newGrad->compute(x, grad_vec, H, e); return e; }); } @@ -248,7 +331,11 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::gradient &gradient, /// }); /// \endcode /// -template +template > || + std::is_invocable_v &> || + std::is_invocable_v &>>> optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H, cudaq::optimizer &optimizer, const int n_params, ArgMapper &&argsMapper) { @@ -258,6 +345,9 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H, "Please provide a cudaq::gradient instance. Make sure the gradient is " "aware of the ArgMapper."); } + if (cudaq::get_platform().supports_remote_vqe()) + __internal__::print_arg_mapper_warning(); + return optimizer.optimize(n_params, [&](const std::vector &x, std::vector &grad_vec) { auto args = argsMapper(x); @@ -323,7 +413,11 @@ optimization_result vqe(QuantumKernel &&kernel, cudaq::spin_op H, /// }); /// \endcode /// -template +template > || + std::is_invocable_v &> || + std::is_invocable_v &>>> optimization_result vqe(std::size_t shots, QuantumKernel &&kernel, cudaq::spin_op H, cudaq::optimizer &optimizer, const int n_params, ArgMapper &&argsMapper) { @@ -333,6 +427,9 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel, "Please provide a cudaq::gradient instance. Make sure the gradient is " "aware of the ArgMapper."); } + if (cudaq::get_platform().supports_remote_vqe()) + __internal__::print_arg_mapper_warning(); + return optimizer.optimize(n_params, [&](const std::vector &x, std::vector &grad_vec) { auto args = argsMapper(x); @@ -373,11 +470,18 @@ optimization_result vqe(std::size_t shots, QuantumKernel &&kernel, /// \p H. This function will use the custom ArgMapper to map input variational /// parameters to a tuple for use in evaluating the kernel function. /// -template +template > || + std::is_invocable_v &> || + std::is_invocable_v &>>> optimization_result vqe(QuantumKernel &&kernel, cudaq::gradient &gradient, cudaq::spin_op H, cudaq::optimizer &optimizer, const int n_params, ArgMapper &&argsMapper) { bool requiresGrad = optimizer.requiresGradients(); + if (cudaq::get_platform().supports_remote_vqe()) + __internal__::print_arg_mapper_warning(); + return optimizer.optimize(n_params, [&](const std::vector &x, std::vector &grad_vec) { auto args = argsMapper(x); diff --git a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp index c8ad3046f0c..c6ec69ff701 100644 --- a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp +++ b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp @@ -259,6 +259,80 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer { // Stop the server. virtual void stop() override { m_server->stop(); } + virtual void handleVQERequest(std::size_t reqId, + cudaq::ExecutionContext &io_context, + const std::string &backendSimName, + std::string_view ir, cudaq::gradient *gradient, + cudaq::optimizer &optimizer, const int n_params, + std::string_view kernelName, + std::size_t seed) override { + cudaq::optimization_result result; + + // If we're changing the backend, load the new simulator library from file. + if (m_simHandle.name != backendSimName) { + if (m_simHandle.libHandle) + dlclose(m_simHandle.libHandle); + + m_simHandle = + SimulatorHandle(backendSimName, loadNvqirSimLib(backendSimName)); + } + + if (seed != 0) + cudaq::set_random_seed(seed); + simulationStart = std::chrono::high_resolution_clock::now(); + + auto &requestInfo = m_codeTransform[reqId]; + if (requestInfo.format == cudaq::CodeFormat::LLVM) { + throw std::runtime_error("CodeFormat::LLVM is not supported with VQE. " + "Use CodeFormat::MLIR instead."); + } else { + llvm::SourceMgr sourceMgr; + sourceMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBufferCopy(ir), + llvm::SMLoc()); + auto module = parseSourceFile(sourceMgr, m_mlirContext.get()); + if (!module) + throw std::runtime_error("Failed to parse the input MLIR code"); + auto engine = jitMlirCode(*module, requestInfo.passes); + const std::string entryPointFunc = + std::string(cudaq::runtime::cudaqGenPrefixName) + + std::string(kernelName); + auto fnPtr = + getValueOrThrow(engine->lookup(entryPointFunc), + "Failed to look up entry-point function symbol"); + if (!fnPtr) + throw std::runtime_error("Failed to get entry function"); + + // quake-to-qir translates cc.stdvec to !llvm.struct<(ptr, + // i64)>, so we need to provide the inputs in this format. Make a lambda + // to convert between the two formats. + struct stdvec_struct { + const double *ptr; + std::size_t size; + }; + auto fn = reinterpret_cast(fnPtr); + auto fnWrapper = [fn](const std::vector &x) { + fn({x.data(), x.size()}); + }; + + // Construct the gradient object. + if (gradient) + gradient->setKernel(fnWrapper); + + bool requiresGrad = optimizer.requiresGradients(); + auto theSpin = **io_context.spin; + + result = optimizer.optimize(n_params, [&](const std::vector &x, + std::vector &grad_vec) { + double e = cudaq::observe(fnWrapper, theSpin, x); + if (requiresGrad) + gradient->compute(x, grad_vec, theSpin, e); + return e; + }); + } + simulationEnd = std::chrono::high_resolution_clock::now(); + io_context.optResult = result; + } + virtual void handleRequest(std::size_t reqId, cudaq::ExecutionContext &io_context, const std::string &backendSimName, @@ -595,7 +669,25 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer { m_codeTransform[reqId] = CodeTransformInfo(request.format, request.passes); json resultJson; - if (request.executionContext.name == "state-overlap") { + std::vector decodedCodeIr; + auto errorCode = llvm::decodeBase64(request.code, decodedCodeIr); + if (errorCode) { + LLVMConsumeError(llvm::wrap(std::move(errorCode))); + throw std::runtime_error("Failed to decode input IR"); + } + std::string_view codeStr(decodedCodeIr.data(), decodedCodeIr.size()); + + if (request.opt.has_value() && request.opt->optimizer) { + if (!request.opt->optimizer_n_params.has_value()) + throw std::runtime_error( + "Cannot run optimizer without providing optimizer_n_params"); + + handleVQERequest( + reqId, request.executionContext, request.simulator, codeStr, + request.opt->gradient.get(), *request.opt->optimizer, + *request.opt->optimizer_n_params, request.entryPoint, request.seed); + resultJson["executionContext"] = request.executionContext; + } else if (request.executionContext.name == "state-overlap") { if (!request.overlapKernel.has_value()) throw std::runtime_error("Missing overlap kernel data."); std::vector decodedCodeIr1, decodedCodeIr2; diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h index 274d9577737..fefc83d8091 100644 --- a/runtime/cudaq/platform/qpu.h +++ b/runtime/cudaq/platform/qpu.h @@ -19,6 +19,8 @@ #include namespace cudaq { +class gradient; +class optimizer; class SerializedCodeExecutionContext; /// Expose the function that will return the current ExecutionManager @@ -136,6 +138,9 @@ class QPU : public registry::RegisteredType { /// @brief Return whether this QPU has conditional feedback support virtual bool supportsConditionalFeedback() { return false; } + /// @brief Return whether this QPU has remote VQE execution support + virtual bool supportsRemoteVQE() { return false; } + /// @brief Return whether this QPU has support for remote serialized code /// execution virtual bool supportsRemoteSerializedCode() { return false; } @@ -160,6 +165,11 @@ class QPU : public registry::RegisteredType { virtual void resetExecutionContext() = 0; virtual void setTargetBackend(const std::string &backend) {} + virtual void launchVQE(const std::string &name, const void *kernelArgs, + cudaq::gradient *gradient, cudaq::spin_op H, + cudaq::optimizer &optimizer, const int n_params, + const std::size_t shots) {} + /// Launch the kernel with given name (to extract its Quake representation). /// The raw function pointer is also provided, as are the runtime arguments, /// as a struct-packed void pointer and its corresponding size. diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp index 722434442fb..c676bc04dfb 100644 --- a/runtime/cudaq/platform/quantum_platform.cpp +++ b/runtime/cudaq/platform/quantum_platform.cpp @@ -126,6 +126,27 @@ bool quantum_platform::supports_conditional_feedback( return platformQPUs[qpu_id]->supportsConditionalFeedback(); } +bool quantum_platform::supports_remote_vqe(const std::size_t qpu_id) const { + return platformQPUs[qpu_id]->supportsRemoteVQE(); +} + +void quantum_platform::launchVQE(const std::string kernelName, + const void *kernelArgs, + cudaq::gradient *gradient, cudaq::spin_op H, + cudaq::optimizer &optimizer, + const int n_params, const std::size_t shots) { + std::size_t qpu_id = 0; + + auto tid = std::hash{}(std::this_thread::get_id()); + auto iter = threadToQpuId.find(tid); + if (iter != threadToQpuId.end()) + qpu_id = iter->second; + + auto &qpu = platformQPUs[qpu_id]; + qpu->launchVQE(kernelName, kernelArgs, gradient, H, optimizer, n_params, + shots); +} + bool quantum_platform::supports_remote_serialized_code( const std::size_t qpu_id) const { return platformQPUs[qpu_id]->supportsRemoteSerializedCode(); diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h index b7641d1af84..f5985aafa0b 100644 --- a/runtime/cudaq/platform/quantum_platform.h +++ b/runtime/cudaq/platform/quantum_platform.h @@ -24,6 +24,8 @@ namespace cudaq { class QPU; +class gradient; +class optimizer; class SerializedCodeExecutionContext; /// Typedefs for defining the connectivity structure of a QPU @@ -114,6 +116,9 @@ class quantum_platform { /// @brief Return true if QPU is locally emulating a remote QPU bool is_emulated(const std::size_t qpuId = 0) const; + /// @brief Return whether the QPU has support for fully remote VQE execution + bool supports_remote_vqe(const std::size_t qpuId = 0) const; + /// @brief Set the noise model for future invocations of /// quantum kernels. void set_noise(const noise_model *model); @@ -132,6 +137,12 @@ class quantum_platform { /// @brief Enqueue a general task that runs on the specified QPU void enqueueAsyncTask(const std::size_t qpu_id, std::function &f); + /// @brief Launch a VQE operation on the platform. + void launchVQE(const std::string kernelName, const void *kernelArgs, + cudaq::gradient *gradient, cudaq::spin_op H, + cudaq::optimizer &optimizer, const int n_params, + const std::size_t shots); + // This method is the hook for the kernel rewrites to invoke // quantum kernels. void launchKernel(std::string kernelName, void (*kernelFunc)(void *), diff --git a/targettests/Remote-Sim/vqe_h2.cpp b/targettests/Remote-Sim/vqe_h2.cpp new file mode 100644 index 00000000000..e1764a121a0 --- /dev/null +++ b/targettests/Remote-Sim/vqe_h2.cpp @@ -0,0 +1,149 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// REQUIRES: remote-sim + +// clang-format off +// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t +// clang-format on + +#include +#include +#include +#include +#include + +// Here we build up a CUDA-Q kernel with N layers and each +// layer containing an arrangement of random SO(4) rotations. The algorithm +// leverages the CUDA-Q VQE support to compute the ground state of the +// Hydrogen atom. + +// The SO4 random entangler written as a CUDA-Q kernel free function +// since this is a pure-device quantum kernel +__qpu__ void so4(cudaq::qubit &q, cudaq::qubit &r, + const std::vector &thetas) { + ry(thetas[0], q); + ry(thetas[1], r); + + h(r); + cx(q, r); + h(r); + + ry(thetas[2], q); + ry(thetas[3], r); + + h(r); + cx(q, r); + h(r); + + ry(thetas[4], q); + ry(thetas[5], r); + + h(r); + cx(q, r); + h(r); +} + +// The SO4 fabric CUDA-Q kernel. Keeps track of simple +// arithmetic class members controlling the number of qubits and +// entangling layers. +struct so4_fabric { + void operator()(std::vector params, int n_qubits, + int n_layers) __qpu__ { + cudaq::qvector q(n_qubits); + + x(q[0]); + x(q[2]); + + const int block_size = 2; + int counter = 0; + for (int i = 0; i < n_layers; i++) { + // first layer of so4 blocks (even) + for (int k = 0; k < n_qubits; k += 2) { + auto subq = q.slice(k, block_size); + auto so4_params = cudaq::slice_vector(params, counter, 6); + so4(subq[0], subq[1], so4_params); + counter += 6; + } + + // second layer of so4 blocks (odd) + for (int k = 1; k + block_size < n_qubits; k += 2) { + auto subq = q.slice(k, block_size); + auto so4_params = cudaq::slice_vector(params, counter, 6); + so4(subq[0], subq[1], so4_params); + counter += 6; + } + } + } +}; + +int main() { + // Read in the spin op from file + std::vector h2_data{0, 0, 0, 0, -0.10647701149499994, 0.0, + 1, 1, 1, 1, 0.0454063328691, 0.0, + 1, 1, 3, 3, 0.0454063328691, 0.0, + 3, 3, 1, 1, 0.0454063328691, 0.0, + 3, 3, 3, 3, 0.0454063328691, 0.0, + 2, 0, 0, 0, 0.170280101353, 0.0, + 2, 2, 0, 0, 0.120200490713, 0.0, + 2, 0, 2, 0, 0.168335986252, 0.0, + 2, 0, 0, 2, 0.165606823582, 0.0, + 0, 2, 0, 0, -0.22004130022499996, 0.0, + 0, 2, 2, 0, 0.165606823582, 0.0, + 0, 2, 0, 2, 0.174072892497, 0.0, + 0, 0, 2, 0, 0.17028010135300004, 0.0, + 0, 0, 2, 2, 0.120200490713, 0.0, + 0, 0, 0, 2, -0.22004130022499999, 0.0, + 15}; + cudaq::spin_op H(h2_data, /*nQubits*/ 4); + + // For 8 qubits, 36 parameters per layer + int n_layers = 2, n_qubits = H.num_qubits(), block_size = 2, p_counter = 0; + int n_blocks_per_layer = 2 * (n_qubits / block_size) - 1; + int n_params = n_layers * 6 * n_blocks_per_layer; + printf("%d qubit Hamiltonian -> %d parameters\n", n_qubits, n_params); + + // Define the initial parameters and ansatz. + auto init_params = + cudaq::random_vector(-1, 1, n_params, std::mt19937::default_seed); + + so4_fabric ansatz; + + // Run VQE with lbfgs + central_difference + { + cudaq::optimizers::lbfgs optimizer; + optimizer.initial_parameters = init_params; + optimizer.max_eval = 20; + optimizer.max_line_search_trials = 10; + cudaq::gradients::central_difference gradient; + auto [opt_val, opt_params] = cudaq::vqe(ansatz, gradient, H, optimizer, + n_params, n_qubits, n_layers); + printf("Optimal value = %.16lf\n", opt_val); + assert(std::abs(opt_val - -1.1164613629294273) < 1e-3); + } + // Run VQE with cobyla + { + cudaq::optimizers::cobyla optimizer; + optimizer.initial_parameters = init_params; + optimizer.max_eval = 100; + auto [opt_val, opt_params] = + cudaq::vqe(ansatz, H, optimizer, n_params, n_qubits, n_layers); + printf("Optimal value = %.16lf\n", opt_val); + assert(std::abs(opt_val - -1.0769400650758392) < 1e-3); + } + // Run VQE with cobyla with fixed number of shots + { + cudaq::optimizers::cobyla optimizer; + optimizer.initial_parameters = init_params; + optimizer.max_eval = 100; + auto [opt_val, opt_params] = cudaq::vqe( + /*shots=*/1000, ansatz, H, optimizer, n_params, n_qubits, n_layers); + printf("Optimal value = %.16lf\n", opt_val); + assert(std::abs(opt_val - -1.0769400650758392) < 1e-3); + } +}