diff --git a/.github/workflows/nvqc_regression_tests.yml b/.github/workflows/nvqc_regression_tests.yml index 5057ae5710..afa4f006bf 100644 --- a/.github/workflows/nvqc_regression_tests.yml +++ b/.github/workflows/nvqc_regression_tests.yml @@ -127,7 +127,8 @@ jobs: # pauli_word: https://github.com/NVIDIA/cuda-quantum/issues/1957 # nested_vectors: related to vector of pauli_words (https://github.com/NVIDIA/cuda-quantum/issues/1957) # custom_operation: https://github.com/NVIDIA/cuda-quantum/issues/1985 - if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]] && [[ "$filename" != *"custom_operation"* ]]; then + # return_values: only supported in 0.8 NVQC service. + if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]] && [[ "$filename" != *"custom_operation"* ]] && [[ "$filename" != *"return_values"* ]]; then echo "$filename" nvqc_config="" # Look for a --remote-mqpu-auto-launch to determine the number of QPUs diff --git a/docs/sphinx/using/backends/platform.rst b/docs/sphinx/using/backends/platform.rst index fd6709dd95..d30d04c2f5 100644 --- a/docs/sphinx/using/backends/platform.rst +++ b/docs/sphinx/using/backends/platform.rst @@ -267,4 +267,7 @@ language constructs within quantum kernels may not yet be fully supported. * - Single-level nested `std::vector` of supported `std::vector` types - `std::vector>`, `std::vector`, etc. - Number of top-level elements (as a 64-bit integer) followed sizes in bytes of element vectors (as a contiguous array of 64-bit integers) then serialized data of the inner vectors. - \ No newline at end of file + +For CUDA-Q kernels that return a value, the remote platform supports returning simple data types of +`bool`, integral (e.g., `int` or `std::size_t`), and floating-point types (`float` or `double`) +when MLIR-based compilation is enabled (:code:`--enable-mlir`). diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h index 1fbed52564..0c897cd4b3 100644 --- a/runtime/common/BaseRemoteSimulatorQPU.h +++ b/runtime/common/BaseRemoteSimulatorQPU.h @@ -126,8 +126,17 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { // set up a single-shot execution context for this case. static thread_local cudaq::ExecutionContext defaultContext("sample", /*shots=*/1); + // This is a kernel invocation outside the CUDA-Q APIs (sample/observe). + const bool isDirectInvocation = !executionContextPtr; cudaq::ExecutionContext &executionContext = executionContextPtr ? *executionContextPtr : defaultContext; + + // Populate the conditional feedback metadata if this is a direct + // invocation (not otherwise populated by cudaq::sample) + if (isDirectInvocation) + executionContext.hasConditionalsOnMeasureResults = + cudaq::kernelHasConditionalFeedback(name); + std::string errorMsg; const bool requestOkay = m_client->sendRequest( *m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr, @@ -135,6 +144,30 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU { m_simName, name, kernelFunc, args, voidStarSize, &errorMsg); if (!requestOkay) throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg); + if (isDirectInvocation && + !executionContext.invocationResultBuffer.empty()) { + if (executionContext.invocationResultBuffer.size() + resultOffset > + voidStarSize) + throw std::runtime_error( + "Unexpected result: return type size of " + + std::to_string(executionContext.invocationResultBuffer.size()) + + " bytes overflows the argument buffer."); + // Currently, we only support result buffer serialization on LittleEndian + // CPUs (x86, ARM, PPC64LE). + // Note: NVQC service will always be using LE. If + // the client (e.g., compiled from source) is built for big-endian, we + // will throw an error if result buffer data is returned. + if (llvm::sys::IsBigEndianHost) + throw std::runtime_error( + "Serializing the result buffer from a remote kernel invocation is " + "not supported for BigEndian CPU architectures."); + + char *resultBuf = reinterpret_cast(args) + resultOffset; + // Copy the result data to the args buffer. + std::memcpy(resultBuf, executionContext.invocationResultBuffer.data(), + executionContext.invocationResultBuffer.size()); + executionContext.invocationResultBuffer.clear(); + } } void diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h index d7f763dd62..70c0827e49 100644 --- a/runtime/common/ExecutionContext.h +++ b/runtime/common/ExecutionContext.h @@ -100,6 +100,11 @@ class ExecutionContext { /// register after execution. Empty means no reordering. std::vector reorderIdx; + /// @brief A buffer containing the return value of a kernel invocation. + /// Note: this is only needed for invocation not able to return a + /// `sample_result`. + std::vector invocationResultBuffer; + /// @brief The Constructor, takes the name of the context /// @param n The name of the context ExecutionContext(const std::string n) : name(n) {} diff --git a/runtime/common/JsonConvert.h b/runtime/common/JsonConvert.h index 0c770dd258..67da14cea0 100644 --- a/runtime/common/JsonConvert.h +++ b/runtime/common/JsonConvert.h @@ -153,6 +153,9 @@ inline void to_json(json &j, const ExecutionContext &context) { if (context.amplitudeMaps.has_value()) j["amplitudeMaps"] = context.amplitudeMaps.value(); + + if (!context.invocationResultBuffer.empty()) + j["invocationResultBuffer"] = context.invocationResultBuffer; } inline void from_json(const json &j, ExecutionContext &context) { @@ -214,6 +217,9 @@ inline void from_json(const json &j, ExecutionContext &context) { if (j.contains("amplitudeMaps")) context.amplitudeMaps = j["amplitudeMaps"]; + + if (j.contains("invocationResultBuffer")) + context.invocationResultBuffer = j["invocationResultBuffer"]; } // Enum data to denote the payload format. diff --git a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp index f54f07299a..b532ba171c 100644 --- a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp +++ b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp @@ -420,7 +420,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer { io_context.hasConditionalsOnMeasureResults) { // Need to run simulation shot-by-shot cudaq::sample_result counts; - invokeMlirKernel(m_mlirContext, ir, requestInfo.passes, + invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes, std::string(kernelName), io_context.shots, [&](std::size_t i) { // Reset the context and get the single @@ -436,7 +436,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer { io_context.result = counts; platform.set_exec_ctx(&io_context); } else { - invokeMlirKernel(m_mlirContext, ir, requestInfo.passes, + invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes, std::string(kernelName)); } } @@ -537,7 +537,8 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer { } void - invokeMlirKernel(std::unique_ptr &contextPtr, + invokeMlirKernel(cudaq::ExecutionContext &io_context, + std::unique_ptr &contextPtr, std::string_view irString, const std::vector &passes, const std::string &entryPointFn, std::size_t numTimes = 1, @@ -549,21 +550,56 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer { if (!module) throw std::runtime_error("Failed to parse the input MLIR code"); auto engine = jitMlirCode(*module, passes); + llvm::SmallVector returnArg; const std::string entryPointFunc = std::string(cudaq::runtime::cudaqGenPrefixName) + entryPointFn; - auto fnPtr = - getValueOrThrow(engine->lookup(entryPointFunc), - "Failed to look up entry-point function symbol"); - if (!fnPtr) - throw std::runtime_error("Failed to get entry function"); + if (auto funcOp = module->lookupSymbol(entryPointFunc)) { + auto funcTy = funcOp.getFunctionType(); + auto returnTy = funcTy.getReturnType(); + // These are the returned types that we support. + if (returnTy.isF32()) { + io_context.invocationResultBuffer.resize(sizeof(float)); + returnArg.push_back(io_context.invocationResultBuffer.data()); + } else if (returnTy.isF64()) { + io_context.invocationResultBuffer.resize(sizeof(double)); + returnArg.push_back(io_context.invocationResultBuffer.data()); + } else if (returnTy.isInteger(1)) { + static_assert(sizeof(bool) == sizeof(char), + "Incompatible boolean data type. CUDA-Q kernels expect " + "sizeof(bool) == sizeof(char)."); + io_context.invocationResultBuffer.resize(sizeof(bool)); + returnArg.push_back(io_context.invocationResultBuffer.data()); + } else if (returnTy.isIntOrIndex()) { + io_context.invocationResultBuffer.resize( + (returnTy.getIntOrFloatBitWidth() + 7) / 8); + returnArg.push_back(io_context.invocationResultBuffer.data()); + } + } - auto fn = reinterpret_cast(fnPtr); - simulationStart = std::chrono::high_resolution_clock::now(); - for (std::size_t i = 0; i < numTimes; ++i) { - // Invoke the kernel - fn(); - if (postExecCallback) { - postExecCallback(i); + // Note: currently, we only return data from kernel on single-shot + // execution. Once we enable arbitrary sample return type, we can run this + // in a loop and return a vector of return type. + if (numTimes == 1 && !returnArg.empty()) { + simulationStart = std::chrono::high_resolution_clock::now(); + llvm::Error error = engine->invokePacked(entryPointFunc, returnArg); + if (error) + throw std::runtime_error("JIT invocation failed"); + if (postExecCallback) + postExecCallback(0); + } else { + auto fnPtr = + getValueOrThrow(engine->lookup(entryPointFunc), + "Failed to look up entry-point function symbol"); + if (!fnPtr) + throw std::runtime_error("Failed to get entry function"); + + auto fn = reinterpret_cast(fnPtr); + simulationStart = std::chrono::high_resolution_clock::now(); + for (std::size_t i = 0; i < numTimes; ++i) { + // Invoke the kernel + fn(); + if (postExecCallback) + postExecCallback(i); } } } diff --git a/targettests/Remote-Sim/return_values.cpp b/targettests/Remote-Sim/return_values.cpp new file mode 100644 index 0000000000..a78e8a2c30 --- /dev/null +++ b/targettests/Remote-Sim/return_values.cpp @@ -0,0 +1,101 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// REQUIRES: remote-sim +// REQUIRES: c++20 + +// clang-format off +// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t +// clang-format on + +#include +#include + +struct rwpe { + double operator()(const int n_iter, double mu, double sigma) __qpu__ { + int iteration = 0; + + // Allocate the qubits + cudaq::qvector q(2); + + // Alias them + auto &aux = q.front(); + auto &target = q.back(); + + x(q[1]); + + while (iteration < n_iter) { + h(aux); + rz(1.0 - (mu / sigma), aux); + rz(.25 / sigma, target); + x(aux, target); + rz(-.25 / sigma, target); + x(aux, target); + h(aux); + if (mz(aux)) { + x(aux); + mu += sigma * .6065; + } else { + mu -= sigma * .6065; + } + + sigma *= .7951; + iteration += 1; + } + + return 2. * mu; + } +}; + +struct returnTrue { + bool operator()() __qpu__ { + cudaq::qubit q; + x(q); + return mz(q); + } +}; + +struct returnFalse { + bool operator()() __qpu__ { + cudaq::qubit q, r; + x(q); + return mz(q) && mz(r); + } +}; + +struct returnInt { + int operator()(int iters) __qpu__ { + cudaq::qubit q; + int count = 0; + for (int i = 0; i < iters; ++i) { + h(q); + if (mz(q)) { + count++; + x(q); + } + } + return count; + } +}; + +int main() { + int n_iterations = 24; + double mu = 0.7951, sigma = 0.6065; + auto phase = rwpe{}(n_iterations, mu, sigma); + + assert(std::abs(phase - 0.49) < 0.05); + + assert(returnTrue{}()); + + assert(!returnFalse{}()); + cudaq::set_random_seed(123); + const int oneCount = returnInt{}(1000); + std::cout << "One count = " << oneCount << "\n"; + // We expect ~ 50% one. + assert(oneCount > 100); +}