Skip to content

Commit

Permalink
[NVQC] Support direct kernel invocation with a return value (#1969)
Browse files Browse the repository at this point in the history
* Enable simple return values from remote execution in MLIR/NVQC mode

* Don't test return values with 0.7 nvqc service

* Update runtime/common/BaseRemoteSimulatorQPU.h

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>

* Update runtime/common/ExecutionContext.h

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>

* Update runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>

* Update runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>

* Add error checks and docs

* Code format

---------

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
  • Loading branch information
1tnguyen and schweitzpgi authored Jul 25, 2024
1 parent fead557 commit 478e00c
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 17 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/nvqc_regression_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ jobs:
# pauli_word: https://github.com/NVIDIA/cuda-quantum/issues/1957
# nested_vectors: related to vector of pauli_words (https://github.com/NVIDIA/cuda-quantum/issues/1957)
# custom_operation: https://github.com/NVIDIA/cuda-quantum/issues/1985
if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]] && [[ "$filename" != *"custom_operation"* ]]; then
# return_values: only supported in 0.8 NVQC service.
if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]] && [[ "$filename" != *"custom_operation"* ]] && [[ "$filename" != *"return_values"* ]]; then
echo "$filename"
nvqc_config=""
# Look for a --remote-mqpu-auto-launch to determine the number of QPUs
Expand Down
5 changes: 4 additions & 1 deletion docs/sphinx/using/backends/platform.rst
Original file line number Diff line number Diff line change
Expand Up @@ -267,4 +267,7 @@ language constructs within quantum kernels may not yet be fully supported.
* - Single-level nested `std::vector` of supported `std::vector` types
- `std::vector<std::vector<int>>`, `std::vector<cudaq::pauli_word>`, etc.
- Number of top-level elements (as a 64-bit integer) followed sizes in bytes of element vectors (as a contiguous array of 64-bit integers) then serialized data of the inner vectors.


For CUDA-Q kernels that return a value, the remote platform supports returning simple data types of
`bool`, integral (e.g., `int` or `std::size_t`), and floating-point types (`float` or `double`)
when MLIR-based compilation is enabled (:code:`--enable-mlir`).
33 changes: 33 additions & 0 deletions runtime/common/BaseRemoteSimulatorQPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,48 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
// set up a single-shot execution context for this case.
static thread_local cudaq::ExecutionContext defaultContext("sample",
/*shots=*/1);
// This is a kernel invocation outside the CUDA-Q APIs (sample/observe).
const bool isDirectInvocation = !executionContextPtr;
cudaq::ExecutionContext &executionContext =
executionContextPtr ? *executionContextPtr : defaultContext;

// Populate the conditional feedback metadata if this is a direct
// invocation (not otherwise populated by cudaq::sample)
if (isDirectInvocation)
executionContext.hasConditionalsOnMeasureResults =
cudaq::kernelHasConditionalFeedback(name);

std::string errorMsg;
const bool requestOkay = m_client->sendRequest(
*m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr,
/*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
m_simName, name, kernelFunc, args, voidStarSize, &errorMsg);
if (!requestOkay)
throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
if (isDirectInvocation &&
!executionContext.invocationResultBuffer.empty()) {
if (executionContext.invocationResultBuffer.size() + resultOffset >
voidStarSize)
throw std::runtime_error(
"Unexpected result: return type size of " +
std::to_string(executionContext.invocationResultBuffer.size()) +
" bytes overflows the argument buffer.");
// Currently, we only support result buffer serialization on LittleEndian
// CPUs (x86, ARM, PPC64LE).
// Note: NVQC service will always be using LE. If
// the client (e.g., compiled from source) is built for big-endian, we
// will throw an error if result buffer data is returned.
if (llvm::sys::IsBigEndianHost)
throw std::runtime_error(
"Serializing the result buffer from a remote kernel invocation is "
"not supported for BigEndian CPU architectures.");

char *resultBuf = reinterpret_cast<char *>(args) + resultOffset;
// Copy the result data to the args buffer.
std::memcpy(resultBuf, executionContext.invocationResultBuffer.data(),
executionContext.invocationResultBuffer.size());
executionContext.invocationResultBuffer.clear();
}
}

void
Expand Down
5 changes: 5 additions & 0 deletions runtime/common/ExecutionContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ class ExecutionContext {
/// register after execution. Empty means no reordering.
std::vector<std::size_t> reorderIdx;

/// @brief A buffer containing the return value of a kernel invocation.
/// Note: this is only needed for invocation not able to return a
/// `sample_result`.
std::vector<char> invocationResultBuffer;

/// @brief The Constructor, takes the name of the context
/// @param n The name of the context
ExecutionContext(const std::string n) : name(n) {}
Expand Down
6 changes: 6 additions & 0 deletions runtime/common/JsonConvert.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ inline void to_json(json &j, const ExecutionContext &context) {

if (context.amplitudeMaps.has_value())
j["amplitudeMaps"] = context.amplitudeMaps.value();

if (!context.invocationResultBuffer.empty())
j["invocationResultBuffer"] = context.invocationResultBuffer;
}

inline void from_json(const json &j, ExecutionContext &context) {
Expand Down Expand Up @@ -214,6 +217,9 @@ inline void from_json(const json &j, ExecutionContext &context) {

if (j.contains("amplitudeMaps"))
context.amplitudeMaps = j["amplitudeMaps"];

if (j.contains("invocationResultBuffer"))
context.invocationResultBuffer = j["invocationResultBuffer"];
}

// Enum data to denote the payload format.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
io_context.hasConditionalsOnMeasureResults) {
// Need to run simulation shot-by-shot
cudaq::sample_result counts;
invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes,
std::string(kernelName), io_context.shots,
[&](std::size_t i) {
// Reset the context and get the single
Expand All @@ -436,7 +436,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
io_context.result = counts;
platform.set_exec_ctx(&io_context);
} else {
invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes,
std::string(kernelName));
}
}
Expand Down Expand Up @@ -537,7 +537,8 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
}

void
invokeMlirKernel(std::unique_ptr<MLIRContext> &contextPtr,
invokeMlirKernel(cudaq::ExecutionContext &io_context,
std::unique_ptr<MLIRContext> &contextPtr,
std::string_view irString,
const std::vector<std::string> &passes,
const std::string &entryPointFn, std::size_t numTimes = 1,
Expand All @@ -549,21 +550,56 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
if (!module)
throw std::runtime_error("Failed to parse the input MLIR code");
auto engine = jitMlirCode(*module, passes);
llvm::SmallVector<void *> returnArg;
const std::string entryPointFunc =
std::string(cudaq::runtime::cudaqGenPrefixName) + entryPointFn;
auto fnPtr =
getValueOrThrow(engine->lookup(entryPointFunc),
"Failed to look up entry-point function symbol");
if (!fnPtr)
throw std::runtime_error("Failed to get entry function");
if (auto funcOp = module->lookupSymbol<LLVM::LLVMFuncOp>(entryPointFunc)) {
auto funcTy = funcOp.getFunctionType();
auto returnTy = funcTy.getReturnType();
// These are the returned types that we support.
if (returnTy.isF32()) {
io_context.invocationResultBuffer.resize(sizeof(float));
returnArg.push_back(io_context.invocationResultBuffer.data());
} else if (returnTy.isF64()) {
io_context.invocationResultBuffer.resize(sizeof(double));
returnArg.push_back(io_context.invocationResultBuffer.data());
} else if (returnTy.isInteger(1)) {
static_assert(sizeof(bool) == sizeof(char),
"Incompatible boolean data type. CUDA-Q kernels expect "
"sizeof(bool) == sizeof(char).");
io_context.invocationResultBuffer.resize(sizeof(bool));
returnArg.push_back(io_context.invocationResultBuffer.data());
} else if (returnTy.isIntOrIndex()) {
io_context.invocationResultBuffer.resize(
(returnTy.getIntOrFloatBitWidth() + 7) / 8);
returnArg.push_back(io_context.invocationResultBuffer.data());
}
}

auto fn = reinterpret_cast<void (*)()>(fnPtr);
simulationStart = std::chrono::high_resolution_clock::now();
for (std::size_t i = 0; i < numTimes; ++i) {
// Invoke the kernel
fn();
if (postExecCallback) {
postExecCallback(i);
// Note: currently, we only return data from kernel on single-shot
// execution. Once we enable arbitrary sample return type, we can run this
// in a loop and return a vector of return type.
if (numTimes == 1 && !returnArg.empty()) {
simulationStart = std::chrono::high_resolution_clock::now();
llvm::Error error = engine->invokePacked(entryPointFunc, returnArg);
if (error)
throw std::runtime_error("JIT invocation failed");
if (postExecCallback)
postExecCallback(0);
} else {
auto fnPtr =
getValueOrThrow(engine->lookup(entryPointFunc),
"Failed to look up entry-point function symbol");
if (!fnPtr)
throw std::runtime_error("Failed to get entry function");

auto fn = reinterpret_cast<void (*)()>(fnPtr);
simulationStart = std::chrono::high_resolution_clock::now();
for (std::size_t i = 0; i < numTimes; ++i) {
// Invoke the kernel
fn();
if (postExecCallback)
postExecCallback(i);
}
}
}
Expand Down
101 changes: 101 additions & 0 deletions targettests/Remote-Sim/return_values.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*******************************************************************************
* Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. *
* All rights reserved. *
* *
* This source code and the accompanying materials are made available under *
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/

// REQUIRES: remote-sim
// REQUIRES: c++20

// clang-format off
// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t
// clang-format on

#include <cudaq.h>
#include <iostream>

struct rwpe {
double operator()(const int n_iter, double mu, double sigma) __qpu__ {
int iteration = 0;

// Allocate the qubits
cudaq::qvector q(2);

// Alias them
auto &aux = q.front();
auto &target = q.back();

x(q[1]);

while (iteration < n_iter) {
h(aux);
rz(1.0 - (mu / sigma), aux);
rz(.25 / sigma, target);
x<cudaq::ctrl>(aux, target);
rz(-.25 / sigma, target);
x<cudaq::ctrl>(aux, target);
h(aux);
if (mz(aux)) {
x(aux);
mu += sigma * .6065;
} else {
mu -= sigma * .6065;
}

sigma *= .7951;
iteration += 1;
}

return 2. * mu;
}
};

struct returnTrue {
bool operator()() __qpu__ {
cudaq::qubit q;
x(q);
return mz(q);
}
};

struct returnFalse {
bool operator()() __qpu__ {
cudaq::qubit q, r;
x(q);
return mz(q) && mz(r);
}
};

struct returnInt {
int operator()(int iters) __qpu__ {
cudaq::qubit q;
int count = 0;
for (int i = 0; i < iters; ++i) {
h(q);
if (mz(q)) {
count++;
x(q);
}
}
return count;
}
};

int main() {
int n_iterations = 24;
double mu = 0.7951, sigma = 0.6065;
auto phase = rwpe{}(n_iterations, mu, sigma);

assert(std::abs(phase - 0.49) < 0.05);

assert(returnTrue{}());

assert(!returnFalse{}());
cudaq::set_random_seed(123);
const int oneCount = returnInt{}(1000);
std::cout << "One count = " << oneCount << "\n";
// We expect ~ 50% one.
assert(oneCount > 100);
}

0 comments on commit 478e00c

Please sign in to comment.