Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVQC] Support direct kernel invocation with a return value #1969

Merged
merged 11 commits into from
Jul 25, 2024
3 changes: 2 additions & 1 deletion .github/workflows/nvqc_regression_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ jobs:
# unsupport_args and compile_errors are compile error tests
# pauli_word: https://github.com/NVIDIA/cuda-quantum/issues/1957
# nested_vectors: related to vector of pauli_words (https://github.com/NVIDIA/cuda-quantum/issues/1957)
if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]]; then
# return_values: only supported in 0.8 NVQC service.
if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]] && [[ "$filename" != *"compile_errors"* ]] && [[ "$filename" != *"nested_vectors"* ]] && [[ "$filename" != *"pauli_word"* ]] && [[ "$filename" != *"return_values"* ]]; then
echo "$filename"
nvqc_config=""
# Look for a --remote-mqpu-auto-launch to determine the number of QPUs
Expand Down
23 changes: 23 additions & 0 deletions runtime/common/BaseRemoteSimulatorQPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,38 @@ class BaseRemoteSimulatorQPU : public cudaq::QPU {
// set up a single-shot execution context for this case.
static thread_local cudaq::ExecutionContext defaultContext("sample",
/*shots=*/1);
// This is a kernel invocation outside the CUDA-Q APIs (sample/observe).
const bool isDirectInvocation = !executionContextPtr;
cudaq::ExecutionContext &executionContext =
executionContextPtr ? *executionContextPtr : defaultContext;

// Populate the conditional feedback metadata if this is a direct
// invocation (not otherwise populated by cudaq::sample)
if (isDirectInvocation)
executionContext.hasConditionalsOnMeasureResults =
cudaq::kernelHasConditionalFeedback(name);

std::string errorMsg;
const bool requestOkay = m_client->sendRequest(
*m_mlirContext, executionContext, /*serializedCodeContext=*/nullptr,
/*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
m_simName, name, kernelFunc, args, voidStarSize, &errorMsg);
if (!requestOkay)
throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
if (isDirectInvocation &&
!executionContext.invocationResultBuffer.empty()) {
if (executionContext.invocationResultBuffer.size() + resultOffset >
voidStarSize)
throw std::runtime_error(
"Unexpected result: return type size of " +
std::to_string(executionContext.invocationResultBuffer.size()) +
" bytes overflows the argument buffer.");
char *resultBuf = reinterpret_cast<char *>(args) + resultOffset;
// Copy the result data to the args buffer.
std::memcpy(resultBuf, executionContext.invocationResultBuffer.data(),
executionContext.invocationResultBuffer.size());
executionContext.invocationResultBuffer.clear();
}
}

void
Expand Down
5 changes: 5 additions & 0 deletions runtime/common/ExecutionContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ class ExecutionContext {
/// register after execution. Empty means no reordering.
std::vector<std::size_t> reorderIdx;

/// @brief A buffer containing the return value of a kernel invocation.
/// Note: this is only needed for invocation not able to return a
/// `sample_result`.
std::vector<char> invocationResultBuffer;
1tnguyen marked this conversation as resolved.
Show resolved Hide resolved

/// @brief The Constructor, takes the name of the context
/// @param n The name of the context
ExecutionContext(const std::string n) : name(n) {}
Expand Down
6 changes: 6 additions & 0 deletions runtime/common/JsonConvert.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ inline void to_json(json &j, const ExecutionContext &context) {

if (context.amplitudeMaps.has_value())
j["amplitudeMaps"] = context.amplitudeMaps.value();

if (!context.invocationResultBuffer.empty())
j["invocationResultBuffer"] = context.invocationResultBuffer;
}

inline void from_json(const json &j, ExecutionContext &context) {
Expand Down Expand Up @@ -214,6 +217,9 @@ inline void from_json(const json &j, ExecutionContext &context) {

if (j.contains("amplitudeMaps"))
context.amplitudeMaps = j["amplitudeMaps"];

if (j.contains("invocationResultBuffer"))
context.invocationResultBuffer = j["invocationResultBuffer"];
}

// Enum data to denote the payload format.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
io_context.hasConditionalsOnMeasureResults) {
// Need to run simulation shot-by-shot
cudaq::sample_result counts;
invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes,
std::string(kernelName), io_context.shots,
[&](std::size_t i) {
// Reset the context and get the single
Expand All @@ -436,7 +436,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
io_context.result = counts;
platform.set_exec_ctx(&io_context);
} else {
invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
invokeMlirKernel(io_context, m_mlirContext, ir, requestInfo.passes,
std::string(kernelName));
}
}
Expand Down Expand Up @@ -537,7 +537,8 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
}

void
invokeMlirKernel(std::unique_ptr<MLIRContext> &contextPtr,
invokeMlirKernel(cudaq::ExecutionContext &io_context,
std::unique_ptr<MLIRContext> &contextPtr,
std::string_view irString,
const std::vector<std::string> &passes,
const std::string &entryPointFn, std::size_t numTimes = 1,
Expand All @@ -549,21 +550,54 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
if (!module)
throw std::runtime_error("Failed to parse the input MLIR code");
auto engine = jitMlirCode(*module, passes);
llvm::SmallVector<void *> returnArg;
1tnguyen marked this conversation as resolved.
Show resolved Hide resolved
const std::string entryPointFunc =
std::string(cudaq::runtime::cudaqGenPrefixName) + entryPointFn;
auto fnPtr =
getValueOrThrow(engine->lookup(entryPointFunc),
"Failed to look up entry-point function symbol");
if (!fnPtr)
throw std::runtime_error("Failed to get entry function");
if (auto funcOp = module->lookupSymbol<LLVM::LLVMFuncOp>(entryPointFunc)) {
auto funcTy = funcOp.getFunctionType();
auto returnTy = funcTy.getReturnType();
// These are the returned types that we support.
1tnguyen marked this conversation as resolved.
Show resolved Hide resolved
if (returnTy.isF32()) {
io_context.invocationResultBuffer.resize(sizeof(float));
returnArg.push_back(io_context.invocationResultBuffer.data());
} else if (returnTy.isF64()) {
io_context.invocationResultBuffer.resize(sizeof(double));
returnArg.push_back(io_context.invocationResultBuffer.data());
} else if (returnTy.isInteger(1)) {
io_context.invocationResultBuffer.resize(sizeof(bool));
1tnguyen marked this conversation as resolved.
Show resolved Hide resolved
returnArg.push_back(io_context.invocationResultBuffer.data());
} else if (returnTy.isIntOrIndex()) {
io_context.invocationResultBuffer.resize(
(returnTy.getIntOrFloatBitWidth() + 7) / 8);
returnArg.push_back(io_context.invocationResultBuffer.data());
}
}

auto fn = reinterpret_cast<void (*)()>(fnPtr);
simulationStart = std::chrono::high_resolution_clock::now();
for (std::size_t i = 0; i < numTimes; ++i) {
// Invoke the kernel
fn();
if (postExecCallback) {
postExecCallback(i);
// Note: currently, we only return data from kernel on single-shot
// execution. Once we enable arbitrary sample return type, we can run this
// in a loop and return a vector of return type.
if (numTimes == 1 && !returnArg.empty()) {
simulationStart = std::chrono::high_resolution_clock::now();
llvm::Error error = engine->invokePacked(entryPointFunc, returnArg);
if (error)
throw std::runtime_error("JIT invocation failed");
if (postExecCallback)
postExecCallback(0);
} else {
auto fnPtr =
getValueOrThrow(engine->lookup(entryPointFunc),
"Failed to look up entry-point function symbol");
if (!fnPtr)
throw std::runtime_error("Failed to get entry function");

auto fn = reinterpret_cast<void (*)()>(fnPtr);
simulationStart = std::chrono::high_resolution_clock::now();
for (std::size_t i = 0; i < numTimes; ++i) {
// Invoke the kernel
fn();
if (postExecCallback)
postExecCallback(i);

}
}
}
Expand Down
101 changes: 101 additions & 0 deletions targettests/Remote-Sim/return_values.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*******************************************************************************
* Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. *
* All rights reserved. *
* *
* This source code and the accompanying materials are made available under *
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/

// REQUIRES: remote-sim
// REQUIRES: c++20

// clang-format off
// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t
// clang-format on

#include <cudaq.h>
#include <iostream>

struct rwpe {
double operator()(const int n_iter, double mu, double sigma) __qpu__ {
int iteration = 0;

// Allocate the qubits
cudaq::qvector q(2);

// Alias them
auto &aux = q.front();
auto &target = q.back();

x(q[1]);

while (iteration < n_iter) {
h(aux);
rz(1.0 - (mu / sigma), aux);
rz(.25 / sigma, target);
x<cudaq::ctrl>(aux, target);
rz(-.25 / sigma, target);
x<cudaq::ctrl>(aux, target);
h(aux);
if (mz(aux)) {
x(aux);
mu += sigma * .6065;
} else {
mu -= sigma * .6065;
}

sigma *= .7951;
iteration += 1;
}

return 2. * mu;
}
};

struct returnTrue {
bool operator()() __qpu__ {
cudaq::qubit q;
x(q);
return mz(q);
}
};

struct returnFalse {
bool operator()() __qpu__ {
cudaq::qubit q, r;
x(q);
return mz(q) && mz(r);
}
};

struct returnInt {
int operator()(int iters) __qpu__ {
cudaq::qubit q;
int count = 0;
for (int i = 0; i < iters; ++i) {
h(q);
if (mz(q)) {
count++;
x(q);
}
}
return count;
}
};

int main() {
int n_iterations = 24;
double mu = 0.7951, sigma = 0.6065;
auto phase = rwpe{}(n_iterations, mu, sigma);

assert(std::abs(phase - 0.49) < 0.05);

assert(returnTrue{}());

assert(!returnFalse{}());
cudaq::set_random_seed(123);
const int oneCount = returnInt{}(1000);
std::cout << "One count = " << oneCount << "\n";
// We expect ~ 50% one.
assert(oneCount > 100);
}
Loading