Merge branch 'main' into main

NVIDIA · Jul 18, 2024 · 96b4cbf · 96b4cbf
2 parents aa3b157 + 42ee0e0
commit 96b4cbf
Show file tree

Hide file tree

Showing 263 changed files with 11,047 additions and 2,383 deletions.
diff --git a/.github/workflows/config/gitlab_commits.txt b/.github/workflows/config/gitlab_commits.txt
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: 191b415f12e6d636e4ab81094ce6619a774910dd
+nvidia-mgpu-commit: 59b8ed189989d6d2d944e41d8fbc5881b289c83c
diff --git a/.github/workflows/config/md_link_check_config.json b/.github/workflows/config/md_link_check_config.json
@@ -13,11 +13,20 @@
  ],
  "ignorePatterns": [
  {
- "pattern": "^https://www.gnu.org/prep/standards/standards.html"
+ "pattern": "^https://www.gnu.org/"
+ },
+ {
+ "pattern": "^https://gcc.gnu.org/"
  },
  {
  "pattern": "^https://epubs.siam.org/doi/10.1137/S0097539796300921"
  },
+ {
+ "pattern": "^https://epubs.siam.org/doi/10.1137/090774999"
+ },
+ {
+ "pattern": "^https://epubs.siam.org/doi/10.1137/090771806"
+ },
  {
  "pattern": "^https://vscode.dev/"
  },

diff --git a/.github/workflows/config/spelling_allowlist.txt b/.github/workflows/config/spelling_allowlist.txt
@@ -61,6 +61,7 @@ NVIDIA
 NVQC
 NVQIR
 OQC
+ORCA
 OpenACC
 OpenMP
 OpenMPI
@@ -95,6 +96,7 @@ SLES
 SLURM
 SVD
 Superpositions
+TBI
 TCP
 TableGen
 Toffoli
@@ -121,6 +123,7 @@ bitcode
 bitstring
 bitstrings
 boolean
+boson
 broadcasted
 buildable
 callable
@@ -196,6 +199,7 @@ inlined
 inlining
 instantiation
 instantiations
+interferometer
 interoperational
 intrinsics
 iterable
@@ -239,12 +243,17 @@ qubits
 qudit
 qudits
 reStructuredText
+reconfigurable
 runtime
 runtimes
 rvalue
 scalability
 scalable
 sexualized
+shifter
+shifters
+splitter
+splitters
 statevector
 struct
 structs

diff --git a/.github/workflows/nvqc_regression_tests.yml b/.github/workflows/nvqc_regression_tests.yml
@@ -124,7 +124,7 @@ jobs:
  # Test all remote-sim tests
  for filename in `find targettests/Remote-Sim -name '*.cpp'`; do
  # unsupport_args is a compile error test
- if [[ "$filename" != *"unsupport_args"* ]]; then
+ if [[ "$filename" != *"unsupport_args"* ]] && [[ "$filename" != *"state_overlap"* ]]; then
  echo "$filename"
  nvqc_config=""
  # Look for a --remote-mqpu-auto-launch to determine the number of QPUs

diff --git a/.github/workflows/prebuilt_binaries.yml b/.github/workflows/prebuilt_binaries.yml
@@ -349,7 +349,7 @@ jobs:
  fi
  
  ${PYTHON} -m pip install cuda_quantum*.whl
- ${PYTHON} -m pip install pytest numpy
+ ${PYTHON} -m pip install pytest numpy psutil
  ${PYTHON} -m pytest -v /home/cudaq/python \
  --ignore /home/cudaq/python/backends \
  --ignore /home/cudaq/python/domains

diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
@@ -204,7 +204,7 @@ jobs:
  --build-arg base_image=$base_image \
  --build-arg python_version=${{ inputs.python_version }} \
  --build-arg cuda_quantum_wheel=$wheelname \
- --build-arg preinstalled_modules="numpy pytest" \
+ --build-arg preinstalled_modules="numpy pytest psutil" \
  --build-arg pip_install_flags=${{ matrix.pip_install_flags }} \
  --build-arg optional_dependencies=$([ "$(uname -m)" == "x86_64" ] && echo cudart)
 

diff --git a/cmake/Modules/CMakeLists.txt b/cmake/Modules/CMakeLists.txt
@@ -6,9 +6,22 @@
 # the terms of the Apache License 2.0 which accompanies this distribution. #
 # ============================================================================ #
 
-file(GLOB CONFIG_FILES CUDAQ*Config.cmake)
-file (GLOB LANG_FILES CMake*)
-install(FILES NVQIRConfig.cmake DESTINATION lib/cmake/nvqir)
+set(CONFIG_FILES
+ CUDAQCommonConfig.cmake
+ CUDAQEmDefaultConfig.cmake
+ CUDAQNloptConfig.cmake
+ CUDAQSpinConfig.cmake
+ CUDAQConfig.cmake
+ CUDAQEnsmallenConfig.cmake
+ CUDAQPlatformDefaultConfig.cmake
+)
+set(LANG_FILES
+ CMakeCUDAQCompiler.cmake.in
+ CMakeCUDAQInformation.cmake
+ CMakeDetermineCUDAQCompiler.cmake
+ CMakeTestCUDAQCompiler.cmake
+)
+
 install(FILES ${CONFIG_FILES} DESTINATION lib/cmake/cudaq)
 install(FILES ${LANG_FILES} DESTINATION lib/cmake/cudaq)
-install(FILES CUDAQConfig.cmake DESTINATION lib/cmake/cudaq)
+install(FILES NVQIRConfig.cmake DESTINATION lib/cmake/nvqir)
diff --git a/cmake/Modules/NVQIRConfig.cmake b/cmake/Modules/NVQIRConfig.cmake
@@ -10,13 +10,10 @@ get_filename_component(NVQIR_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 include(CMakeFindDependencyMacro)
 
 get_filename_component(PARENT_DIRECTORY ${NVQIR_CMAKE_DIR} DIRECTORY)
-set (CUDAQCommon_DIR "${PARENT_DIRECTORY}/cudaq")
-set (CUDAQSpin_DIR "${PARENT_DIRECTORY}/cudaq")
-set(fmt_DIR "${PARENT_DIRECTORY}/fmt")
 
-find_dependency(CUDAQSpin REQUIRED)
-find_dependency(CUDAQCommon REQUIRED)
-find_dependency(fmt REQUIRED)
+find_dependency(CUDAQSpin REQUIRED HINTS "${PARENT_DIRECTORY}/cudaq")
+find_dependency(CUDAQCommon REQUIRED HINTS "${PARENT_DIRECTORY}/cudaq")
+find_dependency(fmt REQUIRED HINTS "${PARENT_DIRECTORY}/cudaq")
 
 if(NOT TARGET nvqir::nvqir)
  include("${NVQIR_CMAKE_DIR}/NVQIRTargets.cmake")

diff --git a/docker/build/devdeps.Dockerfile b/docker/build/devdeps.Dockerfile
@@ -48,6 +48,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends git \
  && cd /llvm-project && git checkout $llvm_commit \
  && apt-get autoremove -y --purge && apt-get clean && rm -rf /var/lib/apt/lists/* 
 
+# Apply customization for https://github.com/NVIDIA/cuda-quantum/issues/1421
+ADD ./tpls/customizations/llvm/llvm_pr71968_mod.diff /
+RUN cd /llvm-project && git apply /llvm_pr71968_mod.diff && rm /llvm_pr71968_mod.diff
+
+# Apply customization for https://github.com/NVIDIA/cuda-quantum/issues/1799
+ADD ./tpls/customizations/llvm/fix_region_simplification.diff /
+RUN cd /llvm-project && git apply /fix_region_simplification.diff && rm /fix_region_simplification.diff
+
 # Build the the LLVM libraries and compiler toolchain needed to build CUDA-Q;
 # The safest option to avoid any compatibility issues is to build an application using these libraries 
 # with the same compiler toolchain that the libraries were compiled with.

diff --git a/docker/release/cudaq.nvqc.Dockerfile b/docker/release/cudaq.nvqc.Dockerfile
@@ -16,28 +16,14 @@
 ARG base_image=nvcr.io/nvidia/nightly/cuda-quantum:latest
 FROM $base_image as nvcf_image
 
-ADD scripts/nvqc_proxy.py /
+# Run the tar command and then uncomment ADD cudaq.tar.gz ... in order to
+# override the installation.
+# tar czvf /workspaces/cuda-quantum/cudaq.tar.gz -C /usr/local/cudaq .
+# ADD cudaq.tar.gz /opt/nvidia/cudaq
 
-# Launch script: launch cudaq-qpud (nvcf mode) with MPI ranks == Number of NVIDIA GPUs
-# IMPORTANT: 
-# (1) NVCF function must set container environment variable `NUM_GPUS`
-# equal to the number of GPUs on the target platform. This will allow clients to query
-# the function capability (number of GPUs) by looking at function info. The below
-# entry point script helps prevent mis-configuration by checking that functions are
-# created and deployed appropriately.
-# (2) NVCF function must set container environment variable `NVQC_REST_PAYLOAD_VERSION` equal
-# to the RestRequest payload version with which `cudaq-qpud` in the deployment Docker image was compiled.
-# Failure to do so will result in early exits of the entry point command, thus deployment failure.
-RUN echo 'cat /opt/nvidia/cudaq/build_info.txt;' \
- 'EXPECTED_REST_PAYLOAD_VERSION="$(cudaq-qpud --type nvcf --schema-version | grep -o "CUDA-Q REST API version: \S*" | cut -d ":" -f 2 | tr -d " ")" ;' \
- 'if [[ "$NVQC_REST_PAYLOAD_VERSION" != "$EXPECTED_REST_PAYLOAD_VERSION" ]]; ' \
- ' then echo "Invalid Deployment: NVQC_REST_PAYLOAD_VERSION environment variable ($NVQC_REST_PAYLOAD_VERSION) does not match cudaq-qpud (expected $EXPECTED_REST_PAYLOAD_VERSION)." && exit 1; fi;' \
- 'python3 /nvqc_proxy.py & ' \
- 'if [[ "$NUM_GPUS" == "$(nvidia-smi --list-gpus | wc -l)" ]]; then ' \
- 'while true; do ' \
- 'mpiexec -np $(nvidia-smi --list-gpus | wc -l) cudaq-qpud --type nvcf --port 3031;' \
- 'done; ' \
- 'else echo "Invalid Deployment: Number of GPUs does not match the hardware" && exit 1; fi' > launch.sh
+RUN sudo mkdir /nvqc_scripts
+ADD tools/cudaq-qpud/nvqc_proxy.py /nvqc_scripts
+ADD tools/cudaq-qpud/json_request_runner.py /nvqc_scripts
+ADD scripts/nvqc_launch.sh /nvqc_scripts
 
-# Start the cudaq-qpud service
-ENTRYPOINT ["bash", "-l", "launch.sh"]
+ENTRYPOINT ["bash", "-l", "/nvqc_scripts/nvqc_launch.sh"]
diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst
@@ -171,6 +171,8 @@ Platform
 .. doxygenclass:: cudaq::quantum_platform
  :members:
 
+.. doxygenclass:: cudaq::SerializedCodeExecutionContext
+
 .. doxygentypedef:: cudaq::QuantumTask
 
 .. doxygentypedef:: cudaq::QubitConnectivity

diff --git a/docs/sphinx/api/languages/python_api.rst b/docs/sphinx/api/languages/python_api.rst
@@ -68,7 +68,8 @@ Kernel Execution
 .. autofunction:: cudaq::get_state
 .. autofunction:: cudaq::get_state_async
 .. autofunction:: cudaq::vqe
-.. autofunction:: cudaq::draw 
+.. autofunction:: cudaq::draw
+.. autofunction:: cudaq::translate
 
 Backend Configuration
 =============================

diff --git a/docs/sphinx/examples/cpp/algorithms/grover.cpp b/docs/sphinx/examples/cpp/algorithms/grover.cpp
@@ -3,44 +3,61 @@
 // nvq++ grover.cpp -o grover.x && ./grover.x
 // ```
 
+#include <cmath>
 #include <cudaq.h>
+#include <numbers>
 
-__qpu__ void reflect_about_uniform(cudaq::qview<> q) {
- auto ctrlQubits = q.front(q.size() - 1);
- auto &lastQubit = q.back();
+__qpu__ void reflect_about_uniform(cudaq::qvector<> &qs) {
+ auto ctrlQubits = qs.front(qs.size() - 1);
+ auto &lastQubit = qs.back();
 
  // Compute (U) Action (V) produces
  // U V U::Adjoint
  cudaq::compute_action(
  [&]() {
- h(q);
- x(q);
+ h(qs);
+ x(qs);
  },
  [&]() { z<cudaq::ctrl>(ctrlQubits, lastQubit); });
 }
 
 struct run_grover {
  template <typename CallableKernel>
- __qpu__ auto operator()(const int n_qubits, const int n_iterations,
- CallableKernel &&oracle) {
- cudaq::qvector q(n_qubits);
- h(q);
+ __qpu__ auto operator()(const int n_qubits, CallableKernel &&oracle) {
+ int n_iterations = round(0.25 * std::numbers::pi * sqrt(2 ^ n_qubits));
+
+ cudaq::qvector qs(n_qubits);
+ h(qs);
  for (int i = 0; i < n_iterations; i++) {
- oracle(q);
- reflect_about_uniform(q);
+ oracle(qs);
+ reflect_about_uniform(qs);
  }
- mz(q);
+ mz(qs);
  }
 };
 
 struct oracle {
- void operator()(cudaq::qvector<> &q) __qpu__ {
- z<cudaq::ctrl>(q[0], q[2]);
- z<cudaq::ctrl>(q[1], q[2]);
+ const long target_state;
+
+ void operator()(cudaq::qvector<> &qs) __qpu__ {
+ cudaq::compute_action(
+ [&]() {
+ for (int i = 1; i <= qs.size(); ++i) {
+ auto target_bit_set = (1 << (qs.size() - i)) & target_state;
+ if (!target_bit_set)
+ x(qs[i - 1]);
+ }
+ },
+ [&]() {
+ auto ctrlQubits = qs.front(qs.size() - 1);
+ z<cudaq::ctrl>(ctrlQubits, qs.back());
+ });
  }
 };
 
-int main() {
- auto counts = cudaq::sample(run_grover{}, 3, 1, oracle{});
- counts.dump();
+int main(int argc, char *argv[]) {
+ auto secret = 1 < argc ? strtol(argv[1], NULL, 2) : 0b1011;
+ oracle compute_oracle{.target_state = secret};
+ auto counts = cudaq::sample(run_grover{}, 4, compute_oracle);
+ printf("Found string %s\n", counts.most_probable().c_str());
 }
diff --git a/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp b/docs/sphinx/examples/cpp/algorithms/qaoa_maxcut.cpp
@@ -72,10 +72,8 @@ int main() {
  -M_PI / 8.0, M_PI / 8.0, n_params, std::mt19937::default_seed);
 
  // Call the optimizer
- auto [opt_val, opt_params] = cudaq::vqe(
- ansatz{}, Hp, optimizer, n_params, [&](std::vector<double> params) {
- return std::make_tuple(params, n_qubits, n_layers);
- });
+ auto [opt_val, opt_params] =
+ cudaq::vqe(ansatz{}, Hp, optimizer, n_params, n_qubits, n_layers);
 
  // Print the optimized value and the parameters
  printf("Optimal value = %.16lf\n", opt_val);

diff --git a/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp b/docs/sphinx/examples/cpp/algorithms/vqe_h2.cpp
@@ -104,18 +104,14 @@ int main() {
 
  so4_fabric ansatz;
 
- auto argMapper = [&](std::vector<double> x) {
- return std::make_tuple(x, n_qubits, n_layers);
- };
-
  // Run VQE.
  cudaq::optimizers::lbfgs optimizer;
  optimizer.initial_parameters = init_params;
  optimizer.max_eval = 20;
  optimizer.max_line_search_trials = 10;
- cudaq::gradients::central_difference gradient(ansatz, argMapper);
+ cudaq::gradients::central_difference gradient;
  auto [opt_val, opt_params] =
- cudaq::vqe(ansatz, gradient, H, optimizer, n_params, argMapper);
+ cudaq::vqe(ansatz, gradient, H, optimizer, n_params, n_qubits, n_layers);
 
  printf("Optimal value = %.16lf\n", opt_val);
 }
diff --git a/docs/sphinx/examples/cpp/basics/cuquantum_backends.cpp b/docs/sphinx/examples/cpp/basics/cuquantum_backends.cpp
@@ -27,7 +27,8 @@ struct ghz {
 };
 
 int main() {
- auto counts = cudaq::sample(/*shots=*/100, ghz{}, 28);
+ auto shots_count = 1024 * 1024;
+ auto counts = cudaq::sample(shots_count, ghz{}, 28);
 
  if (!cudaq::mpi::is_initialized() || cudaq::mpi::rank() == 0) {
  counts.dump();

diff --git a/docs/sphinx/examples/cpp/other/trotter_kernel_mode.cpp b/docs/sphinx/examples/cpp/other/trotter_kernel_mode.cpp
@@ -38,8 +38,9 @@
 // This is because the CPU-only backend has difficulty handling
 // 30+ qubit simulations.
 
-int STEPS = 10; // set to around 25 qubits for `nvidia` target
-int SPINS = 11; // set to around 100 for `nvidia` target
+int SPINS = 11; // set to around 25 qubits for `nvidia` target
+int STEPS = 10; // set to around 100 for `nvidia` target
+
 // Compile and run with:
 // ```
 // nvq++ --enable-mlir -v trotter_kernel_mode.cpp -o trotter.x -target nvidia &&
@@ -121,23 +122,31 @@ int run_steps(int steps, int spins) {
  auto words = term_words(ham);
  auto magnetization_exp_val = cudaq::observe(
  trotter{}, average_magnetization, &state, coefficients, words, dt);
- expResults.emplace_back(magnetization_exp_val.expectation());
+ auto result = magnetization_exp_val.expectation();
+ expResults.emplace_back(result);
  state = cudaq::get_state(trotter{}, &state, coefficients, words, dt);
  const auto stop = std::chrono::high_resolution_clock::now();
  auto duration =
  std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
- runtimeMs.emplace_back(duration.count() / 1000.0);
+ auto timeInSeconds = duration.count() / 1000.0 / 1000.0;
+ runtimeMs.emplace_back(timeInSeconds);
+ std::cout << "Step " << i << ": time [s]: " << timeInSeconds
+ << ", result: " << result << std::endl;
  }
+ std::cout << std::endl;
 
- std::cout << "Runtime [ms]: [";
+ // Print runtimes and results (useful for plotting).
+ std::cout << "Step times [s]: [";
  for (const auto &x : runtimeMs)
  std::cout << x << ", ";
- std::cout << "]\n" << std::endl;
+ std::cout << "]" << std::endl;
 
  std::cout << "Results: [";
  for (const auto &x : expResults)
  std::cout << x << ", ";
- std::cout << "]\n" << std::endl;
+ std::cout << "]" << std::endl;
+
+ std::cout << std::endl;
  return 0;
 }
 
@@ -147,6 +156,6 @@ int main() {
  const auto stop = std::chrono::high_resolution_clock::now();
  auto duration =
  std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
- std::cout << "Total running time:" << duration.count() / 1000.0 / 1000.0
+ std::cout << "Total running time: " << duration.count() / 1000.0 / 1000.0
  << "s" << std::endl;
 }