From ddac8dd9fc0bee70a3f456df68b8aac38576c856 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Fri, 14 Jun 2024 15:58:35 -0700 Subject: [PATCH] Enable debug flags in compilation (#2734) Summary: - Enable debug flags in compilation for CUDA and HIP variants Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2734 Reviewed By: spcyppt Differential Revision: D58607704 Pulled By: q10 fbshipit-source-id: a1670e107d0bdf087e59d118f8a0c8871ff75bfd --- .github/scripts/fbgemm_gpu_build.bash | 48 ++++++++++--------------- .github/scripts/fbgemm_gpu_install.bash | 36 ++++++++++--------- .github/scripts/fbgemm_gpu_test.bash | 9 +++++ .github/scripts/nova_dir.bash | 2 +- fbgemm_gpu/setup.py | 20 ++++++++--- 5 files changed, 63 insertions(+), 52 deletions(-) diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash index 8fb80735bd..0b979c9a14 100644 --- a/.github/scripts/fbgemm_gpu_build.bash +++ b/.github/scripts/fbgemm_gpu_build.bash @@ -159,17 +159,10 @@ __configure_fbgemm_gpu_build_rocm () { print_exec conda env config vars set ${env_prefix} PYTORCH_ROCM_ARCH="${arch_list}" echo "[BUILD] Setting ROCm build args ..." - # shellcheck disable=SC2155 - local cxx_flags="-DTORCH_USE_HIP_DSA" - build_args=( --package_variant=rocm # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake -DHIP_ROOT_DIR=/opt/rocm - # Enable device-side assertions in HIP - # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line - -DCMAKE_C_FLAGS="'${cxx_flags}'" - -DCMAKE_CXX_FLAGS="'${cxx_flags}'" ) } @@ -251,26 +244,14 @@ __configure_fbgemm_gpu_build_genai () { done } +# shellcheck disable=SC2120 __configure_fbgemm_gpu_build () { - local fbgemm_variant="$1" - local fbgemm_variant_targets="$2" - if [ "$fbgemm_variant" == "" ]; then - echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT" - echo "Example(s):" - echo " ${FUNCNAME[0]} cpu # CPU-only variant using Clang" - echo " ${FUNCNAME[0]} cuda # CUDA variant for default target(s)" - echo " ${FUNCNAME[0]} cuda '7.0;8.0' # CUDA variant for custom target(s)" - echo " ${FUNCNAME[0]} rocm # ROCm variant for default target(s)" - echo " ${FUNCNAME[0]} rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" - return 1 - else - echo "################################################################################" - echo "# Configure FBGEMM-GPU Build" - echo "#" - echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}" - echo "################################################################################" - echo "" - fi + echo "################################################################################" + echo "# Configure FBGEMM-GPU Build" + echo "#" + echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}" + echo "################################################################################" + echo "" # shellcheck disable=SC2155 local env_prefix=$(env_name_or_prefix "${env_name}") @@ -302,6 +283,13 @@ __configure_fbgemm_gpu_build () { --verbose ) + # Set debugging options + if [ "$fbgemm_release_channel" != "release" ] || [ "$BUILD_DEBUG" -eq 1 ]; then + build_args+=( + --debug + ) + fi + # shellcheck disable=SC2145 echo "[BUILD] FBGEMM_GPU build arguments have been set: ${build_args[@]}" } @@ -502,8 +490,8 @@ run_fbgemm_gpu_postbuild_checks () { return 1 fi - __print_library_infos - __verify_library_symbols + __print_library_infos || return 1 + __verify_library_symbols || return 1 } ################################################################################ @@ -531,7 +519,7 @@ build_fbgemm_gpu_package () { # Set up and configure the build __build_fbgemm_gpu_common_pre_steps || return 1 - __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + __configure_fbgemm_gpu_build || return 1 echo "################################################################################" echo "# Build FBGEMM-GPU Package (Wheel)" @@ -596,7 +584,7 @@ build_fbgemm_gpu_install () { # Set up and configure the build __build_fbgemm_gpu_common_pre_steps || return 1 - __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + __configure_fbgemm_gpu_build || return 1 echo "################################################################################" echo "# Build + Install FBGEMM-GPU Package" diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash index 8bf477b1f2..6c93c53d02 100644 --- a/.github/scripts/fbgemm_gpu_install.bash +++ b/.github/scripts/fbgemm_gpu_install.bash @@ -31,19 +31,6 @@ __install_print_dependencies_info () { echo "" } -__install_list_subpackages_info () { - # shellcheck disable=SC2086,SC2155 - local fbgemm_gpu_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(dir(fbgemm_gpu))") - # shellcheck disable=SC2086,SC2155 - local experimental_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu.experimental; print(dir(fbgemm_gpu.experimental))") - echo "################################################################################" - echo "[CHECK] FBGEMM_GPU Experimental Packages" - echo "[CHECK] fbgemm_gpu: ${fbgemm_gpu_packages}" - echo "[CHECK] fbgemm_gpu.experimental: ${experimental_packages}" - echo "################################################################################" - echo "" -} - __install_fetch_version_and_variant_info () { echo "[INSTALL] Checking imports and symbols ..." (test_python_import_package "${env_name}" fbgemm_gpu) || return 1 @@ -62,6 +49,23 @@ __install_fetch_version_and_variant_info () { echo "" } +__install_list_subpackages_info () { + # shellcheck disable=SC2086,SC2155 + local fbgemm_gpu_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(dir(fbgemm_gpu))") + + if [ "$installed_fbgemm_gpu_variant" == "cuda" ] || [ "$installed_fbgemm_gpu_variant" == "genai" ]; then + # shellcheck disable=SC2086,SC2155 + local experimental_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu.experimental; print(dir(fbgemm_gpu.experimental))") + fi + + echo "################################################################################" + echo "[CHECK] FBGEMM_GPU Experimental Packages" + echo "[CHECK] fbgemm_gpu: ${fbgemm_gpu_packages}" + echo "[CHECK] fbgemm_gpu.experimental: ${experimental_packages}" + echo "################################################################################" + echo "" +} + __install_check_operator_registrations () { echo "[INSTALL] Check for operator registrations ..." if [ "$installed_fbgemm_gpu_variant" == "genai" ]; then @@ -103,12 +107,12 @@ __fbgemm_gpu_post_install_checks () { # Print PyTorch and CUDA versions for sanity check __install_print_dependencies_info - # List out FBGEMM_GPU subpackages - __install_list_subpackages_info - # Fetch the version and variant info from the package __install_fetch_version_and_variant_info + # List out FBGEMM_GPU subpackages + __install_list_subpackages_info + echo "[INSTALL] Check for installation of Python sources ..." if [ "$installed_fbgemm_gpu_variant" != "genai" ]; then (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1 diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash index 4b8a535447..7e167698db 100644 --- a/.github/scripts/fbgemm_gpu_test.bash +++ b/.github/scripts/fbgemm_gpu_test.bash @@ -74,6 +74,10 @@ __configure_fbgemm_gpu_test_cpu () { } __configure_fbgemm_gpu_test_cuda () { + # Disabled by default; enable for debugging + # shellcheck disable=SC2086 + # print_exec conda env config vars set ${env_prefix} CUDA_LAUNCH_BLOCKING=1 + ignored_tests=( ./tbe/ssd/ssd_split_table_batched_embeddings_test.py ) @@ -407,6 +411,11 @@ test_fbgemm_gpu_setup_and_pip_install () { echo "# Run Result : $([ $retcode -eq 0 ] && echo "PASSED" || echo "FAILED")" echo "################################################################################" + if [ $retcode -eq 0 ]; then + # Clean out environment only if there were no errors + conda remove -n "$env_name" -y --all + fi + cd - || return 1 return $retcode } diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash index 8d6c6f0eb8..a6ad595f26 100644 --- a/.github/scripts/nova_dir.bash +++ b/.github/scripts/nova_dir.bash @@ -17,4 +17,4 @@ export BUILD_FROM_NOVA=1 if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi if [[ "$CU_VERSION" == "cu118" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0' && echo "$TORCH_CUDA_ARCH_LIST"; fi if [[ "$CU_VERSION" == "cu121" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0;9.0;9.0a' && echo "$TORCH_CUDA_ARCH_LIST"; fi -if [[ "$CU_VERSION" == "cu124" ]]; then export TORCH_CUDA_ARCH_LIST='8.0;9.0;9.0a' && echo "$TORCH_CUDA_ARCH_LIST"; fi +if [[ "$CU_VERSION" == "cu124" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0;9.0;9.0a' && echo "$TORCH_CUDA_ARCH_LIST"; fi diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py index a27949609d..cdb131f558 100644 --- a/fbgemm_gpu/setup.py +++ b/fbgemm_gpu/setup.py @@ -39,6 +39,11 @@ def from_args(cls, argv: List[str]): action="store_true", help="Print verbose logs during the build.", ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable DEBUG features in compilation such as PyTorch device-side assertions.", + ) parser.add_argument( "--dryrun", action="store_true", @@ -237,7 +242,7 @@ def _get_cxx11_abi(): _get_cxx11_abi(), ] - cxx_args = [] + cxx_flags = [] if self.args.verbose: print("[SETUP.PY] Building in VERBOSE mode ...") @@ -245,6 +250,11 @@ def _get_cxx11_abi(): ["-DCMAKE_VERBOSE_MAKEFILE=ON", "-DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE"] ) + if self.args.debug: + # Enable device-side assertions in CUDA and HIP + # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line + cxx_flags.extend(["-DTORCH_USE_CUDA_DSA", "-DTORCH_USE_HIP_DSA"]) + if self.args.package_variant == "cpu": print("[SETUP.PY] Building the CPU-ONLY variant of FBGEMM_GPU ...") cmake_args.append("-DFBGEMM_CPU_ONLY=ON") @@ -258,7 +268,7 @@ def _get_cxx11_abi(): if self.args.nccl_lib_path: nccl_root = os.path.dirname(os.path.dirname(self.args.nccl_lib_path)) - cxx_args.extend([f"-L{nccl_root}/lib"]) + cxx_flags.extend([f"-L{nccl_root}/lib"]) cmake_args.extend( [ f"-DNCCL_INCLUDE_DIRS={nccl_root}/include", @@ -270,7 +280,7 @@ def _get_cxx11_abi(): print("[SETUP.PY] Setting CMake flags ...") path = self.args.cxxprefix - cxx_args.extend( + cxx_flags.extend( [ "-fopenmp=libgomp", "-stdlib=libstdc++", @@ -286,8 +296,8 @@ def _get_cxx11_abi(): cmake_args.extend( [ - f"-DCMAKE_C_FLAGS='{' '.join(cxx_args)}'", - f"-DCMAKE_CXX_FLAGS='{' '.join(cxx_args)}'", + f"-DCMAKE_C_FLAGS='{' '.join(cxx_flags)}'", + f"-DCMAKE_CXX_FLAGS='{' '.join(cxx_flags)}'", ] )