From 06ab0d80c0cf57698ca22579ab4ed3b8868e67e1 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Wed, 4 Sep 2024 05:27:36 +0000 Subject: [PATCH] temp --- .github/workflows/build.yml | 14 ---- CMakeLists.txt | 43 ++++++---- README.md | 13 ++- src/nvapi.c | 31 +++----- src/nvml.c | 152 ++++++++++++++++++++++++++++++++++++ src/utils.c | 24 ++++++ src/utils.h | 16 +++- 7 files changed, 236 insertions(+), 57 deletions(-) create mode 100644 src/nvml.c diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c43c500..2e9606b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -24,20 +24,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - if: runner.os == 'Linux' - name: Install CUDA toolkit on Linux - uses: Jimver/cuda-toolkit@v0.2.16 - with: - method: network - sub-packages: '["nvcc", "nvml-dev"]' - - - if: runner.os == 'Windows' - name: Install CUDA toolkit on Windows - uses: Jimver/cuda-toolkit@v0.2.16 - with: - method: network - sub-packages: '["cudart", "nvcc", "nvml_dev"]' - - name: Configure project run: > cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index f4068c3..0297396 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,41 +1,50 @@ # Specify the minimum required version of CMake cmake_minimum_required(VERSION 3.16) -# Include the FetchContent module -include(FetchContent) +# Include the ExternalProject module +include(ExternalProject) # Define the project name and programming language project(nvidia-pstated C) -# Find the CUDAToolkit package -find_package(CUDAToolkit REQUIRED COMPONENTS nvml) - -# Declare the nvapi package -FetchContent_Declare( - nvapi - +# +ExternalProject_Add(nvapi URL https://download.nvidia.com/XFree86/nvapi-open-source-sdk/R555-OpenSource.tar URL_HASH SHA256=71339c274a6a633f19b6bd358c7f3045063c6bc106b7dc488aaa7360a6d2b9d7 + + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory /R555-OpenSource /include ) -# Download and make the nvapi content available for use -FetchContent_MakeAvailable(nvapi) +# +ExternalProject_Add(nvml + URL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-nvml-dev-12-6_12.6.68-1_amd64.deb + URL_HASH SHA256=fda6d4fdf26e20db4ca4950489033f4c6747c7473db3f9dc0529d56f2cc237de + + CONFIGURE_COMMAND "" + BUILD_COMMAND ${CMAKE_COMMAND} -E tar xf /data.tar.xz + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory /usr/local/cuda-12.6/targets/x86_64-linux/include /include +) # Define the executable target add_executable(nvidia-pstated src/main.c src/nvapi.c + src/nvml.c src/utils.c ) -# Include directories for the target -target_include_directories(nvidia-pstated SYSTEM PRIVATE - ${nvapi_SOURCE_DIR}/R555-OpenSource +# +add_dependencies(nvidia-pstated + nvapi + nvml ) -# Link libraries -target_link_libraries(nvidia-pstated PRIVATE - CUDA::nvml +# Include directories for the target +target_include_directories(nvidia-pstated SYSTEM PRIVATE + ${CMAKE_CURRENT_BINARY_DIR}/nvapi-prefix/include + ${CMAKE_CURRENT_BINARY_DIR}/nvml-prefix/include ) # Conditional linking for Linux platform diff --git a/README.md b/README.md index af8ebd4..aaef383 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,24 @@ A daemon that automatically manages the performance states of NVIDIA GPUs. #### Linux -Make sure you have the proprietary NVIDIA driver and the packages providing `libnvidia-api.so.1` and `libnvidia-ml.so.1` installed. +Make sure the proprietary NVIDIA driver is installed. + +You will need the following libraries: + +- `libnvidia-api.so.1` +- `libnvidia-ml.so.1` + +Packages that provide these libraries: - ArchLinux: `nvidia-utils` - Debian: `libnvidia-api1` or `libnvidia-tesla-api1` (depending on the GPU and driver installed) On Debian derivatives, you can use `apt search libnvidia-api.so.1` and `apt search libnvidia-ml.so.1` to find the package you need. +Note that you MUST run this daemon at the host level, i.e. where the CUDA Driver is available. You can NOT run this daemon in a container. + +![nvidia-container-stack](https://cloud.githubusercontent.com/assets/3028125/12213714/5b208976-b632-11e5-8406-38d379ec46aa.png) + #### Windows Make sure the NVIDIA driver is installed. diff --git a/src/nvapi.c b/src/nvapi.c index ede95ab..a9d0e4a 100644 --- a/src/nvapi.c +++ b/src/nvapi.c @@ -2,13 +2,8 @@ #include #include -#ifdef _WIN32 - #include -#elif __linux__ - #include -#endif - #include "nvapi.h" +#include "utils.h" /***** ***** ***** ***** ***** TYPES ***** ***** ***** ***** *****/ @@ -65,22 +60,22 @@ NvAPI_Status NvAPI_GetErrorMessage(NvAPI_Status nr, NvAPI_ShortString szDesc) { } NvAPI_Status NvAPI_Initialize() { - // Check the platform and load the appropriate NvAPI library + // Load the appropriate NvAPI library #ifdef _WIN32 if (!lib) { - lib = LoadLibrary("nvapi64.dll"); + lib = library_open("nvapi64.dll"); } if (!lib) { - lib = LoadLibrary("nvapi.dll"); + lib = library_open("nvapi.dll"); } #elif __linux__ if (!lib) { - lib = dlopen("libnvidia-api.so.1", RTLD_LAZY); + lib = library_open("libnvidia-api.so.1"); } if (!lib) { - lib = dlopen("libnvidia-api.so", RTLD_LAZY); + lib = library_open("libnvidia-api.so"); } #endif @@ -97,11 +92,7 @@ NvAPI_Status NvAPI_Initialize() { nvapi_QueryInterface_t nvapi_QueryInterface; // Get the address of the nvapi_QueryInterface function from the loaded library - #ifdef _WIN32 - nvapi_QueryInterface = (nvapi_QueryInterface_t) GetProcAddress((HMODULE) lib, "nvapi_QueryInterface"); - #elif __linux__ - nvapi_QueryInterface = (nvapi_QueryInterface_t) dlsym(lib, "nvapi_QueryInterface"); - #endif + nvapi_QueryInterface = (nvapi_QueryInterface_t) library_proc(lib, "nvapi_QueryInterface"); // If the function pointer is still null, gathering the address failed if (!nvapi_QueryInterface) { @@ -144,12 +135,8 @@ NvAPI_Status NvAPI_Unload() { _NvAPI_Initialize = NULL; _NvAPI_Unload = NULL; - // Free the loaded library based on the platform - #ifdef _WIN32 - FreeLibrary((HMODULE) lib); - #elif __linux__ - dlclose(lib); - #endif + // Free the loaded library + library_close(lib); } } diff --git a/src/nvml.c b/src/nvml.c new file mode 100644 index 0000000..6476e5d --- /dev/null +++ b/src/nvml.c @@ -0,0 +1,152 @@ +#include + +#include "nvml.h" +#include "utils.h" + +/***** ***** ***** ***** ***** TYPES ***** ***** ***** ***** *****/ + +typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndex_v2_t)(unsigned int, nvmlDevice_t); +typedef nvmlReturn_t (*nvmlDeviceGetName_t)(nvmlDevice_t, char *, unsigned int); +typedef nvmlReturn_t (*nvmlDeviceGetTemperature_t)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int); +typedef nvmlReturn_t (*nvmlDeviceGetUtilizationRates_t)(nvmlDevice_t, nvmlUtilization_t); +typedef char * (*nvmlErrorString_t)(nvmlReturn_t); +typedef nvmlReturn_t (*nvmlInit_v2_t)(void); +typedef nvmlReturn_t (*nvmlShutdown_t)(void); + +/***** ***** ***** ***** ***** VARIABLES ***** ***** ***** ***** *****/ + +static void * lib; + +static nvmlDeviceGetHandleByIndex_v2_t _nvmlDeviceGetHandleByIndex_v2; +static nvmlDeviceGetName_t _nvmlDeviceGetName; +static nvmlDeviceGetTemperature_t _nvmlDeviceGetTemperature; +static nvmlDeviceGetUtilizationRates_t _nvmlDeviceGetUtilizationRates; +static nvmlErrorString_t _nvmlErrorString; +static nvmlInit_v2_t _nvmlInit_v2; +static nvmlShutdown_t _nvmlShutdown; + +/***** ***** ***** ***** ***** MACROS ***** ***** ***** ***** *****/ + +#define NVML_POINTER(pointer) do { \ + if (pointer == NULL) { \ + return NVML_ERROR_UNINITIALIZED; \ + } \ +} while(0) + +/***** ***** ***** ***** ***** IMPLEMENTATION ***** ***** ***** ***** *****/ + +nvmlReturn_t nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t * device) { + // Ensure the function pointer is valid + NVML_POINTER(_nvmlDeviceGetHandleByIndex_v2); + + // Invoke the function using the provided parameters + return _nvmlDeviceGetHandleByIndex_v2(index, *device); +} + +nvmlReturn_t nvmlDeviceGetName(nvmlDevice_t device, char * name, unsigned int length) { + // Ensure the function pointer is valid + NVML_POINTER(_nvmlDeviceGetName); + + // Invoke the function using the provided parameters + return _nvmlDeviceGetName(device, name, length); +} + +nvmlReturn_t nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int * temp) { + // Ensure the function pointer is valid + NVML_POINTER(_nvmlDeviceGetTemperature); + + // Invoke the function using the provided parameters + return _nvmlDeviceGetTemperature(device, sensorType, *temp); +} + +nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t * utilization) { + // Ensure the function pointer is valid + NVML_POINTER(_nvmlDeviceGetUtilizationRates); + + // Invoke the function using the provided parameters + return _nvmlDeviceGetUtilizationRates(device, *utilization); +} + +const char * nvmlErrorString(nvmlReturn_t result) { + // Ensure the function pointer is valid + if (_nvmlErrorString == NULL) { + return ""; + } + + // Invoke the function using the provided parameters + return _nvmlErrorString(result); +} + +nvmlReturn_t nvmlInit_v2(void) { + // Check the platform and load the appropriate NVML library + #ifdef _WIN32 + if (!lib) { + lib = library_open("nvml64.dll"); + } + + if (!lib) { + lib = library_open("nvml.dll"); + } + #elif __linux__ + if (!lib) { + lib = library_open("libnvidia-ml.so.1"); + } + + if (!lib) { + lib = library_open("libnvidia-ml.so"); + } + #endif + + // If the library handle is still not initialized, loading the library failed + if (!lib) { + // Print an error message indicating failure to load the NVML library + fprintf(stderr, "Unable to load NVML library\n"); + + // Return an error status indicating that the library was not found + return NVML_ERROR_LIBRARY_NOT_FOUND; + } + + // Retrieve the addresses of specific NVML functions + _nvmlDeviceGetHandleByIndex_v2 = (nvmlDeviceGetHandleByIndex_v2_t) library_proc(lib, "nvmlDeviceGetHandleByIndex_v2"); + _nvmlDeviceGetName = (nvmlDeviceGetName_t) library_proc(lib, "nvmlDeviceGetName"); + _nvmlDeviceGetTemperature = (nvmlDeviceGetTemperature_t) library_proc(lib, "nvmlDeviceGetTemperature"); + _nvmlDeviceGetUtilizationRates = (nvmlDeviceGetUtilizationRates_t) library_proc(lib, "nvmlDeviceGetUtilizationRates"); + _nvmlErrorString = (nvmlErrorString_t) library_proc(lib, "nvmlErrorString"); + _nvmlInit_v2 = (nvmlInit_v2_t) library_proc(lib, "nvmlInit_v2"); + _nvmlShutdown = (nvmlShutdown_t) library_proc(lib, "nvmlShutdown"); + + // Ensure the function pointer is valid + NVML_POINTER(_nvmlInit_v2); + + // Invoke the function using the provided parameters + return _nvmlInit_v2(); +} + +nvmlReturn_t nvmlShutdown(void) { + // Ensure the function pointer is valid + NVML_POINTER(_nvmlShutdown); + + // Invoke the function using the provided parameters + nvmlReturn_t ret = _nvmlShutdown(); + + // If the function call was successful, proceed with cleanup + if (ret == NVML_SUCCESS) { + // If the library handle is initialized + if (lib) { + // Nullify all the function pointers to prevent further use + _nvmlDeviceGetHandleByIndex_v2 = NULL; + _nvmlDeviceGetName = NULL; + _nvmlDeviceGetTemperature = NULL; + _nvmlDeviceGetUtilizationRates = NULL; + _nvmlErrorString = NULL; + _nvmlInit_v2 = NULL; + _nvmlShutdown = NULL; + + // Free the loaded library + library_close(lib); + } + } + + // Return the status of the function call + return ret; +} diff --git a/src/utils.c b/src/utils.c index 0919a97..91a183c 100644 --- a/src/utils.c +++ b/src/utils.c @@ -5,6 +5,30 @@ #include #include +void * library_open(const char * filename) { + #ifdef _WIN32 + return LoadLibrary(filename); + #elif __linux__ + return dlopen(filename, RTLD_LAZY); + #endif +} + +void * library_proc(void * handle, const char * symbol) { + #ifdef _WIN32 + return GetProcAddress((HMODULE) handle, symbol); + #elif __linux__ + return dlsym(handle, symbol); + #endif +} + +int library_close(void * handle) { + #ifdef _WIN32 + FreeLibrary((HMODULE) handle); + #elif __linux__ + dlclose(handle); + #endif +} + bool parse_ulong(const char *arg, unsigned long *value) { // Check if the input or output argument is invalid if (arg == NULL || value == NULL) { diff --git a/src/utils.h b/src/utils.h index b0f9a56..b6f2dd2 100644 --- a/src/utils.h +++ b/src/utils.h @@ -4,6 +4,12 @@ #include #include +#ifdef _WIN32 + #include +#elif __linux__ + #include +#endif + /***** ***** ***** ***** ***** MACROS ***** ***** ***** ***** *****/ // Macro to check if a condition is true and jump to a label if it is not @@ -16,7 +22,7 @@ /* Jump to the specified label */ \ goto label; \ } \ -} while (0); +} while (0) // Macro to check if a condition is false and jump to a label if it is not #define ASSERT_FALSE(call, label) do { \ @@ -28,7 +34,7 @@ /* Jump to the specified label */ \ goto label; \ } \ -} while (0); +} while (0) // Macro to check if there is a next argument #define HAS_NEXT_ARG (i + 1 < argc) @@ -43,9 +49,13 @@ \ /* Set the pointer to NULL */ \ ptr = NULL; \ -} while (0); +} while (0) /***** ***** ***** ***** ***** FUNCTIONS ***** ***** ***** ***** *****/ +void * library_open(const char * filename); +void * library_proc(void * handle, const char * symbol); +int library_close(void * handle); + bool parse_ulong(const char *arg, unsigned long *value); bool parse_ulong_array(const char *arg, const char *delimiter, const size_t max_count, unsigned long *values, size_t *count);