From 37ea39d6aeba3d1accc17f9eec37c2cb156135f2 Mon Sep 17 00:00:00 2001 From: Solar Designer Date: Sat, 30 Dec 2023 00:13:32 +0100 Subject: [PATCH] argon2-opencl: Prefer GWS that is a multiple of "cores" Fixes #5402 --- src/opencl_argon2_fmt_plug.c | 49 ++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/opencl_argon2_fmt_plug.c b/src/opencl_argon2_fmt_plug.c index fc92c44d00..944c709793 100644 --- a/src/opencl_argon2_fmt_plug.c +++ b/src/opencl_argon2_fmt_plug.c @@ -285,7 +285,9 @@ static int is_power_of_two(uint32_t x) { return (x & (x - 1)) == 0; } + #define AUTOTUNE_HANDLE_CLERROR(cl_error, message) if (CL_SUCCESS != (cl_error)) return CL_OUT_OF_RESOURCES; + static cl_int autotune(argon2_type type, uint32_t lanes, uint32_t segment_blocks, cl_command_queue profiling_queue, cl_event* profiling_event) { uint32_t index = index_best_kernel_params(type, lanes, segment_blocks); @@ -322,7 +324,7 @@ static cl_int autotune(argon2_type type, uint32_t lanes, uint32_t segment_blocks if (DEVICE_USE_LOCAL_MEMORY) { size_t shmemSize = THREADS_PER_LANE * best_lanes_per_block * best_jobs_per_block * sizeof(cl_ulong); if (shmemSize > get_local_memory_size(gpu_id)) - printf("-- Overflowing %u KB / %u KB local GPU memory --\n", + printf("-- Overflowing %u KiB / %u KiB local device memory --\n", (uint32_t)(shmemSize / 1024), (uint32_t)(get_local_memory_size(gpu_id) / 1024)); @@ -510,19 +512,35 @@ static void reset(struct db_main *db) //---------------------------------------------------------------------------------------------------------------------------- // Create OpenCL objects //---------------------------------------------------------------------------------------------------------------------------- - // Use all GPU memory by default + // Use almost all GPU memory by default + unsigned int warps = 6, limit, target; if (gpu_amd(device_info[gpu_id])) { - MAX_KEYS_PER_CRYPT = get_max_mem_alloc_size(gpu_id) / max_memory_size; + limit = get_max_mem_alloc_size(gpu_id) / max_memory_size; } else { - MAX_KEYS_PER_CRYPT = get_global_memory_size(gpu_id) * 15 / 16 / (max_memory_size + ARGON2_PREHASH_DIGEST_LENGTH); + if (gpu_nvidia(device_info[gpu_id])) { + unsigned int major = 0, minor = 0; + get_compute_capability(gpu_id, &major, &minor); + if (major == 5) /* NVIDIA Maxwell */ + warps = 2; + } + limit = get_global_memory_size(gpu_id) * 31 / 32 / (max_memory_size + ARGON2_PREHASH_DIGEST_LENGTH); } - MAX_KEYS_PER_CRYPT -= MAX_KEYS_PER_CRYPT & (MAX_KEYS_PER_CRYPT > 128 ? 3 : 1); // Make it even or multiple of 4 + unsigned int cores = get_processors_count(gpu_id); + unsigned int threads_per_crypt = THREADS_PER_LANE; + if (!benchmark_running) + threads_per_crypt *= max_salt_lanes; + do { + target = (cores * warps + threads_per_crypt - 1) / threads_per_crypt; + } while (target > limit && --warps > 1); + if (target > limit) + target = limit; + if (target > 16) + target -= target & (target > 128 ? 3 : 1); // Make it even or multiple of 4 + MAX_KEYS_PER_CRYPT = target; // Load GWS from config/command line opencl_get_user_preferences(FORMAT_NAME); - if (global_work_size && !self_test_running) { - MAX_KEYS_PER_CRYPT = MAX(1, global_work_size / (THREADS_PER_LANE * max_salt_lanes)); - printf("\nCustom GWS result on MAX_KEYS_PER_CRYPT = %u", MAX_KEYS_PER_CRYPT); - } + if (global_work_size && !self_test_running) + MAX_KEYS_PER_CRYPT = MAX(1, global_work_size / threads_per_crypt); MEM_FREE(saved_key); MEM_FREE(saved_len); MEM_FREE(crypted); @@ -579,6 +597,7 @@ static void reset(struct db_main *db) best_kernel_params = mem_calloc(ARGON2_NUM_TYPES * max_salt_lanes * max_segment_blocks, sizeof(struct kernel_run_params)); // Manage GPU memory + int need_linefeed = benchmark_running; do { // CPU memory to transfer to and from the GPU if (memory_in) { @@ -597,10 +616,14 @@ static void reset(struct db_main *db) memory_buffer = NULL; } memory_buffer = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE, max_memory_size, NULL, &ret_code); - if (ocl_always_show_ws || !self_test_running) - printf("\nTrying to use %zu MB / %u MB GPU memory.\n", - max_memory_size / 1048576, - (uint32_t)(get_global_memory_size(gpu_id) / 1048576)); + if (ocl_always_show_ws || !self_test_running) { + printf("%sTrying to compute %u hashes at a time using %u of %u MiB device memory\n", + need_linefeed ? "\n" : "", + MAX_KEYS_PER_CRYPT, + (unsigned int)(max_memory_size / 0x100000), + (uint32_t)(get_global_memory_size(gpu_id) / 0x100000)); + need_linefeed = 0; + } // Check kernel execution if (ret_code == CL_SUCCESS) {