From 943f51a34dffdd5f2c87aced503ea32af519a21f Mon Sep 17 00:00:00 2001 From: Stephen Akridge Date: Sun, 22 Dec 2019 10:15:06 -0800 Subject: [PATCH] Build CL by default and include it in the release package. Stub out unimplemented interfaces. --- Makefile | 1 + src/Makefile | 56 +++++++++++++++-------- src/{cuda-headers => common}/gpu_common.h | 0 src/{cuda-crypt => crypt-if}/chacha.h | 7 +-- src/cuda-crypt/chacha20_core.cu | 17 +++++-- src/cuda-crypt/chacha_cbc.cu | 2 +- src/gpu-common.mk | 3 +- src/opencl-crypt/cl_chacha.cpp | 34 ++++++++++++++ src/opencl-ecc-ed25519/main.cpp | 2 + src/opencl-ecc-ed25519/sign.cpp | 2 +- src/opencl-ecc-ed25519/verify.cpp | 14 +++++- src/opencl-platform/cl_init_platform.cpp | 30 ++++++------ src/opencl-poh-verify/cl_poh_verify.cpp | 17 +++++++ 13 files changed, 139 insertions(+), 46 deletions(-) rename src/{cuda-headers => common}/gpu_common.h (100%) rename src/{cuda-crypt => crypt-if}/chacha.h (86%) create mode 100644 src/opencl-crypt/cl_chacha.cpp create mode 100644 src/opencl-poh-verify/cl_poh_verify.cpp diff --git a/Makefile b/Makefile index bd4551b..f26b846 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ install: mkdir -p $(DESTDIR) ifneq ($(OS),Darwin) cp -f src/$(V)/libcuda-crypt.so $(DESTDIR) + cp -f src/$(V)/libcl-crypt.so $(DESTDIR) endif ls -lh $(DESTDIR) diff --git a/src/Makefile b/src/Makefile index 6df37f6..9408053 100644 --- a/src/Makefile +++ b/src/Makefile @@ -19,19 +19,33 @@ LIB=cuda-crypt CL_ECC_TEST_BIN=cl_ed25519_verify CL_LIB=cl-crypt +CHACHA_IF_DIR:=crypt-if + CL_HEADER_DIR:=opencl-platform CUDA_HEADER_DIR:=cuda-headers CUDA_SHA256_DIR:=cuda-sha256 +CUDA_DIR ?= /usr/local/cuda + CXX ?= g++ -CFLAGS+=-DENDIAN_NEUTRAL -DLTC_NO_ASM -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR) -#use -DUSE_RDTSC for Windows compilation -CL_CFLAGS=-fPIC -std=c++11 -DENDIAN_NEUTRAL -DOPENCL_VARIANT -DLTC_NO_ASM -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR) -I$(CL_HEADER_DIR) -Icommon/ -CUDA_PATH ?= /usr/local/cuda-9.1 +CFLAGS_COMMON:=-DENDIAN_NEUTRAL -DLTC_NO_ASM -I$(CHACHA_IF_DIR) +CFLAGS+=$(CFLAGS_COMMON) -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR) -all: $V/$(CHACHA_TEST_BIN) $V/$(ECC_TEST_BIN) $(V)/lib$(LIB).so +#use -DUSE_RDTSC for Windows compilation +CL_CFLAGS_common:=-fPIC -std=c++11 $(CFLAGS_COMMON) -DOPENCL_VARIANT \ + -I$(CL_HEADER_DIR) -Icommon/ \ + -I$(CUDA_DIR)/targets/x86_64-linux/include $(HOST_CFLAGS) +CL_CFLAGS_release:=$(CL_CFLAGS_common) -O3 +CL_CFLAGS_debug:=$(CL_CFLAGS_common) -O0 -g +CL_CFLAGS:=$(CL_CFLAGS_$V) + +all: $(V)/$(CHACHA_TEST_BIN) \ + $(V)/$(ECC_TEST_BIN) \ + $(V)/$(CL_ECC_TEST_BIN) \ + $(V)/lib$(LIB).so \ + $(V)/lib$(CL_LIB).so ECC_DIR:=cuda-ecc-ed25519 @@ -45,7 +59,7 @@ $V/seed.o: $(SEED_SRCS) @mkdir -p $(@D) $(NVCC) -rdc=true $(CFLAGS) -c $< -o $@ -SIGN_SRCS:=$(addprefix $(ECC_DIR)/,sign.cu sha512.h ge.h sc.h fe.cu ../$(CUDA_HEADER_DIR)/gpu_common.h ed25519.h) +SIGN_SRCS:=$(addprefix $(ECC_DIR)/,sign.cu sha512.h ge.h sc.h fe.cu ../common/gpu_common.h ed25519.h) $V/sign.o: $(SIGN_SRCS) @mkdir -p $(@D) $(NVCC) -rdc=true $(CFLAGS) -c $< -o $@ @@ -60,6 +74,18 @@ $V/gpu_ctx.o: $(addprefix $(ECC_DIR)/,gpu_ctx.cu gpu_ctx.h) $(NVCC) -rdc=true $(CFLAGS) -c $< -o $@ CL_ECC_DIR:=opencl-ecc-ed25519 +CL_CRYPT_DIR:=opencl-crypt +CL_POH_VERIFY_DIR:=opencl-poh-verify + +CL_POH_VERIFY_SRCS:=$(CL_POH_VERIFY_DIR)/cl_poh_verify.cpp +$V/cl_poh_verify.o: $(CL_POH_VERIFY_SRCS) + @mkdir -p $(@D) + $(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@ + +CL_CHACHA_SRCS:=$(CL_CRYPT_DIR)/cl_chacha.cpp +$V/cl_chacha.o: $(CL_CHACHA_SRCS) + @mkdir -p $(@D) + $(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@ CL_SIGN_SRCS:=$(CL_ECC_DIR)/sign.cpp $(ECC_DIR)/fe.cu $(ECC_DIR)/ed25519.h $V/cl_sign.o: $(CL_SIGN_SRCS) @@ -76,7 +102,7 @@ $V/cl_gpu_ctx.o: $(addprefix $(CL_ECC_DIR)/,gpu_ctx.cpp gpu_ctx.h) $(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@ CHACHA_DIR:=cuda-crypt -CHACHA_SRCS:=$(addprefix $(CHACHA_DIR)/,chacha_cbc.cu chacha.h common.cu) +CHACHA_SRCS:=$(addprefix $(CHACHA_DIR)/,chacha_cbc.cu common.cu) $(CHACHA_IF_DIR)/chacha.h $V/chacha_cbc.o: $(CHACHA_SRCS) @mkdir -p $(@D) @@ -99,23 +125,17 @@ $V/poh_verify.o: $(POH_SRCS) @mkdir -p $(@D) $(NVCC) -rdc=true $(CFLAGS) -c $< -o $@ -CL_CPU_GPU_OBJS=$(addprefix $V/,cl_init_platform.o cl_verify.o cl_gpu_ctx.o cl_sign.o) +CL_CPU_GPU_OBJS=$(addprefix $V/,cl_init_platform.o cl_verify.o cl_gpu_ctx.o cl_sign.o cl_chacha.o cl_poh_verify.o) -$V/cl_crypt-dlink.o: $(CL_CPU_GPU_OBJS) - ar rvs $@ $^ - -$V/lib$(CL_LIB).a: $V/cl_crypt-dlink.o $(CL_CPU_GPU_OBJS) - ar rcs $@ $^ - -$V/lib$(CL_LIB).so: $V/cl_crypt-dlink.o $(CL_CPU_GPU_OBJS) - $(CXX) -shared --shared $^ -o $@ +$V/lib$(CL_LIB).so: $(CL_CPU_GPU_OBJS) + $(CXX) -shared $^ -lOpenCL -o $@ $V/cl_ecc_main.o: $(CL_ECC_DIR)/main.cpp $(ECC_DIR)/ed25519.h @mkdir -p $(@D) - $(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@ + $(CXX) $(CL_CFLAGS) -pthread -I$(ECC_DIR) -c $< -o $@ $V/$(CL_ECC_TEST_BIN): $V/cl_ecc_main.o $V/lib$(CL_LIB).so - $(CXX) $(CL_CFLAGS) -Wl,-v -L$(CUDA_PATH)/lib64 -L$V -lpthread $^ -lOpenCL -o $@ + $(CXX) $(CL_CFLAGS) -L$(CUDA_DIR)/lib64 -L$V -pthread $< -l$(CL_LIB) -lOpenCL -o $@ CPU_GPU_OBJS=$(addprefix $V/,chacha_cbc.o aes_cbc.o verify.o poh_verify.o gpu_ctx.o sign.o seed.o keypair.o) diff --git a/src/cuda-headers/gpu_common.h b/src/common/gpu_common.h similarity index 100% rename from src/cuda-headers/gpu_common.h rename to src/common/gpu_common.h diff --git a/src/cuda-crypt/chacha.h b/src/crypt-if/chacha.h similarity index 86% rename from src/cuda-crypt/chacha.h rename to src/crypt-if/chacha.h index cb4b291..806b427 100644 --- a/src/cuda-crypt/chacha.h +++ b/src/crypt-if/chacha.h @@ -13,9 +13,10 @@ extern "C" { #define CHACHA_ROUNDS 500 #define SAMPLE_SIZE 32 -void __host__ __device__ chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len, - const uint8_t key[CHACHA_KEY_SIZE], const uint8_t nonce[CHACHA_NONCE_SIZE], - uint32_t counter); +void chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len, + const uint8_t key[CHACHA_KEY_SIZE], + const uint8_t nonce[CHACHA_NONCE_SIZE], + uint32_t counter); void cuda_chacha20_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t in_len, const uint8_t key[CHACHA_KEY_SIZE], uint8_t* ivec); diff --git a/src/cuda-crypt/chacha20_core.cu b/src/cuda-crypt/chacha20_core.cu index 1e4fc28..e8a9577 100644 --- a/src/cuda-crypt/chacha20_core.cu +++ b/src/cuda-crypt/chacha20_core.cu @@ -57,10 +57,11 @@ static void __host__ __device__ chacha20_encrypt(const u32 input[16], } } -void __host__ __device__ chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len, - const uint8_t key[CHACHA_KEY_SIZE], - const uint8_t nonce[CHACHA_NONCE_SIZE], - uint32_t counter) +void __host__ __device__ +chacha20_ctr_encrypt_device(const uint8_t *in, uint8_t *out, size_t in_len, + const uint8_t key[CHACHA_KEY_SIZE], + const uint8_t nonce[CHACHA_NONCE_SIZE], + uint32_t counter) { uint32_t input[16]; uint8_t buf[64]; @@ -105,4 +106,10 @@ void __host__ __device__ chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, s } } - +void chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len, + const uint8_t key[CHACHA_KEY_SIZE], + const uint8_t nonce[CHACHA_NONCE_SIZE], + uint32_t counter) +{ + chacha20_ctr_encrypt_device(in, out, in_len, key, nonce, counter); +} diff --git a/src/cuda-crypt/chacha_cbc.cu b/src/cuda-crypt/chacha_cbc.cu index 1ff4c11..6e96f0b 100644 --- a/src/cuda-crypt/chacha_cbc.cu +++ b/src/cuda-crypt/chacha_cbc.cu @@ -142,7 +142,7 @@ __global__ void chacha_ctr_encrypt_kernel(const unsigned char* input, unsigned c size_t i = (size_t)(blockIdx.x * blockDim.x + threadIdx.x); if (i < num_keys) { - chacha20_ctr_encrypt(input, &output[i * length], length, &keys[i * CHACHA_KEY_SIZE], &nonces[i * CHACHA_NONCE_SIZE], 0); + chacha20_ctr_encrypt_device(input, &output[i * length], length, &keys[i * CHACHA_KEY_SIZE], &nonces[i * CHACHA_NONCE_SIZE], 0); } } diff --git a/src/gpu-common.mk b/src/gpu-common.mk index a29595f..759c124 100644 --- a/src/gpu-common.mk +++ b/src/gpu-common.mk @@ -1,7 +1,8 @@ NVCC:=nvcc GPU_PTX_ARCH:=compute_35 GPU_ARCHS?=sm_37,sm_50,sm_61,sm_70 +HOST_CFLAGS:=-Wall -Werror -fPIC -Wno-strict-aliasing GPU_CFLAGS:=--gpu-code=$(GPU_ARCHS),$(GPU_PTX_ARCH) --gpu-architecture=$(GPU_PTX_ARCH) -CFLAGS_release:=-Icommon --ptxas-options=-v $(GPU_CFLAGS) -O3 -Xcompiler "-Wall -Werror -fPIC -Wno-strict-aliasing" +CFLAGS_release:=-Icommon --ptxas-options=-v $(GPU_CFLAGS) -O3 -Xcompiler "$(HOST_CFLAGS)" CFLAGS_debug:=$(CFLAGS_release) -g CFLAGS:=$(CFLAGS_$V) diff --git a/src/opencl-crypt/cl_chacha.cpp b/src/opencl-crypt/cl_chacha.cpp new file mode 100644 index 0000000..638f563 --- /dev/null +++ b/src/opencl-crypt/cl_chacha.cpp @@ -0,0 +1,34 @@ +#include +#include +#include + +#include "chacha.h" + +void chacha_cbc_encrypt_many_sample(const uint8_t* in, + void* sha_state_arg, + size_t length, + const uint8_t* keys, + uint8_t* ivecs, + uint32_t num_keys, + const uint64_t* samples, + uint32_t num_samples, + uint64_t starting_block_offset, + float* time_us) +{ + fprintf(stderr, "chacha_cbc_encrypt_many_sample not implemented\n"); + exit(1); +} + +void chacha_end_sha_state(const void* sha_state, uint8_t* out, uint32_t num_keys) +{ + fprintf(stderr, "chacha_end_sha_state not implemented\n"); + exit(1); +} + +void chacha_init_sha_state(void* sha_state, uint32_t num_keys) +{ + fprintf(stderr, "chacha_init_sha_state not implemented\n"); + exit(1); +} + + diff --git a/src/opencl-ecc-ed25519/main.cpp b/src/opencl-ecc-ed25519/main.cpp index bc831ac..776b4e3 100644 --- a/src/opencl-ecc-ed25519/main.cpp +++ b/src/opencl-ecc-ed25519/main.cpp @@ -17,6 +17,8 @@ extern void ed25519_free_gpu_mem(); +bool g_verbose = false; + typedef struct { size_t size; uint64_t num_retransmits; diff --git a/src/opencl-ecc-ed25519/sign.cpp b/src/opencl-ecc-ed25519/sign.cpp index 9bc96b5..f9078ef 100644 --- a/src/opencl-ecc-ed25519/sign.cpp +++ b/src/opencl-ecc-ed25519/sign.cpp @@ -92,7 +92,7 @@ void ed25519_sign_many(const gpu_Elems* elems, size_t num_threads_per_block = 64; size_t num_blocks = ROUND_UP_DIV(total_signatures, num_threads_per_block) * num_threads_per_block; - LOG("signing blocks: %d threads_per_block: %d\n", num_blocks, num_threads_per_block); + LOG("signing blocks: %zu threads_per_block: %zu\n", num_blocks, num_threads_per_block); /* __kernel void ed25519_sign_kernel(__global unsigned char* packets, diff --git a/src/opencl-ecc-ed25519/verify.cpp b/src/opencl-ecc-ed25519/verify.cpp index 9c52b51..457fddd 100644 --- a/src/opencl-ecc-ed25519/verify.cpp +++ b/src/opencl-ecc-ed25519/verify.cpp @@ -127,7 +127,6 @@ void ed25519_verify_many(const gpu_Elems* elems, num_elems, total_signatures, total_packets, message_size); size_t out_size = total_signatures * sizeof(uint8_t); - size_t offsets_size = total_signatures * sizeof(uint32_t); uint32_t total_packets_size = total_packets * message_size; @@ -157,7 +156,7 @@ void ed25519_verify_many(const gpu_Elems* elems, size_t num_threads_per_block = 64; size_t num_blocks = ROUND_UP_DIV(total_signatures, num_threads_per_block) * num_threads_per_block; - LOG("num_blocks: %d threads_per_block: %d keys: %d out: %p\n", + LOG("num_blocks: %zu threads_per_block: %zu keys: %d out: %p\n", num_blocks, num_threads_per_block, (int)total_packets, out); CL_ERR( clSetKernelArg(ed25519_verify_kernel, 0, sizeof(cl_mem), (void *)&cur_ctx->packets) ); @@ -192,3 +191,14 @@ const char* ed25519_license() { "Licensed under the Apache License, Version 2.0 " ""; } + +// Supported by the cuda lib, so stub them here. +int cuda_host_register(void* ptr, size_t size, unsigned int flags) +{ + return 0; +} + +int cuda_host_unregister(void* ptr) +{ + return 0; +} diff --git a/src/opencl-platform/cl_init_platform.cpp b/src/opencl-platform/cl_init_platform.cpp index 1a9111f..d10b469 100644 --- a/src/opencl-platform/cl_init_platform.cpp +++ b/src/opencl-platform/cl_init_platform.cpp @@ -200,7 +200,7 @@ bool cl_check_init(cl_uint sel_device_type) { */ bool cl_check_init(void) { - if(cl_is_init == true) { + if (cl_is_init == true) { return true; } else { cout << "OpenCL platform query & init..." << endl; @@ -212,11 +212,10 @@ bool cl_check_init(void) { string kernel_src; cl_device_id device; - cl_platform_id platform; cl_uint platform_num = 0; cl_platform_id* platform_list = NULL; - cl_uint device_num = 0; + cl_uint num_devices = 0; cl_device_id* device_list = NULL; size_t attr_size = 0; @@ -232,8 +231,7 @@ bool cl_check_init(void) { cout << "Platforms found: " << platform_num << endl; /* list all platforms and VENDOR/VERSION properties */ - for(int platf=0; platf +#include +#include +#include + +extern "C" { +int poh_verify_many(uint8_t* hashes, + const uint64_t* num_hashes_arr, + size_t num_elems, + uint8_t use_non_default_stream) +{ + fprintf(stderr, "poh_verify_many not implemented."); + exit(1); + return 0; +} +} +