From 943f51a34dffdd5f2c87aced503ea32af519a21f Mon Sep 17 00:00:00 2001
From: Stephen Akridge <sakridge@gmail.com>
Date: Sun, 22 Dec 2019 10:15:06 -0800
Subject: [PATCH] Build CL by default and include it in the release package.

Stub out unimplemented interfaces.
---
 Makefile                                  |  1 +
 src/Makefile                              | 56 +++++++++++++++--------
 src/{cuda-headers => common}/gpu_common.h |  0
 src/{cuda-crypt => crypt-if}/chacha.h     |  7 +--
 src/cuda-crypt/chacha20_core.cu           | 17 +++++--
 src/cuda-crypt/chacha_cbc.cu              |  2 +-
 src/gpu-common.mk                         |  3 +-
 src/opencl-crypt/cl_chacha.cpp            | 34 ++++++++++++++
 src/opencl-ecc-ed25519/main.cpp           |  2 +
 src/opencl-ecc-ed25519/sign.cpp           |  2 +-
 src/opencl-ecc-ed25519/verify.cpp         | 14 +++++-
 src/opencl-platform/cl_init_platform.cpp  | 30 ++++++------
 src/opencl-poh-verify/cl_poh_verify.cpp   | 17 +++++++
 13 files changed, 139 insertions(+), 46 deletions(-)
 rename src/{cuda-headers => common}/gpu_common.h (100%)
 rename src/{cuda-crypt => crypt-if}/chacha.h (86%)
 create mode 100644 src/opencl-crypt/cl_chacha.cpp
 create mode 100644 src/opencl-poh-verify/cl_poh_verify.cpp

diff --git a/Makefile b/Makefile
index bd4551b..f26b846 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ install:
 	mkdir -p $(DESTDIR)
 ifneq ($(OS),Darwin)
 	cp -f src/$(V)/libcuda-crypt.so $(DESTDIR)
+	cp -f src/$(V)/libcl-crypt.so $(DESTDIR)
 endif
 	ls -lh $(DESTDIR)
 
diff --git a/src/Makefile b/src/Makefile
index 6df37f6..9408053 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -19,19 +19,33 @@ LIB=cuda-crypt
 CL_ECC_TEST_BIN=cl_ed25519_verify
 CL_LIB=cl-crypt
 
+CHACHA_IF_DIR:=crypt-if
+
 CL_HEADER_DIR:=opencl-platform
 
 CUDA_HEADER_DIR:=cuda-headers
 CUDA_SHA256_DIR:=cuda-sha256
 
+CUDA_DIR ?= /usr/local/cuda
+
 CXX ?= g++
-CFLAGS+=-DENDIAN_NEUTRAL -DLTC_NO_ASM -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR)
-#use -DUSE_RDTSC for Windows compilation
-CL_CFLAGS=-fPIC -std=c++11 -DENDIAN_NEUTRAL -DOPENCL_VARIANT -DLTC_NO_ASM -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR) -I$(CL_HEADER_DIR) -Icommon/
 
-CUDA_PATH ?= /usr/local/cuda-9.1
+CFLAGS_COMMON:=-DENDIAN_NEUTRAL -DLTC_NO_ASM -I$(CHACHA_IF_DIR)
+CFLAGS+=$(CFLAGS_COMMON) -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR)
 
-all: $V/$(CHACHA_TEST_BIN) $V/$(ECC_TEST_BIN) $(V)/lib$(LIB).so
+#use -DUSE_RDTSC for Windows compilation
+CL_CFLAGS_common:=-fPIC -std=c++11 $(CFLAGS_COMMON) -DOPENCL_VARIANT \
+		  -I$(CL_HEADER_DIR) -Icommon/ \
+		  -I$(CUDA_DIR)/targets/x86_64-linux/include $(HOST_CFLAGS)
+CL_CFLAGS_release:=$(CL_CFLAGS_common) -O3
+CL_CFLAGS_debug:=$(CL_CFLAGS_common) -O0 -g
+CL_CFLAGS:=$(CL_CFLAGS_$V)
+
+all: $(V)/$(CHACHA_TEST_BIN) \
+     $(V)/$(ECC_TEST_BIN) \
+     $(V)/$(CL_ECC_TEST_BIN) \
+     $(V)/lib$(LIB).so \
+     $(V)/lib$(CL_LIB).so
 
 ECC_DIR:=cuda-ecc-ed25519
 
@@ -45,7 +59,7 @@ $V/seed.o: $(SEED_SRCS)
 	@mkdir -p $(@D)
 	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
 
-SIGN_SRCS:=$(addprefix $(ECC_DIR)/,sign.cu sha512.h ge.h sc.h fe.cu ../$(CUDA_HEADER_DIR)/gpu_common.h ed25519.h)
+SIGN_SRCS:=$(addprefix $(ECC_DIR)/,sign.cu sha512.h ge.h sc.h fe.cu ../common/gpu_common.h ed25519.h)
 $V/sign.o: $(SIGN_SRCS)
 	@mkdir -p $(@D)
 	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
@@ -60,6 +74,18 @@ $V/gpu_ctx.o: $(addprefix $(ECC_DIR)/,gpu_ctx.cu gpu_ctx.h)
 	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
 
 CL_ECC_DIR:=opencl-ecc-ed25519
+CL_CRYPT_DIR:=opencl-crypt
+CL_POH_VERIFY_DIR:=opencl-poh-verify
+
+CL_POH_VERIFY_SRCS:=$(CL_POH_VERIFY_DIR)/cl_poh_verify.cpp
+$V/cl_poh_verify.o: $(CL_POH_VERIFY_SRCS)
+	@mkdir -p $(@D)
+	$(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@
+
+CL_CHACHA_SRCS:=$(CL_CRYPT_DIR)/cl_chacha.cpp
+$V/cl_chacha.o: $(CL_CHACHA_SRCS)
+	@mkdir -p $(@D)
+	$(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@
 
 CL_SIGN_SRCS:=$(CL_ECC_DIR)/sign.cpp $(ECC_DIR)/fe.cu $(ECC_DIR)/ed25519.h
 $V/cl_sign.o: $(CL_SIGN_SRCS)
@@ -76,7 +102,7 @@ $V/cl_gpu_ctx.o: $(addprefix $(CL_ECC_DIR)/,gpu_ctx.cpp gpu_ctx.h)
 	$(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@
 	
 CHACHA_DIR:=cuda-crypt
-CHACHA_SRCS:=$(addprefix $(CHACHA_DIR)/,chacha_cbc.cu chacha.h common.cu)
+CHACHA_SRCS:=$(addprefix $(CHACHA_DIR)/,chacha_cbc.cu common.cu) $(CHACHA_IF_DIR)/chacha.h
 
 $V/chacha_cbc.o: $(CHACHA_SRCS)
 	@mkdir -p $(@D)
@@ -99,23 +125,17 @@ $V/poh_verify.o: $(POH_SRCS)
 	@mkdir -p $(@D)
 	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
 
-CL_CPU_GPU_OBJS=$(addprefix $V/,cl_init_platform.o cl_verify.o cl_gpu_ctx.o cl_sign.o)
+CL_CPU_GPU_OBJS=$(addprefix $V/,cl_init_platform.o cl_verify.o cl_gpu_ctx.o cl_sign.o cl_chacha.o cl_poh_verify.o)
 
-$V/cl_crypt-dlink.o: $(CL_CPU_GPU_OBJS)
-	ar rvs $@ $^
-
-$V/lib$(CL_LIB).a: $V/cl_crypt-dlink.o $(CL_CPU_GPU_OBJS)
-	ar rcs $@ $^
-
-$V/lib$(CL_LIB).so: $V/cl_crypt-dlink.o $(CL_CPU_GPU_OBJS)
-	$(CXX) -shared --shared $^ -o $@
+$V/lib$(CL_LIB).so: $(CL_CPU_GPU_OBJS)
+	$(CXX) -shared $^ -lOpenCL -o $@
 
 $V/cl_ecc_main.o: $(CL_ECC_DIR)/main.cpp $(ECC_DIR)/ed25519.h
 	@mkdir -p $(@D)
-	$(CXX) $(CL_CFLAGS) -I$(ECC_DIR) -c $< -o $@
+	$(CXX) $(CL_CFLAGS) -pthread -I$(ECC_DIR) -c $< -o $@
 
 $V/$(CL_ECC_TEST_BIN): $V/cl_ecc_main.o $V/lib$(CL_LIB).so
-	$(CXX) $(CL_CFLAGS) -Wl,-v -L$(CUDA_PATH)/lib64 -L$V -lpthread $^ -lOpenCL -o $@
+	$(CXX) $(CL_CFLAGS) -L$(CUDA_DIR)/lib64 -L$V -pthread $< -l$(CL_LIB) -lOpenCL -o $@
 
 CPU_GPU_OBJS=$(addprefix $V/,chacha_cbc.o aes_cbc.o verify.o poh_verify.o gpu_ctx.o sign.o seed.o keypair.o)
 
diff --git a/src/cuda-headers/gpu_common.h b/src/common/gpu_common.h
similarity index 100%
rename from src/cuda-headers/gpu_common.h
rename to src/common/gpu_common.h
diff --git a/src/cuda-crypt/chacha.h b/src/crypt-if/chacha.h
similarity index 86%
rename from src/cuda-crypt/chacha.h
rename to src/crypt-if/chacha.h
index cb4b291..806b427 100644
--- a/src/cuda-crypt/chacha.h
+++ b/src/crypt-if/chacha.h
@@ -13,9 +13,10 @@ extern "C" {
 #define CHACHA_ROUNDS 500
 #define SAMPLE_SIZE 32
 
-void __host__ __device__ chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len,
-                                              const uint8_t key[CHACHA_KEY_SIZE], const uint8_t nonce[CHACHA_NONCE_SIZE],
-                                              uint32_t counter);
+void chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len,
+                          const uint8_t key[CHACHA_KEY_SIZE],
+                          const uint8_t nonce[CHACHA_NONCE_SIZE],
+                          uint32_t counter);
 
 void cuda_chacha20_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t in_len,
                                const uint8_t key[CHACHA_KEY_SIZE], uint8_t* ivec);
diff --git a/src/cuda-crypt/chacha20_core.cu b/src/cuda-crypt/chacha20_core.cu
index 1e4fc28..e8a9577 100644
--- a/src/cuda-crypt/chacha20_core.cu
+++ b/src/cuda-crypt/chacha20_core.cu
@@ -57,10 +57,11 @@ static void __host__ __device__ chacha20_encrypt(const u32 input[16],
     }
 }
 
-void __host__ __device__ chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len,
-                                              const uint8_t key[CHACHA_KEY_SIZE],
-                                              const uint8_t nonce[CHACHA_NONCE_SIZE],
-                                              uint32_t counter)
+void __host__ __device__
+chacha20_ctr_encrypt_device(const uint8_t *in, uint8_t *out, size_t in_len,
+                            const uint8_t key[CHACHA_KEY_SIZE],
+                            const uint8_t nonce[CHACHA_NONCE_SIZE],
+                            uint32_t counter)
 {
   uint32_t input[16];
   uint8_t buf[64];
@@ -105,4 +106,10 @@ void __host__ __device__ chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, s
   }
 }
 
-
+void chacha20_ctr_encrypt(const uint8_t *in, uint8_t *out, size_t in_len,
+                          const uint8_t key[CHACHA_KEY_SIZE],
+                          const uint8_t nonce[CHACHA_NONCE_SIZE],
+                          uint32_t counter)
+{
+    chacha20_ctr_encrypt_device(in, out, in_len, key, nonce, counter);
+}
diff --git a/src/cuda-crypt/chacha_cbc.cu b/src/cuda-crypt/chacha_cbc.cu
index 1ff4c11..6e96f0b 100644
--- a/src/cuda-crypt/chacha_cbc.cu
+++ b/src/cuda-crypt/chacha_cbc.cu
@@ -142,7 +142,7 @@ __global__ void chacha_ctr_encrypt_kernel(const unsigned char* input, unsigned c
     size_t i = (size_t)(blockIdx.x * blockDim.x + threadIdx.x);
 
     if (i < num_keys) {
-        chacha20_ctr_encrypt(input, &output[i * length], length, &keys[i * CHACHA_KEY_SIZE], &nonces[i * CHACHA_NONCE_SIZE], 0);
+        chacha20_ctr_encrypt_device(input, &output[i * length], length, &keys[i * CHACHA_KEY_SIZE], &nonces[i * CHACHA_NONCE_SIZE], 0);
     }
 }
 
diff --git a/src/gpu-common.mk b/src/gpu-common.mk
index a29595f..759c124 100644
--- a/src/gpu-common.mk
+++ b/src/gpu-common.mk
@@ -1,7 +1,8 @@
 NVCC:=nvcc
 GPU_PTX_ARCH:=compute_35
 GPU_ARCHS?=sm_37,sm_50,sm_61,sm_70
+HOST_CFLAGS:=-Wall -Werror -fPIC -Wno-strict-aliasing
 GPU_CFLAGS:=--gpu-code=$(GPU_ARCHS),$(GPU_PTX_ARCH) --gpu-architecture=$(GPU_PTX_ARCH)
-CFLAGS_release:=-Icommon --ptxas-options=-v $(GPU_CFLAGS) -O3 -Xcompiler "-Wall -Werror -fPIC -Wno-strict-aliasing"
+CFLAGS_release:=-Icommon --ptxas-options=-v $(GPU_CFLAGS) -O3 -Xcompiler "$(HOST_CFLAGS)"
 CFLAGS_debug:=$(CFLAGS_release) -g
 CFLAGS:=$(CFLAGS_$V)
diff --git a/src/opencl-crypt/cl_chacha.cpp b/src/opencl-crypt/cl_chacha.cpp
new file mode 100644
index 0000000..638f563
--- /dev/null
+++ b/src/opencl-crypt/cl_chacha.cpp
@@ -0,0 +1,34 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "chacha.h"
+
+void chacha_cbc_encrypt_many_sample(const uint8_t* in,
+                                    void* sha_state_arg,
+                                    size_t length,
+                                    const uint8_t* keys,
+                                    uint8_t* ivecs,
+                                    uint32_t num_keys,
+                                    const uint64_t* samples,
+                                    uint32_t num_samples,
+                                    uint64_t starting_block_offset,
+                                    float* time_us)
+{
+    fprintf(stderr, "chacha_cbc_encrypt_many_sample not implemented\n");
+    exit(1);
+}
+
+void chacha_end_sha_state(const void* sha_state, uint8_t* out, uint32_t num_keys)
+{
+    fprintf(stderr, "chacha_end_sha_state not implemented\n");
+    exit(1);
+}
+
+void chacha_init_sha_state(void* sha_state, uint32_t num_keys)
+{
+    fprintf(stderr, "chacha_init_sha_state not implemented\n");
+    exit(1);
+}
+
+
diff --git a/src/opencl-ecc-ed25519/main.cpp b/src/opencl-ecc-ed25519/main.cpp
index bc831ac..776b4e3 100644
--- a/src/opencl-ecc-ed25519/main.cpp
+++ b/src/opencl-ecc-ed25519/main.cpp
@@ -17,6 +17,8 @@
 
 extern void ed25519_free_gpu_mem();
 
+bool g_verbose = false;
+
 typedef struct {
     size_t size;
     uint64_t num_retransmits;
diff --git a/src/opencl-ecc-ed25519/sign.cpp b/src/opencl-ecc-ed25519/sign.cpp
index 9bc96b5..f9078ef 100644
--- a/src/opencl-ecc-ed25519/sign.cpp
+++ b/src/opencl-ecc-ed25519/sign.cpp
@@ -92,7 +92,7 @@ void ed25519_sign_many(const gpu_Elems* elems,
 
     size_t num_threads_per_block = 64;
     size_t num_blocks = ROUND_UP_DIV(total_signatures, num_threads_per_block) * num_threads_per_block;
-    LOG("signing blocks: %d threads_per_block: %d\n", num_blocks, num_threads_per_block);
+    LOG("signing blocks: %zu threads_per_block: %zu\n", num_blocks, num_threads_per_block);
 	
     /*
 	__kernel void ed25519_sign_kernel(__global unsigned char* packets,
diff --git a/src/opencl-ecc-ed25519/verify.cpp b/src/opencl-ecc-ed25519/verify.cpp
index 9c52b51..457fddd 100644
--- a/src/opencl-ecc-ed25519/verify.cpp
+++ b/src/opencl-ecc-ed25519/verify.cpp
@@ -127,7 +127,6 @@ void ed25519_verify_many(const gpu_Elems* elems,
         num_elems, total_signatures, total_packets, message_size);
 
     size_t out_size = total_signatures * sizeof(uint8_t);
-    size_t offsets_size = total_signatures * sizeof(uint32_t);
 
     uint32_t total_packets_size = total_packets * message_size;
 
@@ -157,7 +156,7 @@ void ed25519_verify_many(const gpu_Elems* elems,
 
     size_t num_threads_per_block = 64;
     size_t num_blocks = ROUND_UP_DIV(total_signatures, num_threads_per_block) * num_threads_per_block;
-    LOG("num_blocks: %d threads_per_block: %d keys: %d out: %p\n",
+    LOG("num_blocks: %zu threads_per_block: %zu keys: %d out: %p\n",
            num_blocks, num_threads_per_block, (int)total_packets, out);                     
                              
     CL_ERR( clSetKernelArg(ed25519_verify_kernel, 0, sizeof(cl_mem), (void *)&cur_ctx->packets) );
@@ -192,3 +191,14 @@ const char* ed25519_license() {
           "Licensed under the Apache License, Version 2.0 "
           "<http://www.apache.org/licenses/LICENSE-2.0>";
 }
+
+// Supported by the cuda lib, so stub them here.
+int cuda_host_register(void* ptr, size_t size, unsigned int flags)
+{
+    return 0;
+}
+
+int cuda_host_unregister(void* ptr)
+{
+    return 0;
+}
diff --git a/src/opencl-platform/cl_init_platform.cpp b/src/opencl-platform/cl_init_platform.cpp
index 1a9111f..d10b469 100644
--- a/src/opencl-platform/cl_init_platform.cpp
+++ b/src/opencl-platform/cl_init_platform.cpp
@@ -200,7 +200,7 @@ bool cl_check_init(cl_uint sel_device_type) {
 */
 bool cl_check_init(void) {
 
-    if(cl_is_init == true) {
+    if (cl_is_init == true) {
         return true;
     } else {
         cout << "OpenCL platform query & init..." << endl;
@@ -212,11 +212,10 @@ bool cl_check_init(void) {
     string kernel_src;
 
     cl_device_id device;
-    cl_platform_id platform;
     cl_uint platform_num = 0;
     cl_platform_id* platform_list = NULL;
 
-    cl_uint device_num = 0;
+    cl_uint num_devices = 0;
     cl_device_id* device_list = NULL;
 
     size_t attr_size = 0;
@@ -232,8 +231,7 @@ bool cl_check_init(void) {
     cout << "Platforms found: " << platform_num << endl;
 
     /* list all platforms and VENDOR/VERSION properties */
-    for(int platf=0; platf<platform_num; platf++)
-    {
+    for (cl_uint platf = 0; platf < platform_num; platf++) {
         /* get attribute CL_PLATFORM_VENDOR */
         CL_ERR( clGetPlatformInfo(platform_list[platf],
                 CL_PLATFORM_VENDOR, 0, NULL, &attr_size));
@@ -248,33 +246,36 @@ bool cl_check_init(void) {
 
         /* get attribute size CL_PLATFORM_VERSION */
         CL_ERR( clGetPlatformInfo(platform_list[platf],
-                CL_PLATFORM_VERSION, 0, NULL, &attr_size));
+                                  CL_PLATFORM_VERSION,
+                                  0, NULL, &attr_size));
         attr_data = new char[attr_size];
         DIE(attr_data == NULL, "alloc attr_data");
 
         /* get data size CL_PLATFORM_VERSION */
         CL_ERR( clGetPlatformInfo(platform_list[platf],
-                CL_PLATFORM_VERSION, attr_size, attr_data, NULL));
+                                  CL_PLATFORM_VERSION,
+                                  attr_size, attr_data, NULL));
         cout << attr_data << endl;
         delete[] attr_data;
 
         /* get num of available OpenCL devices type ALL on the selected platform */
-        if(clGetDeviceIDs(platform_list[platf], 
-            query_device_type, 0, NULL, &device_num) != CL_SUCCESS) {
-            device_num = 0;
+        if (clGetDeviceIDs(platform_list[platf],
+                           query_device_type, 0,
+                           NULL, &num_devices) != CL_SUCCESS) {
+            num_devices = 0;
             continue;
         }
 
-        device_list = new cl_device_id[device_num];
+        device_list = new cl_device_id[num_devices];
         DIE(device_list == NULL, "alloc devices");
 
         /* get all available OpenCL devices type ALL on the selected platform */
         CL_ERR( clGetDeviceIDs(platform_list[platf], query_device_type,
-            device_num, device_list, NULL));
-        cout << "\tDevices found " << device_num  << endl;
+            num_devices, device_list, NULL));
+        cout << "\tDevices found " << num_devices  << endl;
 
         /* list all devices and TYPE/VERSION properties */
-        for(int dev=0; dev<device_num; dev++)
+        for(cl_uint dev=0; dev < num_devices; dev++)
         {
             /* get attribute size */
             CL_ERR( clGetDeviceInfo(device_list[dev], CL_DEVICE_NAME,
@@ -291,7 +292,6 @@ bool cl_check_init(void) {
             string tmpAttrData = attr_data;
             
             // always select last device of type GPU
-            platform = platform_list[platf];
             device = device_list[dev];
 
             delete[] attr_data;
diff --git a/src/opencl-poh-verify/cl_poh_verify.cpp b/src/opencl-poh-verify/cl_poh_verify.cpp
new file mode 100644
index 0000000..f556f51
--- /dev/null
+++ b/src/opencl-poh-verify/cl_poh_verify.cpp
@@ -0,0 +1,17 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+extern "C" {
+int poh_verify_many(uint8_t* hashes,
+                    const uint64_t* num_hashes_arr,
+                    size_t num_elems,
+                    uint8_t use_non_default_stream)
+{
+    fprintf(stderr, "poh_verify_many not implemented.");
+    exit(1);
+    return 0;
+}
+}
+