kachi-group · nhatdongdang · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,5 @@ bin/
 tensors/
 results.csv
 tensors/
+perf.data
+perf.data.old
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16)
 # Set the project name
 project(ichida-algo)
 
-set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
+set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")
 
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED True)
@@ -23,7 +23,7 @@ include_directories(include)
 add_executable(speed_cpu ${SOURCE_FILES})
 # add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)
 
-target_link_libraries(speed_cpu m)
+target_link_libraries(speed_cpu m pthread)
 # target_link_libraries(benchmark m)
 
 

diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 .PHONY: all test clean run build run_test
 
-all: build
+all: rebuild
 
 clean:
 	rm -f test/results.csv
@@ -12,6 +12,10 @@ build: clean
 	cmake -Bbuild
 	$(MAKE) -C ./build
 	mv ./build/speed_cpu ./
+
+rebuild:
+	$(MAKE) -C ./build
+	mv ./build/speed_cpu ./
 
 run: build
 	./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors
@@ -20,7 +24,7 @@ run_test: build
 	./speed_cpu ./weights_and_biases.txt ./tensors
 
 test: build
-	./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors
+	./speed_cpu ./weights_and_biases.txt ./tensors 1
 	mv ./results.csv ./test
 	python3 ./test/verify_csv.py
 

diff --git a/speed_demo_cpu.sh b/speed_demo_cpu.sh
@@ -16,7 +16,7 @@ if [ ! -f "$binary" ]; then
 fi
 
 start_time=$(date +%s)
-./$binary "$weights_and_biases" "$input_tensor_dir"
+./$binary "$weights_and_biases" "$input_tensor_dir" 1
 
 end_time=$(date +%s)
 execution_time=$((end_time - start_time))

diff --git a/src/main.c b/src/main.c
@@ -3,10 +3,12 @@
 #include "util.h"
 #include <dirent.h>
 #include <inttypes.h>
+#include <omp.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
+#include <unistd.h>
 
 typedef float f32;
 typedef unsigned char u8;
@@ -29,6 +31,7 @@ void propagate_fwd(const matrix* weights, const vector* inputs, vector* results,
     vector_add_inplace(results->len, biases->data, results->data);
 }
 
+// Basic version, too many aligned_alloc
 u8 infer(vector* input) {
     vector* outputs[NUM_LAYERS];
     outputs[0] = new_vec_aligned(98);
@@ -54,7 +57,7 @@ u8 infer(vector* input) {
     propagate_fwd(weights[6], outputs[5], outputs[6], biases[6]);
     softmax_inplace(outputs[6]->data, 52);
 
-    u8 pred = get_max(outputs[6]);
+    u8 pred = getv_max_i(outputs[6]->data, 52);
 
     free(outputs[0]->data);
     free(outputs[0]);
@@ -74,7 +77,9 @@ u8 infer(vector* input) {
     return pred;
 }
 
-u8 infer_reuse_layers(vector* input) {
+// Somewhat experimental, minumum number of alligned_alloc without breaking things.
+// This code fucking sucks but its fast so uhhhh
+u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) {
     vector* outputs[NUM_LAYERS];
     outputs[0] = new_vec_aligned(98);
     outputs[1] = new_vec_aligned(65);
@@ -114,77 +119,24 @@ u8 infer_reuse_layers(vector* input) {
     propagate_fwd(weights[6], outputs[1], outputs[0], biases[6]);
     softmax_inplace(outputs[0]->data, 52);
 
-    u8 pred = get_max(outputs[0]);
+    u8 prediction = getv_max_i(outputs[0]->data, 52);
 
     free(outputs[0]->data);
     free(outputs[0]);
     free(outputs[1]->data);
     free(outputs[1]);
 
-    return pred;
-}
-
-u8 infer_reuse_input(vector* input) {
-    vector* outputs[NUM_LAYERS];
-    outputs[0] = new_vec_aligned(98);
-
-    propagate_fwd(weights[0], input, outputs[0], biases[0]);
-    relu_inplace(outputs[0]->data, 98);
-
-    input->len = 65;
-    memset(input->data, 0, 65 * sizeof(f32));
-
-    propagate_fwd(weights[1], outputs[0], input, biases[1]);
-    relu_inplace(input->data, 65);
-
-    outputs[0]->len = 50;
-    memset(outputs[0]->data, 0, 50 * sizeof(f32));
-
-    propagate_fwd(weights[2], input, outputs[0], biases[2]);
-    relu_inplace(outputs[0]->data, 50);
-
-    input->len = 30;
-    memset(input->data, 0, 30 * sizeof(f32));
-
-    propagate_fwd(weights[3], outputs[0], input, biases[3]);
-    relu_inplace(input->data, 30);
-
-    outputs[0]->len = 25;
-    memset(outputs[0]->data, 0, 25 * sizeof(f32));
-
-    propagate_fwd(weights[4], input, outputs[0], biases[4]);
-    relu_inplace(outputs[0]->data, 25);
-
-    input->len = 40;
-    memset(input->data, 0, 40 * sizeof(f32));
-
-    propagate_fwd(weights[5], outputs[0], input, biases[5]);
-    relu_inplace(input->data, 40);
-
-    outputs[0]->len = 52;
-    memset(outputs[0]->data, 0, 52 * sizeof(f32));
-
-    propagate_fwd(weights[6], input, outputs[0], biases[6]);
-    softmax_inplace(outputs[0]->data, 52);
-
-    u8 pred = get_max(outputs[0]);
-
-    free(outputs[0]->data);
-    free(outputs[0]);
-
-    input->len = 225;
-
-    return pred;
+    return prediction;
 }
 
 int main(int argc, char* argv[]) {
-    if (argc < 3) {
-        printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/>");
+    if (argc < 4) {
+        printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
         return EXIT_FAILURE;
     }
 
     // Start timing
-    struct timeval stop, start;
+    struct timeval stop, start, preinf;
     gettimeofday(&start, NULL);
 
     // Dimensions of target model are hardcoded
@@ -204,20 +156,17 @@ int main(int argc, char* argv[]) {
     biases[5] = new_vec_aligned(40);
     biases[6] = new_vec_aligned(52);
 
-    vector* input = new_vec_aligned(TENSOR_SIZE);
-
     read_model(weights, biases, argv[1]);
 
     // Transpose weights to column major
     for (int i = 0; i < NUM_LAYERS; i++)
         transpose_mat_inplace(weights[i]);
 
+    // Set up preliminary counts and data
     const char* directory_path = argv[2];
     int input_count = file_count(directory_path);
     printf("Number of input tensors: %d\n", input_count);
 
-    // +1 because file idx starts at 1
-    u8* results = (u8*)malloc(input_count * sizeof(u8));
     f32* tensors = (f32*)aligned_alloc(SIMD_ALGN, TSIZE_ALGN_BYTES * input_count);
 
     // Read and process inputs
@@ -241,11 +190,47 @@ int main(int argc, char* argv[]) {
     free(file_path);
     free(file_num_str);
 
-    // Run inference
+    // Time pre-inference
+    gettimeofday(&preinf, NULL);
+    printf("Pre inference (model read, tensor read, transpose) took %lu us\n",
+           (preinf.tv_sec - start.tv_sec) * 1000000 + preinf.tv_usec - start.tv_usec);
+
+    int iter_per_in = atoi(argv[3]);
+    // int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN);
+
+    if (iter_per_in > 1)
+#pragma omp parallel
+    {
+        int force = 0;
+        u8* results_local = (u8*)malloc(input_count * sizeof(u8));
+
+        for (int i = 0; i < input_count; i++) {
+            // printf("Thread %d: Processing input %d\n", omp_get_thread_num(), i);
+
+            vector* input = new_vec_aligned(TSIZE_ALGN_BYTES / sizeof(f32));
+            memcpy(input->data, (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i], TSIZE_ALGN_BYTES);
+
+#pragma omp for
+            for (int j = 0; j < iter_per_in - 1; j++) {
+                // Using global memory for model seems to be faster
+                results_local[i] = infer_reuse_layers_thread(input, weights, biases);
+                force += results_local[i];
+            }
+
+            free(input->data);
+            free(input);
+        }
+
+        free(results_local);
+        printf("Thread %d: %d\n", omp_get_thread_num(), force);
+    }
+
+    // Output for csv
+    vector* input = new_vec_aligned(TENSOR_SIZE);
+    u8* results = (u8*)malloc(input_count * sizeof(u8));
     for (int i = 0; i < input_count; i++) {
         input->data = (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i];
-        // for (int i = 0; i < 100000; i++)
-        results[i] = infer_reuse_layers(input);
+        results[i] = infer_reuse_layers_thread(input, weights, biases);
     }
 
     // Write to csv file
@@ -258,7 +243,7 @@ int main(int argc, char* argv[]) {
 
     // Time taken
     gettimeofday(&stop, NULL);
-    printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
+    printf("Full run took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
 
     return EXIT_SUCCESS;
 }
diff --git a/src/matrix.c b/src/matrix.c
@@ -38,7 +38,7 @@ vector* new_vec_aligned(int len) {
     return new_vec;
 }
 
-// Ver. Artemis Rosman
+// ver. Artemis Rosman simd_intrin 2x8
 static void kernel(const float* in, const float* wg, float* rs, int start_row, int start_col, int w_width) {
     // printf("Kernel at row %d col %d\n", start_row, start_col);
     __m256 res = _mm256_load_ps(&rs[start_col]);
@@ -76,10 +76,19 @@ void relu_inplace(f32* dest, int len) {
     }
 }
 
+// Hacky but fast and accurate for existing inputs
+static double fastexp(double x) {
+    i64 tmp = (i64)(1512775 * x + 1072632447);
+    tmp <<= 32;
+    double result;
+    memcpy(&result, &tmp, sizeof(result));
+    return result;
+}
+
 void softmax_inplace(f32* dest, int len) {
     float res = 0.0f;
     for (int i = 0; i < len; i++) {
-        res += exp(dest[i]);
+        res += fastexp(dest[i]);
     }
     for (int i = 0; i < len; i++) {
         dest[i] /= res;
@@ -111,12 +120,12 @@ void transpose_mat_inplace(matrix* in) {
 }
 
 // Get result from output layer
-u8 get_max(vector* a) {
+u8 getv_max_i(f32* in, int len) {
     int idx = 0;
-    float res = (a->data)[0];
-    for (int i = 0; i < a->len; i++) {
-        if (res < (a->data)[i]) {
-            res = (a->data)[i];
+    float res = in[0];
+    for (int i = 0; i < len; i++) {
+        if (res < in[i]) {
+            res = in[i];
             idx = i;
         }
     }

diff --git a/src/matrix.h b/src/matrix.h
@@ -2,9 +2,10 @@
 
 typedef float f32;
 typedef unsigned char u8;
+typedef signed long i64;
 
 #define KERN_COLS 8
-#define KERN_ROWS 4
+#define KERN_ROWS 2
 #define SIMD_ALGN 64
 
 typedef struct vector {
@@ -33,4 +34,4 @@ void softmax_inplace(f32* dest, int len);
 
 void transpose_mat_inplace(matrix* in);
 
-u8 get_max(vector* a);
+u8 getv_max_i(f32* a, int len);