kachi-group · rozukke · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024 · nhatdongdang
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16)
 # Set the project name
 project(ichida-algo)
 
-set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
+set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -mfma -funroll-loops -Wall -Wextra")
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_STANDARD_REQUIRED True)
 set(CMAKE_VERBOSE_MAKEFILE ON)

diff --git a/include/matrix.h b/include/matrix.h
@@ -3,15 +3,15 @@
 typedef struct {
     int rows;
     int cols;
-    float* data;
+    float* data __attribute__((aligned(32)));
 } matrix;
 
 matrix* new_matrix(int rows, int cols);
 
-void matrix_mul(const matrix* a, const matrix* b, const matrix* result);
+void matrix_mul(const float* inputs, const float* weights, float* __restrict__ results, int res_rows, int w_cols);
 
-void matrix_add(matrix* a, const matrix* b);
+void matrix_add_inplace_1d(const float* src, float* __restrict__ dest, int rows);
 
-void relu(matrix* a);
+void relu(float* dest, int rows);
 
-void softmax(matrix* a);
+void softmax(float* dest, int rows);
diff --git a/script/convertbin.py b/script/convertbin.py
@@ -0,0 +1,22 @@
+import os
+import struct
+
+def read_and_convert_file(file_path, output_file):
+    with open(file_path, 'r') as file:
+        content = file.read().strip()
+        float_values = [float(x) for x in content.split(',')]
+
+        with open(output_file, 'wb') as bin_file:
+            for value in float_values:
+                bin_file.write(struct.pack('f', value))
+
+if __name__ == "__main__":
+    directory = "./txttensors"  # Replace with the path to your directory
+
+    for i in range(1, 53):  # Assuming files are named 01out.txt to 52out.txt
+        file_name = f"{i:02d}out.txt"
+        file_path = os.path.join(directory, file_name)
+        output_file = os.path.join(directory, f"{i:02d}out.bin")
+
+        read_and_convert_file(file_path, output_file)
+        print(f"Binary file saved to {output_file}")
diff --git a/src/main.c b/src/main.c
@@ -1,18 +1,25 @@
 #include "../include/matrix.h"
+
+#define _DEFAULT_SOURCE // Fixes
 #include <dirent.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 
 #define NUM_LAYERS 7
+#define TENSOR_SIZE 225
+
+typedef uint8_t u8;
+typedef float f32;
 
 matrix* weights[NUM_LAYERS];
 matrix* biases[NUM_LAYERS];
 
-char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i',
-                    'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
-                    'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'};
+static char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i',
+                           'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
+                           'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'};
 
 void process_weights_str(char* line, int layer) {
     char* token;
@@ -65,45 +72,25 @@ void read_model(const char* file_name) {
     fclose(file);
 }
 
-void read_tensor(matrix* a, const char* fileName) {
-    FILE* file = fopen(fileName, "r");
-    char* line = NULL;
-    size_t len = 0;
-
-    getline(&line, &len, file);
-    char* token;
-    float value;
-    const char* delimiter = ",";
-    token = strtok(line, delimiter);
-
-    for (int i = 0; i < 225; i++) {
-        value = strtof(token, NULL);
-        (a->data)[i] = value;
-        token = strtok(NULL, delimiter);
-    }
-    free(line);
-    fclose(file);
-}
-
-void propagate_fwd(const matrix* weights, const matrix* input_layer, matrix* output_layer, const matrix* biases) {
-    matrix_mul(weights, input_layer, output_layer);
-    matrix_add(output_layer, biases);
+void propagate_fwd(const matrix* weights, const matrix* inputs, matrix* outputs, const matrix* biases) {
+    matrix_mul(inputs->data, weights->data, outputs->data, outputs->rows, weights->cols);
+    matrix_add_inplace_1d(biases->data, inputs->data, inputs->rows);
 }
 
 // Get result from output layer
-int get_max(matrix* a) {
+u8 get_max(float* src, int rows) {
     int idx = 0;
-    float res = (a->data)[0];
-    for (int i = 0; i < a->rows; i++) {
-        if (res < (a->data)[i]) {
-            res = (a->data)[i];
+    float res = src[0];
+    for (int i = 0; i < rows; i++) {
+        if (res < src[i]) {
+            res = src[i];
             idx = i;
         }
     }
     return idx;
 }
 
-int infer(matrix* input) {
+u8 infer(matrix* input) {
     matrix* mdl_layers[NUM_LAYERS];
     mdl_layers[0] = new_matrix(98, 1);
     mdl_layers[1] = new_matrix(65, 1);
@@ -114,36 +101,54 @@ int infer(matrix* input) {
     mdl_layers[6] = new_matrix(52, 1);
 
     propagate_fwd(weights[0], input, mdl_layers[0], biases[0]);
-    relu(mdl_layers[0]);
+    relu(mdl_layers[0]->data, mdl_layers[0]->rows);
+
     propagate_fwd(weights[1], mdl_layers[0], mdl_layers[1], biases[1]);
-    relu(mdl_layers[1]);
+    relu(mdl_layers[1]->data, mdl_layers[1]->rows);
+
     propagate_fwd(weights[2], mdl_layers[1], mdl_layers[2], biases[2]);
-    relu(mdl_layers[2]);
+    relu(mdl_layers[2]->data, mdl_layers[2]->rows);
+
     propagate_fwd(weights[3], mdl_layers[2], mdl_layers[3], biases[3]);
-    relu(mdl_layers[3]);
+    relu(mdl_layers[3]->data, mdl_layers[3]->rows);
+
     propagate_fwd(weights[4], mdl_layers[3], mdl_layers[4], biases[4]);
-    relu(mdl_layers[4]);
+    relu(mdl_layers[4]->data, mdl_layers[4]->rows);
+
     propagate_fwd(weights[5], mdl_layers[4], mdl_layers[5], biases[5]);
-    relu(mdl_layers[5]);
+    relu(mdl_layers[5]->data, mdl_layers[5]->rows);
 
     propagate_fwd(weights[6], mdl_layers[5], mdl_layers[6], biases[6]);
-    softmax(mdl_layers[6]);
+    softmax(mdl_layers[6]->data, mdl_layers[6]->rows);
 
-    return get_max(mdl_layers[6]);
+    return get_max(mdl_layers[6]->data, mdl_layers[6]->rows);
+}
+
+int file_count(const char* dir_path) {
+    struct dirent* entry;
+    DIR* dir = opendir(dir_path);
+
+    // Count inputs
+    int num_inputs = 0;
+    while ((entry = readdir(dir)) != NULL) {
+        if (entry->d_type == DT_REG)
+            num_inputs++;
+    }
+
+    return num_inputs;
 }
 
 int main(int argc, char* argv[]) {
     if (argc < 3) {
-        printf("Not enough arguments.");
+        printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/>");
         return EXIT_FAILURE;
     }
 
     // Start timing
     struct timeval stop, start;
     gettimeofday(&start, NULL);
 
-    // TODO: find a way to load static weights and biases
-    // Load model (The memory of those code should be initialize during compile time to enchance the speed)
+    // Dimensions of target model are hardcoded
     weights[0] = new_matrix(98, 225);
     weights[1] = new_matrix(65, 98);
     weights[2] = new_matrix(50, 65);
@@ -162,48 +167,59 @@ int main(int argc, char* argv[]) {
 
     read_model(argv[1]);
 
-    // Run program
+    // Holding place for input tensors
+    matrix* input = new_matrix(225, 1);
+
+    // ---------------------------------------------------------------------------------- Read from dir
+
     const char* directory_path = argv[2];
-    struct dirent* entry;
-    DIR* dir = opendir(directory_path);
+    int input_count = file_count(directory_path);
+    printf("Number of input tensors: %d\n", input_count);
 
-    matrix* input = new_matrix(225, 1);
+    // +1 because file idx starts at 1
+    u8* results = (u8*)malloc(input_count * sizeof(u8));
 
-    // Read and process inputs
-    char* file_name = (char*)malloc((100) * sizeof(char));
-    char* file_num_str = (char*)malloc((100) * sizeof(char));
+    __attribute__((aligned(32))) f32* tensors = (f32*)malloc(sizeof(f32) * TENSOR_SIZE * input_count);
 
-    int file_num;
-    int size = 0;
-    while ((entry = readdir(dir)) != NULL) {
-        if (entry->d_type == DT_REG) {
-            size++;
-        }
-    }
+    char* file_path = (char*)malloc((256) * sizeof(char));
+    char* file_num_str = (char*)malloc((50) * sizeof(char));
 
-    int* results = (int*)malloc((size + 1) * sizeof(int));
-    dir = opendir(directory_path);
+    // Read all tensors into tensors arr
+    DIR* dir = opendir(directory_path);
+    struct dirent* entry;
     while ((entry = readdir(dir)) != NULL) {
         if (entry->d_type == DT_REG) {
+            // Get input number
             strcpy(file_num_str, entry->d_name);
             file_num_str[strlen(entry->d_name) - 7] = '\0';
-            file_num = atoi(entry->d_name);
-            strcpy(file_name, directory_path);
-            strcat(file_name, "/");
-            strcat(file_name, entry->d_name);
-            read_tensor(input, file_name);
-            results[file_num] = infer(input);
+            int file_num = atoi(entry->d_name);
+
+            // Get full path to file
+            strcpy(file_path, directory_path);
+            strcat(file_path, "/");
+            strcat(file_path, entry->d_name);
+
+            // Read tensor into full array
+            FILE* file = fopen(file_path, "rb");
+
+            // Offset into correct sector of array
+            fread(tensors + ((file_num - 1) * TENSOR_SIZE), TENSOR_SIZE, sizeof(f32), file);
+            fclose(file);
         }
     }
-
-    free(file_name);
+    free(file_path);
     free(file_num_str);
     closedir(dir);
 
+    for (int i = 0; i < input_count; i++) {
+        input->data = tensors + (i * TENSOR_SIZE);
+        results[i] = infer(input);
+    }
+
     // Write to csv file
     FILE* csv_file = fopen("results.csv", "w+");
     fprintf(csv_file, "image_number, guess\n");
-    for (int i = 1; i <= size; i++) {
+    for (int i = 0; i <= input_count; i++) {
         fprintf(csv_file, "%d, %c\n", i, letters[results[i]]);
     }
     fclose(csv_file);