Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Second performance pass and multithreading #23

Merged
merged 7 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,5 @@ bin/
tensors/
results.csv
tensors/
perf.data
perf.data.old
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16)
# Set the project name
project(ichida-algo)

set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")

set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED True)
Expand All @@ -23,7 +23,7 @@ include_directories(include)
add_executable(speed_cpu ${SOURCE_FILES})
# add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)

target_link_libraries(speed_cpu m)
target_link_libraries(speed_cpu m pthread)
# target_link_libraries(benchmark m)


Expand Down
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.PHONY: all test clean run build run_test

all: build
all: rebuild

clean:
rm -f test/results.csv
Expand All @@ -12,6 +12,10 @@ build: clean
cmake -Bbuild
$(MAKE) -C ./build
mv ./build/speed_cpu ./

rebuild:
$(MAKE) -C ./build
mv ./build/speed_cpu ./

run: build
./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors
Expand All @@ -20,7 +24,7 @@ run_test: build
./speed_cpu ./weights_and_biases.txt ./tensors

test: build
./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors
./speed_cpu ./weights_and_biases.txt ./tensors 1
mv ./results.csv ./test
python3 ./test/verify_csv.py

Expand Down
2 changes: 1 addition & 1 deletion speed_demo_cpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if [ ! -f "$binary" ]; then
fi

start_time=$(date +%s)
./$binary "$weights_and_biases" "$input_tensor_dir"
./$binary "$weights_and_biases" "$input_tensor_dir" 1

end_time=$(date +%s)
execution_time=$((end_time - start_time))
Expand Down
121 changes: 53 additions & 68 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
#include "util.h"
#include <dirent.h>
#include <inttypes.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>

typedef float f32;
typedef unsigned char u8;
Expand All @@ -29,6 +31,7 @@ void propagate_fwd(const matrix* weights, const vector* inputs, vector* results,
vector_add_inplace(results->len, biases->data, results->data);
}

// Basic version, too many aligned_alloc
u8 infer(vector* input) {
vector* outputs[NUM_LAYERS];
outputs[0] = new_vec_aligned(98);
Expand All @@ -54,7 +57,7 @@ u8 infer(vector* input) {
propagate_fwd(weights[6], outputs[5], outputs[6], biases[6]);
softmax_inplace(outputs[6]->data, 52);

u8 pred = get_max(outputs[6]);
u8 pred = getv_max_i(outputs[6]->data, 52);

free(outputs[0]->data);
free(outputs[0]);
Expand All @@ -74,7 +77,9 @@ u8 infer(vector* input) {
return pred;
}

u8 infer_reuse_layers(vector* input) {
// Somewhat experimental, minumum number of alligned_alloc without breaking things.
// This code fucking sucks but its fast so uhhhh
u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) {
vector* outputs[NUM_LAYERS];
outputs[0] = new_vec_aligned(98);
outputs[1] = new_vec_aligned(65);
Expand Down Expand Up @@ -114,77 +119,24 @@ u8 infer_reuse_layers(vector* input) {
propagate_fwd(weights[6], outputs[1], outputs[0], biases[6]);
softmax_inplace(outputs[0]->data, 52);

u8 pred = get_max(outputs[0]);
u8 prediction = getv_max_i(outputs[0]->data, 52);

free(outputs[0]->data);
free(outputs[0]);
free(outputs[1]->data);
free(outputs[1]);

return pred;
}

u8 infer_reuse_input(vector* input) {
vector* outputs[NUM_LAYERS];
outputs[0] = new_vec_aligned(98);

propagate_fwd(weights[0], input, outputs[0], biases[0]);
relu_inplace(outputs[0]->data, 98);

input->len = 65;
memset(input->data, 0, 65 * sizeof(f32));

propagate_fwd(weights[1], outputs[0], input, biases[1]);
relu_inplace(input->data, 65);

outputs[0]->len = 50;
memset(outputs[0]->data, 0, 50 * sizeof(f32));

propagate_fwd(weights[2], input, outputs[0], biases[2]);
relu_inplace(outputs[0]->data, 50);

input->len = 30;
memset(input->data, 0, 30 * sizeof(f32));

propagate_fwd(weights[3], outputs[0], input, biases[3]);
relu_inplace(input->data, 30);

outputs[0]->len = 25;
memset(outputs[0]->data, 0, 25 * sizeof(f32));

propagate_fwd(weights[4], input, outputs[0], biases[4]);
relu_inplace(outputs[0]->data, 25);

input->len = 40;
memset(input->data, 0, 40 * sizeof(f32));

propagate_fwd(weights[5], outputs[0], input, biases[5]);
relu_inplace(input->data, 40);

outputs[0]->len = 52;
memset(outputs[0]->data, 0, 52 * sizeof(f32));

propagate_fwd(weights[6], input, outputs[0], biases[6]);
softmax_inplace(outputs[0]->data, 52);

u8 pred = get_max(outputs[0]);

free(outputs[0]->data);
free(outputs[0]);

input->len = 225;

return pred;
return prediction;
}

int main(int argc, char* argv[]) {
if (argc < 3) {
printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/>");
if (argc < 4) {
printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
return EXIT_FAILURE;
}

// Start timing
struct timeval stop, start;
struct timeval stop, start, preinf;
gettimeofday(&start, NULL);

// Dimensions of target model are hardcoded
Expand All @@ -204,20 +156,17 @@ int main(int argc, char* argv[]) {
biases[5] = new_vec_aligned(40);
biases[6] = new_vec_aligned(52);

vector* input = new_vec_aligned(TENSOR_SIZE);

read_model(weights, biases, argv[1]);

// Transpose weights to column major
for (int i = 0; i < NUM_LAYERS; i++)
transpose_mat_inplace(weights[i]);

// Set up preliminary counts and data
const char* directory_path = argv[2];
int input_count = file_count(directory_path);
printf("Number of input tensors: %d\n", input_count);

// +1 because file idx starts at 1
u8* results = (u8*)malloc(input_count * sizeof(u8));
f32* tensors = (f32*)aligned_alloc(SIMD_ALGN, TSIZE_ALGN_BYTES * input_count);

// Read and process inputs
Expand All @@ -241,11 +190,47 @@ int main(int argc, char* argv[]) {
free(file_path);
free(file_num_str);

// Run inference
// Time pre-inference
gettimeofday(&preinf, NULL);
printf("Pre inference (model read, tensor read, transpose) took %lu us\n",
(preinf.tv_sec - start.tv_sec) * 1000000 + preinf.tv_usec - start.tv_usec);

int iter_per_in = atoi(argv[3]);
// int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN);

if (iter_per_in > 1)
#pragma omp parallel
{
int force = 0;
u8* results_local = (u8*)malloc(input_count * sizeof(u8));

for (int i = 0; i < input_count; i++) {
// printf("Thread %d: Processing input %d\n", omp_get_thread_num(), i);

vector* input = new_vec_aligned(TSIZE_ALGN_BYTES / sizeof(f32));
memcpy(input->data, (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i], TSIZE_ALGN_BYTES);

#pragma omp for
for (int j = 0; j < iter_per_in - 1; j++) {
// Using global memory for model seems to be faster
results_local[i] = infer_reuse_layers_thread(input, weights, biases);
force += results_local[i];
}

free(input->data);
free(input);
}

free(results_local);
printf("Thread %d: %d\n", omp_get_thread_num(), force);
}

// Output for csv
vector* input = new_vec_aligned(TENSOR_SIZE);
u8* results = (u8*)malloc(input_count * sizeof(u8));
for (int i = 0; i < input_count; i++) {
input->data = (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i];
// for (int i = 0; i < 100000; i++)
results[i] = infer_reuse_layers(input);
results[i] = infer_reuse_layers_thread(input, weights, biases);
}

// Write to csv file
Expand All @@ -258,7 +243,7 @@ int main(int argc, char* argv[]) {

// Time taken
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
printf("Full run took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);

return EXIT_SUCCESS;
}
23 changes: 16 additions & 7 deletions src/matrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ vector* new_vec_aligned(int len) {
return new_vec;
}

// Ver. Artemis Rosman
// ver. Artemis Rosman simd_intrin 2x8
static void kernel(const float* in, const float* wg, float* rs, int start_row, int start_col, int w_width) {
// printf("Kernel at row %d col %d\n", start_row, start_col);
__m256 res = _mm256_load_ps(&rs[start_col]);
Expand Down Expand Up @@ -76,10 +76,19 @@ void relu_inplace(f32* dest, int len) {
}
}

// Hacky but fast and accurate for existing inputs
static double fastexp(double x) {
i64 tmp = (i64)(1512775 * x + 1072632447);
tmp <<= 32;
double result;
memcpy(&result, &tmp, sizeof(result));
return result;
}

void softmax_inplace(f32* dest, int len) {
float res = 0.0f;
for (int i = 0; i < len; i++) {
res += exp(dest[i]);
res += fastexp(dest[i]);
}
for (int i = 0; i < len; i++) {
dest[i] /= res;
Expand Down Expand Up @@ -111,12 +120,12 @@ void transpose_mat_inplace(matrix* in) {
}

// Get result from output layer
u8 get_max(vector* a) {
u8 getv_max_i(f32* in, int len) {
int idx = 0;
float res = (a->data)[0];
for (int i = 0; i < a->len; i++) {
if (res < (a->data)[i]) {
res = (a->data)[i];
float res = in[0];
for (int i = 0; i < len; i++) {
if (res < in[i]) {
res = in[i];
idx = i;
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

typedef float f32;
typedef unsigned char u8;
typedef signed long i64;

#define KERN_COLS 8
#define KERN_ROWS 4
#define KERN_ROWS 2
#define SIMD_ALGN 64

typedef struct vector {
Expand Down Expand Up @@ -33,4 +34,4 @@ void softmax_inplace(f32* dest, int len);

void transpose_mat_inplace(matrix* in);

u8 get_max(vector* a);
u8 getv_max_i(f32* a, int len);
rozukke marked this conversation as resolved.
Show resolved Hide resolved