Merge pull request #26 from nhatdongdang/feat/gpu-multithread

Gpu multithread
kachi-group · Jul 5, 2024 · 2e2c1e5 · 2e2c1e5
2 parents a4b925b + e8db37a
commit 2e2c1e5
Show file tree

Hide file tree

Showing 10 changed files with 321 additions and 103 deletions.
diff --git a/benchmark/matrix_add/Makefile b/benchmark/matrix_add/Makefile
@@ -0,0 +1,19 @@
+compile = nvcc -O3 -arch=sm_75 --use_fast_math 
+SRC_DIR := versions
+BIN_DIR := bin
+SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
+EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))
+
+all: clean $(EXECUTABLES) 
+
+clean:
+ rm -f -r bin
+ mkdir bin
+
+$(BIN_DIR)/%: $(SRC_DIR)/%.cu
+ $(compile) $< benchmark.cu -o $@.exe 
+
+plot: all
+ python3 ./plot.py
+
+
diff --git a/benchmark/matrix_add/benchmark.cu b/benchmark/matrix_add/benchmark.cu
@@ -0,0 +1,13 @@
+#include "template.cuh"
+#include <stdio.h>
+#include <time.h>
+
+int main(int argc, char* argv[]) {
+ long n;
+ if (argc > 1) {
+ n = atol(argv[1]);
+ } else {
+ n = 100000;
+ }
+ printf("%f", time(n));
+}
diff --git a/benchmark/matrix_add/benchmark_plot.png b/benchmark/matrix_add/benchmark_plot.png
diff --git a/benchmark/matrix_add/plot.py b/benchmark/matrix_add/plot.py
@@ -0,0 +1,50 @@
+import os
+import subprocess
+import matplotlib.pyplot as plt
+
+result = subprocess.run(['make'], capture_output=True, text=True)
+# Define the folder containing the executables
+folder_path = './bin' # Change this to your bin folder path
+
+# Define the input sizes to test
+start=10000
+end=10000
+step=100000
+
+input_sizes = list(range(start, end+1, step))
+# Initialize a dictionary to store runtimes for each executable
+runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}
+
+# Loop through each executable
+for exe in runtimes.keys():
+ exe_path = os.path.join(folder_path, exe)
+
+ # Loop through each input size
+ for n in range(start,end+1,step):
+ # Run the executable with the input size and capture its output
+ result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)
+
+ # Parse the output to get the runtime
+ runtime = float(result.stdout.strip())
+ print(exe,runtime)
+
+ # Append the runtime to the corresponding executable list
+ runtimes[exe].append(runtime)
+
+# Plot the data
+plt.figure(figsize=(12, 6))
+
+# Loop through each executable and plot the runtimes
+for exe, times in runtimes.items():
+ plt.plot(input_sizes, times, marker='o', label=exe)
+
+plt.xlabel('Iterations')
+plt.ylabel('Runtime (s)')
+plt.title('Benchmark of Function Versions')
+plt.legend()
+plt.grid(True)
+plt.tight_layout()
+
+output_file = 'benchmark_plot.png' # Specify your desired output file name and format
+plt.savefig(output_file)
+# Show the plot
diff --git a/benchmark/matrix_add/template.cuh b/benchmark/matrix_add/template.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+typedef struct {
+ int rows;
+ int cols;
+ float* data; // array
+} matrix;
+
+double time(int n);
+matrix* new_matrix_d(int rows, int cols);
diff --git a/benchmark/matrix_add/versions/1.cu b/benchmark/matrix_add/versions/1.cu
@@ -0,0 +1,44 @@
+#include "../template.cuh"
+
+matrix* new_matrix(int rows, int cols) {
+ matrix* res = (matrix*)malloc(sizeof(matrix));
+ res->rows = rows;
+ res->cols = cols;
+ res->data = (float*)malloc((rows * cols) * sizeof(float));
+ return res;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+ matrix* res = (matrix*)malloc(sizeof(matrix));
+ res->rows = rows;
+ res->cols = cols;
+ res->cols = cols;
+ cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
+ return res;
+}
+
+__global__ void matrix_add(float *a, float*b ,int rows)
+{
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx<rows){
+ a[idx]+=b[idx];
+ }
+}
+
+double time(int n) {
+ int row=100000;
+ matrix* a = new_matrix_d(row, 1);
+ matrix* b = new_matrix_d(row, 1);
+ cudaStream_t stream1;
+ cudaStreamCreate ( &stream1);
+
+ int thread=1024;
+ int block=((row+thread-1)/thread);
+
+ clock_t start = clock();
+ for(int i=0;i<n;i++){
+ matrix_add<<<1,1,0,stream1>>>(a->data,b->data,row);
+ }
+ double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
+ return seconds;
+}
diff --git a/benchmark/matrix_add/versions/cpu.cu b/benchmark/matrix_add/versions/cpu.cu
@@ -0,0 +1,37 @@
+#include "../template.cuh"
+
+matrix* new_matrix(int rows, int cols) {
+ matrix* res = (matrix*)malloc(sizeof(matrix));
+ res->rows = rows;
+ res->cols = cols;
+ res->data = (float*)malloc((rows * cols) * sizeof(float));
+ return res;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+ matrix* res = (matrix*)malloc(sizeof(matrix));
+ res->rows = rows;
+ res->cols = cols;
+ res->cols = cols;
+ cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
+ return res;
+}
+
+void matrix_add(float* a, float* b, int rows) {
+ for (int i = 0; i < rows; i++) {
+ a[i] += b[i];
+ }
+}
+
+double time(int n) {
+ int row=100000;
+ matrix* a = new_matrix(row, 1);
+ matrix* b = new_matrix(row, 1);
+
+ clock_t start = clock();
+ for (int i = 0; i < n; i++) {
+ matrix_add(a->data, b->data,row);
+ }
+ double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
+ return seconds;
+}