Skip to content

Commit

Permalink
Merge pull request #45 from kachi-group/main-staging
Browse files Browse the repository at this point in the history
Main staging
  • Loading branch information
nhatdongdang committed Jul 8, 2024
2 parents e5d2909 + cb46223 commit e63a2cc
Show file tree
Hide file tree
Showing 39 changed files with 664 additions and 148 deletions.
50 changes: 25 additions & 25 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
name: CI

on:
push:
branches: main
paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt']
pull_request:
branches: main
paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt']
push:
branches: main
paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
pull_request:
branches: main
paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]

jobs:
build-and-test:
runs-on: ubuntu-latest
build-and-test:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: Install dependencies
run: |
pip install pandas
- name: Install dependencies
run: |
pip install pandas
- name: Build project
run: |
make build
- name: Run test suite
run: |
make test
- name: Build project
run: |
make build
- name: Run test suite
run: |
make test_cpu
38 changes: 22 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
cmake_minimum_required(VERSION 3.16)

# Set the project name
project(ichida-algo)
project(ichida-algo LANGUAGES C CXX)

set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")

set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED True)
# set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_VERBOSE_MAKEFILE ON)

set(SRC_DIR src)
set(INC_DIR include)
set(LIB_DIR lib)
set(TEST_DIR test)
set(BENCHMARK_DIR benchmark)
set(SRC_DIR src)
set(CUDA_SRC_DIR cudasrc)

# Source files
file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
include_directories(${INC_DIR})

include_directories(include)
file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)

add_executable(speed_cpu ${SOURCE_FILES})
# add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)

target_link_libraries(speed_cpu m pthread)
# target_link_libraries(benchmark m)

target_link_libraries(speed_cpu m pthread gomp)

find_package(CUDA)

if(CUDA_FOUND)
enable_language(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")
find_package(MPI REQUIRED)
include_directories(${MPI_INCLUDE_PATH})
file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu)
add_executable(speed_gpu ${CUDA_SOURCE_FILES})
set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
else()
message(STATUS "CUDA not found, only CPU version will be built.")
endif()


49 changes: 24 additions & 25 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,37 +1,36 @@
.PHONY: all test clean run build run_test
.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat

all: rebuild
# Default iterations
iterations ?= 1000

all: build

clean:
rm -f test/results.csv
rm -f results.csv
rm -rf build
rm -f speed_cpu
rm -f speed_cpu speed_gpu

build: clean
cmake -Bbuild
$(MAKE) -C ./build
mv ./build/speed_cpu ./

rebuild:
$(MAKE) -C ./build
mv ./build/speed_cpu ./

run: build
./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors

run_test: build
./speed_cpu ./weights_and_biases.txt ./tensors

test: build
./speed_cpu ./weights_and_biases.txt ./tensors 1
mv ./results.csv ./test
python3 ./test/verify_csv.py
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
$(MAKE) -C build
cp -u build/speed_cpu ./
if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi

bench: build
./build/benchmark
run_cpu: build
./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)

stat: build
python3 ./benchmark/stat.py
run_gpu: build
n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)

test_cpu: build
./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
mv ./results.csv ./test
python3 ./test/verify_csv.py

test_gpu: build
n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
mv ./results.csv ./test
python3 ./test/verify_csv.py
62 changes: 0 additions & 62 deletions benchmark/benchmark.c

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
19 changes: 19 additions & 0 deletions benchmark/gpu/matrix_add/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
compile = nvcc -O3 -arch=sm_75 --use_fast_math
SRC_DIR := versions
BIN_DIR := bin
SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))

all: clean $(EXECUTABLES)

clean:
rm -f -r bin
mkdir bin

$(BIN_DIR)/%: $(SRC_DIR)/%.cu
$(compile) $< benchmark.cu -o $@.exe

plot: all
python3 ./plot.py


13 changes: 13 additions & 0 deletions benchmark/gpu/matrix_add/benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "template.cuh"
#include <stdio.h>
#include <time.h>

int main(int argc, char* argv[]) {
long n;
if (argc > 1) {
n = atol(argv[1]);
} else {
n = 100000;
}
printf("%f", time(n));
}
50 changes: 50 additions & 0 deletions benchmark/gpu/matrix_add/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import subprocess
import matplotlib.pyplot as plt

result = subprocess.run(['make'], capture_output=True, text=True)
# Define the folder containing the executables
folder_path = './bin' # Change this to your bin folder path

# Define the input sizes to test
start=10000
end=10000
step=100000

input_sizes = list(range(start, end+1, step))
# Initialize a dictionary to store runtimes for each executable
runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}

# Loop through each executable
for exe in runtimes.keys():
exe_path = os.path.join(folder_path, exe)

# Loop through each input size
for n in range(start,end+1,step):
# Run the executable with the input size and capture its output
result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)

# Parse the output to get the runtime
runtime = float(result.stdout.strip())
print(exe,runtime)

# Append the runtime to the corresponding executable list
runtimes[exe].append(runtime)

# Plot the data
plt.figure(figsize=(12, 6))

# Loop through each executable and plot the runtimes
for exe, times in runtimes.items():
plt.plot(input_sizes, times, marker='o', label=exe)

plt.xlabel('Iterations')
plt.ylabel('Runtime (s)')
plt.title('Benchmark of Function Versions')
plt.legend()
plt.grid(True)
plt.tight_layout()

output_file = 'benchmark_plot.png' # Specify your desired output file name and format
plt.savefig(output_file)
# Show the plot
10 changes: 10 additions & 0 deletions benchmark/gpu/matrix_add/template.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

typedef struct {
int rows;
int cols;
float* data; // array
} matrix;

double time(int n);
matrix* new_matrix_d(int rows, int cols);
44 changes: 44 additions & 0 deletions benchmark/gpu/matrix_add/versions/1.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include "../template.cuh"

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
return res;
}

matrix* new_matrix_d(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->cols = cols;
cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
return res;
}

__global__ void matrix_add(float *a, float*b ,int rows)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<rows){
a[idx]+=b[idx];
}
}

double time(int n) {
int row=100000;
matrix* a = new_matrix_d(row, 1);
matrix* b = new_matrix_d(row, 1);
cudaStream_t stream1;
cudaStreamCreate ( &stream1);

int thread=1024;
int block=((row+thread-1)/thread);

clock_t start = clock();
for(int i=0;i<n;i++){
matrix_add<<<1,1,0,stream1>>>(a->data,b->data,row);
}
double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
return seconds;
}
Loading

0 comments on commit e63a2cc

Please sign in to comment.