kachi-group · johnathanchann · Jun 28, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/.clang-format b/.clang-format
@@ -1,5 +1,5 @@
-BasedOnStyle: LLVM
-IndentWidth: 4
-PointerAlignment: Left
-ColumnLimit: 120
+BasedOnStyle: LLVM
+IndentWidth: 4
+PointerAlignment: Left
+ColumnLimit: 120
 AlwaysBreakTemplateDeclarations: true
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,34 +1,34 @@
-name: CI
-
-on:
-  push:
-  branches: main
-  paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt'] 
-  pull_request:
-  branches: main
-  paths: ['**.cu','**.c','**.cpp', '**.h', '**CMakeLists.txt']
-
-jobs:
-  build-and-test:
-  runs-on: ubuntu-latest
-
-  steps:
-  - name: Checkout code
-  uses: actions/checkout@v4
-
-  - name: Setup python
-  uses: actions/setup-python@v5
-  with:
-  python-version: '3.10' 
-
-  - name: Install dependencies
-  run: |
-  pip install pandas
-
-  - name: Build project
-  run: |
-  make build
- 
-  - name: Run test suite
-  run: |
-  make test
+name: CI
+
+on:
+ push:
+ branches: main
+ paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
+ pull_request:
+ branches: main
+ paths: ["**.cu", "**.c", "**.cpp", "**.h", "**CMakeLists.txt"]
+
+jobs:
+ build-and-test:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Setup python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+
+ - name: Install dependencies
+ run: |
+ pip install pandas
+
+ - name: Build project
+ run: |
+ make build
+
+ - name: Run test suite
+ run: |
+ make test_cpu
diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml
@@ -1,33 +1,33 @@
-name: cpp-linter
-on:
- pull_request:
- branches: main
- paths: ['**.cu','**.cpp','**.c', '**.h', '**CMakeLists.txt']
- push:
- branches: main
- paths: ['**.cu','**.cpp','**.c', '**.h', '**CMakeLists.txt']
-
-permissions:
-  contents: write
-  pull-requests: write
-  actions: write
-
-jobs:
- cpp-linter:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - uses: cpp-linter/cpp-linter-action@v2
- id: linter
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- with:
- style: 'file' # Use .clang-format config file. 
- tidy-checks: '-*' # disable clang-tidy checks. 
- version: 17
- thread-comments: true
- format-review: true
-
- - name: Run clang-format
- if: steps.linter.outputs.clang-format-checks-failed > 0
- run: exit 1
+name: cpp-linter
+on:
+ pull_request:
+ branches: main
+ paths: ["**.cu", "**.cpp", "**.c", "**.h", "**CMakeLists.txt"]
+ push:
+ branches: main
+ paths: ["**.cu", "**.cpp", "**.c", "**.h", "**CMakeLists.txt"]
+
+permissions:
+ contents: write
+ pull-requests: write
+ actions: write
+
+jobs:
+ cpp-linter:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: cpp-linter/cpp-linter-action@v2
+ id: linter
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ style: "file" # Use .clang-format config file.
+ tidy-checks: "-*" # disable clang-tidy checks.
+ version: 17
+ thread-comments: true
+ format-review: true
+
+ - name: Run clang-format
+ if: steps.linter.outputs.clang-format-checks-failed > 0
+ run: exit 1
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,30 +1,37 @@
 cmake_minimum_required(VERSION 3.16)
 
-# Set the project name
-project(ichida-algo)
+project(ichida-algo LANGUAGES C CXX)
 
 set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")
-
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED True)
-# set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_VERBOSE_MAKEFILE ON)
 
-set(SRC_DIR src)
 set(INC_DIR include)
-set(LIB_DIR lib)
-set(TEST_DIR test)
-set(BENCHMARK_DIR benchmark)
+set(SRC_DIR src)
+set(CUDA_SRC_DIR cuda/src)
 
-# Source files
-file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
+include_directories(${INC_DIR})
 
-include_directories(include)
+file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
 
 add_executable(speed_cpu ${SOURCE_FILES})
-# add_executable(benchmark ${SRC_DIR}/matrix.c ${BENCHMARK_DIR}/benchmark.c)
-
-target_link_libraries(speed_cpu m pthread)
-# target_link_libraries(benchmark m)
+target_link_libraries(speed_cpu m pthread gomp)
+
+find_package(CUDA)
+
+if(CUDA_FOUND)
+ enable_language(CUDA)
+ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")
+ find_package(MPI REQUIRED)
+ include_directories(${MPI_INCLUDE_PATH})
+ file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu)
+ add_executable(speed_gpu ${CUDA_SOURCE_FILES})
+ set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+ target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
+else()
+ message(STATUS "CUDA not found, only CPU version will be built.")
+endif()
 
 
 
diff --git a/LICENSE b/LICENSE
@@ -1,21 +1,21 @@
-MIT License
-
-Copyright (c) 2024 kachi-group
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+MIT License
+
+Copyright (c) 2024 kachi-group
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -1,37 +1,36 @@
-.PHONY: all test clean run build run_test
+.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat
 
-all: rebuild
+# Default iterations
+iterations ?= 1000
+
+all: build
 
 clean:
  rm -f test/results.csv
  rm -f results.csv
  rm -rf build
- rm -f speed_cpu
+ rm -f speed_cpu speed_gpu
 
 build: clean
- cmake -Bbuild
- $(MAKE) -C ./build
- mv ./build/speed_cpu ./
-
-rebuild:
- $(MAKE) -C ./build
- mv ./build/speed_cpu ./
-
-run: build
- ./speed_demo_cpu.sh ./weights_and_biases.txt ./tensors
-
-run_test: build
- ./speed_cpu ./weights_and_biases.txt ./tensors
-
-test: build
- ./speed_cpu ./weights_and_biases.txt ./tensors 1
- mv ./results.csv ./test
- python3 ./test/verify_csv.py
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+ $(MAKE) -C build
+ cp -u build/speed_cpu ./
+ if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi
 
-bench: build
- ./build/benchmark
+run_cpu: build
+ ./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
 
-stat: build
- python3 ./benchmark/stat.py
+run_gpu: build
+ n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+ mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
 
+test_cpu: build
+ ./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
+ mv ./results.csv ./test
+ python3 ./test/verify_csv.py
 
+test_gpu: build
+ n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+ mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
+ mv ./results.csv ./test
+ python3 ./test/verify_csv.py
diff --git a/cuda/benchmark/matrix_add/Makefile b/cuda/benchmark/matrix_add/Makefile
@@ -0,0 +1,19 @@
+compile = nvcc -O3 -arch=sm_75 --use_fast_math 
+SRC_DIR := versions
+BIN_DIR := bin
+SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
+EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))
+
+all: clean $(EXECUTABLES) 
+
+clean:
+ rm -f -r bin
+ mkdir bin
+
+$(BIN_DIR)/%: $(SRC_DIR)/%.cu
+ $(compile) $< benchmark.cu -o $@.exe 
+
+plot: all
+ python3 ./plot.py
+
+
diff --git a/cuda/benchmark/matrix_add/benchmark.cu b/cuda/benchmark/matrix_add/benchmark.cu
@@ -0,0 +1,13 @@
+#include "template.cuh"
+#include <stdio.h>
+#include <time.h>
+
+int main(int argc, char* argv[]) {
+ long n;
+ if (argc > 1) {
+ n = atol(argv[1]);
+ } else {
+ n = 100000;
+ }
+ printf("%f", time(n));
+}
diff --git a/cuda/benchmark/matrix_add/benchmark_plot.png b/cuda/benchmark/matrix_add/benchmark_plot.png
diff --git a/cuda/benchmark/matrix_add/plot.py b/cuda/benchmark/matrix_add/plot.py
@@ -0,0 +1,50 @@
+import os
+import subprocess
+import matplotlib.pyplot as plt
+
+result = subprocess.run(['make'], capture_output=True, text=True)
+# Define the folder containing the executables
+folder_path = './bin' # Change this to your bin folder path
+
+# Define the input sizes to test
+start=10000
+end=10000
+step=100000
+
+input_sizes = list(range(start, end+1, step))
+# Initialize a dictionary to store runtimes for each executable
+runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}
+
+# Loop through each executable
+for exe in runtimes.keys():
+ exe_path = os.path.join(folder_path, exe)
+
+ # Loop through each input size
+ for n in range(start,end+1,step):
+ # Run the executable with the input size and capture its output
+ result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)
+
+ # Parse the output to get the runtime
+ runtime = float(result.stdout.strip())
+ print(exe,runtime)
+
+ # Append the runtime to the corresponding executable list
+ runtimes[exe].append(runtime)
+
+# Plot the data
+plt.figure(figsize=(12, 6))
+
+# Loop through each executable and plot the runtimes
+for exe, times in runtimes.items():
+ plt.plot(input_sizes, times, marker='o', label=exe)
+
+plt.xlabel('Iterations')
+plt.ylabel('Runtime (s)')
+plt.title('Benchmark of Function Versions')
+plt.legend()
+plt.grid(True)
+plt.tight_layout()
+
+output_file = 'benchmark_plot.png' # Specify your desired output file name and format
+plt.savefig(output_file)
+# Show the plot