forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
284 lines (255 loc) · 10.3 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
CC ?= clang
CFLAGS = -Ofast -Wno-unused-result -Wno-ignored-pragmas -Wno-unknown-attributes
LDFLAGS =
LDLIBS = -lm
INCLUDES =
CFLAGS_COND = -march=native
# Find nvcc
SHELL_UNAME = $(shell uname)
REMOVE_FILES = rm -f
OUTPUT_FILE = -o $@
CUDA_OUTPUT_FILE = -o $@
# NVCC flags
# -t=0 is short for --threads, 0 = number of CPUs on the machine
NVCC_FLAGS = -O3 -t=0 --use_fast_math -std=c++17
NVCC_LDFLAGS = -lcublas -lcublasLt
NVCC_INCLUDES =
NVCC_LDLIBS =
NCLL_INCUDES =
NVCC_CUDNN =
# By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute
USE_CUDNN ?= 0
# We will place .o files in the `build` directory (create it if it doesn't exist)
BUILD_DIR = build
ifeq ($(OS), Windows_NT)
$(shell if not exist $(BUILD_DIR) mkdir $(BUILD_DIR))
REMOVE_BUILD_OBJECT_FILES := del $(BUILD_DIR)\*.obj
else
$(shell mkdir -p $(BUILD_DIR))
REMOVE_BUILD_OBJECT_FILES := rm -f $(BUILD_DIR)/*.o
endif
# Function to check if a file exists in the PATH
ifneq ($(OS), Windows_NT)
define file_exists_in_path
$(which $(1) 2>/dev/null)
endef
else
define file_exists_in_path
$(shell where $(1) 2>nul)
endef
endif
ifneq ($(CI),true) # if not in CI, then use the GPU query
ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
ifneq ($(call file_exists_in_path, __nvcc_device_query),)
GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query)
GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
endif
endif
endif
# set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above
ifneq ($(GPU_COMPUTE_CAPABILITY),)
NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
endif
# autodect a lot of various supports on current platform
$(info ---------------------------------------------)
ifneq ($(OS), Windows_NT)
NVCC := $(shell which nvcc 2>/dev/null)
# Function to test if the compiler accepts a given flag.
define check_and_add_flag
$(eval FLAG_SUPPORTED := $(shell printf "int main() { return 0; }\n" | $(CC) $(1) -x c - -o /dev/null 2>/dev/null && echo 'yes'))
ifeq ($(FLAG_SUPPORTED),yes)
CFLAGS += $(1)
endif
endef
# Check each flag and add it if supported
$(foreach flag,$(CFLAGS_COND),$(eval $(call check_and_add_flag,$(flag))))
else
CFLAGS :=
REMOVE_FILES = del *.exe,*.obj,*.lib,*.exp,*.pdb && del
SHELL_UNAME := Windows
ifneq ($(shell where nvcc 2> nul),"")
NVCC := nvcc
else
NVCC :=
endif
CC := cl
CFLAGS = /Idev /Zi /nologo /W4 /WX- /diagnostics:column /sdl /O2 /Oi /Ot /GL /D _DEBUG /D _CONSOLE /D _UNICODE /D UNICODE /Gm- /EHsc /MD /GS /Gy /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /permissive- \
/external:W3 /Gd /TP /wd4996 /Fd$@.pdb /FC /openmp:llvm
LDFLAGS :=
LDLIBS :=
INCLUDES :=
NVCC_FLAGS += -I"dev"
ifeq ($(WIN_CI_BUILD),1)
$(info Windows CI build)
OUTPUT_FILE = /link /OUT:$@
CUDA_OUTPUT_FILE = -o $@
else
$(info Windows local build)
OUTPUT_FILE = /link /OUT:$@ && copy /Y $@ $@.exe
CUDA_OUTPUT_FILE = -o $@ && copy /Y $@.exe $@
endif
endif
# Check and include cudnn if available
# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line
# By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include
# Refer to the README for cuDNN install instructions
ifeq ($(USE_CUDNN), 1)
ifeq ($(SHELL_UNAME), Linux)
ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists)
$(info ✓ cuDNN found, will run with flash-attention)
CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists)
$(info ✓ cuDNN found, will run with flash-attention)
CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
else
$(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
endif
NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
NVCC_LDFLAGS += -lcudnn
NVCC_FLAGS += -DENABLE_CUDNN
NVCC_CUDNN = $(BUILD_DIR)/cudnn_att.o
else
ifneq ($(OS), Windows_NT)
$(info → cuDNN is not supported on MAC OS right now)
else
$(info ✓ Windows cuDNN found, will run with flash-attention)
ifeq ($(shell if exist "$(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include" (echo exists)),exists)
CUDNN_FRONTEND_PATH ?= $(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include #override on command line if different location
else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists)
CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location
else
$(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
endif
CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64
NVCC_CUDNN = $(BUILD_DIR)\cudnn_att.obj
NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn
NVCC_FLAGS += -DENABLE_CUDNN
endif
endif
else
$(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable)
endif
# Check if OpenMP is available
# This is done by attempting to compile an empty file with OpenMP flags
# OpenMP makes the code a lot faster so I advise installing it
# e.g. on MacOS: brew install libomp
# e.g. on Ubuntu: sudo apt-get install libomp-dev
# later, run the program by prepending the number of threads, e.g.: OMP_NUM_THREADS=8 ./gpt2
# First, check if NO_OMP is set to 1, if not, proceed with the OpenMP checks
ifeq ($(NO_OMP), 1)
$(info OpenMP is manually disabled)
else
ifneq ($(OS), Windows_NT)
# Detect if running on macOS or Linux
ifeq ($(SHELL_UNAME), Darwin)
# Check for Homebrew's libomp installation in different common directories
ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists)
# macOS with Homebrew on ARM (Apple Silicon)
CFLAGS += -Xclang -fopenmp -DOMP
LDFLAGS += -L/opt/homebrew/opt/libomp/lib
LDLIBS += -lomp
INCLUDES += -I/opt/homebrew/opt/libomp/include
$(info ✓ OpenMP found)
else ifeq ($(shell [ -d /usr/local/opt/libomp/lib ] && echo "exists"), exists)
# macOS with Homebrew on Intel
CFLAGS += -Xclang -fopenmp -DOMP
LDFLAGS += -L/usr/local/opt/libomp/lib
LDLIBS += -lomp
INCLUDES += -I/usr/local/opt/libomp/include
$(info ✓ OpenMP found)
else
$(info ✗ OpenMP not found)
endif
else
# Check for OpenMP support in GCC or Clang on Linux
ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0)
CFLAGS += -fopenmp -DOMP
LDLIBS += -lgomp
$(info ✓ OpenMP found)
else
$(info ✗ OpenMP not found)
endif
endif
endif
endif
# Check if NCCL is available, include if so, for multi-GPU training
ifeq ($(NO_MULTI_GPU), 1)
$(info → Multi-GPU (NCCL) is manually disabled)
else
ifneq ($(OS), Windows_NT)
# Detect if running on macOS or Linux
ifeq ($(SHELL_UNAME), Darwin)
$(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping NCCL support)
else ifeq ($(shell dpkg -l | grep -q nccl && echo "exists"), exists)
$(info ✓ NCCL found, OK to train with multiple GPUs)
NVCC_FLAGS += -DMULTI_GPU
NVCC_LDLIBS += -lnccl
else
$(info ✗ NCCL is not found, disabling multi-GPU support)
$(info ---> On Linux you can try install NCCL with `sudo apt install libnccl2 libnccl-dev`)
endif
endif
endif
# Attempt to find and include OpenMPI on the system
OPENMPI_DIR ?= /usr/lib/x86_64-linux-gnu/openmpi
OPENMPI_LIB_PATH = $(OPENMPI_DIR)/lib/
OPENMPI_INCLUDE_PATH = $(OPENMPI_DIR)/include/
ifeq ($(NO_USE_MPI), 1)
$(info → MPI is manually disabled)
else ifeq ($(shell [ -d $(OPENMPI_LIB_PATH) ] && [ -d $(OPENMPI_INCLUDE_PATH) ] && echo "exists"), exists)
$(info ✓ MPI enabled)
NVCC_INCLUDES += -I$(OPENMPI_INCLUDE_PATH)
NVCC_LDFLAGS += -L$(OPENMPI_LIB_PATH)
NVCC_LDLIBS += -lmpi
NVCC_FLAGS += -DUSE_MPI
else
$(info ✗ MPI not found)
endif
# Precision settings, default to bf16 but ability to override
PRECISION ?= BF16
VALID_PRECISIONS := FP32 FP16 BF16
ifeq ($(filter $(PRECISION),$(VALID_PRECISIONS)),)
$(error Invalid precision $(PRECISION), valid precisions are $(VALID_PRECISIONS))
endif
ifeq ($(PRECISION), FP32)
PFLAGS = -DENABLE_FP32
else ifeq ($(PRECISION), FP16)
PFLAGS = -DENABLE_FP16
else
PFLAGS = -DENABLE_BF16
endif
# PHONY means these targets will always be executed
.PHONY: all train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu profile_gpt2cu
# Add targets
TARGETS = train_gpt2 test_gpt2
# Conditional inclusion of CUDA targets
ifeq ($(NVCC),)
$(info ✗ nvcc not found, skipping GPU/CUDA builds)
else
$(info ✓ nvcc found, including GPU/CUDA support)
TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu $(NVCC_CUDNN)
endif
$(info ---------------------------------------------)
all: $(TARGETS)
train_gpt2: train_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
test_gpt2: test_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
$(NVCC_CUDNN): llmc/cudnn_att.cpp
$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) -o $@
train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
train_gpt2fp32cu: train_gpt2_fp32.cu
$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
test_gpt2fp32cu: test_gpt2_fp32.cu
$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
clean:
$(REMOVE_FILES) $(TARGETS)
$(REMOVE_BUILD_OBJECT_FILES)