-
Notifications
You must be signed in to change notification settings - Fork 4
/
benchmark.py
136 lines (115 loc) · 4.38 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
import pycuda.autoinit
import numpy as np
import pycuda.driver as drv
import torch
from openai_gemm_pytorch import matmul
print drv.Context.get_current().get_device().name()
config = (
# m, n, k, AT, BT (row order)
( 16, 1760, 1760, False, False),
( 32, 1760, 1760, False, False),
( 64, 1760, 1760, False, False),
( 128, 1760, 1760, False, False),
( 7000, 1760, 1760, False, False),
( 16, 2048, 2048, False, False),
( 32, 2048, 2048, False, False),
( 64, 2048, 2048, False, False),
( 128, 2048, 2048, False, False),
( 7000, 2048, 2048, False, False),
( 16, 2560, 2560, False, False),
( 32, 2560, 2560, False, False),
( 64, 2560, 2560, False, False),
( 128, 2560, 2560, False, False),
( 7000, 2560, 2560, False, False),
( 16, 4096, 4096, False, False),
( 32, 4096, 4096, False, False),
( 64, 4096, 4096, False, False),
( 128, 4096, 4096, False, False),
( 7000, 4096, 4096, False, False),
( 16, 1760, 1760, False, True),
( 32, 1760, 1760, False, True),
( 64, 1760, 1760, False, True),
( 128, 1760, 1760, False, True),
( 7000, 1760, 1760, False, True),
( 16, 2048, 2048, False, True),
( 32, 2048, 2048, False, True),
( 64, 2048, 2048, False, True),
( 128, 2048, 2048, False, True),
( 7000, 2048, 2048, False, True),
( 16, 2560, 2560, False, True),
( 32, 2560, 2560, False, True),
( 64, 2560, 2560, False, True),
( 128, 2560, 2560, False, True),
( 7000, 2560, 2560, False, True),
( 16, 4096, 4096, False, True),
( 32, 4096, 4096, False, True),
( 64, 4096, 4096, False, True),
( 128, 4096, 4096, False, True),
( 7000, 4096, 4096, False, True),
( 7133, 1760, 1760, True , False),
( 7133, 2048, 2048, True , False),
( 7133, 2560, 2560, True , False),
( 7133, 4096, 4096, True , False),
( 9124, 5124, 1760, False, False),
( 9124, 5124, 2048, False, False),
( 9124, 5124, 2560, False, False),
( 9124, 5124, 4096, False, False),
( 9124, 5124, 1760, False, True),
( 9124, 5124, 2048, False, True),
( 9124, 5124, 2560, False, True),
( 9124, 5124, 4096, False, True),
( 8457, 35, 1760, False, False),
( 8457, 35, 2048, False, False),
( 8457, 35, 2560, False, False),
( 8457, 35, 4096, False, False),
( 8457, 35, 1760, False, True),
( 8457, 35, 2048, False, True),
( 8457, 35, 2560, False, True),
( 8457, 35, 4096, False, True),
( 16, 7680, 2560, False, False),
( 32, 7680, 2560, False, False),
( 64, 7680, 2560, False, False),
( 128, 7680, 2560, False, False),
( 16, 7680, 2560, False, True),
( 32, 7680, 2560, False, True),
( 64, 7680, 2560, False, True),
( 128, 7680, 2560, False, True),
( 16, 3072, 1024, False, False),
( 32, 3072, 1024, False, False),
( 64, 3072, 1024, False, False),
( 128, 3072, 1024, False, False),
( 16, 3072, 1024, False, True),
( 32, 3072, 1024, False, True),
( 64, 3072, 1024, False, True),
( 128, 3072, 1024, False, True),
( 7435, 3072, 1024, True , False),
( 5481, 7680, 2560, True , False),
# (60000, 32, 32, True , False),
# (60000, 256, 256, True , False),
# ( 4096, 4096, 32, True , False),
# ( 3456, 3456, 32, True , False),
# ( 896, 896, 32, True , False),
)
print "| M| N| K| Op|OpenAI_32|cuBLAS_32|ratio_32|OpenAI_16|cuBLAS_16|ratio_16|"
print "|------|------|------|---|---------|---------|--------|---------|---------|--------|"
for m, n, k, at, bt in config:
dimA = (k,m) if at else (m,k)
dimB = (n,k) if bt else (k,n)
dimC = (m,n)
opA = 'T' if at else 'N'
opB = 'T' if bt else 'N'
op = opA + opB
dtype_data = list()
for dtype in ('torch.cuda.FloatTensor', 'torch.cuda.HalfTensor'): #np.float32, np.float16,
A = torch.randn(dimA).type(dtype)
B = torch.randn(dimB).type(dtype)
C = torch.randn(dimC).type(dtype)
if at: A = A.t()
if bt: B = B.t()
data = matmul(A, B, C, bench=True)
cublas = data.pop()
openai = sorted(data)[0]
text = "%9.0f|%9.0f|%8.1f" % (openai[1], cublas[1], openai[1] / cublas[1])
dtype_data.append(text)
print "|%6d|%6d|%6d|%3s|%s|" % (m, n, k, op, "|".join(dtype_data))