-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cpp
153 lines (136 loc) · 5.89 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#include <utils.h>
#include <iostream>
#include <memory>
//#include <pack.hpp>
#include <test_reference.h>
using DA_TYPE = uint8_t;
using DB_TYPE = uint8_t;
void gemmx8x8s32(const char *transa, const char *transb, const char *offsetc,
dim_t M, dim_t N, dim_t K, float alpha, const uint8_t *A, dim_t LDA,
const uint8_t *ao, const uint8_t *B, dim_t LDB, const uint8_t *bo,
float beta, int32_t *C, dim_t LDC, const int32_t *co);
void gemmx8x8s32(const char *transa, const char *transb, const char *offsetc,
dim_t M, dim_t N, dim_t K, float alpha, const int8_t *A, dim_t LDA,
const int8_t *ao, const uint8_t *B, dim_t LDB, const uint8_t *bo,
float beta, int32_t *C, dim_t LDC, const int32_t *co);
extern double FLA_Clock();
constexpr int32_t C_VAL = 12;
double validate(const char *transA, const char *transB, const char *offsetc,
int m, int n, int k, float alpha, const DA_TYPE *A, int ldA,
const DA_TYPE *ao, const DB_TYPE *B, int ldB, const DB_TYPE *bo,
float beta, int32_t *C, int ldC, const int32_t *co) {
test_params_t p = {};
p.transA = *transA;
p.transB = *transB;
p.M = m;
p.K = k;
p.N = n;
p.alpha = alpha;
p.beta = beta;
p.lda = ldA;
p.ldb = ldB;
;
p.igemm_params = {};
if (ao) p.igemm_params._oa = (*ao);
if (bo) p.igemm_params._ob = (*bo);
if(offsetc) p.igemm_params.offsetc = *offsetc;
p.off = {};
int ldCref = determineLd(m, n, 0);
p.ldc = ldCref;
std::unique_ptr<int32_t[]> Cref(new int32_t[determineSize(m, n, 0)]);
fillMatrix(m, n, Cref.get(), ldCref, C_VAL);
ref_gemm_t<DA_TYPE, DB_TYPE>::call(p, m, n, A, B, Cref.get(), co);
showMatrix(m, n, Cref.get(), ldCref, "Cref");
return maxAbsDiff<double>(m, n, Cref.get(), ldCref, C, ldC);
}
int main() {
// constexpr int ldPlus = 1032;
// //for(int n =1000; n<1033;n++){
// constexpr int m=1000;
// constexpr int n=1000;
// constexpr int k=1000;
dim_t last = 1991;
dim_t first = 200;
dim_t inc = 200;
dim_t nrepeats = 3;
test_seed_t seed {};
dim_t m, n, k;
char trans[] = {'n', 't'};
float alpha = 1.5;
float beta = 2.0;
const char *offsetC = "c";
// uint8_t add_val = 4;
std::unique_ptr<int32_t[]> cop(
new int32_t[last]);
randomMatrix<int32_t>(last, 1, cop.get(), 1, 1, 13, seed);
showMatrix( 1, last, cop.get(), 1, "co");
//fillMatrix(last,1,cop.get(),1, 5);
const int32_t *co = (const int32_t *)cop.get();
const DA_TYPE *ao = nullptr; //&add_val;
const DB_TYPE *bo = nullptr; //&add_val;
for (auto transA : trans)
for (auto transB : trans) {
std::cout << "seed " << seed << " MR: " << MR << " NR: " << NR
<< " -- transA: " << transA << " transB: " << transB
<< std::endl;
printf("%% time G_OPS diff \n");
for (dim_t size = last; size >= first; size -= inc) {
/* we will only time cases where all three matrices are square */
m = n = k = size;
bool trA = transA == 't' || transA == 'T';
bool trB = transB == 't' || transB == 'T';
// m=1;
// n=1;
dim_t ldA = determineLd(m, k, size, trA);
dim_t ldB = determineLd(k, n, size, trB);
dim_t ldC = determineLd(m, n, size);
// std::cout<<m<<","<<n<<","<<ldC<<","<<determineSize(m,n,ldC)<<std::endl;
std::unique_ptr<DA_TYPE[]> A(
new DA_TYPE[determineSize(m, k, ldA, trA)]);
std::unique_ptr<DB_TYPE[]> B(
new DB_TYPE[determineSize(k, n, ldB, trB)]);
std::unique_ptr<int32_t[]> C(
new int32_t[determineSize(m, n, ldC)]);
#if !defined(LOW_0_127)
randomMatrix<DA_TYPE>(m, k, A.get(), ldA, seed);
#else
randomMatrix<DA_TYPE>(m, k, A.get(), ldA, 0, 127, seed);
#endif
randomMatrix<DB_TYPE>(k, n, B.get(), ldB, seed);
// linMatrix(m,k, A.get(),ldA, (uint8_t)1);
// linMatrix(k,n, B.get(),ldB, (uint8_t)100);
showMatrix(m, k, A.get(), ldA, "A");
showMatrix(k, n, B.get(), ldB, "B");
double dtime, dtime_best;
// showMatrix( ((m+MR-1) & (-MR)),((k+3) & (-4)) ,Apack.get(),
// ((m+MR-1) & (-MR)), "Apack");
// showMatrix(((k+3) & (-4)), ((n+NR-1) & (-NR)) ,Bpack.get(), ((k+3)
// & (-4)), "Bpack");
// gbp<MR, NR, int32_t, uint8_t>(((k+3) & (-4)), Apack.get(),
// Bpack.get(), C.get(), ldC);
auto gops = 2.0 * m * n * k * 1e-09;
// std::cout<<"//Begin//"<<std::endl;
int irep = 0;
for (irep = 0; irep < nrepeats; irep++) {
fillMatrix(m, n, C.get(), ldC, C_VAL);
auto dtime = FLA_Clock();
gemmx8x8s32(&transA, &transB, offsetC, m, n, k, alpha,
A.get(), ldA, ao, B.get(), ldB, bo, beta, C.get(),
ldC, co);
dtime = FLA_Clock() - dtime;
if (irep == 0)
dtime_best = dtime;
else
dtime_best = (dtime < dtime_best ? dtime : dtime_best);
}
auto diff = validate(&transA, &transB, offsetC, m, n, k, alpha,
A.get(), ldA, ao, B.get(), ldB, bo, beta, C.get(), ldC,
co);
showMatrix(m, n, C.get(), ldC, "C");
printf("%5d %8.4le %8.4le %8.4le \n", n, dtime_best,
gops / dtime_best, diff);
// std::cout<<"//End"<<std::endl;
}
}
return 0;
}