-
Notifications
You must be signed in to change notification settings - Fork 67
/
vvau.hpp
276 lines (257 loc) · 11 KB
/
vvau.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
/******************************************************************************
* Copyright (c) 2019, Xilinx, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*******************************************************************************/
/*******************************************************************************
*
* Authors: Giulio Gambardella <giuliog@xilinx.com>
*
* \file vvau.hpp
*
* This file lists a templated funtion used to implement
* Vector-Vector-Activation Unit (used for depthwise separable convolutions)
*
*******************************************************************************/
#ifndef VVAU_HPP
#define VVAU_HPP
#include "hls_stream.h"
#include "mac.hpp"
#include "interpret.hpp"
/**
* \brief Vector vector activate function
*
* The function performs the multiplication between a weigth vector and the input activation vector,
* accumulating the results and then applying an activation function on the accumulated result.
* It is used to implement depth-wise separable convolution
*
* \tparam Channels Number of channels
* \tparam Kernel_2 Kernel * Kernel dimension (Kernel ^ 2 if square)
* \tparam SIMD Number of input columns computed in parallel, must be set to 1
* \tparam PE Number of output rows computed in parallel
* \tparam MMV Number of output pixels computed in parallel
* \tparam TSrcI DataType of the input activation (as used in the MAC)
* \tparam TDstI DataType of the output activation (as generated by the activation)
* \tparam TWeightI DataType of the weights (as used in the MAC)
* \tparam TI DataType of the input stream - safely deducible from the paramaters
* \tparam TO DataType of the output stream - safely deducible from the paramaters
* \tparam TW DataType of the weights matrix - safely deducible from the paramaters
* \tparam TA DataType of the activation class (e.g. thresholds) - safely deducible from the paramaters
* \tparam R Datatype for the resource used for FPGA implementation of the MAC - safely deducible from the paramaters
*
* \param in Input stream
* \param out Output stream
* \param weights Weights matrix (currently supports BinaryWeights or FixedPointWeights)
* \param activation Activation class
* \param reps Number of time the function has to be repeatedly executed (e.g. number of images)
* \param r Resource type for the hardware implementation of the MAC block
*/
template<
unsigned Channels, unsigned Kernel_2, unsigned SIMD, unsigned PE, unsigned MMV,
typename TSrcI = Identity, typename TDstI = Identity, typename TWeightI = Identity,
typename TI, typename TO, typename TW, typename TA, typename R
>
void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
hls::stream<TO> &out,
TW const &weights,
TA const &activation,
int const reps,
R const &r) {
static_assert(SIMD == 1, "SIMD parallelism not yet supported.");
// how many different rows each neuron will compute
// alternatively: number of vertical matrix chunks
unsigned const NF = Channels / PE;
// how many synapse groups each row is split into
// alternatively: number of horizontal matrix chunks
// always equal to # kernel pixels since no SIMD
unsigned const SF = Kernel_2;
decltype(activation.init(0,0)) accu[MMV][PE];
#pragma HLS ARRAY_PARTITION variable=accu complete dim=0
unsigned nf = 0;
unsigned sf = 0;
unsigned tile = 0; // invariant: tile = nf*SF + sf
// everything merged into a common iteration space (one "big" loop instead
// of smaller nested loops) to get the pipelinening the way we want
unsigned const TOTAL_FOLD = NF * SF ;//* Channels/SIMD;
for(unsigned i = 0; i < reps * TOTAL_FOLD; i++) {
#pragma HLS pipeline style=flp II=1
TI inElem;
inElem = in.read();
// Threshold Initialisation
if(sf == 0) {
for(unsigned pe = 0; pe < PE; pe++) {
for(unsigned mmv = 0; mmv < MMV; mmv++) {
#pragma HLS UNROLL
accu[mmv][pe] = activation.init(nf, pe);
}
}
}
// compute matrix-vector product for each processing element
auto const &w = weights.weights(tile);
for(unsigned pe = 0; pe < PE; pe++) {
#pragma HLS UNROLL
auto const wgt = TWeightI()(w[pe]);
for (unsigned mmv = 0; mmv < MMV; mmv++){
auto const act = TSrcI()(inElem, mmv);
accu[mmv][pe] += mul(wgt[0], act(pe,mmv), r);
}
}
// keep track of which folded synapse/neuron we are processing
++tile;
if(++sf == SF) {
// produce output and clear accumulators
auto outElem = TDstI().template operator()<TO>();
for (unsigned pe = 0; pe < PE; pe++) {
#pragma HLS UNROLL
for (unsigned mmv = 0; mmv < MMV; mmv++){
#pragma HLS UNROLL
outElem(pe,mmv,1) = activation.activate(nf, pe, accu[mmv][pe]);
}
}
out.write(outElem);
// next folded neuron or image
sf = 0;
if(++nf == NF) {
nf = 0;
tile = 0;
}
}
}
}
/**
* \brief Vector vector activate with streaming weights
*
* The function performs the multiplication between a weigth vector and the input activation vector,
* accumulating the results and then applying an activation function on the accumulated result.
* It is used to implement depth-wise separable convolution. The weights are supplied from a stream
* input to facilitate memory-compute decoupling.
*
* \tparam Channels Number of channels
* \tparam Kernel_2 Kernel * Kernel dimension (Kernel ^ 2 if square)
* \tparam SIMD Number of input columns computed in parallel, must be set to 1
* \tparam PE Number of output rows computed in parallel
* \tparam MMV Number of output pixels computed in parallel
* \tparam TSrcI DataType of the input activation (as used in the MAC)
* \tparam TDstI DataType of the output activation (as generated by the activation)
* \tparam TWeightI DataType of the weights (as used in the MAC)
* \tparam TI DataType of the input stream - safely deducible from the paramaters
* \tparam TO DataType of the output stream - safely deducible from the paramaters
* \tparam TW DataType of the weights matrix - safely deducible from the paramaters
* \tparam TA DataType of the activation class (e.g. thresholds) - safely deducible from the paramaters
* \tparam R Datatype for the resource used for FPGA implementation of the MAC - safely deducible from the paramaters
*
* \param in Input stream
* \param out Output stream
* \param weights Weights matrix (currently supports BinaryWeights or FixedPointWeights)
* \param activation Activation class
* \param reps Number of time the function has to be repeatedly executed (e.g. number of images)
* \param r Resource type for the hardware implementation of the MAC block
*/
template<
unsigned Channels, unsigned Kernel_2, unsigned SIMD, unsigned PE, unsigned MMV,
typename TSrcI = Identity, typename TDstI = Identity, typename TWeightI = Identity, typename TW,
typename TI, typename TO, typename TA, typename R
>
void Vector_Vector_Activate_Stream_Batch(
hls::stream<TI> &in,
hls::stream<TO> &out,
hls::stream<ap_uint<PE*SIMD*TW::width>> &weights,
TA const &activation,
int const reps,
R const &r
) {
static_assert(SIMD == 1, "SIMD parallelism not yet supported.");
// how many different rows each neuron will compute
// alternatively: number of vertical matrix chunks
constexpr unsigned NF = Channels / PE;
// how many synapse groups each row is split into
// alternatively: number of horizontal matrix chunks
// always equal to # kernel pixels since no SIMD
constexpr unsigned SF = Kernel_2;
decltype(activation.init(0,0)) accu[MMV][PE];
#pragma HLS ARRAY_PARTITION variable=accu complete dim=0
// unpacked and packed buffers for weight stream
unsigned nf = 0;
unsigned sf = 0;
unsigned tile = 0; // invariant: tile = nf*SF + sf
// everything merged into a common iteration space (one "big" loop instead
// of smaller nested loops) to get the pipelinening the way we want
constexpr unsigned TOTAL_FOLD = NF * SF ;//* Channels/SIMD;
for(unsigned i = 0; i < reps * TOTAL_FOLD; i++) {
#pragma HLS pipeline style=flp II=1
TI inElem;
inElem = in.read();
// Threshold Initialisation
if(sf == 0) {
for(unsigned pe = 0; pe < PE; pe++) {
for(unsigned mmv = 0; mmv < MMV; mmv++) {
#pragma HLS UNROLL
accu[mmv][pe] = activation.init(nf, pe);
}
}
}
// Packed and unpacked weight representations
ap_uint<PE * SIMD * TW::width> const W_packed = weights.read();
Weights_Tile<SIMD, TW, PE> w;
#pragma HLS ARRAY_PARTITION variable=w.m_weights complete dim=0
for(unsigned pe = 0; pe < PE; pe++) {
#pragma HLS UNROLL
w.m_weights[pe] = W_packed((pe+1)*SIMD*TW::width-1, pe*SIMD*TW::width);
}
for(unsigned pe = 0; pe < PE; pe++) {
#pragma HLS UNROLL
auto const wgt = TWeightI()(w[pe]);
for(unsigned mmv = 0; mmv < MMV; mmv++) {
auto const act = TSrcI()(inElem, mmv);
accu[mmv][pe] += mul(wgt[0], act(pe,mmv), r);
}
}
// keep track of which folded synapse/neuron we are processing
++tile;
if(++sf == SF) {
// produce output and clear accumulators
auto outElem = TDstI().template operator()<TO>();
for(unsigned pe = 0; pe < PE; pe++) {
#pragma HLS UNROLL
for(unsigned mmv = 0; mmv < MMV; mmv++) {
#pragma HLS UNROLL
outElem(pe,mmv,1) = activation.activate(nf, pe, accu[mmv][pe]);
}
}
out.write(outElem);
// next folded neuron or image
sf = 0;
if(++nf == NF) {
nf = 0;
tile = 0;
}
}
}
}
#endif