-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintrin_impl.h
453 lines (404 loc) · 15.5 KB
/
intrin_impl.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
/*! \brief The signal of the given number. */
#define _SIGNAL(x) ((x) < 0 ? -1 : (x) > 0)
/*! \brief log of 2. Do give a perfect power of 2. */
#define _LOG2(x) ((x) ? ((31) - __builtin_clz((uint32_t)(x))) : 0)
/*! \brief Configure the state register of the DSA. */
inline void CONFIG_PARAM(int idx1, REG val1, bool s1,
int idx2, REG val2, bool s2) {
int s2_ = (s2) ? ~((1 << 11) - 1) : 0;
int mask = (idx1) | ((int)(idx2) << 5) | ((s1) << 10) | s2_;
INTRINSIC_RRI("ss_cfg_param", val1, val2, (uint64_t) mask);
}
/*! \brief Configure the state register of the DSA. */
inline void CONFIG_PARAM(int idx1, REG val1, bool s1) {
int mask = (idx1) | ((s1) << 10);
INTRINSIC_RRI("ss_cfg_param", val1, (uint64_t) 0, (uint64_t) mask);
}
/*!
* \brief Configure the predicate of control code broadcasting.
* \param bitmask: The bit mask of configuration.
* We can use environment variable LANES=n to configure the number of lanes.
* By default, it is 8.
* \note The number of each lane is 0-based, i.e. 0..(n-1). The n-th lane is the shared
* scratchpad.
*/
inline void SS_CONTEXT(REG bitmask) {
CONFIG_PARAM(DSARF::TBC, bitmask, 1);
}
/*!
* \brief Configure the spatial architecture.
* This is generated by the spatial scheduler.
* \param addr: The array of bitstream of spatial architecture configuration.
* \param size: The size of the configuration array in bytes.
*/
inline void SS_CONFIG(REG addr, REG size) {
CONFIG_PARAM(DSARF::CSA, addr, 0, DSARF::CFS, size, 0);
}
/*!
* \brief Drop all the ongoing data request while retaining the configuration.
*/
inline void SS_RESET() {
SS_CONFIG((uint64_t)0, (uint64_t)0);
}
/*!
* \brief Reset all live streams after finishing all ports
*/
inline void SS_STREAM_RESET() {
SS_CONFIG((uint64_t)0, (uint64_t)1);
}
/*!
* \brief Concatenate the data type bitmask.
* \param dtype The direct memory access data type. [0:1]
* \param const_type The const stream data type. [2:3]
* \param index_type The indirect index data type. [4:5]
* \param ind_s2d The indirect stride2d address data type. [6:7]
* \param l1d_type The indirect inner dimension length data type. [8:9]
*/
inline uint64_t DTYPE_MASK(int dtype = 0,
int const_type = 0,
int index_type = 0,
int ind_s2d = 0,
int l1d_type = 0) {
int direct_ = _LOG2(dtype);
int const_ = _LOG2(const_type);
int index_ = _LOG2(index_type);
int offset_ = _LOG2(ind_s2d);
int l1d_ = _LOG2(l1d_type);
uint64_t value = 0;
value = (value << 2) | l1d_;
value = (value << 2) | offset_;
value = (value << 2) | index_;
value = (value << 2) | const_;
value = (value << 2) | direct_;
return value;
}
/*!
* \brief Configure repeat register of the next instantiated input stream.
* \param port The port to be configured.
* \param field The field of the port to be configured.
* \param value The value to the field to be set
*/
inline void SS_CONFIG_PORT(int port, int field, REG value) {
uint64_t mask = port;
mask <<= 1;
mask = (mask << 4) | (field);
INTRINSIC_RI("ss_cfg_port", value, mask);
}
/*! \brief The next stream instantiated from this port will be repeated n times. */
inline void SS_REPEAT_PORT(int port, REG n) {
SS_CONFIG_PORT(port, DPF_PortRepeat, MUL(n, 1 << DSA_REPEAT_DIGITAL_POINT));
}
/*! \brief A delta will be applied on the times of repeat after each time the repeat is done. */
inline void SS_REPEAT_STRETCH(int port, REG n) {
SS_CONFIG_PORT(port, DPF_PortRepeatStretch, MUL(n, 1 << DSA_REPEAT_DIGITAL_POINT));
}
/*!
* \brief Set the registers that related to 1-d stream.
* \param start Register SAR
* \param length Register L1D
*/
inline void CONFIG_1D_STREAM(REG start, REG stride1d, REG length, int dtype, int ctype) {
CONFIG_PARAM(DSARF::SAR, (REG)(start), 0, DSARF::L1D, length, 0);
CONFIG_PARAM(DSARF::CSR, DTYPE_MASK(dtype, ctype, 0), 1, DSARF::I1D, stride1d, 0);
}
/*! \brief Concatenate the given values in a bitmask. */
inline uint64_t LINEAR_STREAM_MASK(int port, int padding, int action, int dimension,
int operation, int memory) {
uint64_t res = dimension & 3;
res = (res << 1) | (action & 1);
res = (res << 3) | (padding & 7);
res = (res << 1) | (memory & 1);
res = (res << 3) | (operation & 7);
res = (res << 7) | (port & 127);
return res;
}
/*!
* \brief Mask wrapper for indirect memory streams.
* \param port The destination port.
* \param memory The type of the memory, 0: dma, 1: spad.
* \param ind_mode a 3-bit hot vector. xx1: if use index from port, ow 0;
* x1x: if use offset address from a port, ow the stride2d * outer;
* 1xx: if use length from a port, ow the l1d register.
* \param lin_mode 0: 1d indirect stream; 2: 2d indirect stream
*/
inline uint64_t INDIRECT_STREAM_MASK(int port,
int memory,
int ind,
int dim,
MemoryOperation operation,
bool penetrate,
bool associate) {
uint64_t value = associate;
value = (value << 1) | dim;
value = (value << 1) | penetrate;
value = (value << 3) | ind;
value = (value << 1) | memory;
value = (value << 3) | ((int) operation);
value = (value << 7) | port;
return value;
}
/*!
* \brief Instantiate a 1d linear stream. (dtype*)(a + i*stride1d)
* \param addr The initial value of the state machine.
* \param stride The stride after accessing.
* \param length The number of trip counts.
* \param port The source/destination port.
* \param padding The mode of padding. Refer rf.h:Padding for more details.
* If it is a write stream, this is useless. Use 0 as a placeholder.
* \param action 0: access; 1: generate the affine linear value sequence to the port.
* \param dimension (d+1)=the number of dimensions of the stream.
* For now, 1,2,and 3-d are supported.
* \param operation 0: read, 1: write, 2-7: atomic +, -, *, /, min, and max.
* \param memory 0: memory, 1: spad.
* \param dtype The data type of this stream.
*/
inline void INSTANTIATE_1D_STREAM(REG addr, REG stride, REG length,
int port, int padding, int action, int operation,
int memory, int dtype, int ctype) {
CONFIG_1D_STREAM(addr, stride, length, dtype, ctype);
auto value = LINEAR_STREAM_MASK(port, padding, action, /*1d*/0, operation, memory);
INTRINSIC_R("ss_lin_strm", value);
}
/*!
* \brief Feed several consts to a port.
* \param port: The destination port.
* \param val: The value to be fed. The data type is 8-byte.
* \param n: The number of elements to be fed to the port.
* \param cbyte: The data type of the constant.
*/
inline void SS_CONST(int port, REG value, REG n, int cbyte = 8) {
SS_REPEAT_PORT(port, n);
INSTANTIATE_1D_STREAM(/*Initial*/value, /*Stride*/(uint64_t) 0, /*N*/1,
port, DP_NoPadding, DSA_Generate,
/*Memory Operation*/ 0,
/*Memory Source*/ 0,
/*Word Byte*/ 1,
cbyte);
}
/*! \brief Insert a barrier for the accelerator. Refer rf.h to see the masks. */
inline void SS_WAIT(REG mask) {
REG x0((uint64_t) 0);
INTRINSIC_DRI("ss_wait", x0, mask, (uint64_t) 0);
}
/*! \brief Block the control host and wait everything done on the accelerator. */
inline void SS_WAIT_ALL() {
REG all_ones(~0ull);
SS_WAIT(all_ones);
}
/*!
* \brief Write a value from CGRA to the register file.
* \param out_port: The source port.
* \param val: A lvalue reference where the value is written to.
*/
inline REG SS_RECV(int port, int dtype = 8) {
int mask = port;
CONFIG_PARAM(DSARF::CSR, DTYPE_MASK(dtype), 0);
mask <<= 5;
mask |= 1;
REG res;
REG x0((uint64_t) 0);
INTRINSIC_DRI("ss_recv", res, x0, mask);
return res;
}
/*!
* \brief Forward value from output port to the input port.
* \param output_port: The data source port.
* \param input_port: The destination data port.
* \param n: The number of data forwarded.
* \param dtype: The data type of each element forwarded.
*/
inline void SS_RECURRENCE(int oport, int iport, REG n, int dtype = 8) {
CONFIG_PARAM(DSARF::L1D, n, false, DSARF::CSR, _LOG2(dtype), false);
CONFIG_PARAM(DSARF::I1D, (uint64_t) 1, false);
REG port(iport | (oport << 7));
INTRINSIC_R("ss_wr_rd", port);
}
/*!
* \brief Set the registers that related to 2d stream.
* \param start Register SAR
* \param stride Register I2D
* \param length Register L1D
* \param stretch Register E2D
* \param n Register L2D
*/
inline void CONFIG_2D_STREAM(REG addr, REG stride1d, REG length,
REG stride2d, REG stretch, REG n, int dtype, int ctype) {
CONFIG_1D_STREAM(addr, stride1d, length, dtype, ctype);
CONFIG_PARAM(DSARF::E2D, stretch, 0, DSARF::L2D, n, 0);
CONFIG_PARAM(DSARF::I2D, stride2d, 0);
}
/*!
* \brief Instantiate a 2d linear stream.
*/
inline void INSTANTIATE_2D_STREAM(REG addr, REG stride1d, REG l1d, REG stride2d, REG stretch, REG n,
int port, int padding, int action, int op, int mem,
int dtype, int ctype) {
CONFIG_2D_STREAM(addr, stride1d, l1d, stride2d, stretch, n, dtype, ctype);
auto value = LINEAR_STREAM_MASK(port, padding, action, /*2d*/1, op, mem);
INTRINSIC_R("ss_lin_strm", value);
}
inline void CONFIG_3D_STREAM(REG addr, REG stride_1d, REG l1d, REG stride_2d, REG stretch_2d1d, REG n_2d,
REG delta_stretch_3d2d, REG delta_stride_3d2d,
REG delta_length_3d1d, REG delta_length_3d2d,
REG stride_3d, REG n_3d, int dtype, int ctype) {
CONFIG_2D_STREAM(addr, stride_1d, l1d, stride_2d, stretch_2d1d, n_2d, dtype, ctype);
CONFIG_PARAM(DSARF::DE2D, delta_stretch_3d2d, 0,
DSARF::DI2D, delta_stride_3d2d, 0);
CONFIG_PARAM(DSARF::E3D1D, delta_length_3d1d, 0,
DSARF::E3D2D, delta_length_3d2d, 0);
CONFIG_PARAM(DSARF::I3D, stride_3d, 0, DSARF::L3D, n_3d, 0);
}
/*!
* \brief Instantiate a 2d linear stream.
*/
inline void INSTANTIATE_3D_STREAM(REG addr, REG stride_1d, REG l1d, REG stride_2d,
REG stretch_2d1d, REG n_2d,
REG delta_stretch_3d2d, REG delta_stride_3d2d,
REG delta_length_3d1d, REG delta_length_3d2d,
REG stride_3d, REG n_3d,
int port, int padding, int action, int op, int mem,
int dtype, int ctype) {
CONFIG_3D_STREAM(addr, stride_1d, l1d, stride_2d, stretch_2d1d, n_2d,
delta_stretch_3d2d, delta_stride_3d2d,
delta_length_3d1d, delta_length_3d2d,
stride_3d, n_3d, dtype, ctype);
auto value = LINEAR_STREAM_MASK(port, padding, action, /*3d*/2, op, mem);
INTRINSIC_R("ss_lin_strm", value);
}
/*!
* \brief Periodically feed two consts to a port. [(val1 x v1_repeat), (val2 x v2_repeat)] x iters
* \param port: The destination port.
* \param val1: The first value to be fed. The data type is 8-byte.
* \param v1_repeat: The repeat times of val1.
* \param val2: The second value to be fed. The data type is 8-byte.
* \param v2_repeat: The repeat :qtimes of val2.
* \param iters: The times of repeating the periods.
* \param ctype: The data type of this const stream.
*/
inline void SS_2D_CONST(int port, REG v1, REG r1, REG v2, REG r2, REG iters, int cbyte = 8) {
REG stride_2d = SUB(v2, v1);
REG stretch_2d1d = SUB(r2, r1);
INSTANTIATE_3D_STREAM(v1, (uint64_t) 0, r1, stride_2d, /*stretch 2d1d*/stretch_2d1d,
/*n2d*/(uint64_t) 2,
/*delta stretch 3d2d*/(uint64_t) 0,
/*delta stride 3d2d*/(uint64_t) 0,
/*delta length 3d1d*/(uint64_t) 0,
/*delta length 3d2d*/(uint64_t) 0,
/*stride 3d*/(uint64_t) 0,
/*n3d*/iters, port, 0, DSA_Generate, 0, 0, 1, cbyte);
}
/*!
* \brief Instantiate a 1d indirect stream a[b[i]]
*/
inline void INSTANTIATE_1D_INDIRECT(int target_port, int target_type, int idx_port, int index_type,
REG start, REG stride1d, REG len, int memory,
MemoryOperation operation, bool penetrate, bool associate = false) {
CONFIG_PARAM(DSARF::INDP, idx_port, 0, DSARF::SAR, start, 0);
CONFIG_PARAM(DSARF::L1D, len, 0, DSARF::CSR, DTYPE_MASK(target_type, 0, index_type), 0);
CONFIG_PARAM(DSARF::I1D, stride1d, 0);
auto value = INDIRECT_STREAM_MASK(target_port, memory, 1, 0, operation, penetrate, associate);
INTRINSIC_R("ss_ind_strm", value);
}
/*!
* \brief Allocate [start, end) on the spad to be buffet buffer.
* \param start The close set of the starting address.
* \param end The open set of the end address.
*/
inline void SS_BUFFET_ALLOC(int start, int end) {
int64_t mask = end;
mask <<= 32;
mask |= start;
CONFIG_PARAM(DSARF::BR, mask, 0);
}
/*!
* \brief Allocate [start, end) on the spad to be buffet buffer.
* \param start The close set of the starting address.
* \param end The open set of the end address.
*/
inline void SS_BUFFET_DEALLOC() {
SS_BUFFET_ALLOC(-1, -1);
}
/*!
* \brief The attributes of a indirect 2d stream.
* \code{c}
* for (i=0; i<m; ++i)
* for (j=0; j<n[i]+i*stretch; ++j)
* a[b[i]+c[j]]
* \endcode
*/
struct Indirect2DAttr {
/*!
* \brief The port destination.
*/
int dest_port;
/*!
* \brief The data type of this stream.
*/
int dtype;
/*!
* \brief The port and type of b[i].
*/
int start_port{-1}, start_dtype{0};
/*!
* \brief The address of a.
*/
REG start;
/*!
* \brief The port and type of c[j].
*/
int idx_port{-1}, idx_dtype{0};
/*!
* \brief The port and type of n[i].
*/
int l1d_port{-1}, l1d_dtype{0};
/*!
* \brief The inner dimension can optionally be a constant when l1d_port is -1.
*/
REG l1d;
/*!
* \brief The value of m.
*/
REG l2d;
/*!
* \brief The value of stretch above.
*/
REG stretch{(uint64_t) 0};
/*!
* \brief The type of memory.
*/
int memory;
/*!
* \brief Index stream state penetration.
*/
bool penetrate{false};
/*!
* \brief Index stream state association.
*/
bool associate{false};
/*!
* \brief If a address generation, the dtype of the stream.
*/
int ctype{0};
};
/*!
* \brief Instantiate a 2-d indirect stream.
* \param i2a The attributes of an indirect 2d stream.
*/
inline void SS_INDIRECT_2D_READ(const Indirect2DAttr *i2a) {
int idx_port = i2a->idx_port;
int start_port = i2a->start_port;
int l1d_port = i2a->l1d_port;
int ind_mode = (idx_port != -1) | (start_port != -1) * 2 | (l1d_port != -1) * 4;
idx_port = idx_port == -1 ? 0 : idx_port;
l1d_port = l1d_port == -1 ? 0 : l1d_port;
start_port = start_port == -1 ? 0 : start_port;
int port_mask = (idx_port) | (start_port << 7) | (l1d_port << 14);
CONFIG_PARAM(DSARF::INDP, port_mask, 0, DSARF::L1D, i2a->l1d, 0);
int dtype_mask =
DTYPE_MASK(i2a->dtype, i2a->ctype, i2a->idx_dtype, i2a->start_dtype, i2a->l1d_dtype);
CONFIG_PARAM(DSARF::E2D, i2a->stretch, 0, DSARF::CSR, dtype_mask, 0);
CONFIG_PARAM(DSARF::SAR, i2a->start, 0, DSARF::L2D, i2a->l2d, 0);
auto value = INDIRECT_STREAM_MASK(i2a->dest_port, i2a->memory, ind_mode, 1, DMO_Read,
i2a->penetrate, i2a->associate);
INTRINSIC_R("ss_ind_strm", value);
}