forked from NVIDIA/cutlass
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcooperative_copy.hpp
331 lines (291 loc) · 13.7 KB
/
cooperative_copy.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/***************************************************************************************************
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <cute/config.hpp>
#include <cute/atom/copy_atom.hpp>
#include <cute/algorithm/copy.hpp>
#include <cute/tensor_impl.hpp>
#include <cute/tensor_predicate.hpp>
namespace cute
{
template <uint32_t NumThreads,
class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE void
naive_cooperative_copy(uint32_t const& tid,
Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> & dst)
{
auto N = size(src);
if (tid < N) {
uint32_t upper_bound = (N / NumThreads) * NumThreads;
CUTE_UNROLL
for (uint32_t i = 0; i < upper_bound; i += NumThreads) { // All in-bounds
dst[tid + i] = src[tid + i];
}
if (N % NumThreads != 0) { // Likely static condition
uint32_t final_idx = tid + upper_bound;
if (final_idx < N) { // Final in-bounds
dst[final_idx] = src[final_idx];
}
}
}
}
// Accept mutable temporaries
template <uint32_t NumThreads,
class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE void
naive_cooperative_copy(uint32_t const& tid,
Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> && dst)
{
return naive_cooperative_copy(tid, src, dst);
}
// A heuristic to determine a "good" permutation of two tensors for later vectorization and thr-assignment
template <class AEngine, class ALayout,
class BEngine, class BLayout>
CUTE_HOST_DEVICE constexpr
auto
heuristic_permutation(Tensor<AEngine, ALayout> const& a,
Tensor<BEngine, BLayout> const& b)
{
constexpr bool swizzleA = get_swizzle_t<AEngine>::num_bits != 0 or
get_swizzle_t<ALayout>::num_bits != 0;
constexpr bool swizzleB = get_swizzle_t<BEngine>::num_bits != 0 or
get_swizzle_t<BLayout>::num_bits != 0;
auto a_inv = right_inverse(get_nonswizzle_portion(a.layout()));
auto b_inv = right_inverse(get_nonswizzle_portion(b.layout()));
constexpr uint8_t scoreA = (uint8_t(swizzleA) << 2) |
(uint8_t(is_smem<AEngine>::value) << 1) |
(uint8_t(size(a_inv) > size(b_inv)) << 0);
constexpr uint8_t scoreB = (uint8_t(swizzleB) << 2) |
(uint8_t(is_smem<BEngine>::value) << 1) |
(uint8_t(size(b_inv) > size(a_inv)) << 0);
if constexpr (scoreA >= scoreB) {
return a_inv;
} else {
return b_inv;
}
}
// cooperative_copy<NumThreads, MaxVecBits>(thr_idx, src, dst)
// Use NumThreads to copy Tensor src to Tensor dst with element-wise vectorization up to MaxVecBits.
// @pre 0 <= @a tid < NumThreads
// @pre Tensors @a src and @a dst are aligned up to MaxVecBits.
// That is, pointers and dynamic strides are assumed to be aligned up to MaxVecBits.
//
template <uint32_t NumThreads, uint32_t MaxVecBits,
class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE
void
cooperative_copy(uint32_t const& tid,
Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> & dst)
{
// Assumes the shapes are static, can generalize/fallback
CUTE_STATIC_ASSERT_V(is_static<decltype(shape(src))>{} && is_static<decltype(shape(dst))>{});
CUTE_STATIC_ASSERT_V(size(src) == size(dst));
// Assumes the types are the same, can generalize/fallback
static_assert(cute::is_same<typename SrcEngine::value_type, typename DstEngine::value_type>::value);
static_assert(MaxVecBits == sizeof_bits_v<typename SrcEngine::value_type> ||
MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128,
"Expected MaxVecBits to be value size or 8 or 16 or 32 or 64 or 128 for alignment and performance.");
// Check that the tensors are likely shared across threads: either gmem or smem
static_assert((is_gmem<SrcEngine>::value || is_smem<SrcEngine>::value),
"cooperative_copy expects shared gmem or smem source tensor.");
static_assert((is_gmem<DstEngine>::value || is_smem<DstEngine>::value),
"cooperative_copy expects shared gmem or smem destination tensor.");
// Precondition on tid in DEBUG
assert(tid < NumThreads);
// Precondition on pointer alignment in DEBUG
assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(src.data())));
assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(dst.data())));
#if 0
if (thread0()) {
print(" "); print("cooperative_copy\n");
print(" "); print("NumThreads: "); print(NumThreads); print("\n");
print(" "); print("MaxVecBits: "); print(MaxVecBits); print("\n");
print(" "); print("src: "); print(src); print("\n");
print(" "); print("dst: "); print(dst); print("\n");
}
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
#endif
// The common layout of the two tensors that can be vectorized over elements and threads
// vidx -> coord
auto common_layout = heuristic_permutation(src, dst);
// Apply
// (V, rest)
Tensor src_a = coalesce(logical_divide(src, common_layout), Shape<_1,_1>{});
Tensor dst_a = coalesce(logical_divide(dst, common_layout), Shape<_1,_1>{});
//
// Determine vectorization of elems and thrs based on src/dst size and number of threads
// NOTE: This heuristic promotes parallelization over vectorization
//
// The number of elements and number of bits
constexpr int elem_bits = sizeof_bits_v<typename SrcEngine::value_type>;
constexpr int total_elem = size(SrcLayout{});
// The number of elements that can be vectorized in values
constexpr int common_elem = decltype(max_common_vector(src_a, dst_a))::value;
#if 0
if (thread0()) {
print(" "); print("common_layout: "); print(common_layout); print("\n");
print(" "); print("src_a: "); print(src_a); print("\n");
print(" "); print("dst_a: "); print(dst_a); print("\n");
}
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
#endif
//
if constexpr (total_elem % NumThreads != 0) {
// Not attempting to find a partitioning pattern, fallback to dynamically indexed slowpath
if constexpr (common_elem > 1 && MaxVecBits > elem_bits) {
// If the vectorization is non-trivial and divides the maximum vectorizations, then vectorize
constexpr auto max_align_src = elem_bits * decltype(max_alignment(src_a.layout()))::value;
constexpr auto max_align_dst = elem_bits * decltype(max_alignment(dst_a.layout()))::value;
constexpr auto vec_bits = gcd(max_align_src, max_align_dst, MaxVecBits);
using VecType = uint_bit_t<vec_bits>;
static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
static_assert((vec_bits >= 8), "No support for subbyte copying");
Tensor src_v = recast<VecType const>(src_a);
Tensor dst_v = recast<VecType >(dst_a);
#if 0
if (thread0()) {
print(" "); print("cooperative_copy -- naive\n");
print(" "); print("src_v: "); print(src_v); print("\n");
print(" "); print("dst_v: "); print(dst_v); print("\n");
}
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
#endif
naive_cooperative_copy<NumThreads>(tid, src_v, dst_v);
} else {
naive_cooperative_copy<NumThreads>(tid, src_a, dst_a);
}
} else {
// If the tensors can be equally partitioned by the threads,
// compute vectorization widths in elements and threads.
// If there are too many threads to allow a full vectorized copy, trunc the vectorization
constexpr int total_bits = total_elem * elem_bits;
constexpr int max_bits_per_thr = total_bits / NumThreads;
// At least elem_bits, at most common_bits
constexpr int common_bits = common_elem * elem_bits;
constexpr int vec_bits = cute::max(elem_bits, cute::gcd(common_bits, int(MaxVecBits), max_bits_per_thr));
// Should account for vec_bits < 8 and/or vec_elem <= 1
// And also account for subbyte types, which could cause race conditions
// Want to ENFORCE sufficient vectorization in those cases
static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
static_assert(vec_bits >= 8, "No support for subbyte copying");
using VecType = uint_bit_t<vec_bits>;
constexpr int vec_elem = vec_bits / elem_bits;
constexpr int vec_thrs = cute::min(int(NumThreads), total_elem / vec_elem);
//
// Determine the partitioning patterns for the vec_elems and vec_thrs
//
// Distribute the rest of the V*T to some consistent portion outside of the common_layout, if needed
auto common_domain_src = domain_distribute(shape(src_a), Int<vec_elem*vec_thrs>{});
auto common_domain_dst = domain_distribute(shape(dst_a), Int<vec_elem*vec_thrs>{});
// Make sure for now, could fall back here instead
CUTE_STATIC_ASSERT_V(size(common_domain_src) == Int<vec_elem*vec_thrs>{});
CUTE_STATIC_ASSERT_V(compatible(common_domain_src, common_domain_dst) ||
compatible(common_domain_dst, common_domain_src));
// Use the "more specific" domain for the extra elements of V*T
auto common_domain = conditional_return(compatible(common_domain_src, common_domain_dst),
common_domain_dst, common_domain_src);
// Construct the tiler
auto tiler_vt = common_domain.with_shape(Int<vec_elem>{}, Int<vec_thrs>{});
// Apply and slice
Tensor src_v = logical_divide(src_a, tiler_vt)(make_coord(_,tid),_);
Tensor dst_v = logical_divide(dst_a, tiler_vt)(make_coord(_,tid),_);
#if 0
if (thread0()) {
print(" "); print("cooperative_copy -- vec\n");
print(" "); print("Used vector: "); print(vec_elem); print("\n");
print(" "); print("Used threads: "); print(vec_thrs); print("\n");
print(" "); print("tiler_vt: "); print(tiler_vt); print("\n");
print(" "); print("src_v: "); print(src_v); print("\n");
print(" "); print("dst_v: "); print(dst_v); print("\n");
print(" "); print("recast<VecType const>(src_v): "); print(recast<VecType const>(src_v)); print("\n");
print(" "); print("recast<VecType >(dst_v): "); print(recast<VecType >(dst_v)); print("\n");
}
#ifdef __CUDA_ARCH__
__syncthreads();
#endif
#endif
// If we're using all threads (static) or the tid is in-range (dynamic)
if (vec_thrs == NumThreads or tid < vec_thrs) {
return copy_if(TrivialPredTensor{}, recast<VecType const>(src_v), recast<VecType>(dst_v));
}
}
}
// Default max-vectorization size to value_type size
template <uint32_t NumThreads,
class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE
void
cooperative_copy(uint32_t const& tid,
Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> & dst)
{
constexpr uint32_t MaxVecBits = sizeof_bits_v<typename SrcEngine::value_type>;
return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst);
}
//
// Accept mutable temporaries
//
template <uint32_t NumThreads,
class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE
void
cooperative_copy(uint32_t const& tid,
Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> && dst)
{
return cooperative_copy<NumThreads>(tid, src, dst);
}
template <uint32_t NumThreads, uint32_t MaxVecBits,
class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE
void
cooperative_copy(uint32_t const& tid,
Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> && dst)
{
return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst);
}
} // end namespace cute