36__device__
void copy(
const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
38 static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
39 constexpr auto I0 = Number<0>{};
40 constexpr auto I1 = Number<1>{};
42 const auto& in_grid_desc =
layout(src_tensor).GetUnrolledDescriptor();
43 const auto& out_grid_desc =
layout(dst_tensor).GetUnrolledDescriptor();
45 using SrcShapeType = remove_cvref_t<
decltype(
shape(src_tensor))>;
46 constexpr index_t num_dims = SrcShapeType::Size();
48 constexpr auto thread_slice_lengths =
49 generate_sequence_v2([](
auto I) {
return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
50 constexpr auto dim_access_order = generate_sequence_v2(
51 [](
auto I) {
return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
53 if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
56 auto transfer = ThreadwiseTensorSliceTransfer_v7<
57 Tuple<typename SrcTensorType::TensorElementType>,
58 Tuple<typename DstTensorType::TensorElementType>,
59 decltype(tie(in_grid_desc)),
60 decltype(tie(out_grid_desc)),
61 tensor_operation::element_wise::PassThrough,
62 Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
63 decltype(thread_slice_lengths),
64 decltype(dim_access_order),
68 Sequence<true>>{in_grid_desc,
69 make_tuple(src_tensor.GetMultiIdxOffsets()),
71 make_tuple(dst_tensor.GetMultiIdxOffsets()),
72 tensor_operation::element_wise::PassThrough{}};
74 transfer.Run(tie(in_grid_desc),
75 tie(src_tensor.GetBuffer()),
77 tie(dst_tensor.GetBuffer()));
79 else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
82 const auto src_slice_origin_idxs =
83 generate_tuple([&](
auto) {
return I0; }, Number<num_dims>{});
86 ThreadwiseTensorSliceTransfer_v1r3<
typename SrcTensorType::TensorElementType,
87 typename DstTensorType::TensorElementType,
88 remove_cvref_t<
decltype(in_grid_desc)>,
89 remove_cvref_t<
decltype(out_grid_desc)>,
90 tensor_operation::element_wise::PassThrough,
91 decltype(thread_slice_lengths),
92 decltype(dim_access_order),
95 InMemoryDataOperationEnum::Set,
98 dst_tensor.GetMultiIdxOffsets(),
99 tensor_operation::element_wise::PassThrough{}};
101 transfer.Run(in_grid_desc,
102 src_slice_origin_idxs,
103 src_tensor.GetBuffer(),
105 dst_tensor.GetBuffer());
107 else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
110 const auto dst_slice_origin_idxs =
111 generate_tuple([&](
auto) {
return I0; }, Number<num_dims>{});
112 auto transfer = ThreadwiseTensorSliceTransfer_v2<
113 std::remove_const_t<typename SrcTensorType::TensorElementType>,
114 std::remove_const_t<typename DstTensorType::TensorElementType>,
115 remove_cvref_t<
decltype(in_grid_desc)>,
116 remove_cvref_t<
decltype(out_grid_desc)>,
117 decltype(thread_slice_lengths),
118 decltype(dim_access_order),
123 false>{in_grid_desc, src_tensor.GetMultiIdxOffsets()};
125 transfer.Run(in_grid_desc,
126 src_tensor.GetBuffer(),
128 dst_slice_origin_idxs,
129 dst_tensor.GetBuffer());
134 static_for<0, SrcShapeType::Size(), 1>{}([&](
auto i) { dst_tensor(i) = src_tensor(i); });
181 DstTensorType& dst_tensor,
184 static_assert(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer);
185 static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
187 const auto& in_grid_desc =
layout(src_tensor).GetUnrolledDescriptor();
188 const auto& out_grid_desc =
layout(dst_tensor).GetUnrolledDescriptor();
190 using SrcShapeType = remove_cvref_t<
decltype(
shape(src_tensor))>;
191 constexpr index_t num_dims = SrcShapeType::Size();
193 constexpr auto tile_lengths_seq =
194 generate_sequence_v2([](
auto I) {
return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
195 constexpr auto thread_layout_seq =
196 generate_sequence_v2([](
auto I) {
return size<I>(ThreadShape{}); }, Number<num_dims>{});
197 constexpr auto dim_access_order = generate_sequence_v2(
198 [](
auto I) {
return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
200 using ThisThreadBlock = ThisThreadBlock<size(ThreadShape{})>;
203 auto transfer = ThreadGroupTensorSliceTransfer_v7<
205 Tuple<typename SrcTensorType::TensorElementType>,
206 Tuple<typename DstTensorType::TensorElementType>,
207 decltype(tie(in_grid_desc)),
208 decltype(tie(out_grid_desc)),
209 tensor_operation::element_wise::PassThrough,
210 Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
211 std::remove_const_t<
decltype(tile_lengths_seq)>,
212 std::remove_const_t<
decltype(thread_layout_seq)>,
213 std::remove_const_t<
decltype(dim_access_order)>,
214 std::remove_const_t<
decltype(dim_access_order)>,
218 Sequence<true>>{in_grid_desc,
219 make_tuple(src_tensor.GetMultiIdxOffsets()),
221 make_tuple(dst_tensor.GetMultiIdxOffsets()),
222 tensor_operation::element_wise::PassThrough{}};
224 transfer.Run(tie(in_grid_desc),
225 tie(src_tensor.GetBuffer()),
227 tie(dst_tensor.GetBuffer()));
__device__ void blockwise_copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor, const Layout< ThreadShape, ThreadUnrolledDesc > &thread_layout)
Perform optimized blockwise copy between two tensors. Tensors must have the same size.
Definition copy.hpp:180