18template <
typename SliceLengths,
24 typename SrcDimAccessOrder,
25 typename DstDimAccessOrder,
26 typename SrcVectorTensorLengths,
27 typename DstVectorTensorLengths,
28 typename SrcVectorTensorContiguousDimOrder,
29 typename DstVectorTensorContiguousDimOrder,
30 bool SrcResetCoordinateAfterRun,
33 bool DstResetCoordinateAfterRun>
51 const Index& src_slice_origin,
52 const DstDesc& dst_desc,
53 const Index& dst_slice_origin)
59 "wrong! current implementation assume SrcData and DstData are same type");
62 static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0 &&
63 SliceLengths::At(i) % DstVectorTensorLengths::At(i) == 0,
78 template <
typename SrcBuffer,
typename SrcStepHacks>
80 RunRead(
const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const SrcStepHacks& src_step_hacks)
88 "wrong! SrcBuffer and SrcData data type are inconsistent");
91 constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
96 SrcVectorTensorContiguousDimOrder{}),
99 SrcVectorTensorContiguousDimOrder{});
101 constexpr auto src_vector_desc =
106 constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
108 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
110 constexpr auto ordered_src_access_lengths =
116 Index forward_step_idx;
119 forward_step_idx(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
123 src_desc, forward_step_idx, src_step_hacks[
I0][i]);
130 Index backward_step_idx;
133 backward_step_idx(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
137 src_desc, backward_step_idx, src_step_hacks[
I1][i]);
142 static_ford<
decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
144 constexpr auto forward_sweep = [&]() {
147 forward_sweep_(
I0) =
true;
150 index_t tmp = ordered_src_access_idx[
I0];
153 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
156 forward_sweep_(i) = tmp % 2 == 0;
159 return forward_sweep_;
163 constexpr auto src_data_idx = [&]() {
167 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
168 : ordered_src_access_lengths[i] - 1 -
169 ordered_src_access_idx[i];
173 src_vector_tensor_lengths;
178 using src_vector_t =
typename decltype(src_vector)::type;
180 const bool is_src_valid =
184 src_vector.template AsType<src_vector_t>()(
I0) =
185 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
191 constexpr index_t src_vector_offset =
192 src_vector_desc.CalculateOffset(src_vector_idx);
194 constexpr index_t buffer_offset =
195 buffer_desc_.CalculateOffset(src_data_idx + src_vector_idx);
201 constexpr auto move_on_dim = [&]()
constexpr {
205 move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
209 ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
218 if constexpr(move_on_dim[i])
220 if constexpr(forward_sweep[i])
223 src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
228 src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
235 if constexpr(SrcResetCoordinateAfterRun)
237 const auto src_reset_step =
244 template <
typename DstBuffer,
typename DstStepHacks>
246 RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf,
const DstStepHacks& dst_step_hacks)
254 "wrong! SrcBuffer or DstBuffer data type is wrong");
257 constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
262 DstVectorTensorContiguousDimOrder{}),
265 DstVectorTensorContiguousDimOrder{});
267 constexpr auto dst_vector_desc =
272 constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
274 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
276 constexpr auto ordered_dst_access_lengths =
282 Index forward_step_idx;
285 forward_step_idx(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
289 dst_desc, forward_step_idx, dst_step_hacks[
I0][i]);
296 Index backward_step_idx;
299 backward_step_idx(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
303 dst_desc, backward_step_idx, dst_step_hacks[
I1][i]);
308 static_ford<
decltype(ordered_dst_access_lengths)>{}([&](
auto ordered_dst_access_idx) {
310 constexpr auto forward_sweep = [&]() {
313 forward_sweep_(
I0) =
true;
319 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
322 forward_sweep_(i) = tmp % 2 == 0;
325 return forward_sweep_;
329 constexpr auto dst_data_idx = [&]() {
333 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
334 : ordered_dst_access_lengths[i] - 1 -
335 ordered_dst_access_idx[i];
339 dst_vector_tensor_lengths;
348 constexpr index_t buffer_offset =
349 buffer_desc_.CalculateOffset(dst_data_idx + dst_vector_idx);
351 constexpr index_t dst_vector_offset =
352 dst_vector_desc.CalculateOffset(dst_vector_idx);
358 using dst_vector_t =
typename decltype(dst_vector)::type;
361 const bool is_dst_valid =
365 dst_coord_.GetOffset(),
367 dst_vector.template AsType<dst_vector_t>()[
Number<0>{}]);
369 constexpr auto move_on_dim = [&]()
constexpr {
373 move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
377 ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
386 if constexpr(move_on_dim[i])
388 if constexpr(forward_sweep[i])
391 dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
396 dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
403 if constexpr(DstResetCoordinateAfterRun)
405 const auto dst_reset_step =
412 template <
typename SrcBuffer>
413 __device__
void RunRead(
const SrcDesc& src_desc,
const SrcBuffer& src_buf)
415 constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
419 constexpr auto src_step_hacks =
423 RunRead(src_desc, src_buf, src_step_hacks);
426 template <
typename DstBuffer>
427 __device__
void RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf)
429 constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
433 constexpr auto dst_step_hacks =
437 RunWrite(dst_desc, dst_buf, dst_step_hacks);
442 constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
444 constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
446 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
448 constexpr auto ordered_src_access_lengths =
452 constexpr auto forward_sweep = [&]() {
455 forward_sweep_(
I0) =
true;
458 index_t tmp = ordered_src_access_lengths[
I0] - 1;
461 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
464 forward_sweep_(i) = tmp % 2 == 0;
467 return forward_sweep_;
472 constexpr auto src_data_idx = [&]() {
476 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
480 src_vector_tensor_lengths;
484 constexpr auto reset_src_data_step = [&]() {
485 Index reset_src_data_step_;
489 return reset_src_data_step_;
492 return reset_src_data_step;
497 constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
499 constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
501 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
503 constexpr auto ordered_dst_access_lengths =
507 constexpr auto forward_sweep = [&]() {
510 forward_sweep_(
I0) =
true;
513 index_t tmp = ordered_dst_access_lengths[
I0] - 1;
516 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
519 forward_sweep_(i) = tmp % 2 == 0;
522 return forward_sweep_;
527 constexpr auto dst_data_idx = [&]() {
531 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
535 dst_vector_tensor_lengths;
539 constexpr auto reset_dst_data_step = [&]() {
540 Index reset_dst_data_step_;
544 return reset_dst_data_step_;
547 return reset_dst_data_step;
552 const Index& src_slice_origin_step_idx)
555 const auto adjusted_step_idx =
556 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
566 template <
typename SrcMoveSliceWindowStepHack>
569 const Index& src_slice_origin_step_idx,
570 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
573 const auto adjusted_step_idx =
574 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
579 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
585 const Index& dst_slice_origin_step_idx)
588 const auto adjusted_step_idx =
589 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
599 static constexpr auto buffer_desc_ =
602 static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
604 StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Lds
Definition amd_address_space.hpp:18
@ Global
Definition amd_address_space.hpp:17
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr Y type_convert(X x)
Definition utility/type_convert.hpp:98
__host__ __device__ constexpr auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition utility/container_helper.hpp:54
__host__ __device__ constexpr auto to_multi_index(const T &x)
Definition array_multi_index.hpp:28
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto container_reverse_exclusive_scan(const Array< TData, NSize > &x, Reduce f, TData init)
Definition utility/container_helper.hpp:213
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition utility/container_helper.hpp:43
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:413
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:584
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::SrcCoord decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition threadwise_tensor_slice_transfer_v5r1.hpp:44
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:73
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:551
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::DstCoordStep decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition threadwise_tensor_slice_transfer_v5r1.hpp:48
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:68
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, const DstStepHacks &dst_step_hacks)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:246
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:568
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::I0 static constexpr auto I0
Definition threadwise_tensor_slice_transfer_v5r1.hpp:38
static __device__ constexpr auto GetSrcCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v5r1.hpp:440
__device__ constexpr ThreadwiseTensorSliceTransfer_v5r1(const SrcDesc &src_desc, const Index &src_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:50
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::SrcCoordStep decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition threadwise_tensor_slice_transfer_v5r1.hpp:47
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::Index MultiIndex< nDim > Index
Definition threadwise_tensor_slice_transfer_v5r1.hpp:42
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::nDim static constexpr index_t nDim
Definition threadwise_tensor_slice_transfer_v5r1.hpp:41
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::DstCoord decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition threadwise_tensor_slice_transfer_v5r1.hpp:45
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:427
static __device__ constexpr auto GetDstCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v5r1.hpp:495
ck::ThreadwiseTensorSliceTransfer_v5r1< ThreadSliceLengths, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorTensorLengths, DstVectorTensorLengths, SrcVectorTensorContiguousDimOrder, DstVectorTensorContiguousDimOrder, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::I1 static constexpr auto I1
Definition threadwise_tensor_slice_transfer_v5r1.hpp:39
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, const SrcStepHacks &src_step_hacks)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:80
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition utility/math.hpp:34
Definition functional2.hpp:33
Definition functional3.hpp:97