GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1Value, BK1Value, NPerBlock, LTilePerBlock, L1Value, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, AEnableLds, ABlockLdsExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0ThreadTransferSrcResetCoordinateAfterRun, B0EnableLds, B0BlockLdsExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1ThreadTransferSrcResetCoordinateAfterRun, B1EnableLds, B1BlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, PadN, MaskOutUpperTriangle, NumGemmKPrefetchStage, LoopSched, PipelineVer > Struct Template Reference#
Classes |
Public Types |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1Value, BK1Value, NPerBlock, LTilePerBlock, L1Value, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, AEnableLds, ABlockLdsExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0ThreadTransferSrcResetCoordinateAfterRun, B0EnableLds, B0BlockLdsExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1ThreadTransferSrcResetCoordinateAfterRun, B1EnableLds, B1BlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, PadN, MaskOutUpperTriangle, NumGemmKPrefetchStage, LoopSched, PipelineVer > Struct Template Reference
#include <gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp>
Classes | |
| struct | SharedMemTrait |
Public Types | |
| using | ThisThreadBlock = ThisThreadBlock<BlockSize> |
| using | GridwiseGemmPipe |
| using | CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock |
| using | DefaultBlock2CTileMap |
Static Public Member Functions | |
| __host__ static __device__ constexpr auto | MakeABlockDescriptor () |
| __host__ static __device__ constexpr auto | MakeB0BlockDescriptor () |
| __host__ static __device__ constexpr auto | MakeB1BlockDescriptor () |
| __host__ static __device__ constexpr auto | MakeABlockSliceCopyStep () |
| __host__ static __device__ constexpr auto | MakeB0BlockSliceCopyStep () |
| __host__ static __device__ constexpr auto | MakeB1BlockSliceCopyStep () |
| template<typename ABlockDesc_> | |
| __host__ static __device__ constexpr auto | MakeAWaveDescriptor (const ABlockDesc_ &) |
| template<typename B0BlockDesc_> | |
| __host__ static __device__ constexpr auto | MakeB0WaveDescriptor (const B0BlockDesc_ &) |
| template<typename A1BlockDesc_AL0_M_AL1> | |
| __host__ static __device__ constexpr auto | MakeA1WaveDescriptor_L0_M0_M1_M2_L1 (const A1BlockDesc_AL0_M_AL1 &) |
| template<typename B1BlockDesc_> | |
| __host__ static __device__ constexpr auto | MakeB1WaveDescriptor (const B1BlockDesc_ &) |
| __host__ static __device__ constexpr auto | GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat () |
| __host__ static __device__ constexpr index_t | GetSharedMemoryNumberOfByte () |
| template<typename Block2CTileMap> | |
| __host__ static __device__ constexpr bool | CheckValidity (const AGridDesc &a_grid_desc, const B0GridDesc &b0_grid_desc, const B1GridDesc &b1_grid_desc, const CGridDesc_M_N &c_grid_desc_m_n, const Block2CTileMap &block_2_ctile_map) |
| __host__ static __device__ constexpr bool | CalculateHasMainKBlockLoop (index_t K) |
| __host__ static __device__ constexpr auto | MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock (const CGridDesc_M_N &c_grid_desc_m_n) |
| __host__ static __device__ constexpr auto | MakeDefaultBlock2CTileMap (const CGridDesc_M_N &c_grid_desc_m_n, index_t, index_t) |
| template<bool HasMainKBlockLoop, typename C0MatrixMask, typename Block2CTileMap = DefaultBlock2CTileMap> | |
| static __device__ void | Run (const ADataType *__restrict__ p_a_grid, const B0DataType *__restrict__ p_b0_grid, const B1DataType *__restrict__ p_b1_grid, CDataType *__restrict__ p_c_grid, void *__restrict__ p_shared, const AGridDesc &a_grid_desc, const B0GridDesc &b0_grid_desc, const B1GridDesc &b1_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock &c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation &a_element_op, const B0ElementwiseOperation &b0_element_op, const AccElementwiseOperation &acc_element_op, const B1ElementwiseOperation &b1_element_op, const CElementwiseOperation &c_element_op, const C0MatrixMask &c0_matrix_mask, const Block2CTileMap &block_2_ctile_map) |
Static Public Attributes | |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | I4 = Number<4>{} |
| static constexpr auto | I5 = Number<5>{} |
| static constexpr auto | I6 = Number<6>{} |
| static constexpr auto | I7 = Number<7>{} |
| static constexpr auto | AK1 = Number<AK1Value>{} |
| static constexpr auto | BK0 = Number<KPerBlock / BK1Value>{} |
| static constexpr auto | BK1 = Number<BK1Value>{} |
| static constexpr auto | L0PerBlock = LTilePerBlock / L1Value |
| static constexpr auto | AL0 = Number<L0PerBlock / 2>{} |
| static constexpr auto | AL1 = Number<L1Value>{} |
| static constexpr auto | BL0 = Number<L0PerBlock>{} |
| static constexpr auto | BL1 = Number<L1Value>{} |
| static constexpr auto | MWaves = MPerBlock / (MRepeat * MPerWmma) |
| static constexpr auto | LWaves = LPerBlock / (LRepeat * LPerWmma) |
| static constexpr auto | NWaves = NPerBlock / (NRepeat * NPerWmma) |
| static constexpr auto | WmmaK = 16 |
| static constexpr auto | WmmaL = 16 |
Member Typedef Documentation
◆ CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
| using ck::GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1Value, BK1Value, NPerBlock, LTilePerBlock, L1Value, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, AEnableLds, ABlockLdsExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0ThreadTransferSrcResetCoordinateAfterRun, B0EnableLds, B0BlockLdsExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1ThreadTransferSrcResetCoordinateAfterRun, B1EnableLds, B1BlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, PadN, MaskOutUpperTriangle, NumGemmKPrefetchStage, LoopSched, PipelineVer >::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock |
Initial value:
CGridDesc_M_N{}))>
__host__ static __device__ constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N &c_grid_desc_m_n)
Definition gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp:653
◆ DefaultBlock2CTileMap
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
| using ck::GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1Value, BK1Value, NPerBlock, LTilePerBlock, L1Value, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, AEnableLds, ABlockLdsExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0ThreadTransferSrcResetCoordinateAfterRun, B0EnableLds, B0BlockLdsExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1ThreadTransferSrcResetCoordinateAfterRun, B1EnableLds, B1BlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, PadN, MaskOutUpperTriangle, NumGemmKPrefetchStage, LoopSched, PipelineVer >::DefaultBlock2CTileMap |
Initial value:
__host__ static __device__ constexpr auto MakeDefaultBlock2CTileMap(const CGridDesc_M_N &c_grid_desc_m_n, index_t, index_t)
Definition gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp:672
◆ GridwiseGemmPipe
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
| using ck::GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1Value, BK1Value, NPerBlock, LTilePerBlock, L1Value, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, AEnableLds, ABlockLdsExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0ThreadTransferSrcResetCoordinateAfterRun, B0EnableLds, B0BlockLdsExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1ThreadTransferSrcResetCoordinateAfterRun, B1EnableLds, B1BlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, PadN, MaskOutUpperTriangle, NumGemmKPrefetchStage, LoopSched, PipelineVer >::GridwiseGemmPipe |
Initial value:
NumGemmKPrefetchStage,
LoopSched,
AEnableLds,
B0EnableLds>())>
constexpr auto GridwiseGemmPipeline_Selector()
Definition gridwise_gemm_pipeline_selector.hpp:31
◆ ThisThreadBlock
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
| using ck::GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1Value, BK1Value, NPerBlock, LTilePerBlock, L1Value, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, AEnableLds, ABlockLdsExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0ThreadTransferSrcResetCoordinateAfterRun, B0EnableLds, B0BlockLdsExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1ThreadTransferSrcResetCoordinateAfterRun, B1EnableLds, B1BlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, PadN, MaskOutUpperTriangle, NumGemmKPrefetchStage, LoopSched, PipelineVer >::ThisThreadBlock = ThisThreadBlock<BlockSize> |
Member Function Documentation
◆ CalculateHasMainKBlockLoop()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ CheckValidity()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
template<typename Block2CTileMap>
|
inlinestaticconstexpr |
◆ GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ GetSharedMemoryNumberOfByte()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeA1WaveDescriptor_L0_M0_M1_M2_L1()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
template<typename A1BlockDesc_AL0_M_AL1>
|
inlinestaticconstexpr |
◆ MakeABlockDescriptor()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeABlockSliceCopyStep()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeAWaveDescriptor()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
template<typename ABlockDesc_>
|
inlinestaticconstexpr |
◆ MakeB0BlockDescriptor()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeB0BlockSliceCopyStep()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeB0WaveDescriptor()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
template<typename B0BlockDesc_>
|
inlinestaticconstexpr |
◆ MakeB1BlockDescriptor()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeB1BlockSliceCopyStep()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeB1WaveDescriptor()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
template<typename B1BlockDesc_>
|
inlinestaticconstexpr |
◆ MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeDefaultBlock2CTileMap()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ Run()
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
template<bool HasMainKBlockLoop, typename C0MatrixMask, typename Block2CTileMap = DefaultBlock2CTileMap>
|
inlinestatic |
Member Data Documentation
◆ AK1
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ AL0
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ AL1
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ BK0
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ BK1
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ BL0
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ BL1
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I0
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I1
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I2
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I3
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I4
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I5
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I6
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ I7
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ L0PerBlock
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ LWaves
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ MWaves
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ NWaves
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ WmmaK
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ WmmaL
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
The documentation for this struct was generated from the following file: