GridwiseGemmDl_km_kn_mn_v1r3< BlockSize, FloatAB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, MPerBlock, NPerBlock, K0PerBlock, K1Value, M1PerThreadM111, N1PerThreadN111, KPerThread, M11N11ThreadClusterM110Xs, M11N11ThreadClusterN110Xs, ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, ABlockTransferSrcVectorTensorContiguousDimOrder, ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, BBlockTransferSrcVectorTensorContiguousDimOrder, BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector > Struct Template Reference#
ck::GridwiseGemmDl_km_kn_mn_v1r3< BlockSize, FloatAB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, MPerBlock, NPerBlock, K0PerBlock, K1Value, M1PerThreadM111, N1PerThreadN111, KPerThread, M11N11ThreadClusterM110Xs, M11N11ThreadClusterN110Xs, ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, ABlockTransferSrcVectorTensorContiguousDimOrder, ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, BBlockTransferSrcVectorTensorContiguousDimOrder, BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector > Struct Template Reference
#include <gridwise_gemm_dl_v1r3.hpp>
Public Types | |
| using | AGridDesc_K0_M0_M1_K1 = decltype(MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{})) |
| using | BGridDesc_K0_N0_N1_K1 = decltype(MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{})) |
| using | CGridDesc_M0_M10_M11_N0_N10_N11 |
| using | Block2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{})) |
Static Public Member Functions | |
| __host__ static __device__ constexpr index_t | GetSharedMemoryNumberOfByte () |
| __host__ static __device__ constexpr bool | CheckValidity (const AGridDesc_K0_M_K1 &a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 &b_grid_desc_k0_n_k1, const CGridDesc_M_N &c_grid_desc_m_n) |
| __host__ static __device__ constexpr index_t | CalculateGridSize (index_t M, index_t N) |
| __host__ static __device__ constexpr bool | CalculateHasMainKBlockLoop (index_t K0) |
| __host__ static __device__ constexpr bool | CalculateHasDoubleTailKBlockLoop (index_t K0) |
| __host__ static __device__ constexpr auto | MakeAGridDescriptor_K0_M0_M1_K1 (const AGridDesc_K0_M_K1 &a_grid_desc_k0_m_k1) |
| __host__ static __device__ constexpr auto | MakeBGridDescriptor_K0_N0_N1_K1 (const BGridDesc_K0_N_K1 &b_grid_desc_k0_n_k1) |
| __host__ static __device__ constexpr auto | MakeCGridDescriptor_M0_M10_M11_N0_N10_N11 (const CGridDesc_M_N &c_grid_desc_m_n) |
| __host__ static __device__ constexpr auto | MakeDefaultBlock2CTileMap (const CGridDesc_M_N &c_grid_desc_m_n) |
| template<bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> | |
| static __device__ void | Run (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, FloatAB *__restrict__ p_shared_block, const AGridDesc_K0_M0_M1_K1 &a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 &b_grid_desc_k0_n0_n1_k1, const CGridDesc_M0_M10_M11_N0_N10_N11 &c_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap &block_2_ctile_map, integral_constant< bool, HasMainKBlockLoop >, integral_constant< bool, HasDoubleTailKBlockLoop >) |
Static Public Attributes | |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | K1 = Number<K1Value>{} |
Member Typedef Documentation
◆ AGridDesc_K0_M0_M1_K1
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
| using ck::GridwiseGemmDl_km_kn_mn_v1r3< BlockSize, FloatAB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, MPerBlock, NPerBlock, K0PerBlock, K1Value, M1PerThreadM111, N1PerThreadN111, KPerThread, M11N11ThreadClusterM110Xs, M11N11ThreadClusterN110Xs, ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, ABlockTransferSrcVectorTensorContiguousDimOrder, ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, BBlockTransferSrcVectorTensorContiguousDimOrder, BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector >::AGridDesc_K0_M0_M1_K1 = decltype(MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{})) |
◆ BGridDesc_K0_N0_N1_K1
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
| using ck::GridwiseGemmDl_km_kn_mn_v1r3< BlockSize, FloatAB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, MPerBlock, NPerBlock, K0PerBlock, K1Value, M1PerThreadM111, N1PerThreadN111, KPerThread, M11N11ThreadClusterM110Xs, M11N11ThreadClusterN110Xs, ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, ABlockTransferSrcVectorTensorContiguousDimOrder, ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, BBlockTransferSrcVectorTensorContiguousDimOrder, BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector >::BGridDesc_K0_N0_N1_K1 = decltype(MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{})) |
◆ Block2CTileMap
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
| using ck::GridwiseGemmDl_km_kn_mn_v1r3< BlockSize, FloatAB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, MPerBlock, NPerBlock, K0PerBlock, K1Value, M1PerThreadM111, N1PerThreadN111, KPerThread, M11N11ThreadClusterM110Xs, M11N11ThreadClusterN110Xs, ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, ABlockTransferSrcVectorTensorContiguousDimOrder, ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, BBlockTransferSrcVectorTensorContiguousDimOrder, BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector >::Block2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{})) |
◆ CGridDesc_M0_M10_M11_N0_N10_N11
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
| using ck::GridwiseGemmDl_km_kn_mn_v1r3< BlockSize, FloatAB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, MPerBlock, NPerBlock, K0PerBlock, K1Value, M1PerThreadM111, N1PerThreadN111, KPerThread, M11N11ThreadClusterM110Xs, M11N11ThreadClusterN110Xs, ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, ABlockTransferSrcVectorTensorContiguousDimOrder, ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, BBlockTransferSrcVectorTensorContiguousDimOrder, BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector >::CGridDesc_M0_M10_M11_N0_N10_N11 |
Initial value:
decltype(MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}))
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(const CGridDesc_M_N &c_grid_desc_m_n)
Definition gridwise_gemm_dl_v1r3.hpp:208
Member Function Documentation
◆ CalculateGridSize()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ CalculateHasDoubleTailKBlockLoop()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ CalculateHasMainKBlockLoop()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ CheckValidity()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ GetSharedMemoryNumberOfByte()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ MakeAGridDescriptor_K0_M0_M1_K1()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ MakeBGridDescriptor_K0_N0_N1_K1()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ MakeCGridDescriptor_M0_M10_M11_N0_N10_N11()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ MakeDefaultBlock2CTileMap()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
inlinestaticconstexpr |
◆ Run()
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
template<bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
|
inlinestatic |
Member Data Documentation
◆ I0
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
staticconstexpr |
◆ I1
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
staticconstexpr |
◆ I2
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
staticconstexpr |
◆ I3
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
staticconstexpr |
◆ K1
template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t K1Value, index_t M1PerThreadM111, index_t N1PerThreadN111, index_t KPerThread, typename M11N11ThreadClusterM110Xs, typename M11N11ThreadClusterN110Xs, typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, typename ABlockTransferSrcVectorTensorContiguousDimOrder, typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, typename BBlockTransferSrcVectorTensorContiguousDimOrder, typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, typename CThreadTransferSrcDstAccessOrder, index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferDstScalarPerVector>
|
staticconstexpr |
The documentation for this struct was generated from the following file: