BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type > Struct Template Reference

BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2&lt; BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type &gt; Struct Template Reference#

Composable Kernel: ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type > Struct Template Reference
ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type > Struct Template Reference

#include <blockwise_gemm_dlops_v2r2.hpp>

Public Types

using AIndex = MultiIndex<3>
using BIndex = MultiIndex<3>
using CIndex = MultiIndex<4>

Public Member Functions

__device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 ()
template<typename CM0M1N0N1ThreadDesc, typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
__device__ void Run (const CM0M1N0N1ThreadDesc &, const ABlockBuffer &a_block_buf, const BBlockBuffer &b_block_buf, CThreadBuffer &c_thread_buf) const

Static Public Member Functions

__host__ static __device__ constexpr auto MakeAKM0M1BlockDescriptor (const AKMBlockDesc &)
__host__ static __device__ constexpr auto MakeBKN0N1BlockDescriptor (const BKNBlockDesc &)
__host__ static __device__ constexpr auto MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor ()
__host__ static __device__ constexpr auto MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor ()
__host__ static __device__ constexpr auto GetCM0M1N0N1ThreadTensorLengths ()
static __device__ CIndex CalculateCM0M1N0N1ThreadOriginOnBlock (index_t thread_id)
__host__ static __device__ constexpr index_t GetABlockAlignment ()
__host__ static __device__ constexpr auto GetBBlockAlignment ()

Static Public Attributes

static constexpr auto I0 = Number<0>{}
static constexpr auto I1 = Number<1>{}
static constexpr auto I2 = Number<2>{}
static constexpr auto I3 = Number<3>{}
static constexpr index_t K = AKMBlockDesc{}.GetLength(I0)
static constexpr index_t M = AKMBlockDesc{}.GetLength(I1)
static constexpr index_t N = BKNBlockDesc{}.GetLength(I1)
static constexpr index_t M100 = M1N1ThreadClusterM100
static constexpr index_t N100 = M1N1ThreadClusterN100
static constexpr index_t M101 = M1N1ThreadClusterM101
static constexpr index_t N101 = M1N1ThreadClusterN101
static constexpr index_t M11 = M1PerThreadM11
static constexpr index_t N11 = N1PerThreadN11
static constexpr index_t M1 = M1N1ThreadClusterM100 * M1N1ThreadClusterM101 * M1PerThreadM11
static constexpr index_t N1 = M1N1ThreadClusterN100 * M1N1ThreadClusterN101 * N1PerThreadN11
static constexpr index_t M0 = M / M1
static constexpr index_t N0 = N / N1
static constexpr auto a_k_m0_m1_block_desc_ = MakeAKM0M1BlockDescriptor(AKMBlockDesc{})
static constexpr auto b_k_n0_n1_block_desc_ = MakeBKN0N1BlockDescriptor(BKNBlockDesc{})

Member Typedef Documentation

◆ AIndex

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
using ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::AIndex = MultiIndex<3>

◆ BIndex

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
using ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::BIndex = MultiIndex<3>

◆ CIndex

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
using ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::CIndex = MultiIndex<4>

Constructor & Destructor Documentation

◆ BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__device__ ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 ( )
inline

Member Function Documentation

◆ CalculateCM0M1N0N1ThreadOriginOnBlock()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__device__ CIndex ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::CalculateCM0M1N0N1ThreadOriginOnBlock ( index_t thread_id)
inlinestatic

◆ GetABlockAlignment()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ static __device__ constexpr index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::GetABlockAlignment ( )
inlinestaticconstexpr

◆ GetBBlockAlignment()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::GetBBlockAlignment ( )
inlinestaticconstexpr

◆ GetCM0M1N0N1ThreadTensorLengths()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::GetCM0M1N0N1ThreadTensorLengths ( )
inlinestaticconstexpr

◆ MakeAKM0M1BlockDescriptor()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeAKM0M1BlockDescriptor ( const AKMBlockDesc & )
inlinestaticconstexpr

◆ MakeBKN0N1BlockDescriptor()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeBKN0N1BlockDescriptor ( const BKNBlockDesc & )
inlinestaticconstexpr

◆ MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor ( )
inlinestaticconstexpr

◆ MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor ( )
inlinestaticconstexpr

◆ Run()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
template<typename CM0M1N0N1ThreadDesc, typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
__device__ void ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::Run ( const CM0M1N0N1ThreadDesc & ,
const ABlockBuffer & a_block_buf,
const BBlockBuffer & b_block_buf,
CThreadBuffer & c_thread_buf ) const
inline

Member Data Documentation

◆ a_k_m0_m1_block_desc_

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::a_k_m0_m1_block_desc_ = MakeAKM0M1BlockDescriptor(AKMBlockDesc{})
staticconstexpr

◆ b_k_n0_n1_block_desc_

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::b_k_n0_n1_block_desc_ = MakeBKN0N1BlockDescriptor(BKNBlockDesc{})
staticconstexpr

◆ I0

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I1 = Number<1>{}
staticconstexpr

◆ I2

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I2 = Number<2>{}
staticconstexpr

◆ I3

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I3 = Number<3>{}
staticconstexpr

◆ K

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::K = AKMBlockDesc{}.GetLength(I0)
staticconstexpr

◆ M

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M = AKMBlockDesc{}.GetLength(I1)
staticconstexpr

◆ M0

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M0 = M / M1
staticconstexpr

◆ M1

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M1 = M1N1ThreadClusterM100 * M1N1ThreadClusterM101 * M1PerThreadM11
staticconstexpr

◆ M100

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M100 = M1N1ThreadClusterM100
staticconstexpr

◆ M101

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M101 = M1N1ThreadClusterM101
staticconstexpr

◆ M11

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M11 = M1PerThreadM11
staticconstexpr

◆ N

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N = BKNBlockDesc{}.GetLength(I1)
staticconstexpr

◆ N0

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N0 = N / N1
staticconstexpr

◆ N1

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N1 = M1N1ThreadClusterN100 * M1N1ThreadClusterN101 * N1PerThreadN11
staticconstexpr

◆ N100

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N100 = M1N1ThreadClusterN100
staticconstexpr

◆ N101

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N101 = M1N1ThreadClusterN101
staticconstexpr

◆ N11

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc, index_t M1PerThreadM11, index_t N1PerThreadN11, index_t KPerThread, index_t M1N1ThreadClusterM100, index_t M1N1ThreadClusterN100, index_t M1N1ThreadClusterM101, index_t M1N1ThreadClusterN101, index_t AThreadCopyScalarPerVector_M11, index_t BThreadCopyScalarPerVector_N11, typename enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N11 = N1PerThreadN11
staticconstexpr

The documentation for this struct was generated from the following file: