#include <blockwise_gemm_dlops_v2r2.hpp>
|
| __device__ | BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 () |
| template<typename CM0M1N0N1ThreadDesc, typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer> |
| __device__ void | Run (const CM0M1N0N1ThreadDesc &, const ABlockBuffer &a_block_buf, const BBlockBuffer &b_block_buf, CThreadBuffer &c_thread_buf) const |
◆ AIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| using ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::AIndex = MultiIndex<3> |
◆ BIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| using ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::BIndex = MultiIndex<3> |
◆ CIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| using ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::CIndex = MultiIndex<4> |
◆ BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __device__ ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 |
( |
| ) |
|
|
inline |
◆ CalculateCM0M1N0N1ThreadOriginOnBlock()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __device__ CIndex ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::CalculateCM0M1N0N1ThreadOriginOnBlock |
( |
index_t | thread_id | ) |
|
|
inlinestatic |
◆ GetABlockAlignment()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::GetABlockAlignment |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ GetBBlockAlignment()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::GetBBlockAlignment |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ GetCM0M1N0N1ThreadTensorLengths()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::GetCM0M1N0N1ThreadTensorLengths |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ MakeAKM0M1BlockDescriptor()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeAKM0M1BlockDescriptor |
( |
const AKMBlockDesc & | | ) |
|
|
inlinestaticconstexpr |
◆ MakeBKN0N1BlockDescriptor()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeBKN0N1BlockDescriptor |
( |
const BKNBlockDesc & | | ) |
|
|
inlinestaticconstexpr |
◆ MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ Run()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
template<typename CM0M1N0N1ThreadDesc, typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
| __device__ void ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::Run |
( |
const CM0M1N0N1ThreadDesc & | , |
|
|
const ABlockBuffer & | a_block_buf, |
|
|
const BBlockBuffer & | b_block_buf, |
|
|
CThreadBuffer & | c_thread_buf ) const |
|
inline |
◆ a_k_m0_m1_block_desc_
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::a_k_m0_m1_block_desc_ = MakeAKM0M1BlockDescriptor(AKMBlockDesc{}) |
|
staticconstexpr |
◆ b_k_n0_n1_block_desc_
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::b_k_n0_n1_block_desc_ = MakeBKN0N1BlockDescriptor(BKNBlockDesc{}) |
|
staticconstexpr |
◆ I0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I0 = Number<0>{} |
|
staticconstexpr |
◆ I1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I1 = Number<1>{} |
|
staticconstexpr |
◆ I2
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I2 = Number<2>{} |
|
staticconstexpr |
◆ I3
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::I3 = Number<3>{} |
|
staticconstexpr |
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::K = AKMBlockDesc{}.GetLength(I0) |
|
staticconstexpr |
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M = AKMBlockDesc{}.GetLength(I1) |
|
staticconstexpr |
◆ M0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M0 = M / M1 |
|
staticconstexpr |
◆ M1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M1 = M1N1ThreadClusterM100 * M1N1ThreadClusterM101 * M1PerThreadM11 |
|
staticconstexpr |
◆ M100
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M100 = M1N1ThreadClusterM100 |
|
staticconstexpr |
◆ M101
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M101 = M1N1ThreadClusterM101 |
|
staticconstexpr |
◆ M11
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::M11 = M1PerThreadM11 |
|
staticconstexpr |
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N = BKNBlockDesc{}.GetLength(I1) |
|
staticconstexpr |
◆ N0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N0 = N / N1 |
|
staticconstexpr |
◆ N1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N1 = M1N1ThreadClusterN100 * M1N1ThreadClusterN101 * N1PerThreadN11 |
|
staticconstexpr |
◆ N100
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N100 = M1N1ThreadClusterN100 |
|
staticconstexpr |
◆ N101
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N101 = M1N1ThreadClusterN101 |
|
staticconstexpr |
◆ N11
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename AKMBlockDesc, typename BKNBlockDesc,
index_t M1PerThreadM11,
index_t N1PerThreadN11,
index_t KPerThread,
index_t M1N1ThreadClusterM100,
index_t M1N1ThreadClusterN100,
index_t M1N1ThreadClusterM101,
index_t M1N1ThreadClusterN101,
index_t AThreadCopyScalarPerVector_M11,
index_t BThreadCopyScalarPerVector_N11, typename
enable_if< AKMBlockDesc::IsKnownAtCompileTime() &&BKNBlockDesc::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2< BlockSize, FloatA, FloatB, FloatC, AKMBlockDesc, BKNBlockDesc, M1PerThreadM11, N1PerThreadN11, KPerThread, M1N1ThreadClusterM100, M1N1ThreadClusterN100, M1N1ThreadClusterM101, M1N1ThreadClusterN101, AThreadCopyScalarPerVector_M11, BThreadCopyScalarPerVector_N11, type >::N11 = N1PerThreadN11 |
|
staticconstexpr |
The documentation for this struct was generated from the following file: