F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy > Struct Template Reference

F16xMXF4FlatmmPipelineAGmemBGmemCRegV1&lt; Problem, PipelinePolicy &gt; Struct Template Reference#

Composable Kernel: ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy > Struct Template Reference
ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy > Struct Template Reference

#include <mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp>

Inheritance diagram for ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >:
ck_tile::FlatmmPipelineAGmemBGmemCRegV1< Problem, F16xMXF4FlatmmPipelineAgBgCrPolicy >

Public Types

using Underlying = FlatmmPipelineAGmemBGmemCRegV1<Problem, PipelinePolicy>
using ADataType = remove_cvref_t<typename Problem::ADataType>
using BDataType = remove_cvref_t<typename Problem::QuantType>
using CDataType = remove_cvref_t<typename Problem::CDataType>
using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>
using ComputeType = ADataType
using ALayout = remove_cvref_t<typename Problem::ALayout>
using BLayout = remove_cvref_t<typename Problem::BLayout>
using CLayout = remove_cvref_t<typename Problem::CLayout>
using BlockFlatmm
using WG = remove_cvref_t<decltype(config.template at<0>())>
using BlockTile = remove_cvref_t<typename BlockGemmShape::BlockTile>
using BlockWarps = remove_cvref_t<typename BlockGemmShape::BlockWarps>
using WarpTile = remove_cvref_t<typename BlockGemmShape::WarpTile>
Public Types inherited from ck_tile::FlatmmPipelineAGmemBGmemCRegV1< Problem, F16xMXF4FlatmmPipelineAgBgCrPolicy >
using ADataType
using BDataType
using CDataType
using BlockGemmShape
using ALayout
using BLayout
using CLayout
using BlockFlatmm
using WG
using BlockTile
using BlockWarps
using WarpTile

Public Member Functions

template<typename ADramBlockWindowTmp, typename AElementFunction, typename BFlatBlockWindowTmp, typename DequantBFlatWindow>
CK_TILE_HOST_DEVICE auto operator() (ADramBlockWindowTmp a_copy_dram_window_, const AElementFunction &a_element_func, const BFlatBlockWindowTmp &b_flat_dram_block_window_tmp, const DequantBFlatWindow &scale_b_flat_window, const index_t num_loop, const index_t k_padded_zeros, void *p_smem_ping, void *p_smem_pong) const
template<typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename DequantBFlatWindow>
CK_TILE_DEVICE auto operator() (const ADramBlockWindowTmp &a_dram_block_window_tmp, const BFlatBlockWindowTmp &b_flat_dram_block_window_tmp, const DequantBFlatWindow &scale_b_flat_window, const index_t num_loop, const index_t k_padded_zeros, void *p_smem_ping, void *p_smem_pong) const
template<typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename DequantBFlatWindow>
CK_TILE_DEVICE auto operator() (const ADramBlockWindowTmp &a_dram_block_window_tmp, const BFlatBlockWindowTmp &b_flat_dram_block_window_tmp, const DequantBFlatWindow &scale_b_flat_window, const index_t num_loop, void *p_smem_ping, void *p_smem_pong) const
Public Member Functions inherited from ck_tile::FlatmmPipelineAGmemBGmemCRegV1< Problem, F16xMXF4FlatmmPipelineAgBgCrPolicy >
CK_TILE_HOST_DEVICE auto operator() (const ADramBlockWindowTmp &a_dram_block_window_tmp, const AElementFunction &a_element_func, const BFlatBlockWindowTmp &b_flat_dram_block_window_tmp, index_t num_loop, void *p_smem_ping, void *p_smem_pong) const

Static Public Member Functions

static constexpr index_t GetVectorSizeA ()
static constexpr index_t GetVectorSizeB ()
static constexpr index_t GetVectorSizeC ()
static CK_TILE_HOST_DEVICE constexpr auto SchedulerPerM (index_t dsread_perM, index_t dswrite_perM, index_t load_perM)
static CK_TILE_HOST_DEVICE constexpr auto HotLoopScheduler ()
static CK_TILE_HOST_DEVICE constexpr auto Last2ndHotLoopScheduler ()
static CK_TILE_HOST_DEVICE constexpr auto LastHotLoopScheduler ()
static CK_TILE_HOST_DEVICE constexpr auto GetADramTileDistribution ()
Static Public Member Functions inherited from ck_tile::FlatmmPipelineAGmemBGmemCRegV1< Problem, F16xMXF4FlatmmPipelineAgBgCrPolicy >
static constexpr index_t GetVectorSizeA ()
static constexpr index_t GetVectorSizeB ()
static constexpr index_t GetVectorSizeC ()
static CK_TILE_HOST const std::string GetName ()
static CK_TILE_HOST_DEVICE constexpr auto TransposeC ()
static CK_TILE_HOST_DEVICE constexpr index_t GetSmemSize ()
static CK_TILE_HOST_DEVICE constexpr auto SchedulerPerM (index_t dsread_perM, index_t dswrite_perM, index_t load_perM)
static CK_TILE_HOST_DEVICE constexpr auto HotLoopScheduler ()
static CK_TILE_HOST_DEVICE constexpr auto Last2ndHotLoopScheduler ()
static CK_TILE_HOST_DEVICE constexpr auto LastHotLoopScheduler ()

Static Public Attributes

static constexpr auto config
static constexpr index_t DsWritePreIssue = 3
static constexpr index_t DsReadPreload = 2
static constexpr index_t BlockSize = Problem::kBlockSize
static constexpr index_t WaveSize = get_warp_size()
static constexpr index_t kMPerBlock = BlockGemmShape::kM
static constexpr index_t kNPerBlock = BlockGemmShape::kN
static constexpr index_t kKPerBlock = BlockGemmShape::kK
static constexpr index_t flatKPerWarp = Problem::flatKPerWarp
static constexpr index_t flatNPerWarp = Problem::flatNPerWarp
static constexpr bool kPadM = Problem::kPadM
static constexpr bool kPadN = Problem::kPadN
static constexpr bool kPadK = Problem::kPadK
static constexpr index_t kLdsAlignmentInBytes = 16
static constexpr index_t NumWaveGroups = Problem::NumWaveGroups
static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel
static constexpr auto I0 = number<0>()
static constexpr auto I1 = number<1>()
static constexpr auto I2 = number<2>()
static constexpr auto idxM = I0
static constexpr auto idxN = I1
static constexpr auto idxK = I2
static constexpr index_t MWarp = config.template at<1>()
static constexpr index_t NWarp = config.template at<2>()
static constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM)
static constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WG::kN)
static constexpr index_t KIterPerWarp = kKPerBlock / WG::kK
static constexpr index_t KFlatPerBlockPerIter = flatKPerWarp
static constexpr index_t NFlatPerBlockPerIter = flatNPerWarp
static constexpr index_t MPerBlockPerIter = kMPerBlock / MIterPerWarp
static constexpr index_t KPerBlockPerIter = kKPerBlock / KIterPerWarp
static constexpr int MXFP4PackedSize = 2
static constexpr index_t AK1 = Problem::VectorLoadSize / sizeof(ADataType)
static constexpr index_t BK1 = Problem::VectorLoadSize / sizeof(BDataType) * MXFP4PackedSize
static constexpr index_t m_preload
static constexpr int ContinuousKPerThread = Problem::ContinuousKPerThread
static constexpr int ContinuousScaleNPerThread = Problem::ContinuousScaleNPerThread
static constexpr int ContinuousScaleKPerThread = Problem::ContinuousScaleKPerThread
static constexpr int ScaleKFlatPerWarp
static constexpr int XDLK_PerThread
static constexpr int XDL_PerWeightK = 4
static constexpr int XDL_PerScaleK = XDL_PerWeightK * ContinuousScaleKPerThread
static constexpr int XDL_PerScaleN = ContinuousScaleNPerThread
static constexpr int MXFP4KPerWarp = KIterPerWarp / XDL_PerWeightK
static constexpr int ScaleKPerWarp = KIterPerWarp / XDL_PerScaleK
static constexpr int ScaleNPerWarp = NIterPerWarp / XDL_PerScaleN
static constexpr int MXFP4K_PerScaleK = MXFP4KPerWarp / ScaleKPerWarp
static constexpr bool HasHotLoop = Problem::HasHotLoop
static constexpr auto TailNum = Problem::TailNum
static constexpr index_t mfma_per_wg = 1
static constexpr index_t dsread_per_wg
static constexpr index_t dsread_num_perK = dsread_per_wg * MIterPerWarp
static constexpr index_t dswrite_num_perK = dsread_num_perK / (MWarp * NWarp)
static constexpr index_t dswrite_rep = (dswrite_num_perK + MIterPerWarp - 1) / MIterPerWarp
static constexpr index_t Aload_num_perK = dswrite_num_perK
static constexpr index_t Aload_rep = dswrite_rep
static constexpr index_t Bload_num_perK = kNPerBlock * WG::kK / NWarp / BK1 / WaveSize
static constexpr index_t ScaleBload_K1 = ContinuousScaleNPerThread * ContinuousScaleKPerThread
static constexpr index_t ScaleBload_num
static constexpr index_t Bload_total_num
static constexpr index_t KPerScaleLoad = KIterPerWarp / ScaleBload_num
static constexpr index_t HalfMIter = (MIterPerWarp + 1) / 2
static constexpr index_t Bload_rep = (Bload_num_perK + HalfMIter - 1) / HalfMIter
static constexpr index_t mfma_perM_perK = NIterPerWarp * mfma_per_wg
static constexpr index_t dswrite_mIter = (DsWritePreIssue - 1) % MIterPerWarp
static constexpr index_t dswrite_kIter = (DsWritePreIssue - 1) / MIterPerWarp
static constexpr bool DoubleSmemBuffer = false
Static Public Attributes inherited from ck_tile::FlatmmPipelineAGmemBGmemCRegV1< Problem, F16xMXF4FlatmmPipelineAgBgCrPolicy >
static constexpr auto config
static constexpr index_t DsWritePreIssue
static constexpr index_t DsReadPreload
static constexpr index_t BlockSize
static constexpr index_t WaveSize
static constexpr index_t kMPerBlock
static constexpr index_t kNPerBlock
static constexpr index_t kKPerBlock
static constexpr index_t flatKPerWarp
static constexpr index_t flatNPerWarp
static constexpr bool kPadM
static constexpr bool kPadN
static constexpr bool kPadK
static constexpr index_t kLdsAlignmentInBytes
static constexpr index_t NumWaveGroups
static constexpr bool UsePersistentKernel
static constexpr auto I0
static constexpr auto I1
static constexpr auto I2
static constexpr auto idxM
static constexpr auto idxN
static constexpr auto idxK
static constexpr index_t MWarp
static constexpr index_t NWarp
static constexpr index_t MIterPerWarp
static constexpr index_t NIterPerWarp
static constexpr index_t KIterPerWarp
static constexpr index_t KFlatPerBlockPerIter
static constexpr index_t NFlatPerBlockPerIter
static constexpr index_t MPerBlockPerIter
static constexpr index_t KPerBlockPerIter
static constexpr index_t K1
static constexpr index_t m_preload
static constexpr bool HasHotLoop
static constexpr auto TailNum
static constexpr index_t mfma_per_wg
static constexpr index_t dsread_per_wg
static constexpr index_t dsread_num_perK
static constexpr index_t dswrite_num_perK
static constexpr index_t dswrite_rep
static constexpr index_t Aload_num_perK
static constexpr index_t Aload_rep
static constexpr index_t Bload_num_perK
static constexpr index_t HalfMIter
static constexpr index_t Bload_rep
static constexpr index_t mfma_perM_perK
static constexpr index_t dswrite_mIter
static constexpr index_t dswrite_kIter
static constexpr bool DoubleSmemBuffer

Member Typedef Documentation

◆ ADataType

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ADataType = remove_cvref_t<typename Problem::ADataType>

◆ ALayout

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ALayout = remove_cvref_t<typename Problem::ALayout>

◆ BDataType

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BDataType = remove_cvref_t<typename Problem::QuantType>

◆ BLayout

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BLayout = remove_cvref_t<typename Problem::BLayout>

◆ BlockFlatmm

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BlockFlatmm
Initial value:
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition type_traits.hpp:21

◆ BlockGemmShape

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>

◆ BlockTile

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BlockTile = remove_cvref_t<typename BlockGemmShape::BlockTile>

◆ BlockWarps

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BlockWarps = remove_cvref_t<typename BlockGemmShape::BlockWarps>

◆ CDataType

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::CDataType = remove_cvref_t<typename Problem::CDataType>

◆ CLayout

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::CLayout = remove_cvref_t<typename Problem::CLayout>

◆ ComputeType

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ComputeType = ADataType

◆ Underlying

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::Underlying = FlatmmPipelineAGmemBGmemCRegV1<Problem, PipelinePolicy>

◆ WarpTile

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::WarpTile = remove_cvref_t<typename BlockGemmShape::WarpTile>

◆ WG

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
using ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::WG = remove_cvref_t<decltype(config.template at<0>())>

Member Function Documentation

◆ GetADramTileDistribution()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::GetADramTileDistribution ( )
inlinestaticconstexpr

◆ GetVectorSizeA()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
constexpr index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::GetVectorSizeA ( )
inlinestaticconstexpr

◆ GetVectorSizeB()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
constexpr index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::GetVectorSizeB ( )
inlinestaticconstexpr

◆ GetVectorSizeC()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
constexpr index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::GetVectorSizeC ( )
inlinestaticconstexpr

◆ HotLoopScheduler()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::HotLoopScheduler ( )
inlinestaticconstexpr

◆ Last2ndHotLoopScheduler()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::Last2ndHotLoopScheduler ( )
inlinestaticconstexpr

◆ LastHotLoopScheduler()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::LastHotLoopScheduler ( )
inlinestaticconstexpr

◆ operator()() [1/3]

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
template<typename ADramBlockWindowTmp, typename AElementFunction, typename BFlatBlockWindowTmp, typename DequantBFlatWindow>
CK_TILE_HOST_DEVICE auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::operator() ( ADramBlockWindowTmp a_copy_dram_window_,
const AElementFunction & a_element_func,
const BFlatBlockWindowTmp & b_flat_dram_block_window_tmp,
const DequantBFlatWindow & scale_b_flat_window,
const index_t num_loop,
const index_t k_padded_zeros,
void * p_smem_ping,
void * p_smem_pong ) const
inline

◆ operator()() [2/3]

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
template<typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename DequantBFlatWindow>
CK_TILE_DEVICE auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::operator() ( const ADramBlockWindowTmp & a_dram_block_window_tmp,
const BFlatBlockWindowTmp & b_flat_dram_block_window_tmp,
const DequantBFlatWindow & scale_b_flat_window,
const index_t num_loop,
const index_t k_padded_zeros,
void * p_smem_ping,
void * p_smem_pong ) const
inline

◆ operator()() [3/3]

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
template<typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename DequantBFlatWindow>
CK_TILE_DEVICE auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::operator() ( const ADramBlockWindowTmp & a_dram_block_window_tmp,
const BFlatBlockWindowTmp & b_flat_dram_block_window_tmp,
const DequantBFlatWindow & scale_b_flat_window,
const index_t num_loop,
void * p_smem_ping,
void * p_smem_pong ) const
inline

◆ SchedulerPerM()

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::SchedulerPerM ( index_t dsread_perM,
index_t dswrite_perM,
index_t load_perM )
inlinestaticconstexpr

Member Data Documentation

◆ AK1

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::AK1 = Problem::VectorLoadSize / sizeof(ADataType)
staticconstexpr

◆ Aload_num_perK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::Aload_num_perK = dswrite_num_perK
staticconstexpr

◆ Aload_rep

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::Aload_rep = dswrite_rep
staticconstexpr

◆ BK1

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BK1 = Problem::VectorLoadSize / sizeof(BDataType) * MXFP4PackedSize
staticconstexpr

◆ Bload_num_perK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::Bload_num_perK = kNPerBlock * WG::kK / NWarp / BK1 / WaveSize
staticconstexpr

◆ Bload_rep

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::Bload_rep = (Bload_num_perK + HalfMIter - 1) / HalfMIter
staticconstexpr

◆ Bload_total_num

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::Bload_total_num
staticconstexpr
Initial value:
=
static constexpr index_t KIterPerWarp
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:113
static constexpr index_t ScaleBload_num
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:170
static constexpr index_t Bload_num_perK
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:168

◆ BlockSize

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::BlockSize = Problem::kBlockSize
staticconstexpr

◆ config

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::config
staticconstexpr
Initial value:
=
BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>()

◆ ContinuousKPerThread

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ContinuousKPerThread = Problem::ContinuousKPerThread
staticconstexpr

◆ ContinuousScaleKPerThread

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ContinuousScaleKPerThread = Problem::ContinuousScaleKPerThread
staticconstexpr

◆ ContinuousScaleNPerThread

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ContinuousScaleNPerThread = Problem::ContinuousScaleNPerThread
staticconstexpr

◆ DoubleSmemBuffer

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
bool ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::DoubleSmemBuffer = false
staticconstexpr

◆ dsread_num_perK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::dsread_num_perK = dsread_per_wg * MIterPerWarp
staticconstexpr

◆ dsread_per_wg

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::dsread_per_wg
staticconstexpr
Initial value:
=
WG::kM * WG::kK * sizeof(ADataType) / WaveSize / Problem::VectorLoadSize
remove_cvref_t< typename Problem::ADataType > ADataType
Definition flatmm_pipeline_agmem_bgmem_creg_v1.hpp:48
static constexpr index_t WaveSize
Definition flatmm_pipeline_agmem_bgmem_creg_v1.hpp:69

◆ DsReadPreload

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::DsReadPreload = 2
staticconstexpr

◆ dswrite_kIter

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::dswrite_kIter = (DsWritePreIssue - 1) / MIterPerWarp
staticconstexpr

◆ dswrite_mIter

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::dswrite_mIter = (DsWritePreIssue - 1) % MIterPerWarp
staticconstexpr

◆ dswrite_num_perK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::dswrite_num_perK = dsread_num_perK / (MWarp * NWarp)
staticconstexpr

◆ dswrite_rep

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::dswrite_rep = (dswrite_num_perK + MIterPerWarp - 1) / MIterPerWarp
staticconstexpr

◆ DsWritePreIssue

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::DsWritePreIssue = 3
staticconstexpr

◆ flatKPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::flatKPerWarp = Problem::flatKPerWarp
staticconstexpr

◆ flatNPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::flatNPerWarp = Problem::flatNPerWarp
staticconstexpr

◆ HalfMIter

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::HalfMIter = (MIterPerWarp + 1) / 2
staticconstexpr

◆ HasHotLoop

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
bool ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::HasHotLoop = Problem::HasHotLoop
staticconstexpr

◆ I0

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::I0 = number<0>()
staticconstexpr

◆ I1

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::I1 = number<1>()
staticconstexpr

◆ I2

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::I2 = number<2>()
staticconstexpr

◆ idxK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::idxK = I2
staticconstexpr

◆ idxM

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::idxM = I0
staticconstexpr

◆ idxN

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::idxN = I1
staticconstexpr

◆ KFlatPerBlockPerIter

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::KFlatPerBlockPerIter = flatKPerWarp
staticconstexpr

◆ KIterPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::KIterPerWarp = kKPerBlock / WG::kK
staticconstexpr

◆ kKPerBlock

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::kKPerBlock = BlockGemmShape::kK
staticconstexpr

◆ kLdsAlignmentInBytes

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::kLdsAlignmentInBytes = 16
staticconstexpr

◆ kMPerBlock

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::kMPerBlock = BlockGemmShape::kM
staticconstexpr

◆ kNPerBlock

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::kNPerBlock = BlockGemmShape::kN
staticconstexpr

◆ kPadK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
bool ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::kPadK = Problem::kPadK
staticconstexpr

◆ kPadM

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
bool ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::kPadM = Problem::kPadM
staticconstexpr

◆ kPadN

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
bool ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::kPadN = Problem::kPadN
staticconstexpr

◆ KPerBlockPerIter

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::KPerBlockPerIter = kKPerBlock / KIterPerWarp
staticconstexpr

◆ KPerScaleLoad

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::KPerScaleLoad = KIterPerWarp / ScaleBload_num
staticconstexpr

◆ m_preload

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::m_preload
staticconstexpr
Initial value:
static constexpr index_t MIterPerWarp
Definition flatmm_pipeline_agmem_bgmem_creg_v1.hpp:103
static constexpr index_t KIterPerWarp
Definition flatmm_pipeline_agmem_bgmem_creg_v1.hpp:105
static constexpr index_t DsReadPreload
Definition flatmm_pipeline_agmem_bgmem_creg_v1.hpp:66

◆ mfma_per_wg

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::mfma_per_wg = 1
staticconstexpr

◆ mfma_perM_perK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::mfma_perM_perK = NIterPerWarp * mfma_per_wg
staticconstexpr

◆ MIterPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::MIterPerWarp = kMPerBlock / (MWarp * WG::kM)
staticconstexpr

◆ MPerBlockPerIter

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::MPerBlockPerIter = kMPerBlock / MIterPerWarp
staticconstexpr

◆ MWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::MWarp = config.template at<1>()
staticconstexpr

◆ MXFP4K_PerScaleK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::MXFP4K_PerScaleK = MXFP4KPerWarp / ScaleKPerWarp
staticconstexpr

◆ MXFP4KPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::MXFP4KPerWarp = KIterPerWarp / XDL_PerWeightK
staticconstexpr

◆ MXFP4PackedSize

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::MXFP4PackedSize = 2
staticconstexpr

◆ NFlatPerBlockPerIter

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::NFlatPerBlockPerIter = flatNPerWarp
staticconstexpr

◆ NIterPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::NIterPerWarp = kNPerBlock / (NWarp * WG::kN)
staticconstexpr

◆ NumWaveGroups

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::NumWaveGroups = Problem::NumWaveGroups
staticconstexpr

◆ NWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::NWarp = config.template at<2>()
staticconstexpr

◆ ScaleBload_K1

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ScaleBload_K1 = ContinuousScaleNPerThread * ContinuousScaleKPerThread
staticconstexpr

◆ ScaleBload_num

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ScaleBload_num
staticconstexpr
Initial value:
=
static constexpr index_t kNPerBlock
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:80
static constexpr index_t WaveSize
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:77
static constexpr index_t ScaleBload_K1
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:169
static constexpr index_t NWarp
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:109
static constexpr index_t kKPerBlock
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:81

◆ ScaleKFlatPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ScaleKFlatPerWarp
staticconstexpr
Initial value:
=
CK_TILE_HOST_DEVICE constexpr index_t get_warp_size()
Definition arch.hpp:63
static constexpr int ContinuousScaleNPerThread
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:129
static constexpr int ContinuousScaleKPerThread
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:130

◆ ScaleKPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ScaleKPerWarp = KIterPerWarp / XDL_PerScaleK
staticconstexpr

◆ ScaleNPerWarp

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::ScaleNPerWarp = NIterPerWarp / XDL_PerScaleN
staticconstexpr

◆ TailNum

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
auto ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::TailNum = Problem::TailNum
staticconstexpr

◆ UsePersistentKernel

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
bool ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::UsePersistentKernel = Problem::Traits::UsePersistentKernel
staticconstexpr

◆ WaveSize

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
index_t ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::WaveSize = get_warp_size()
staticconstexpr

◆ XDL_PerScaleK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::XDL_PerScaleK = XDL_PerWeightK * ContinuousScaleKPerThread
staticconstexpr

◆ XDL_PerScaleN

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::XDL_PerScaleN = ContinuousScaleNPerThread
staticconstexpr

◆ XDL_PerWeightK

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::XDL_PerWeightK = 4
staticconstexpr

◆ XDLK_PerThread

template<typename Problem, typename PipelinePolicy = F16xMXF4FlatmmPipelineAgBgCrPolicy>
int ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1< Problem, PipelinePolicy >::XDLK_PerThread
staticconstexpr
Initial value:
=
WarpTile::at(I2) / (get_warp_size() / WarpTile::at(I1))
static constexpr auto I1
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:99
static constexpr auto I2
Definition mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp:100

The documentation for this struct was generated from the following file: