impl Namespace Reference

impl Namespace Reference#

Composable Kernel: ck_tile::impl Namespace Reference
ck_tile::impl Namespace Reference

Classes

struct  buffer_load_trait
struct  buffer_load_trait< 16, T >
struct  buffer_load_trait< 8, T >
struct  buffer_load_trait< 4, T >
struct  buffer_load_trait< 2, T >
struct  buffer_load_trait< 1, T >
struct  smem_load_trait
struct  smem_load_trait< 16, T >
struct  smem_load_trait< 8, T >
struct  smem_load_trait< 4, T >
struct  smem_load_trait< 2, T >
struct  smem_load_trait< 1, T >
struct  __integer_sequence
struct  __integer_sequence< index_t, Ints... >
struct  seq_reverse
struct  seq_reverse< sequence< Ids... >, Ns... >
struct  reverse_slice_sequence_impl
struct  reverse_slice_sequence_impl< sequence< x, xs... >, sequence< m, ms... >, sequence< id, ids... >, SliceSize >
struct  reverse_slice_sequence_impl< sequence< x >, sequence< m >, sequence< id >, SliceSize >
struct  tuple_array_impl
struct  tuple_object
struct  tuple_object< idx, T, true >
struct  tuple_object< idx, T, false >
struct  tuple_base
struct  tuple_base< sequence< I... >, T... >
struct  tuple_array_impl< T, 0 >
struct  tuple_array_impl< T, 1 >
struct  ext_vector
struct  ext_vector< T_, N_, std::enable_if_t<!std::is_class_v< typename native_t< T_ >::type > > >
struct  ext_vector< T_, N_, std::enable_if_t< std::is_class_v< typename native_t< T_ >::type > > >
struct  ext_vector< V_, N_, std::enable_if_t<!std::is_class_v< typename native_t< V_ >::type > > >
struct  ext_vector< V_, N_, std::enable_if_t< std::is_class_v< typename native_t< V_ >::type > > >
struct  is_null_tile_window
struct  is_null_tile_window< null_tile_window< T > >
struct  sweep_tile_impl
struct  sweep_tile_impl< DistributedTensor, UnpacksPerXDim, sequence< I, Is... > >
struct  sweep_tile_impl< DistributedTensor, UnpacksPerXDim, sequence<> >
struct  sweep_tile_impl_0
struct  sweep_tile_impl_0< DistributedTensor, UnpacksPerXDim, sequence< I, Is... > >
struct  default_linear_bottom_dims_impl
struct  default_linear_bottom_dims_impl< address_space_enum::global, len_ >
struct  default_linear_bottom_dims_impl< address_space_enum::lds, len_ >
struct  static_counter_uniq_
struct  is_static_impl
struct  RawIntegerType_
struct  RawIntegerType_< 1 >
struct  RawIntegerType_< 2 >
struct  RawIntegerType_< 4 >
struct  RawIntegerType_< 8 >
struct  MaskName
struct  MaskName< false, false >
struct  MaskName< false, true >
struct  MaskName< true, false >
struct  MaskName< true, true >
struct  SimplifiedMaskName
struct  SimplifiedMaskName< false >
struct  SimplifiedMaskName< true >
struct  SimplifiedRatioMaskName
struct  SimplifiedRatioMaskName< false >
struct  SimplifiedRatioMaskName< true >
struct  WarpGemmDispatcher
struct  WarpGemmDispatcher< float, float, float, 16, 16, 4, false >
struct  WarpGemmDispatcher< float, float, float, 16, 16, 16, false >
struct  WarpGemmDispatcher< float, float, float, 16, 16, 16, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 8, false >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 8, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 4, 64, 16, false >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 64, 4, 16, false >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 8, false, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 8, true, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true >
struct  WarpGemmDispatcher< ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 8, false >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 8, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 4, 64, 16, false >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 64, 4, 16, false >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 8, false, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 8, true, true >
struct  WarpGemmDispatcher< ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, true >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32, 16, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32, 32, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 32, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 64, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32, 16, true >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 32, true >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 16, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 16, true >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16, 32, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 32, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32, 16, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32, 16, true >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32, 16, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32, 32, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16, 32, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16, 32, true >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16, 64, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32, 16, true >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 128, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16, 128, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16, 128, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16, 128, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 128, true >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16, 128, true >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16, 128, true >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16, 128, true >
struct  WarpGemmDispatcher< ck_tile::pk_fp4_t, ck_tile::pk_fp4_t, float, 16, 16, 128, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32, 64, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 64, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32, 64, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32, 64, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32, 64, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 64, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32, 64, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32, 64, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 128, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16, 128, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16, 128, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16, 128, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::pk_fp4_t, ck_tile::pk_fp4_t, float, 16, 16, 128, false, false, false, WGAttrNumAccessEnum::Quad >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 16, TransposeC, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16, 16, TransposeC, false >
struct  WarpGemmDispatcher< ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16, 16, TransposeC, false >
struct  WarpGemmDispatcher< ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16, 16, TransposeC, false >
struct  WarpGemmDispatcher< ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32, 16, false >
struct  WarpGemmDispatcher< ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32, 16, true >
struct  WarpGemmDispatcher< ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16, 32, false >
struct  WarpGemmDispatcher< ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16, 32, true >
struct  WarpGemmDispatcher< ck_tile::int8_t, ck_tile::int8_t, int32_t, 16, 16, 16, TransposeC, false >

Typedefs

template<index_t I, typename... Ts>
using at_index_t = __type_pack_element<I, Ts...>
template<typename T>
using has_is_static = decltype(T::is_static())
template<typename T>
using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type

Functions

template<index_t N>
CK_TILE_DEVICE void insert_dummy_dep_per_dword (array< float, N > &b)
template<>
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 2 > (array< float, 2 > &b)
template<>
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 3 > (array< float, 3 > &b)
template<>
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 4 > (array< float, 4 > &b)
template<>
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 8 > (array< float, 8 > &b)
template<>
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 16 > (array< float, 16 > &b)
template<>
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 32 > (array< float, 32 > &b)
CK_TILE_DEVICE void insert_dummy_dep ()
template<typename T>
CK_TILE_DEVICE void insert_dummy_dep (T &buffer)
template<typename Tx, typename... Ty>
CK_TILE_DEVICE void insert_dummy_dep (Tx &bx, Ty &... by)
template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr T getv (const tuple_object< I, T, true > &)
template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr const T & getv (const tuple_object< I, T, false > &x)
template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr T & getv (tuple_object< I, T, false > &x)
template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr T && getv (tuple_object< I, T, false > &&x)
template<typename SrcT, typename DstT, bool clip = true, bool stoch = false>
CK_TILE_HOST_DEVICE DstT run_cast_to_f8 (SrcT src, unsigned int rng=0)
template<typename SrcT, typename DstT, bool clip = true>
CK_TILE_HOST_DEVICE DstT run_cast_from_f8 (SrcT x)
template<typename X, typename Y, bool clip, bool stoch>
CK_TILE_HOST_DEVICEcast_to_f8 (X x, uint32_t rng)
template<typename OutDataType, typename InTensor>
CK_TILE_DEVICE auto cast_tile_pk_fp8_fp32 (const InTensor &in_dstr_tensors)
template<typename OutDataType, typename InTensor>
CK_TILE_DEVICE auto cast_tile_pk_fp16_fp32 (const InTensor &in_dstr_tensors)
CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_stride (index_t tokens)
CK_TILE_HOST index_t moe_sorting_mesh_byte_size (index_t tokens_, index_t, index_t topk_)
CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_smem_size (index_t tokens, index_t num_experts, index_t topk)
CK_TILE_HOST_DEVICE index_t moe_sorting_mp_cumsum_smem_size (index_t num_experts)
CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size ()
template<typename T, typename F, index_t wave_size_ = get_warp_size()>
CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce (T local, F reduce_f, number< wave_size_ >={})
template<typename data_t, int wave_size>
CK_TILE_DEVICE void moe_sorting_wave_cumsum (data_t &thread_data)
template<index_t kBlockSize = 256>
CK_TILE_DEVICE void moe_buf_set_zero_kernel (uint8x16_t *buf, long_index_t buf_bytes, index_t gid)
template<index_t kBlockSize = 256>
CK_TILE_DEVICE void moe_buf_set_zero_kernel_2d (void *buf, index_t row, index_t col, index_t elem_bytes, index_t gid, index_t blocks)
CK_TILE_HOST constexpr auto moe_sorting_get_smem_size_p23 (int num_experts_)

Typedef Documentation

◆ at_index_t

template<index_t I, typename... Ts>
using ck_tile::impl::at_index_t = __type_pack_element<I, Ts...>

◆ has_is_static

template<typename T>
using ck_tile::impl::has_is_static = decltype(T::is_static())

◆ RawIntegerType

template<typename T>
using ck_tile::impl::RawIntegerType = typename RawIntegerType_<sizeof(T)>::type

Function Documentation

◆ cast_tile_pk_fp16_fp32()

template<typename OutDataType, typename InTensor>
CK_TILE_DEVICE auto ck_tile::impl::cast_tile_pk_fp16_fp32 ( const InTensor & in_dstr_tensors)

◆ cast_tile_pk_fp8_fp32()

template<typename OutDataType, typename InTensor>
CK_TILE_DEVICE auto ck_tile::impl::cast_tile_pk_fp8_fp32 ( const InTensor & in_dstr_tensors)

◆ cast_to_f8()

template<typename X, typename Y, bool clip, bool stoch>
CK_TILE_HOST_DEVICE Y ck_tile::impl::cast_to_f8 ( X x,
uint32_t rng )

◆ getv() [1/4]

template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr const T & ck_tile::impl::getv ( const tuple_object< I, T, false > & x)
constexpr

◆ getv() [2/4]

template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr T ck_tile::impl::getv ( const tuple_object< I, T, true > & )
constexpr

◆ getv() [3/4]

template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr T && ck_tile::impl::getv ( tuple_object< I, T, false > && x)
constexpr

◆ getv() [4/4]

template<index_t I, class T>
CK_TILE_HOST_DEVICE constexpr T & ck_tile::impl::getv ( tuple_object< I, T, false > & x)
constexpr

◆ insert_dummy_dep() [1/3]

CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep ( )

◆ insert_dummy_dep() [2/3]

template<typename T>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep ( T & buffer)

◆ insert_dummy_dep() [3/3]

template<typename Tx, typename... Ty>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep ( Tx & bx,
Ty &... by )

◆ insert_dummy_dep_per_dword()

template<index_t N>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword ( array< float, N > & b)

◆ insert_dummy_dep_per_dword< 16 >()

template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 16 > ( array< float, 16 > & b)

◆ insert_dummy_dep_per_dword< 2 >()

template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 2 > ( array< float, 2 > & b)

◆ insert_dummy_dep_per_dword< 3 >()

template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 3 > ( array< float, 3 > & b)

◆ insert_dummy_dep_per_dword< 32 >()

template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 32 > ( array< float, 32 > & b)

◆ insert_dummy_dep_per_dword< 4 >()

template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 4 > ( array< float, 4 > & b)

◆ insert_dummy_dep_per_dword< 8 >()

template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 8 > ( array< float, 8 > & b)

◆ moe_buf_set_zero_kernel()

template<index_t kBlockSize = 256>
CK_TILE_DEVICE void ck_tile::impl::moe_buf_set_zero_kernel ( uint8x16_t * buf,
long_index_t buf_bytes,
index_t gid )

◆ moe_buf_set_zero_kernel_2d()

template<index_t kBlockSize = 256>
CK_TILE_DEVICE void ck_tile::impl::moe_buf_set_zero_kernel_2d ( void * buf,
index_t row,
index_t col,
index_t elem_bytes,
index_t gid,
index_t blocks )

◆ moe_sorting_get_smem_size_p23()

CK_TILE_HOST constexpr auto ck_tile::impl::moe_sorting_get_smem_size_p23 ( int num_experts_)
constexpr

◆ moe_sorting_mesh_byte_size()

CK_TILE_HOST index_t ck_tile::impl::moe_sorting_mesh_byte_size ( index_t tokens_,
index_t ,
index_t topk_ )

◆ moe_sorting_mp_cumsum_smem_size()

CK_TILE_HOST_DEVICE index_t ck_tile::impl::moe_sorting_mp_cumsum_smem_size ( index_t num_experts)

◆ moe_sorting_mp_mesh_smem_size()

CK_TILE_HOST_DEVICE index_t ck_tile::impl::moe_sorting_mp_mesh_smem_size ( index_t tokens,
index_t num_experts,
index_t topk )

◆ moe_sorting_mp_mesh_stride()

CK_TILE_HOST_DEVICE index_t ck_tile::impl::moe_sorting_mp_mesh_stride ( index_t tokens)

◆ moe_sorting_mp_sem_smem_size()

CK_TILE_HOST_DEVICE index_t ck_tile::impl::moe_sorting_mp_sem_smem_size ( )

◆ moe_sorting_wave_cumsum()

template<typename data_t, int wave_size>
CK_TILE_DEVICE void ck_tile::impl::moe_sorting_wave_cumsum ( data_t & thread_data)

◆ moe_sorting_wave_reduce()

template<typename T, typename F, index_t wave_size_ = get_warp_size()>
CK_TILE_DEVICE constexpr T ck_tile::impl::moe_sorting_wave_reduce ( T local,
F reduce_f,
number< wave_size_ > = {} )
constexpr

◆ run_cast_from_f8()

template<typename SrcT, typename DstT, bool clip = true>
CK_TILE_HOST_DEVICE DstT ck_tile::impl::run_cast_from_f8 ( SrcT x)

◆ run_cast_to_f8()

template<typename SrcT, typename DstT, bool clip = true, bool stoch = false>
CK_TILE_HOST_DEVICE DstT ck_tile::impl::run_cast_to_f8 ( SrcT src,
unsigned int rng = 0 )