StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference

StreamKKernel&lt; TilePartitioner_, GemmPipeline_, EpiloguePipeline_ &gt; Struct Template Reference#

Composable Kernel: ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference
ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference

#include <streamk_gemm_kernel.hpp>

Classes

struct  StreamKKernelArgs
 ALayout and ADataType are expected to be scalars, not a tuple. More...

Public Types

using UniversalGemmKernel
 Inject the UniversalGemmKernel base class to support execution of all necessary functions.
using TilePartitioner = remove_cvref_t<TilePartitioner_>
using GemmPipeline = remove_cvref_t<GemmPipeline_>
using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>
using ALayout = remove_cvref_t<typename GemmPipeline::ALayout>
 Specify the layout configurations for A, B, and C.
using BLayout = remove_cvref_t<typename GemmPipeline::BLayout>
using CLayout = remove_cvref_t<typename GemmPipeline::CLayout>
using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>
 Specify the data type configurations for A, B, and C.
using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>
using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>
using KernelArgs = StreamKKernelArgs
using Kernel = StreamKKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>

Public Member Functions

CK_TILE_DEVICE void operator() (StreamKKernelArgs kargs) const
 Entry point for the Stream-K Kernel, performing the main Stream-K loop.

Static Public Member Functions

static CK_TILE_HOST const std::string GetName ()
static CK_TILE_HOST auto GridSize (const TilePartitioner &tile_partitioner) -> dim3
 Compute the grid size for the Stream K kernel using the tile_partitioner.
static CK_TILE_HOST auto MaxOccupancyGridSize (const stream_config &s) -> dim3
 Get the maximum occupancy grid size for the persistent kernel on the current device.
static CK_TILE_HOST constexpr auto BlockSize () -> dim3
static CK_TILE_HOST StreamKKernelArgs MakeKernelArgs (const StreamKHostArgs &host_args, int num_cu=NumCU(), int occupancy=Occupancy())
 Constructs kernel arguments for the Stream-K kernel.
template<bool UseDefaultScheduler = true>
static CK_TILE_DEVICE void RunGemm (const std::array< const ADataType *, UniversalGemmKernel::NumATensor > &as_ptr, const std::array< const BDataType *, UniversalGemmKernel::NumBTensor > &bs_ptr, const std::array< const void *, UniversalGemmKernel::NumDTensor > &ds_ptr, CDataType *c_ptr, void *smem_ptr_0, const typename UniversalGemmKernel::KernelArgs &kargs, const index_t num_loop, const index_t block_idx_m, const index_t block_idx_n, const index_t k_size)
static CK_TILE_HOST bool IsSupportedArgument (const StreamKKernelArgs &kargs)
static CK_TILE_HOST uint32_t GetWorkSpaceSize (const StreamKKernelArgs &kargs)
 Computes the buffer size needed to store accumulation results for Stream K.
static CK_TILE_HOST void SetWorkSpacePointer (StreamKKernelArgs &kargs, void *workspace_ptr)
 Sets the kargs' current workspace_ptr to the given workspace_ptr.

Static Public Attributes

static constexpr index_t kBlockSize = UniversalGemmKernel::kBlockSize

Member Typedef Documentation

◆ ADataType

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ADataType = remove_cvref_t<typename GemmPipeline::ADataType>

Specify the data type configurations for A, B, and C.

◆ ALayout

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ALayout = remove_cvref_t<typename GemmPipeline::ALayout>

Specify the layout configurations for A, B, and C.

◆ BDataType

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BDataType = remove_cvref_t<typename GemmPipeline::BDataType>

◆ BLayout

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BLayout = remove_cvref_t<typename GemmPipeline::BLayout>

◆ CDataType

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>

◆ CLayout

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::CLayout = remove_cvref_t<typename GemmPipeline::CLayout>

◆ EpiloguePipeline

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>

◆ GemmPipeline

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GemmPipeline = remove_cvref_t<GemmPipeline_>

◆ Kernel

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::Kernel = StreamKKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>

◆ KernelArgs

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::KernelArgs = StreamKKernelArgs

◆ TilePartitioner

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::TilePartitioner = remove_cvref_t<TilePartitioner_>

◆ UniversalGemmKernel

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
using ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::UniversalGemmKernel
Initial value:
UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > UniversalGemmKernel
Inject the UniversalGemmKernel base class to support execution of all necessary functions.
Definition batched_gemm_kernel.hpp:65

Inject the UniversalGemmKernel base class to support execution of all necessary functions.

Member Function Documentation

◆ BlockSize()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST constexpr auto ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BlockSize ( ) ->dim3
inlinestaticconstexpr

◆ GetName()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST const std::string ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GetName ( )
inlinestaticnodiscard

◆ GetWorkSpaceSize()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST uint32_t ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GetWorkSpaceSize ( const StreamKKernelArgs & kargs)
inlinestatic

Computes the buffer size needed to store accumulation results for Stream K.

Returns
The buffer size needed.

◆ GridSize()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST auto ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GridSize ( const TilePartitioner & tile_partitioner) ->dim3
inlinestatic

Compute the grid size for the Stream K kernel using the tile_partitioner.

Returns
The grid size.

◆ IsSupportedArgument()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST bool ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::IsSupportedArgument ( const StreamKKernelArgs & kargs)
inlinestatic

◆ MakeKernelArgs()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST StreamKKernelArgs ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::MakeKernelArgs ( const StreamKHostArgs & host_args,
int num_cu = NumCU(),
int occupancy = Occupancy() )
inlinestatic

Constructs kernel arguments for the Stream-K kernel.

Parameters
host_argsStream-K host arguments.
num_cuNumber of compute units (CUs). The default is the number of CUs on the device. The caller may select their own to assist with test reproducibility, etc.
occupancyThe maximum number of active blocks per CU for this kernel. The caller may select their own to assist with test reproducibility, etc.
Returns
The kernel arguments for Stream-K.

◆ MaxOccupancyGridSize()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST auto ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::MaxOccupancyGridSize ( const stream_config & s) ->dim3
inlinestatic

Get the maximum occupancy grid size for the persistent kernel on the current device.

Returns
The maximum occupancy grid size.
Note
This function queries the maximum occupancy of the kernel using hipOccupancyMaxActiveBlocksPerMultiprocessor.

◆ operator()()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_DEVICE void ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::operator() ( StreamKKernelArgs kargs) const
inline

Entry point for the Stream-K Kernel, performing the main Stream-K loop.

◆ RunGemm()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<bool UseDefaultScheduler = true>
CK_TILE_DEVICE void ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::RunGemm ( const std::array< const ADataType *, UniversalGemmKernel::NumATensor > & as_ptr,
const std::array< const BDataType *, UniversalGemmKernel::NumBTensor > & bs_ptr,
const std::array< const void *, UniversalGemmKernel::NumDTensor > & ds_ptr,
CDataType * c_ptr,
void * smem_ptr_0,
const typename UniversalGemmKernel::KernelArgs & kargs,
const index_t num_loop,
const index_t block_idx_m,
const index_t block_idx_n,
const index_t k_size )
inlinestatic

◆ SetWorkSpacePointer()

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
CK_TILE_HOST void ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::SetWorkSpacePointer ( StreamKKernelArgs & kargs,
void * workspace_ptr )
inlinestatic

Sets the kargs' current workspace_ptr to the given workspace_ptr.

Note
Assumes that the given workspace_ptr points to allocated device memory.

Member Data Documentation

◆ kBlockSize

template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
index_t ck_tile::StreamKKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::kBlockSize = UniversalGemmKernel::kBlockSize
staticconstexpr

The documentation for this struct was generated from the following file: