device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp File Reference

device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp File Reference#

Composable Kernel: device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp File Reference
device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp File Reference

Go to the source code of this file.

Classes

struct  ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, InLayout, WeiLayout, OutLayout, InDataType, WeiDataType, OutDataType, AccDataType, InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation, ConvBackwardWeightSpecialization, BlockSize, MPerBlock, NPerBlock, K0PerBlock, K1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BBlockLdsAddExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CBlockTransferScalarPerVector_NWaveNPerXdl, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB >
struct  ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, InLayout, WeiLayout, OutLayout, InDataType, WeiDataType, OutDataType, AccDataType, InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation, ConvBackwardWeightSpecialization, BlockSize, MPerBlock, NPerBlock, K0PerBlock, K1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BBlockLdsAddExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CBlockTransferScalarPerVector_NWaveNPerXdl, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB >::ActiveWorkgroupsPerCU
struct  ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, InLayout, WeiLayout, OutLayout, InDataType, WeiDataType, OutDataType, AccDataType, InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation, ConvBackwardWeightSpecialization, BlockSize, MPerBlock, NPerBlock, K0PerBlock, K1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BBlockLdsAddExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CBlockTransferScalarPerVector_NWaveNPerXdl, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB >::Argument
struct  ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, InLayout, WeiLayout, OutLayout, InDataType, WeiDataType, OutDataType, AccDataType, InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation, ConvBackwardWeightSpecialization, BlockSize, MPerBlock, NPerBlock, K0PerBlock, K1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BBlockLdsAddExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CBlockTransferScalarPerVector_NWaveNPerXdl, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB >::Invoker

Namespaces

namespace  ck
namespace  ck::tensor_operation
namespace  ck::tensor_operation::device

Functions

template<typename GridwiseGemm, typename AGridDesc_AK0_M_K1, typename BGridDesc_BK0_N_K1, typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::tensor_operation::device::kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3 (typename GridwiseGemm::Argument karg, const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const index_t num_k_per_block)
template<typename GridwiseGemm, typename AGridDesc_AK0_M_K1, typename BGridDesc_BK0_N_K1, typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::tensor_operation::device::kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds (typename GridwiseGemm::Argument karg, const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const index_t num_k_per_block)