20 typename ComputeDataType,
24 typename AMmaTileDesc,
25 typename BMmaTileDesc,
26 index_t ABlockTransferSrcScalarPerVector,
27 index_t BBlockTransferSrcScalarPerVector,
43 typename ComputeDataType,
47 typename AMmaTileDesc,
48 typename BMmaTileDesc,
49 index_t ABlockTransferSrcScalarPerVector,
50 index_t BBlockTransferSrcScalarPerVector,
71 ABlockTransferSrcScalarPerVector,
72 BBlockTransferSrcScalarPerVector,
90 ABlockTransferSrcScalarPerVector,
91 BBlockTransferSrcScalarPerVector,
111 ABlockTransferSrcScalarPerVector,
112 BBlockTransferSrcScalarPerVector,
155 template <
typename TileDesc_M0_M1_M2_K>
166 TileDesc_M0_M1_M2_K{},
193 constexpr auto num_ds_read_inst_a =
203 static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
208 constexpr auto ds_read_a_issue_cycle =
210 constexpr auto ds_read_a_mfma_rate =
216 constexpr auto num_total_stages = MRepeat;
220 constexpr auto num_mfma_perstage = num_mfma_inst / num_total_stages;
221 constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
223 constexpr auto num_ds_read_a_mfma_perstage =
226 constexpr auto num_ds_read_a_prefetch_stages = 2;
229 (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
231 (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
233 constexpr auto buffer_load_stages_more =
234 (num_buffer_load_inst_a + num_buffer_load_inst_b) -
236 (num_total_stages - 2)) *
237 ((num_total_stages - 2));
239 constexpr auto buffer_load_b_stages =
240 buffer_load_perstage_more * buffer_load_stages_more > num_buffer_load_inst_b
241 ? num_buffer_load_inst_b / buffer_load_perstage_more
242 : (buffer_load_stages_more +
243 (num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more) /
244 buffer_load_perstage_less);
246 constexpr auto buffer_load_a_stages =
247 num_total_stages - num_ds_read_a_prefetch_stages - buffer_load_b_stages;
249 constexpr auto buffer_load_issue_point_b = 0;
250 constexpr auto buffer_load_issue_point_interval_more =
251 num_mfma_perstage / buffer_load_perstage_more;
252 constexpr auto buffer_load_issue_point_interval_less =
253 num_mfma_perstage / buffer_load_perstage_less;
254 constexpr auto ds_write_issue_point = 0;
255 constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
260 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
262 if constexpr(((i < buffer_load_stages_more) &&
263 (imfma % buffer_load_issue_point_interval_more ==
264 buffer_load_issue_point_b)) ||
265 ((i >= buffer_load_stages_more) &&
266 (imfma % buffer_load_issue_point_interval_less ==
267 buffer_load_issue_point_b)))
269 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
272 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
274 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0);
282 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
283 if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
284 (imfma % buffer_load_issue_point_interval_more ==
285 ds_write_issue_point)) ||
286 (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
287 (imfma % buffer_load_issue_point_interval_less ==
288 ds_write_issue_point)))
290 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
292 if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
293 (imfma % buffer_load_issue_point_interval_more ==
294 buffer_load_issue_point_a)) ||
295 (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
296 (imfma % buffer_load_issue_point_interval_less ==
297 buffer_load_issue_point_a)))
299 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
301 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
303 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0);
312 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
313 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
315 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0);
321 template <
typename Stage>
326 constexpr auto num_buffer_load_inst_b =
331 constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
332 constexpr auto staged_num_mfma = num_mfma / MRepeat;
334 constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
336 if constexpr(stage.value == 0)
338 constexpr auto staged_num_buffer_load_b_per_ds_read_a =
339 num_buffer_load_inst_b / staged_num_ds_read_inst_a;
340 constexpr auto staged_num_mfma_per_buffer_load_b =
341 staged_num_mfma / num_buffer_load_inst_b;
348 __builtin_amdgcn_sched_group_barrier(
349 0x008, staged_num_mfma_per_buffer_load_b, 0);
350 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
353 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
354 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
355 __builtin_amdgcn_sched_group_barrier(
356 0x008, staged_num_mfma_per_buffer_load_b - 1, 0);
357 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
360 __builtin_amdgcn_sched_barrier(0);
362 else if constexpr(stage.value == 1)
364 constexpr auto staged_num_mfma_per_ds_write_a =
367 constexpr auto stage_more_mfma =
368 staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
372 if constexpr(i_inst.value < stage_more_mfma)
374 if(i_inst.value < staged_num_ds_read_inst_a)
376 __builtin_amdgcn_sched_group_barrier(
377 0x008, staged_num_mfma_per_ds_write_a - 1, 0);
378 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
379 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
380 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
384 __builtin_amdgcn_sched_group_barrier(
385 0x008, staged_num_mfma_per_ds_write_a, 0);
386 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
391 if(i_inst.value < staged_num_ds_read_inst_a)
393 __builtin_amdgcn_sched_group_barrier(
394 0x008, staged_num_mfma_per_ds_write_a - 2, 0);
395 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
396 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
397 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
401 __builtin_amdgcn_sched_group_barrier(
402 0x008, staged_num_mfma_per_ds_write_a - 1, 0);
403 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
407 __builtin_amdgcn_sched_barrier(0);
414 __builtin_amdgcn_sched_group_barrier(
415 0x008, staged_num_mfma_per_ds_read_a, 0);
416 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
419 __builtin_amdgcn_sched_barrier(0);
429 constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
430 constexpr auto staged_num_mfma = num_mfma / MRepeat;
432 constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
437 __builtin_amdgcn_sched_group_barrier(0x008, staged_num_mfma_per_ds_read_a, 0);
438 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
441 __builtin_amdgcn_sched_barrier(0);
444 template <
bool HasMainLoop,
448 typename ABlockTransfer,
449 typename AGridBuffer,
450 typename ABlockBuffer,
451 typename ABlockTransferStep,
453 typename BBlockTransfer,
454 typename BGridBuffer,
455 typename BBlockBuffer,
456 typename BBlockTransferStep,
457 typename CThreadBuffer>
458 __device__
void Run(
const AGridDesc& a_grid_desc,
459 const ABlockDesc& a_block_desc,
460 ABlockTransfer& a_blockwise_copy,
461 const AGridBuffer& a_grid_buf,
462 ABlockBuffer& a_block_buf,
463 const ABlockTransferStep& a_block_copy_step,
464 const BGridDesc& b_grid_desc,
465 BBlockTransfer& b_blockwise_copy,
466 BBlockTransfer& b_blockwise_copy_up,
467 const BGridBuffer& b_grid_buf,
468 const BGridBuffer& b_grid_buf_up,
469 BBlockBuffer& b_block_buf,
470 const BBlockTransferStep& b_block_copy_step,
471 CThreadBuffer& c_thread_buf,
472 CThreadBuffer& c_thread_buf_up,
476 __builtin_amdgcn_sched_barrier(0);
487 b_blockwise_copy.Run(b_grid_desc,
493 b_blockwise_copy_up.Run(b_grid_desc,
497 b_thread_bufs_up(
I0));
498 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
499 b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
501 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
502 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
503 __builtin_amdgcn_sched_barrier(0);
506 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(
I0));
509 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
510 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
528 c_thread_buf.Clear();
529 c_thread_buf_up.Clear();
531 __builtin_amdgcn_sched_barrier(0);
534 if constexpr(HasMainLoop)
539 auto LoopFunc = [&](
auto mfma_reg_buf,
auto local_read_buf) {
540 b_blockwise_copy.Run(b_grid_desc,
544 b_thread_bufs(local_read_buf));
545 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
546 b_blockwise_copy_up.Run(b_grid_desc,
550 b_thread_bufs_up(local_read_buf));
551 b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
553 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
554 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
555 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
564 a_thread_vec.template AsType<ComputeDataType>()(ik) =
573 b_thread_vec.template AsType<ComputeDataType>()(ik) =
574 b_thread_bufs[mfma_reg_buf]
578 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
579 b_thread_bufs_up[mfma_reg_buf]
584 using mfma_input_type =
592 a_thread_vec.template AsType<mfma_input_type>(),
593 b_thread_vec.template AsType<mfma_input_type>(),
597 a_thread_vec.template AsType<mfma_input_type>(),
598 b_thread_vec_up.template AsType<mfma_input_type>(),
603 if constexpr(m0.value == MRepeat - 2)
617 a_block_buf.At(local_read_buf),
631 else if constexpr(m0.value == (MRepeat - 1))
643 a_block_buf.At(local_read_buf),
669 a_block_buf.At(mfma_reg_buf),
691 }
while(i < (num_loop - 2));
696 b_blockwise_copy.Run(b_grid_desc,
702 b_blockwise_copy_up.Run(b_grid_desc,
706 b_thread_bufs_up(
I1));
707 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(
I1));
716 a_thread_vec.template AsType<ComputeDataType>()(ik) =
719 b_thread_vec.template AsType<ComputeDataType>()(ik) =
723 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
728 using mfma_input_type =
734 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
735 b_thread_vec.template AsType<mfma_input_type>(),
738 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
739 b_thread_vec_up.template AsType<mfma_input_type>(),
743 if constexpr(m0.value == (MRepeat - 2))
765 else if constexpr(m0.value == MRepeat - 1)
817 a_thread_vec.template AsType<ComputeDataType>()(ik) =
820 b_thread_vec.template AsType<ComputeDataType>()(ik) =
823 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
828 using mfma_input_type =
834 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
835 b_thread_vec.template AsType<mfma_input_type>(),
838 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
839 b_thread_vec_up.template AsType<mfma_input_type>(),
844 if constexpr(m0.value < (MRepeat - 2))
880 a_thread_vec.template AsType<ComputeDataType>()(ik) =
883 b_thread_vec.template AsType<ComputeDataType>()(ik) =
886 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
891 using mfma_input_type =
897 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
898 b_thread_vec.template AsType<mfma_input_type>(),
900 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
901 b_thread_vec_up.template AsType<mfma_input_type>(),
906 if constexpr(m0.value < (MRepeat - 2))
__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
Definition utility/math.hpp:66
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
__host__ __device__ BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
Constructor for BlockwiseGemmXdlops_pipeline_base.
Definition blockwise_gemm_pipeline_xdlops_base.hpp:222
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:280
static constexpr index_t MWaves
Definition blockwise_gemm_pipeline_xdlops_base.hpp:44
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:239
static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
static constexpr index_t KGroup
Definition blockwise_gemm_pipeline_xdlops_base.hpp:67
static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:266
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:294
static constexpr index_t AMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:60
static __device__ auto CalculateAThreadOriginDataIndex6D()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:136
static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:253
static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:51
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops > HotLoopInstList
Definition blockwise_gemm_pipeline_xdlops_base.hpp:82
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:111
static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:160
static __device__ auto CalculateCThreadOriginDataIndex8D(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:189
static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
static constexpr auto I2
Definition blockwise_gemm_pipeline_xdlops_base.hpp:38
static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:50
static constexpr index_t BMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:61
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:341
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:307
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:324
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_pipeline_xdlops_base.hpp:44
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, BBlockTransfer &b_blockwise_copy, BBlockTransfer &b_blockwise_copy_up, const BGridBuffer &b_grid_buf, const BGridBuffer &b_grid_buf_up, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, CThreadBuffer &c_thread_buf_up, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:458
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:152
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AThreadCopy ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeDataType, decltype(a_block_desc_m0_m1_m2_k0_k1_k2), decltype(a_thread_desc_), Sequence< 1, 1, 1, 1, 1, KPack/KGroup >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:933
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:189
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:184
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KGroup static constexpr index_t KGroup
Definition blockwise_gemm_pipeline_xdlops_base.hpp:67
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:945
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:179
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_k0_k1 static constexpr BTileDesc b_block_desc_n0_n1_k0_k1
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:948
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotloopLocalBufSwitch static constexpr index_t HotloopLocalBufSwitch
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:153
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeAGemmMmaTileDescriptor __host__ static __device__ constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K &)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:156
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:150
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k0_k1_k2 static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:176
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:930
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:151
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I2 static constexpr auto I2
Definition blockwise_gemm_pipeline_xdlops_base.hpp:38
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:943
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:102
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::EpilogueScheduler_1 static __device__ constexpr auto EpilogueScheduler_1(Stage stage)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:322
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::EpilogueScheduler_2 static __device__ constexpr auto EpilogueScheduler_2()
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:423
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp:37
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Read_Inst_Num static constexpr index_t A_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:49
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Read_Width static constexpr index_t A_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:82
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Write_Inst_Num static constexpr index_t A_LDS_Write_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:44
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::C_MFMA_Inst_Num static constexpr index_t C_MFMA_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:54
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::C_MFMA_Inst_Cycle static constexpr index_t C_MFMA_Inst_Cycle
Definition blkgemmpipe_scheduler.hpp:105
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:41
Definition utility/sequence.hpp:43
Definition threadwise_tensor_slice_transfer.hpp:1260
Definition functional2.hpp:33
Definition dtype_vector.hpp:10