include/ck_tile/host/fill.hpp

include/ck_tile/host/fill.hpp#

Composable Kernel: include/ck_tile/host/fill.hpp
include/ck_tile/host/fill.hpp

Functor for filling a range with randomly generated values from a uniform distribution.

Functor for filling a range with randomly generated values from a uniform distribution.This struct provides functionality to fill iterators or ranges with random values generated from a uniform distribution. It supports both single-threaded and multi-threaded operation.

Template Parameters
TThe target type for the generated values.
Note
The multi-threaded implementation is not guaranteed to provide perfectly distributed values across threads.
// Direct usage without creating a separate variable:
ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_host_tensor);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cmath>
#include <iterator>
#include <optional>
#include <random>
#include <stdexcept>
#include <type_traits>
#include <utility>
#include <unordered_set>
#include "ck_tile/core.hpp"
namespace ck_tile {
template <typename T>
struct FillUniformDistribution
{
float a_{-5.f};
float b_{5.f};
std::optional<uint32_t> seed_{11939};
// ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed
// across threads).
bool threaded = false;
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
{
uint32_t num_thread = std::thread::hardware_concurrency();
auto total = static_cast<std::size_t>(std::distance(first, last));
auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min((it + 1) * work_per_thread, total);
auto thread_f = [this, total, iw_begin, iw_end, &first] {
if(iw_begin > total || iw_end > total)
return;
// need to make each thread unique, add an offset to current seed
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
: std::random_device{}());
std::uniform_real_distribution<float> dis(a_, b_);
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
if constexpr(numeric_traits<T>::PackedSize == 2)
return ck_tile::type_convert<T>(fp32x2_t{dis(gen), dis(gen)});
else
return ck_tile::type_convert<T>(dis(gen));
});
};
threads[it] = joinable_thread(thread_f);
}
}
else
{
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::uniform_real_distribution<float> dis(a_, b_);
std::generate(first, last, [&dis, &gen]() {
if constexpr(numeric_traits<T>::PackedSize == 2)
return ck_tile::type_convert<T>(fp32x2_t{dis(gen), dis(gen)});
else
return ck_tile::type_convert<T>(dis(gen));
});
}
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <>
struct FillUniformDistribution<ck_tile::pk_int4_t>
{
float a_{-8.f}; // same type as primary template so that
// `FillUniformDistribution<Type>{-5.0f, 5.0f}` works for all types
float b_{7.f};
std::optional<uint32_t> seed_{11939};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
{
throw std::runtime_error(
"a_ or b_ of FillUniformDistribution<ck_tile::pk_int4_t> is out of range.");
}
int min_value = static_cast<int>(a_);
int max_value = static_cast<int>(b_);
constexpr auto int4_array = std::array<uint8_t, 16>{0x88,
0x99,
0xaa,
0xbb,
0xcc,
0xdd,
0xee,
0xff,
0x00,
0x11,
0x22,
0x33,
0x44,
0x55,
0x66,
0x77};
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::uniform_int_distribution<std::int32_t> dis(0, max_value - min_value + 1);
while(first != last)
{
int randomInt = dis(gen);
*first = int4_array[randomInt + (min_value + 8)];
++first;
}
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
namespace impl {
// clang-format off
template<index_t bytes> struct RawIntegerType_ {};
template<> struct RawIntegerType_<1> { using type = uint8_t;};
template<> struct RawIntegerType_<2> { using type = uint16_t;};
template<> struct RawIntegerType_<4> { using type = uint32_t;};
template<> struct RawIntegerType_<8> { using type = uint64_t;};
// clang-format on
template <typename T>
using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type;
} // namespace impl
// Note: this struct will have no const-ness will generate random
template <typename T>
struct FillUniformDistribution_Unique
{
float a_{-5.f};
float b_{5.f};
std::optional<uint32_t> seed_{11939};
std::mt19937 gen_{};
std::unordered_set<impl::RawIntegerType<T>> set_{};
float b = 5.f,
std::optional<uint32_t> seed = {11939})
: a_(a),
b_(b),
seed_(seed),
gen_{seed_.has_value() ? *seed_ : std::random_device{}()},
set_{}
{
}
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last)
{
std::mt19937& gen = gen_;
std::uniform_real_distribution<float> dis(a_, b_);
auto& set = set_;
std::generate(first, last, [&dis, &gen, &set]() {
T v = static_cast<T>(0);
do
{
v = ck_tile::type_convert<T>(dis(gen));
} while(set.count(bit_cast<impl::RawIntegerType<T>>(v)) == 1);
return v;
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range)
-> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
void clear() { set_.clear(); }
};
template <typename T>
struct FillNormalDistribution
{
float mean_{0.f};
float variance_{1.f};
std::optional<uint32_t> seed_{11939};
// ATTENTION: threaded does not guarantee the distribution between thread
bool threaded = false;
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
{
uint32_t num_thread = std::thread::hardware_concurrency();
auto total = static_cast<std::size_t>(std::distance(first, last));
auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min((it + 1) * work_per_thread, total);
auto thread_f = [this, total, iw_begin, iw_end, &first] {
if(iw_begin > total || iw_end > total)
return;
// need to make each thread unique, add an offset to current seed
std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
: std::random_device{}());
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
return ck_tile::type_convert<T>(dis(gen));
});
};
threads[it] = joinable_thread(thread_f);
}
}
else
{
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
std::generate(
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
}
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillNormalDistribution&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// However this produces segfaults in std::mt19937 which look like inifite loop.
// template <typename T>
// struct FillUniformDistributionIntegerValue
// {
// int a_{-5};
// int b_{5};
//
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
// }
// };
// Workaround for uniform_int_distribution not working as expected. See note above.<
template <typename T>
struct FillUniformDistributionIntegerValue
{
float a_{-5.f};
float b_{5.f};
std::optional<uint32_t> seed_{11939};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::uniform_real_distribution<float> dis(a_, b_);
std::generate(
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillNormalDistributionIntegerValue
{
float mean_{0.f};
float variance_{1.f};
std::optional<uint32_t> seed_{11939};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
std::generate(
first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillNormalDistributionIntegerValue&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillMonotonicSeq
{
T step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, *this, n = init_value_]() mutable {
auto tmp = n;
if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
{
n.data += step_.data;
}
else
{
n += step_;
}
return tmp;
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T, bool IsAscending = true>
struct FillStepRange
{
float start_value_{0};
float end_value_{3};
float step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, *this, n = start_value_]() mutable {
auto tmp = n;
n += step_;
if constexpr(IsAscending)
{
if(n > end_value_)
}
else
{
if(n < end_value_)
}
return type_convert<T>(tmp);
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillStepRange&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T>
struct FillConstant
{
T value_{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, value_);
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillConstant&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
//----------------------------------------------------------------------------------------------
template <typename T>
struct AdjustToStructuredSparsity
{
size_t start{0};
// masks represent all valid 2:4 structured sparsity permutations
// clang-format off
static constexpr int32_t masks[] = {0, 0, 1, 1,
0, 1, 0, 1,
0, 1, 1, 0,
1, 0, 0, 1,
1, 0, 1, 0,
1, 1, 0, 0,
0, 0, 0, 1,
0, 0, 1, 0,
0, 1, 0, 0,
1, 0, 0, 0};
// clang-format on
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::transform(first, last, first, [=, *this, index = start](T val) mutable {
auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
index += 1;
return type_convert<T>(tmp);
});
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
template <typename T, bool UseCos = true, bool UseAbs = false>
struct FillTrigValue
{
template <typename T_, bool UseCos_ = true, bool UseAbs_ = false>
struct LinearTrigGen
{
int i{0};
auto operator()()
{
float v = 0;
if constexpr(UseCos_)
{
v = cos(i);
}
else
{
v = sin(i);
}
if constexpr(UseAbs_)
v = abs(v);
i++;
}
};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
LinearTrigGen<T, UseCos, UseAbs> gen;
std::generate(first, last, gen);
}
template <typename ForwardRange>
auto operator()(ForwardRange&& range) const
-> std::void_t<decltype(std::declval<const FillTrigValue&>()(
std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range))))>
{
(*this)(std::begin(std::forward<ForwardRange>(range)),
std::end(std::forward<ForwardRange>(range)));
}
};
} // namespace ck_tile
typename RawIntegerType_< sizeof(T)>::type RawIntegerType
Definition tile/host/fill.hpp:168
Definition tile/core/algorithm/cluster_descriptor.hpp:13
@ set
Definition arch.hpp:57
CK_TILE_HOST T cos(T x)
Definition tile/core/numeric/math.hpp:752
CK_TILE_HOST_DEVICE constexpr Y bit_cast(const X &x)
Definition bit_cast.hpp:11
CK_TILE_HOST T sin(T x)
Definition tile/core/numeric/math.hpp:698
CK_TILE_HOST_DEVICE bfloat16_t abs(const bfloat16_t &x)
Definition bfloat16.hpp:400
float fp32x2_t
Definition pk_fp4.hpp:22
CK_TILE_HOST_DEVICE constexpr Y type_convert(X x)
Definition tile/core/numeric/type_convert.hpp:29
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition pointer.h:1517
unsigned short uint16_t
Definition stdint.h:125
unsigned int uint32_t
Definition stdint.h:126
signed int int32_t
Definition stdint.h:123
unsigned char uint8_t
Definition stdint.h:124
unsigned __int64 uint64_t
Definition stdint.h:136
size_t start
Definition tile/host/fill.hpp:455
static constexpr int32_t masks[]
Definition tile/host/fill.hpp:458
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:471
T value_
Definition tile/host/fill.hpp:430
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:433
T init_value_
Definition tile/host/fill.hpp:357
T step_
Definition tile/host/fill.hpp:358
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:361
std::optional< uint32_t > seed_
Definition tile/host/fill.hpp:229
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:234
float variance_
Definition tile/host/fill.hpp:228
bool threaded
Definition tile/host/fill.hpp:231
float mean_
Definition tile/host/fill.hpp:227
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:335
float mean_
Definition tile/host/fill.hpp:330
float variance_
Definition tile/host/fill.hpp:331
std::optional< uint32_t > seed_
Definition tile/host/fill.hpp:332
float end_value_
Definition tile/host/fill.hpp:392
float start_value_
Definition tile/host/fill.hpp:391
float step_
Definition tile/host/fill.hpp:393
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:396
int i
Definition tile/host/fill.hpp:498
auto operator()()
Definition tile/host/fill.hpp:499
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:517
void operator()(ForwardIter first, ForwardIter last)
Definition tile/host/fill.hpp:194
std::optional< uint32_t > seed_
Definition tile/host/fill.hpp:177
float a_
Definition tile/host/fill.hpp:175
FillUniformDistribution_Unique(float a=-5.f, float b=5.f, std::optional< uint32_t > seed={11939})
Definition tile/host/fill.hpp:182
std::mt19937 gen_
Definition tile/host/fill.hpp:179
void clear()
Definition tile/host/fill.hpp:221
float b_
Definition tile/host/fill.hpp:176
std::unordered_set< impl::RawIntegerType< T > > set_
Definition tile/host/fill.hpp:180
float b_
Definition tile/host/fill.hpp:42
bool threaded
Definition tile/host/fill.hpp:46
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:49
float a_
Definition tile/host/fill.hpp:41
std::optional< uint32_t > seed_
Definition tile/host/fill.hpp:43
std::optional< uint32_t > seed_
Definition tile/host/fill.hpp:305
void operator()(ForwardIter first, ForwardIter last) const
Definition tile/host/fill.hpp:308
float b_
Definition tile/host/fill.hpp:304
float a_
Definition tile/host/fill.hpp:303
static constexpr int PackedSize
Definition tile/core/numeric/numeric.hpp:82
Definition library/utility/host_tensor.hpp:616