Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cub/benchmarks/bench/segmented_topk/fixed/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_batched_topk.cuh>

#include <cuda/__argument_>
#include <cuda/argument>
#include <cuda/iterator>

#include <nvbench_helper.cuh>
Expand Down Expand Up @@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
const auto num_segments = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
const auto elements = num_segments * segment_size;
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)};
const auto total_num_items = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)};
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

// Skip workloads where k exceeds the segment size
Expand All @@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);

auto segment_sizes = ::cuda::__argument::__constant<MaxSegmentSize>{};
auto k = ::cuda::__argument::__constant<MaxNumSelected>{};
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
auto segment_sizes = ::cuda::args::constant<MaxSegmentSize>{};
auto k = ::cuda::args::constant<MaxNumSelected>{};
auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};

state.add_element_count(elements, "NumElements");
state.add_element_count(segment_size, "SegmentSize");
Expand Down Expand Up @@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
segment_sizes,
k,
select_direction,
::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)},
::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
total_num_items,
env);
});
Expand Down
14 changes: 7 additions & 7 deletions cub/benchmarks/bench/segmented_topk/variable/indexed.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <thrust/device_vector.h>
#include <thrust/reduce.h>

#include <cuda/__argument_>
#include <cuda/argument>
#include <cuda/iterator>
#include <cuda/std/cstdint>

Expand Down Expand Up @@ -36,18 +36,18 @@ void decode_style_variable_topk_indexed(
static_cast<cuda::std::int64_t>(MaxSegmentSize));
const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
const auto output_elements = static_cast<std::size_t>(num_segments) * K;
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
const auto total_num_items = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(input_elements)};

auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
auto out_indices_buffer = thrust::device_vector<IndexT>(output_elements, thrust::no_init);

auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
auto k_param = ::cuda::__argument::__constant<K>{};
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
auto segment_sizes_param = ::cuda::args::__immediate_sequence{
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()};
auto k_param = ::cuda::args::constant<K>{};
auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};
auto num_segments_param = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(num_segments)};

auto d_keys_in = cuda::make_strided_iterator(
cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
Expand Down
14 changes: 7 additions & 7 deletions cub/benchmarks/bench/segmented_topk/variable/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <thrust/device_vector.h>
#include <thrust/reduce.h>

#include <cuda/__argument_>
#include <cuda/argument>
#include <cuda/iterator>

#include <nvbench_helper.cuh>
Expand All @@ -32,17 +32,17 @@ void decode_style_variable_topk_keys(
static_cast<cuda::std::int64_t>(MaxSegmentSize));
const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
const auto output_elements = static_cast<std::size_t>(num_segments) * K;
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
const auto total_num_items = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(input_elements)};

auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);

auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
auto k_param = ::cuda::__argument::__constant<K>{};
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
auto segment_sizes_param = ::cuda::args::__immediate_sequence{
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()};
auto k_param = ::cuda::args::constant<K>{};
auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};
auto num_segments_param = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(num_segments)};

auto d_keys_in = cuda::make_strided_iterator(
cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
Expand Down
12 changes: 6 additions & 6 deletions cub/cub/agent/agent_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
#include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
#include <cub/util_type.cuh>

#include <cuda/__argument_>
#include <cuda/__cmath/ceil_div.h>
#include <cuda/argument>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
using key_t = it_value_t<key_it_t>;
using value_t = it_value_t<value_it_t>;

using segment_size_val_t = typename ::cuda::__argument::__traits<SegmentSizeParameterT>::element_type;
using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
using segment_size_val_t = typename ::cuda::args::__traits<SegmentSizeParameterT>::element_type;
using num_segments_val_t = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
using counters_t = batched_topk_counters<num_segments_val_t>;

static constexpr auto policy = PolicyGetter{}();
Expand All @@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;

// Check if there could be large segments present
static constexpr bool only_small_segments = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
static constexpr bool only_small_segments = ::cuda::args::__traits<SegmentSizeParameterT>::highest <= tile_size;

// Check if we are dealing with keys-only or key-value pairs
static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
Expand Down Expand Up @@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
return;
}

constexpr bool is_full_tile = ::cuda::__argument::__traits<SegmentSizeParameterT>::is_constant
&& ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
constexpr bool is_full_tile = ::cuda::args::__traits<SegmentSizeParameterT>::is_constant
&& ::cuda::args::__traits<SegmentSizeParameterT>::lowest == tile_size;

// Resolve Segment Parameters
const auto segment_size = params::get_param(segment_sizes, segment_id);
Expand Down
36 changes: 18 additions & 18 deletions cub/cub/detail/segmented_params.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__argument_>
#include <cuda/argument>
#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__type_traits/remove_cvref.h>
#include <cuda/std/__utility/forward.h>
Expand All @@ -33,10 +33,10 @@ namespace detail::params
//! @param[in] __index Segment index to read for sequence arguments.
//! @return The single argument value, or the sequence element at the given index.
_CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
{
return __arg;
}
Expand All @@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<
}
}

template <auto _Value, class _SegmentIndexT>
template <auto _Value, class _Tp, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg);
return ::cuda::args::__unwrap(__arg);
}

template <auto _Value, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg)[__index];
return ::cuda::args::__unwrap(__arg)[__index];
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg);
return ::cuda::args::__unwrap(__arg);
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg)[__index];
return ::cuda::args::__unwrap(__arg)[__index];
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg);
return ::cuda::args::__unwrap(__arg);
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg)[__index];
return ::cuda::args::__unwrap(__arg)[__index];
}

// =====================================================================
Expand Down
28 changes: 14 additions & 14 deletions cub/cub/device/dispatch/dispatch_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/__argument_>
#include <cuda/__cmath/ceil_div.h>
#include <cuda/__iterator/counting_iterator.h>
#include <cuda/__iterator/transform_iterator.h>
#include <cuda/argument>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__type_traits/always_false.h>
#include <cuda/std/__type_traits/is_same.h>
Expand All @@ -50,28 +50,28 @@ namespace detail::batched_topk
// Internal: wrap the compile-time select direction into a discrete param for dispatch
// -----------------------------------------------------------------------------

// The selection direction is compile-time only: callers pass `::cuda::__argument::__constant<Dir>`, which maps to a
// The selection direction is compile-time only: callers pass `::cuda::args::constant<Dir>`, which maps to a
// value-less static_discrete_param. Because the direction is fixed at compile time and carries no runtime value, it
// can never disagree with its only supported option, so dispatch can never silently degrade to a no-op.
template <detail::topk::select Dir>
[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant<Dir>)
template <detail::topk::select Dir, class _Tp>
[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant<Dir, _Tp>)
{
return params::static_discrete_param<detail::topk::select, Dir>{};
}

// The selection direction is intentionally a compile-time constant: only `::cuda::__argument::__constant<Dir>` is
// The selection direction is intentionally a compile-time constant: only `::cuda::args::constant<Dir>` is
// accepted (the overload above maps it to a value-less static_discrete_param). This catch-all documents that
// deliberate limitation and rejects anything else (e.g. a runtime `detail::topk::select` or a per-segment iterator of
// directions) with a clear diagnostic. It is an intent/documentation guard rather than a user-facing one: callers
// reach the algorithm through the min/max device entry points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}), which
// construct the matching `__constant<Dir>` internally, so `dispatch` is only ever invoked with a direction we create.
// construct the matching `constant<Dir>` internally, so `dispatch` is only ever invoked with a direction we create.
template <typename SelectDirectionT>
[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(SelectDirectionT)
{
static_assert(::cuda::std::__always_false_v<SelectDirectionT>,
"DeviceBatchedTopK currently supports only compile-time selection directions: the min/max entry "
"points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}) dispatch with a "
"::cuda::__argument::__constant<cub::detail::topk::select>; runtime or per-segment directions are "
"::cuda::args::constant<Dir>; runtime or per-segment directions are "
"intentionally not supported");
// Unreachable (the static_assert above always fires); keeps the return type well-formed so the only diagnostic is
// the message above.
Expand Down Expand Up @@ -131,7 +131,7 @@ template <typename KeyInputItItT,
typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
it_value_t<it_value_t<ValueInputItItT>>,
::cuda::std::int64_t,
::cuda::__argument::__traits<KParameterT>::highest>>
::cuda::args::__traits<KParameterT>::highest>>
#if _CCCL_HAS_CONCEPTS()
requires batched_topk_policy_selector<PolicySelector>
#endif // _CCCL_HAS_CONCEPTS()
Expand All @@ -150,7 +150,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
cudaStream_t stream = nullptr,
[[maybe_unused]] PolicySelector policy_selector = {})
{
using large_segment_tile_offset_t = typename ::cuda::__argument::__traits<TotalNumItemsGuaranteeT>::element_type;
using large_segment_tile_offset_t = typename ::cuda::args::__traits<TotalNumItemsGuaranteeT>::element_type;

// Wrap the raw enum into the internal discrete param type
auto select_directions = wrap_select_direction(select_direction);
Expand All @@ -176,9 +176,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
static constexpr int worker_per_segment_tile_size =
worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
static constexpr bool any_small_segments =
::cuda::__argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
::cuda::args::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
static constexpr bool only_small_segments =
::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
::cuda::args::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;

// Allocation layout:
// only_small_segments: [0] dummy.
Expand All @@ -188,7 +188,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
size_t allocation_sizes[allocations_array_size] = {1};

using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
using num_segments_val_t = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
using counters_t = batched_topk_counters<num_segments_val_t>;
using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
using segment_size_scan_input_op_t =
Expand Down Expand Up @@ -244,7 +244,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(

// TODO (elstehle): support number of segments provided by device-accessible iterator
// Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
static_assert(::cuda::__argument::__traits<NumSegmentsParameterT>::is_single_value,
static_assert(::cuda::args::__traits<NumSegmentsParameterT>::is_single_value,
"Only uniform segment sizes are currently supported.");
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if constexpr (any_small_segments)
Expand Down Expand Up @@ -346,7 +346,7 @@ template <typename KeyInputItItT,
policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
it_value_t<it_value_t<ValueInputItItT>>,
::cuda::std::int64_t,
::cuda::__argument::__traits<KParameterT>::highest>;
::cuda::args::__traits<KParameterT>::highest>;
return detail::dispatch_with_env_and_tuning<default_policy_selector>(
env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
return dispatch(
Expand Down
Loading
Loading