From 7eee8c0f5327f11256c12169b746317f885b653d Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Wed, 3 Jun 2026 22:57:33 -0700 Subject: [PATCH 1/6] Make argument wrappers construction public --- .../bench/segmented_topk/fixed/keys.cu | 12 +- .../bench/segmented_topk/variable/keys.cu | 14 +- cub/cub/agent/agent_batched_topk.cuh | 12 +- cub/cub/detail/segmented_params.cuh | 30 +- .../device/dispatch/dispatch_batched_topk.cuh | 26 +- .../dispatch/kernels/kernel_batched_topk.cuh | 10 +- .../catch2_test_device_segmented_topk_keys.cu | 33 +- ...catch2_test_device_segmented_topk_pairs.cu | 22 +- libcudacxx/include/cuda/__argument/argument.h | 283 +++++++++++++----- .../include/cuda/__argument/argument_bounds.h | 4 +- libcudacxx/include/cuda/argument | 26 ++ .../include/cuda/std/__internal/namespaces.h | 4 +- .../cuda/argument/argument_bounds.pass.cpp | 42 +-- .../cuda/argument/argument_traits.pass.cpp | 164 +++++----- .../cuda/argument/deferred_argument.pass.cpp | 110 +++---- .../cuda/argument/dynamic_argument.pass.cpp | 121 ++++---- .../cuda/argument/static_argument.pass.cpp | 50 ++-- .../static_bounds_conversion.fail.cpp | 6 +- .../static_bounds_type_mismatch.fail.cpp | 4 +- .../cuda/argument/usage_example.pass.cpp | 43 ++- 20 files changed, 587 insertions(+), 429 deletions(-) create mode 100644 libcudacxx/include/cuda/argument diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu index b8f13469dce..4178e7ea0d5 100644 --- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include @@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys( const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected); const auto num_segments = ::cuda::std::max(1, (max_elements / segment_size)); const auto elements = num_segments * segment_size; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)}; + const auto total_num_items = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)}; const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); // Skip workloads where k exceeds the segment size @@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys( auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size); auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements); - auto segment_sizes = ::cuda::__argument::__constant{}; - auto k = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; + auto segment_sizes = ::cuda::argument::constant{}; + auto k = ::cuda::argument::constant{}; + auto select_direction = ::cuda::argument::constant{}; state.add_element_count(elements, "NumElements"); state.add_element_count(segment_size, "SegmentSize"); @@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys( segment_sizes, k, select_direction, - ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)}, + ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)}, total_num_items, env); }); diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu index 001bbb4e258..0febcf507bd 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -32,17 +32,17 @@ void decode_style_variable_topk_keys( static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast(input_elements)}; + const auto total_num_items = ::cuda::argument::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; - auto num_segments_param = ::cuda::__argument::__immediate{static_cast(num_segments)}; + auto segment_sizes_param = ::cuda::argument::immediate_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()}; + auto k_param = ::cuda::argument::constant{}; + auto select_direction = ::cuda::argument::constant{}; + auto num_segments_param = ::cuda::argument::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh index c5ec20d26cd..bc10311d36d 100644 --- a/cub/cub/agent/agent_batched_topk.cuh +++ b/cub/cub/agent/agent_batched_topk.cuh @@ -23,8 +23,8 @@ #include #include -#include #include +#include CUB_NAMESPACE_BEGIN @@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment using key_t = it_value_t; using value_t = it_value_t; - using segment_size_val_t = typename ::cuda::__argument::__traits::element_type; - using num_segments_val_t = typename ::cuda::__argument::__traits::element_type; + using segment_size_val_t = typename ::cuda::argument::__traits::element_type; + using num_segments_val_t = typename ::cuda::argument::__traits::element_type; using counters_t = batched_topk_counters; static constexpr auto policy = PolicyGetter{}(); @@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread; // Check if there could be large segments present - static constexpr bool only_small_segments = ::cuda::__argument::__traits::highest <= tile_size; + static constexpr bool only_small_segments = ::cuda::argument::__traits::highest <= tile_size; // Check if we are dealing with keys-only or key-value pairs static constexpr bool is_keys_only = ::cuda::std::is_same_v; @@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment return; } - constexpr bool is_full_tile = ::cuda::__argument::__traits::is_constant - && ::cuda::__argument::__traits::lowest == tile_size; + constexpr bool is_full_tile = ::cuda::argument::__traits::is_constant + && ::cuda::argument::__traits::lowest == tile_size; // Resolve Segment Parameters const auto segment_size = params::get_param(segment_sizes, segment_id); diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh index 543f55b5036..1c4e6859d3e 100644 --- a/cub/cub/detail/segmented_params.cuh +++ b/cub/cub/detail/segmented_params.cuh @@ -13,7 +13,7 @@ # pragma system_header #endif // no system header -#include +#include #include #include #include @@ -33,10 +33,10 @@ namespace detail::params //! @param[in] __index Segment index to read for sequence arguments. //! @return The single argument value, or the sequence element at the given index. _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT) -_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) +_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) + if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) { return __arg; } @@ -48,44 +48,44 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t< template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::argument::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::argument::__unwrap(__arg)[__index]; } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept + const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::argument::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::argument::__unwrap(__arg)[__index]; } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept + const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::argument::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::argument::__unwrap(__arg)[__index]; } // ===================================================================== diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index 56d12268dc9..29d245c175e 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -31,10 +31,10 @@ #include -#include #include #include #include +#include #include #include #include @@ -50,28 +50,28 @@ namespace detail::batched_topk // Internal: wrap the compile-time select direction into a discrete param for dispatch // ----------------------------------------------------------------------------- -// The selection direction is compile-time only: callers pass `::cuda::__argument::__constant`, which maps to a +// The selection direction is compile-time only: callers pass `::cuda::argument::constant`, which maps to a // value-less static_discrete_param. Because the direction is fixed at compile time and carries no runtime value, it // can never disagree with its only supported option, so dispatch can never silently degrade to a no-op. template -[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant) +[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant) { return params::static_discrete_param{}; } -// The selection direction is intentionally a compile-time constant: only `::cuda::__argument::__constant` is +// The selection direction is intentionally a compile-time constant: only `::cuda::argument::constant` is // accepted (the overload above maps it to a value-less static_discrete_param). This catch-all documents that // deliberate limitation and rejects anything else (e.g. a runtime `detail::topk::select` or a per-segment iterator of // directions) with a clear diagnostic. It is an intent/documentation guard rather than a user-facing one: callers // reach the algorithm through the min/max device entry points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}), which -// construct the matching `__constant` internally, so `dispatch` is only ever invoked with a direction we create. +// construct the matching `constant` internally, so `dispatch` is only ever invoked with a direction we create. template [[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(SelectDirectionT) { static_assert(::cuda::std::__always_false_v, "DeviceBatchedTopK currently supports only compile-time selection directions: the min/max entry " "points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}) dispatch with a " - "::cuda::__argument::__constant; runtime or per-segment directions are " + "::cuda::argument::constant; runtime or per-segment directions are " "intentionally not supported"); // Unreachable (the static_assert above always fires); keeps the return type well-formed so the only diagnostic is // the message above. @@ -131,7 +131,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::__argument::__traits::highest>> + ::cuda::argument::__traits::highest>> #if _CCCL_HAS_CONCEPTS() requires batched_topk_policy_selector #endif // _CCCL_HAS_CONCEPTS() @@ -150,7 +150,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { - using large_segment_tile_offset_t = typename ::cuda::__argument::__traits::element_type; + using large_segment_tile_offset_t = typename ::cuda::argument::__traits::element_type; // Wrap the raw enum into the internal discrete param type auto select_directions = wrap_select_direction(select_direction); @@ -176,9 +176,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int worker_per_segment_tile_size = worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread; static constexpr bool any_small_segments = - ::cuda::__argument::__traits::lowest <= worker_per_segment_tile_size; + ::cuda::argument::__traits::lowest <= worker_per_segment_tile_size; static constexpr bool only_small_segments = - ::cuda::__argument::__traits::highest <= worker_per_segment_tile_size; + ::cuda::argument::__traits::highest <= worker_per_segment_tile_size; // Allocation layout: // only_small_segments: [0] dummy. @@ -188,7 +188,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2); size_t allocation_sizes[allocations_array_size] = {1}; - using num_segments_val_t = typename ::cuda::__argument::__traits::element_type; + using num_segments_val_t = typename ::cuda::argument::__traits::element_type; using counters_t = batched_topk_counters; using segment_size_scan_offset_t = detail::choose_offset_t; using segment_size_scan_input_op_t = @@ -244,7 +244,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( // TODO (elstehle): support number of segments provided by device-accessible iterator // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host) - static_assert(::cuda::__argument::__traits::is_single_value, + static_assert(::cuda::argument::__traits::is_single_value, "Only uniform segment sizes are currently supported."); if constexpr (any_small_segments) @@ -346,7 +346,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::__argument::__traits::highest>; + ::cuda::argument::__traits::highest>; return detail::dispatch_with_env_and_tuning( env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { return dispatch( diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh index 1ff50dfaf67..9c904f95971 100644 --- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh @@ -20,8 +20,8 @@ #include #include -#include #include +#include CUB_NAMESPACE_BEGIN @@ -39,7 +39,7 @@ private: worker_policy worker_per_segment_policy; multi_worker_policy multi_worker_per_segment_policy; }; - static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits::highest; + static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits::highest; static constexpr batched_topk_policy active_policy = current_policy(); template @@ -133,8 +133,8 @@ __launch_bounds__(int( KParameterT k, SelectDirectionParameterT select_directions, NumSegmentsParameterT num_segments, - batched_topk_counters::element_type>* d_counters, - typename ::cuda::__argument::__traits::element_type* d_large_segments_ids, + batched_topk_counters::element_type>* d_counters, + typename ::cuda::argument::__traits::element_type* d_large_segments_ids, LargeSegmentTileOffsetT* d_large_segments_tile_offsets) { using agent_t = typename find_smallest_covering_policy< @@ -151,7 +151,7 @@ __launch_bounds__(int( LargeSegmentTileOffsetT>::agent_t; // Static Assertions (Constraints) - static_assert(agent_t::tile_size >= ::cuda::__argument::__traits::highest, + static_assert(agent_t::tile_size >= ::cuda::argument::__traits::highest, "Block size exceeds maximum segment size supported by SegmentSizeParameterT"); static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block, "Static shared memory per block must not exceed 48KB limit."); diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index 3ef76bc2743..16f7d4322ca 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -156,11 +156,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments", batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::argument::constant{}, + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_segments * segment_size}); // Prepare expected results fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); compact_sorted_keys_to_topk(expected_keys, segment_size, k); @@ -254,12 +254,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::__argument::__immediate_sequence{ - segment_size_it, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_items}); + ::cuda::argument::immediate_sequence{ + segment_size_it, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::argument::constant{}, + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_items}); // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); @@ -292,12 +292,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment batched_topk_keys( d_keys_in_it, d_keys_out_it, - ::cuda::__argument::__immediate{ - segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::argument::constant{}, + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_segments * segment_size}); const int num_minus_zero = static_cast(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{})); REQUIRE(num_minus_zero >= 1); diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index b16a97b2472..49b2e48a1b6 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -225,11 +225,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments" d_keys_out, d_values_in, d_values_out, - ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::argument::constant{}, + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_segments * segment_size}); // Verification: // - We verify correct top-k selection through the keys @@ -346,12 +346,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen d_keys_out, d_values_in, d_values_out, - ::cuda::__argument::__immediate_sequence{ - segment_size_it, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_items}); + ::cuda::argument::immediate_sequence{ + segment_size_it, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::argument::constant{}, + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_items}); // Verification: // - We verify correct top-k selection through the keys diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index a1d55db5045..2df19b9ac5d 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -87,7 +87,7 @@ inline constexpr bool __is_sequence_v = || ::cuda::std::__has_random_access_traversal<_Tp>; // ===================================================================== -// __constant +// constant // ===================================================================== // Non-sequence wrappers intentionally do not reject types with a distinct element type. @@ -96,7 +96,7 @@ inline constexpr bool __is_sequence_v = //! @brief Wraps a compile-time constant argument value. template -struct __constant +struct constant { using value_type = ::cuda::std::remove_cvref_t; using __element_type = value_type; @@ -109,7 +109,7 @@ struct __constant //! @brief Wraps a compile-time constant argument sequence. template -struct __constant_sequence +struct constant_sequence { using value_type = ::cuda::std::remove_cvref_t; using __element_type = __element_type_of_t; @@ -122,15 +122,138 @@ struct __constant_sequence } }; +// __assert_in_range // ===================================================================== -// __immediate + +template +_CCCL_API constexpr void __assert_in_range([[maybe_unused]] _From __val) noexcept +{ + if constexpr (::cuda::std::__cccl_is_cv_integer_v<_To> && ::cuda::std::__cccl_is_cv_integer_v<_From>) + { + _CCCL_ASSERT(::cuda::std::in_range<::cuda::std::remove_cv_t<_To>>(__val), + "runtime bound value overflows the element type"); + } +} + +template +[[nodiscard]] _CCCL_API constexpr _To __runtime_bound_cast(_From __val) noexcept +{ + __assert_in_range<_To>(__val); + return static_cast<_To>(__val); +} + +template +_CCCL_API constexpr bool __static_bound_in_range() noexcept +{ + using _RawTo = ::cuda::std::remove_cv_t<_To>; + using _RawFrom = ::cuda::std::remove_cv_t; + + if constexpr (::cuda::std::__cccl_is_integer_v<_RawTo> && ::cuda::std::__cccl_is_integer_v<_RawFrom>) + { + return ::cuda::std::in_range<_RawTo>(_Value); + } + else if constexpr (::cuda::std::is_arithmetic_v<_RawTo> && ::cuda::std::is_arithmetic_v<_RawFrom>) + { + return static_cast<_RawFrom>(static_cast<_RawTo>(_Value)) == _Value; + } + else + { + return true; + } +} + +template +inline constexpr bool __valid_static_bounds_v = true; + +template +inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> = + __static_bound_in_range<_ElementType, _Lowest>() && __static_bound_in_range<_ElementType, _Highest>(); + +template +_CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept +{ + if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + { + return ::cuda::std::numeric_limits<_ElementType>::lowest(); + } + else + { + return static_cast<_ElementType>(_StaticBounds::lower()); + } +} + +template +_CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept +{ + if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + { + return (::cuda::std::numeric_limits<_ElementType>::max)(); + } + else + { + return static_cast<_ElementType>(_StaticBounds::upper()); + } +} + +template +_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + auto __static_lowest = __wrapper_static_lowest<_ElementType, _StaticBounds>(); + return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower(); +} + +template +_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + auto __static_highest = __wrapper_static_highest<_ElementType, _StaticBounds>(); + return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper(); +} + +template +_CCCL_API constexpr bool __has_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + return __effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds) + <= __effective_highest<_ElementType, _StaticBounds>(__runtime_bounds); +} + +template +_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, + "static argument bounds cannot be represented by the element type"); + _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)), + "static and runtime argument bounds do not intersect"); +} + +template +_CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept +{ + if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + { + _CCCL_ASSERT((__val >= __wrapper_static_lowest<_ElementType, _StaticBounds>()), + "immediate argument value is below static lowest bound"); + _CCCL_ASSERT((__val <= __wrapper_static_highest<_ElementType, _StaticBounds>()), + "immediate argument value is above static highest bound"); + } +} + +template +_CCCL_API constexpr void __validate_runtime_element_bounds( + [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound"); + _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound"); +} + +// ===================================================================== +// immediate // ===================================================================== //! @brief Wraps a runtime argument value with optional bounds. //! //! The value is host-accessible at API call time. template -struct __immediate +struct immediate { using __element_type = __element_type_of_t<_Arg>; @@ -150,13 +273,13 @@ struct __immediate } public: - _CCCL_API constexpr __immediate(_Arg __arg) noexcept + _CCCL_API constexpr immediate(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_value(); } - _CCCL_API constexpr __immediate(_Arg __arg, _StaticBounds) noexcept + _CCCL_API constexpr immediate(_Arg __arg, _StaticBounds) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_value(); @@ -165,18 +288,18 @@ struct __immediate #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __immediate(_Arg, __static_bounds<_Lowest, _Highest>) - -> __immediate<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>) + -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== -// __immediate_sequence +// immediate_sequence // ===================================================================== //! @brief Wraps a runtime argument sequence with optional bounds. template -struct __immediate_sequence +struct immediate_sequence { using __element_type = __element_type_of_t<_Arg>; @@ -215,14 +338,14 @@ struct __immediate_sequence } public: - _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); __validate_value(); } - _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); @@ -230,7 +353,7 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -240,7 +363,7 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -250,27 +373,27 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept - : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb) + _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + : immediate_sequence(::cuda::std::move(__arg), __sb, __rb) {} }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) + -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== -// __deferred_base / __deferred / __deferred_sequence +// __deferred_base / deferred / deferred_sequence // ===================================================================== //! @brief Common base for deferred argument wrappers. @@ -324,57 +447,57 @@ struct __deferred_base //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available //! by the time the argument is consumed in stream order. template -struct __deferred : __deferred_base<_Arg, _StaticBounds> +struct deferred : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __deferred(_Arg) -> __deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>) + -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>) -> __deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) + -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) + -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be //! available by the time the argument is consumed in stream order. template -struct __deferred_sequence : __deferred_base<_Arg, _StaticBounds> +struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg) -> __deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> __deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) + -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== @@ -384,17 +507,17 @@ _CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_boun template inline constexpr bool __is_wrapper_v = false; template -inline constexpr bool __is_wrapper_v<__immediate<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__constant<_Value>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__deferred<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__deferred_sequence<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) @@ -404,87 +527,87 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } template [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const __constant<_Value>&) noexcept +__unwrap(const constant<_Value>&) noexcept { return _Value; } template [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const __constant_sequence<_Value>&) noexcept +__unwrap(const constant_sequence<_Value>&) noexcept { return _Value; } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } @@ -550,7 +673,7 @@ struct __traits_impl }; template -struct __traits_impl<__constant<_Value>> +struct __traits_impl> { using value_type = ::cuda::std::remove_cvref_t; using element_type = value_type; @@ -562,7 +685,7 @@ struct __traits_impl<__constant<_Value>> }; template -struct __traits_impl<__immediate<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -577,7 +700,7 @@ struct __traits_impl<__immediate<_Arg, _StaticBounds>> }; template -struct __traits_impl<__constant_sequence<_Value>> +struct __traits_impl> { using value_type = ::cuda::std::remove_cvref_t; using element_type = __element_type_of_t; @@ -590,7 +713,7 @@ struct __traits_impl<__constant_sequence<_Value>> }; template -struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -606,7 +729,7 @@ struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> }; template -struct __traits_impl<__deferred<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -621,7 +744,7 @@ struct __traits_impl<__deferred<_Arg, _StaticBounds>> }; template -struct __traits_impl<__deferred_sequence<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -652,25 +775,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept { return __constant_compute_lowest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept { return __constant_sequence_compute_lowest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -678,7 +801,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -686,7 +809,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -702,25 +825,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept { return __constant_compute_highest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept { return __constant_sequence_compute_highest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -728,7 +851,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -736,7 +859,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h index 46dd22c370c..3e70b4d1893 100644 --- a/libcudacxx/include/cuda/__argument/argument_bounds.h +++ b/libcudacxx/include/cuda/__argument/argument_bounds.h @@ -122,7 +122,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; //! @tparam _Upper The static upper bound. //! @return A compile-time bounds object. template -[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> __bounds() noexcept +[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept { return {}; } @@ -133,7 +133,7 @@ template //! @param __upper The runtime upper bound. //! @return A runtime bounds object. template -[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> __bounds(_Tp __lower, _Tp __upper) noexcept +[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept { return {__lower, __upper}; } diff --git a/libcudacxx/include/cuda/argument b/libcudacxx/include/cuda/argument new file mode 100644 index 00000000000..bcb079e7e5e --- /dev/null +++ b/libcudacxx/include/cuda/argument @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_ARGUMENT_ +#define _CUDA_ARGUMENT_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#endif // _CUDA_ARGUMENT_ diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h index 325830c17cf..f3cc191adfc 100644 --- a/libcudacxx/include/cuda/std/__internal/namespaces.h +++ b/libcudacxx/include/cuda/std/__internal/namespaces.h @@ -115,8 +115,8 @@ #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution) #define _CCCL_END_NAMESPACE_CUDA_EXECUTION _CCCL_END_NAMESPACE(cuda::execution) -#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::__argument) -#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::__argument) +#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument) +#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::argument) // Namespace to avoid name collisions with CPOs on clang-16 (see // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index d7f394a0d74..9a5717f32c3 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include @@ -20,27 +20,27 @@ TEST_FUNC constexpr bool test() // Basic static bounds { - constexpr auto b = cuda::__argument::__static_bounds<1, 4096>{}; + constexpr auto b = cuda::argument::__static_bounds<1, 4096>{}; static_assert(b.lower() == 1); static_assert(b.upper() == 4096); } // Exact static bounds { - constexpr auto b = cuda::__argument::__static_bounds<42, 42>{}; + constexpr auto b = cuda::argument::__static_bounds<42, 42>{}; static_assert(b.lower() == 42); static_assert(b.upper() == 42); } // Long type deduced from NTTPs { - static_assert(cuda::std::is_same_v::lower()), long>); + static_assert(cuda::std::is_same_v::lower()), long>); } #if TEST_HAS_CLASS_NTTP // Static bounds preserve their original NTTP types { - constexpr auto b = cuda::__argument::__bounds<1.0f, 8.0f>(); + constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>(); static_assert(b.lower() == 1.0f); static_assert(b.upper() == 8); static_assert(cuda::std::is_same_v); @@ -52,9 +52,13 @@ TEST_FUNC constexpr bool test() // Basic runtime bounds { - auto b = cuda::__argument::__runtime_bounds{10, 100}; + auto b = cuda::argument::__runtime_bounds{10, 100}; assert(b.lower() == 10); assert(b.upper() == 100); + assert(b.__lower_ == 10); + assert(b.__upper_ == 100); + b.__upper_ = 90; + assert(b.upper() == 90); static_assert(cuda::std::is_same_v); } @@ -62,35 +66,35 @@ TEST_FUNC constexpr bool test() // Static via factory { - constexpr auto b = cuda::__argument::__bounds<1, 8>(); + constexpr auto b = cuda::argument::bounds<1, 8>(); static_assert(b.lower() == 1); static_assert(b.upper() == 8); - static_assert(cuda::__argument::__is_static_bounds_cv_v); - static_assert(!cuda::__argument::__is_runtime_bounds_cv_v); - static_assert(cuda::__argument::__is_bounds_v); + static_assert(cuda::argument::__is_static_bounds_cv_v); + static_assert(!cuda::argument::__is_runtime_bounds_cv_v); + static_assert(cuda::argument::__is_bounds_v); } // Runtime via factory { - auto b = cuda::__argument::__bounds(10, 100); + auto b = cuda::argument::bounds(10, 100); assert(b.lower() == 10); assert(b.upper() == 100); - static_assert(!cuda::__argument::__is_static_bounds_cv_v); - static_assert(cuda::__argument::__is_runtime_bounds_cv_v); - static_assert(cuda::__argument::__is_bounds_v); + static_assert(!cuda::argument::__is_static_bounds_cv_v); + static_assert(cuda::argument::__is_runtime_bounds_cv_v); + static_assert(cuda::argument::__is_bounds_v); } // Static and runtime bounds intersection { - static_assert(cuda::__argument::__valid_argument_bounds>( - cuda::__argument::__runtime_bounds{50, 200})); - static_assert(!cuda::__argument::__valid_argument_bounds>( - cuda::__argument::__runtime_bounds{0, 50})); + static_assert(cuda::argument::__has_bounds_intersection>( + cuda::argument::__runtime_bounds{50, 200})); + static_assert(!cuda::argument::__has_bounds_intersection>( + cuda::argument::__runtime_bounds{0, 50})); } // Non-bounds type { - static_assert(!cuda::__argument::__is_bounds_v); + static_assert(!cuda::argument::__is_bounds_v); } return true; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp index 13753040f08..517893abb7d 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -54,138 +54,132 @@ struct non_sequence_value TEST_FUNC void test() { - // --- __is_sequence_v / __is_single_value_v --- + // --- __is_sequence_v --- // builtin and class type are not sequences - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v); + static_assert(!cuda::argument::__is_sequence_v); + static_assert(!cuda::argument::__is_sequence_v); + static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); // iterators and pointers can be sequences if they are at least random access - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(!cuda::argument::__is_sequence_v>); // ranges and arrays are sequences - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v&>); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v&>); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v>); // --- __element_type_of_t --- - static_assert(cuda::std::is_same_v&>, int>); - static_assert(cuda::std::is_same_v, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v&>, int>); + static_assert(cuda::std::is_same_v, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); static_assert( - cuda::std::is_same_v>>, + cuda::std::is_same_v>>, int>); - static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); // --- argument_traits: is_deferred --- - static_assert(!cuda::__argument::__traits::is_deferred); - static_assert(!cuda::__argument::__traits>::is_deferred); - static_assert(!cuda::__argument::__traits>>::is_deferred); - static_assert(!cuda::__argument::__traits>::is_deferred); + static_assert(!cuda::argument::__traits::is_deferred); + static_assert(!cuda::argument::__traits>::is_deferred); + static_assert(!cuda::argument::__traits>>::is_deferred); + static_assert(!cuda::argument::__traits>::is_deferred); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::__argument::__traits{1, 2, 3}>>::is_deferred); + !cuda::argument::__traits{1, 2, 3}>>::is_deferred); #endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::__argument::__traits>>::is_deferred); - static_assert(cuda::__argument::__traits>>::is_deferred); + static_assert(cuda::argument::__traits>>::is_deferred); + static_assert(cuda::argument::__traits>>::is_deferred); // --- argument_traits: is_single_value --- - static_assert(cuda::__argument::__traits::is_single_value); - static_assert(cuda::__argument::__traits::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert( - cuda::__argument::__traits>>::is_single_value); - static_assert( - !cuda::__argument::__traits>>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits::is_single_value); + static_assert(cuda::argument::__traits::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits>>::is_single_value); + static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); -#endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::__argument::__traits>::is_single_value); static_assert( - !cuda::__argument::__traits>>::is_single_value); + !cuda::argument::__traits{1, 2, 3}>>::is_single_value); +#endif // TEST_HAS_CLASS_NTTP + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(!cuda::argument::__traits>>::is_single_value); // --- argument_traits: value_type --- - static_assert(cuda::std::is_same_v::value_type, int>); - static_assert(cuda::std::is_same_v>::value_type, int>); - static_assert(cuda::std::is_same_v< - cuda::__argument::__traits>>::value_type, - cuda::std::span>); - static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert( + cuda::std::is_same_v>>::value_type, + cuda::std::span>); + static_assert(cuda::std::is_same_v>::value_type, int>); #if TEST_HAS_CLASS_NTTP static_assert( cuda::std::is_same_v< - cuda::__argument::__traits{1, 2, 3}>>::value_type, + cuda::argument::__traits{1, 2, 3}>>::value_type, cuda::std::array>); #endif // TEST_HAS_CLASS_NTTP // --- argument_traits: lowest / highest --- - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); static_assert( - cuda::__argument::__traits>>::lowest - == 1); + cuda::argument::__traits>>::lowest == 1); static_assert( - cuda::__argument::__traits>&>::highest - == 8); + cuda::argument::__traits>&>::highest == 8); static_assert( - cuda::__argument::__traits< - cuda::__argument::__immediate_sequence, cuda::__argument::__static_bounds<1, 8>>>::highest + cuda::argument::__traits< + cuda::argument::immediate_sequence, cuda::argument::__static_bounds<1, 8>>>::highest == 8); #if TEST_HAS_CLASS_NTTP static_assert( - cuda::__argument::__traits{3, 1, 2}>>::lowest == 1); + cuda::argument::__traits{3, 1, 2}>>::lowest == 1); static_assert( - cuda::__argument::__traits{3, 1, 2}>>::highest == 3); + cuda::argument::__traits{3, 1, 2}>>::highest == 3); #endif // TEST_HAS_CLASS_NTTP // --- Free function bounds on plain values --- - static_assert(cuda::__argument::__lowest_(42) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(42) == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); // --- Scalar and sequence wrappers expose distinct single-value traits --- - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert( - !cuda::__argument::__traits>>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(!cuda::argument::__traits>>::is_single_value); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); + static_assert( + !cuda::argument::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp index 9949e0013b4..f9bda950aa7 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -22,102 +22,110 @@ TEST_FUNC constexpr bool test() // Deferred single value via span { int val = 42; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); + auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; + assert(cuda::argument::__unwrap(def)[0] == 42); + assert(def.__arg_[0] == 42); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); } // Deferred single value with static bounds { int val = 42; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}, cuda::__argument::__bounds<1, 1000>()}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 1000); + auto def = cuda::argument::deferred{cuda::std::span{&val, 1}, cuda::argument::bounds<1, 1000>()}; + assert(cuda::argument::__unwrap(def)[0] == 42); + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 1000); } // Deferred single value via pointer { int val = 42; - using def_t = cuda::__argument::__deferred>; - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); + using def_t = cuda::argument::deferred>; + static_assert(cuda::argument::__traits::lowest == 0); + static_assert(cuda::argument::__traits::highest == 100); // Also verify construction works - auto def = cuda::__argument::__deferred{&val, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def) == &val); + auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()}; + assert(cuda::argument::__unwrap(def) == &val); } // Deferred single value via fancy iterator { auto it = cuda::counting_iterator{42}; - auto def = cuda::__argument::__deferred{it, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); - static_assert(cuda::__argument::__traits::is_single_value); + auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()}; + assert(cuda::argument::__unwrap(def)[0] == 42); + static_assert(cuda::argument::__traits::lowest == 0); + static_assert(cuda::argument::__traits::highest == 100); + static_assert(cuda::argument::__traits::is_single_value); } // Deferred single value with both bounds, runtime bounds first { int val = 42; - auto def = cuda::__argument::__deferred{ - cuda::std::span{&val, 1}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 256>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::argument::deferred{ + cuda::std::span{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 256); + assert(def.__runtime_bounds_.__lower_ == 5); + assert(def.__runtime_bounds_.__upper_ == 100); + assert(cuda::argument::__lowest_(def) == 5); + assert(cuda::argument::__highest_(def) == 100); + def.__runtime_bounds_.__upper_ = 90; + assert(cuda::argument::__highest_(def) == 90); } // Deferred sequence via fancy iterator { auto it = cuda::counting_iterator{10}; - auto def = cuda::__argument::__deferred_sequence{it, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def)[0] == 10); - assert(cuda::__argument::__unwrap(def)[2] == 12); - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); - static_assert(!cuda::__argument::__traits::is_single_value); + auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()}; + assert(cuda::argument::__unwrap(def)[0] == 10); + assert(cuda::argument::__unwrap(def)[2] == 12); + static_assert(cuda::argument::__traits::lowest == 0); + static_assert(cuda::argument::__traits::highest == 100); + static_assert(!cuda::argument::__traits::is_single_value); } // Deferred sequence with both bounds { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::__argument::__deferred_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 4096>(), cuda::__argument::__bounds(5, 100)}; - static_assert(cuda::__argument::__traits::lowest == 1); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::argument::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)}; + assert(def.__arg_.size() == 4); + assert(def.__runtime_bounds_.__lower_ == 5); + assert(def.__runtime_bounds_.__upper_ == 100); + static_assert(cuda::argument::__traits::lowest == 1); + assert(cuda::argument::__lowest_(def) == 5); + assert(cuda::argument::__highest_(def) == 100); } // Deferred sequence with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::__argument::__deferred_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 4096>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 4096); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::argument::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 4096); + assert(cuda::argument::__lowest_(def) == 5); + assert(cuda::argument::__highest_(def) == 100); } // Traits: deferred is single value { - using traits = cuda::__argument::__traits>>; + using traits = cuda::argument::__traits>>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred with pointer is also single value { - using traits = cuda::__argument::__traits>; + using traits = cuda::argument::__traits>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred_sequence is not single value { - using traits = cuda::__argument::__traits>>; + using traits = cuda::argument::__traits>>; static_assert(traits::is_deferred); static_assert(!traits::is_single_value); } @@ -125,16 +133,16 @@ TEST_FUNC constexpr bool test() // Unwrap: deferred { int val = 99; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}}; - auto& v = cuda::__argument::__unwrap(def); + auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; + auto& v = cuda::argument::__unwrap(def); assert(v[0] == 99); } // Unwrap: deferred_sequence { int arr[3] = {10, 20, 30}; - auto def = cuda::__argument::__deferred_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::__argument::__unwrap(def); + auto def = cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::argument::__unwrap(def); assert(v.size() == 3); assert(v[1] == 20); } @@ -142,14 +150,14 @@ TEST_FUNC constexpr bool test() // Unwrap: rvalue deferred returns by value { int val = 99; - auto v = cuda::__argument::__unwrap(cuda::__argument::__deferred{cuda::std::span{&val, 1}}); + auto v = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span{&val, 1}}); assert(v[0] == 99); } // Unwrap: rvalue deferred_sequence returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::__argument::__unwrap(cuda::__argument::__deferred_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp index c9723304774..e969b9bd234 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -25,93 +25,99 @@ TEST_FUNC constexpr bool test() { // Uniform scalar via CTAD { - auto da = cuda::__argument::__immediate{5}; - assert(cuda::__argument::__unwrap(da) == 5); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - assert(cuda::__argument::__lowest_(da) == 5); - assert(cuda::__argument::__highest_(da) == 5); + auto da = cuda::argument::immediate{5}; + assert(cuda::argument::__unwrap(da) == 5); + assert(da.__arg_ == 5); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + assert(cuda::argument::__lowest_(da) == 5); + assert(cuda::argument::__highest_(da) == 5); + da.__arg_ = 6; + assert(cuda::argument::__unwrap(da) == 6); } // Uniform scalar with static bounds { - auto da = cuda::__argument::__immediate{5, cuda::__argument::__bounds<1, 8>()}; - assert(cuda::__argument::__unwrap(da) == 5); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 8); - assert(cuda::__argument::__lowest_(da) == 5); - assert(cuda::__argument::__highest_(da) == 5); + auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()}; + assert(cuda::argument::__unwrap(da) == 5); + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 8); + assert(cuda::argument::__lowest_(da) == 5); + assert(cuda::argument::__highest_(da) == 5); } // Non-sequence values are accepted without scalar-only restrictions { - auto da = cuda::__argument::__immediate{non_sequence_value{7}}; - assert(cuda::__argument::__unwrap(da).payload == 7); + auto da = cuda::argument::immediate{non_sequence_value{7}}; + assert(cuda::argument::__unwrap(da).payload == 7); } // Pointer-like types can still represent a single value when explicitly wrapped that way { int value = 11; - auto da = cuda::__argument::__immediate{&value}; - static_assert(cuda::__argument::__traits::is_single_value); - assert(*cuda::__argument::__unwrap(da) == 11); + auto da = cuda::argument::immediate{&value}; + static_assert(cuda::argument::__traits::is_single_value); + assert(*cuda::argument::__unwrap(da) == 11); } // Per-segment span with runtime bounds { int arr[4] = {10, 20, 30, 40}; - auto da = - cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}, cuda::__argument::__bounds(1L, 100L)}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__lowest_(da) == 1); - assert(cuda::__argument::__highest_(da) == 100); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds(1L, 100L)}; + assert(cuda::argument::__unwrap(da).size() == 4); + assert(da.__arg_.size() == 4); + assert(da.__runtime_bounds_.__lower_ == 1); + assert(da.__runtime_bounds_.__upper_ == 100); + assert(cuda::argument::__lowest_(da) == 1); + assert(cuda::argument::__highest_(da) == 100); + da.__runtime_bounds_.__upper_ = 90; + assert(cuda::argument::__highest_(da) == 90); } // Per-segment span with both bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::__argument::__immediate_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(10, 200)}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(da) == 10); - assert(cuda::__argument::__highest_(da) == 200); + auto da = cuda::argument::immediate_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 256); + assert(cuda::argument::__lowest_(da) == 10); + assert(cuda::argument::__highest_(da) == 200); } // Per-segment span with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::__argument::__immediate_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds(10, 200), cuda::__argument::__bounds<1, 256>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(da) == 10); - assert(cuda::__argument::__highest_(da) == 200); + auto da = cuda::argument::immediate_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 256); + assert(cuda::argument::__lowest_(da) == 10); + assert(cuda::argument::__highest_(da) == 200); } // Per-segment via span { int arr[4] = {1, 2, 3, 4}; - auto da = cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__unwrap(da)[0] == 1); - assert(cuda::__argument::__unwrap(da)[3] == 4); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}}; + assert(cuda::argument::__unwrap(da).size() == 4); + assert(cuda::argument::__unwrap(da)[0] == 1); + assert(cuda::argument::__unwrap(da)[3] == 4); } // Per-segment with static bounds { int arr[4] = {10, 20, 30, 40}; - auto da = - cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 100>()}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__unwrap(da)[2] == 30); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 100); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds<1, 100>()}; + assert(cuda::argument::__unwrap(da).size() == 4); + assert(cuda::argument::__unwrap(da)[2] == 30); + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 100); } // Traits { - using traits = cuda::__argument::__traits>; + using traits = cuda::argument::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_single_value); static_assert(cuda::std::is_same_v); @@ -119,48 +125,47 @@ TEST_FUNC constexpr bool test() // Sequence traits { - using traits = cuda::__argument::__traits>>; + using traits = cuda::argument::__traits>>; static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); static_assert(cuda::std::is_same_v>); } - // __is_single_value_v on unwrapped types + // __is_sequence_v on unwrapped types { static_assert( - !cuda::__argument::__is_sequence_v>::value_type>); - static_assert( - !cuda::__argument::__traits>>::is_single_value); + !cuda::argument::__is_sequence_v>::value_type>); + static_assert(!cuda::argument::__traits>>::is_single_value); } // Unwrap: scalar { - auto da = cuda::__argument::__immediate{7}; - auto& v = cuda::__argument::__unwrap(da); + auto da = cuda::argument::immediate{7}; + auto& v = cuda::argument::__unwrap(da); assert(v == 7); v = 8; - assert(cuda::__argument::__unwrap(da) == 8); + assert(cuda::argument::__unwrap(da) == 8); } // Unwrap: span { int arr[3] = {10, 20, 30}; - auto da = cuda::__argument::__immediate_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::__argument::__unwrap(da); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::argument::__unwrap(da); assert(v.size() == 3); assert(v[1] == 20); } // Unwrap: rvalue scalar returns by value { - const auto& v = cuda::__argument::__unwrap(cuda::__argument::__immediate{7}); + const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7}); assert(v == 7); } // Unwrap: rvalue span returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::__argument::__unwrap(cuda::__argument::__immediate_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp index 85ccaf1c8a0..cb6e78cade3 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -24,37 +24,37 @@ TEST_FUNC void test() { // Basic value { - constexpr auto sa = cuda::__argument::__constant<42>{}; + constexpr auto sa = cuda::argument::constant<42>{}; static_assert(sa.value() == 42); static_assert(cuda::std::is_same_v); } // Different types { - constexpr auto sa_long = cuda::__argument::__constant<100L>{}; + constexpr auto sa_long = cuda::argument::constant<100L>{}; static_assert(sa_long.value() == 100L); static_assert(cuda::std::is_same_v); } // Negative value { - constexpr auto sa_neg = cuda::__argument::__constant<-1>{}; + constexpr auto sa_neg = cuda::argument::constant<-1>{}; static_assert(sa_neg.value() == -1); } #if TEST_HAS_CLASS_NTTP // Non-sequence values are accepted without scalar-only restrictions { - constexpr auto sa = cuda::__argument::__constant{}; + constexpr auto sa = cuda::argument::constant{}; static_assert(sa.value().payload == 7); - static_assert(cuda::__argument::__unwrap(sa).payload == 7); + static_assert(cuda::argument::__unwrap(sa).payload == 7); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Array sequence { - constexpr auto sa_arr = cuda::__argument::__constant_sequence{128, 256, 512}>{}; + constexpr auto sa_arr = cuda::argument::constant_sequence{128, 256, 512}>{}; static_assert(sa_arr.value()[0] == 128); static_assert(sa_arr.value()[1] == 256); static_assert(sa_arr.value()[2] == 512); @@ -64,32 +64,32 @@ TEST_FUNC void test() // Bounds: scalar { - constexpr auto sa = cuda::__argument::__constant<42>{}; - static_assert(cuda::__argument::__lowest_(sa) == 42); - static_assert(cuda::__argument::__highest_(sa) == 42); + constexpr auto sa = cuda::argument::constant<42>{}; + static_assert(cuda::argument::__lowest_(sa) == 42); + static_assert(cuda::argument::__highest_(sa) == 42); } #if TEST_HAS_CLASS_NTTP // Bounds: array sequence computes lowest/highest of elements { - constexpr auto sa = cuda::__argument::__constant_sequence{128, 256, 512}>{}; - static_assert(cuda::__argument::__lowest_(sa) == 128); - static_assert(cuda::__argument::__highest_(sa) == 512); + constexpr auto sa = cuda::argument::constant_sequence{128, 256, 512}>{}; + static_assert(cuda::argument::__lowest_(sa) == 128); + static_assert(cuda::argument::__highest_(sa) == 512); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Bounds: empty array sequence has unconstrained element bounds { - constexpr auto sa = cuda::__argument::__constant_sequence{}>{}; - static_assert(cuda::__argument::__lowest_(sa) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(sa) == (cuda::std::numeric_limits::max)()); + constexpr auto sa = cuda::argument::constant_sequence{}>{}; + static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits::max)()); } #endif // TEST_HAS_CLASS_NTTP // Traits { - using traits = cuda::__argument::__traits>; + using traits = cuda::argument::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_constant); static_assert(traits::is_single_value); @@ -101,7 +101,7 @@ TEST_FUNC void test() #if TEST_HAS_CLASS_NTTP // Sequence traits { - using traits = cuda::__argument::__traits{1, 2, 3}>>; + using traits = cuda::argument::__traits{1, 2, 3}>>; static_assert(traits::is_constant); static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); @@ -113,25 +113,25 @@ TEST_FUNC void test() // Single value: scalar is single, sequence is not { static_assert( - !cuda::__argument::__is_sequence_v>::value_type>); + !cuda::argument::__is_sequence_v>::value_type>); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); + static_assert( + !cuda::argument::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } // Unwrap: scalar { - constexpr auto sa = cuda::__argument::__constant<42>{}; - constexpr auto val = cuda::__argument::__unwrap(sa); + constexpr auto sa = cuda::argument::constant<42>{}; + constexpr auto val = cuda::argument::__unwrap(sa); static_assert(val == 42); } #if TEST_HAS_CLASS_NTTP // Unwrap: sequence { - constexpr auto sa = cuda::__argument::__constant_sequence{10, 20, 30}>{}; - constexpr auto val = cuda::__argument::__unwrap(sa); + constexpr auto sa = cuda::argument::constant_sequence{10, 20, 30}>{}; + constexpr auto val = cuda::argument::__unwrap(sa); static_assert(val[0] == 10); static_assert(val[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp index 19e475ef453..79a96a3366e 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp @@ -8,11 +8,11 @@ // //===----------------------------------------------------------------------===// -#include +#include -using arg_t = cuda::__argument::__immediate>; +using arg_t = cuda::argument::immediate>; -[[maybe_unused]] constexpr auto invalid_highest = cuda::__argument::__traits::highest; +[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits::highest; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp index 8d240e0cf3e..d3a0499fc55 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -#include +#include -[[maybe_unused]] constexpr auto invalid_bounds = cuda::__argument::__static_bounds<0, 1L>{}; +[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{}; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp index 8cc239585ac..b7aca57bbdf 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp @@ -13,7 +13,7 @@ // All argument types (plain values, static, dynamic, deferred) work uniformly // through the free functions. -#include +#include #include #include #include @@ -34,7 +34,7 @@ enum class algorithm_variant template TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) { - if constexpr (cuda::__argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity) + if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity) { return algorithm_variant::shared_memory; } @@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) template TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments) { - auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::__argument::__highest_(__seg_size))); + auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::argument::__highest_(__seg_size))); return __highest * __num_segments; } @@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se template TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size) { - const auto& __val = cuda::__argument::__unwrap(__seg_size); + const auto& __val = cuda::argument::__unwrap(__seg_size); - if constexpr (cuda::__argument::__traits<_SegSizeArg>::is_single_value) + if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value) { return static_cast(__val); } @@ -95,7 +95,7 @@ TEST_FUNC constexpr bool test() // static_argument: scalar, fits in shared memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__constant<128>{}; + constexpr auto seg_size = cuda::argument::constant<128>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 128 * 4); assert(process_segments(seg_size) == 128); @@ -104,7 +104,7 @@ TEST_FUNC constexpr bool test() #if TEST_HAS_CLASS_NTTP // static_argument: array sequence, highest fits in shared memory { - constexpr auto seg_sizes = cuda::__argument::__constant_sequence{}; + constexpr auto seg_sizes = cuda::argument::constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 256 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 256); @@ -112,7 +112,7 @@ TEST_FUNC constexpr bool test() // static_argument: array sequence, highest exceeds shared memory, buffer clamped { - constexpr auto seg_sizes = cuda::__argument::__constant_sequence{}; + constexpr auto seg_sizes = cuda::argument::constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 512 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 512); @@ -121,7 +121,7 @@ TEST_FUNC constexpr bool test() // dynamic_argument: tight static bounds, shared memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 256>()}; + constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); @@ -129,7 +129,7 @@ TEST_FUNC constexpr bool test() // dynamic_argument: wide static bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 4096>()}; + constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); @@ -137,7 +137,7 @@ TEST_FUNC constexpr bool test() // dynamic_argument: no bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100}; + constexpr auto seg_size = cuda::argument::immediate{100}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); @@ -145,9 +145,8 @@ TEST_FUNC constexpr bool test() // dynamic_argument: per-segment span with runtime bounds only { - int sizes[3] = {64, 128, 96}; - auto seg_sizes = - cuda::__argument::__immediate_sequence{cuda::std::span{sizes, 3}, cuda::__argument::__bounds(1, 200)}; + int sizes[3] = {64, 128, 96}; + auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span{sizes, 3}, cuda::argument::bounds(1, 200)}; assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); @@ -156,9 +155,9 @@ TEST_FUNC constexpr bool test() // dynamic_argument: per-segment span with both bounds { int sizes[3] = {64, 128, 96}; - auto seg_sizes = cuda::__argument::__immediate_sequence{ - cuda::std::span{sizes, 3}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)}; - static_assert(cuda::__argument::__traits::highest <= shared_memory_capacity); + auto seg_sizes = cuda::argument::immediate_sequence{ + cuda::std::span{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; + static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); @@ -167,9 +166,9 @@ TEST_FUNC constexpr bool test() // deferred_argument: uniform, bounds for decisions only { int val = 100; - auto seg_size = cuda::__argument::__deferred{ - cuda::std::span{&val, 1}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)}; - static_assert(cuda::__argument::__traits::highest <= shared_memory_capacity); + auto seg_size = cuda::argument::deferred{ + cuda::std::span{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; + static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 200 * 4); } @@ -185,14 +184,14 @@ TEST_FUNC constexpr bool test() #if TEST_HAS_CLASS_NTTP // static_argument float (float NTTPs require C++20) { - constexpr auto seg_size = cuda::__argument::__constant<128.0f>{}; + constexpr auto seg_size = cuda::argument::constant<128.0f>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 128); } // dynamic_argument float with static bounds { - constexpr auto seg_size = cuda::__argument::__immediate{100.0f, cuda::__argument::__bounds<1.0f, 256.0f>()}; + constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 100); } From f6494eee191de70a10a4e68fc67962e0abe04647 Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Wed, 3 Jun 2026 23:35:26 -0700 Subject: [PATCH 2/6] Forgot about the bounds types --- libcudacxx/include/cuda/__argument/argument.h | 87 +++++++++---------- .../include/cuda/__argument/argument_bounds.h | 20 ++--- .../cuda/argument/argument_bounds.pass.cpp | 16 ++-- .../cuda/argument/argument_traits.pass.cpp | 6 +- .../cuda/argument/deferred_argument.pass.cpp | 2 +- .../static_bounds_conversion.fail.cpp | 2 +- .../static_bounds_type_mismatch.fail.cpp | 2 +- 7 files changed, 67 insertions(+), 68 deletions(-) diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index 2df19b9ac5d..f823b27f6b1 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -166,13 +166,13 @@ template inline constexpr bool __valid_static_bounds_v = true; template -inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> = +inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> = __static_bound_in_range<_ElementType, _Lowest>() && __static_bound_in_range<_ElementType, _Highest>(); template _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept { - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>) { return ::cuda::std::numeric_limits<_ElementType>::lowest(); } @@ -185,7 +185,7 @@ _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept template _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept { - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>) { return (::cuda::std::numeric_limits<_ElementType>::max)(); } @@ -196,28 +196,28 @@ _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept } template -_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr _ElementType __effective_lowest(runtime_bounds<_ElementType> __runtime_bounds) noexcept { auto __static_lowest = __wrapper_static_lowest<_ElementType, _StaticBounds>(); return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower(); } template -_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr _ElementType __effective_highest(runtime_bounds<_ElementType> __runtime_bounds) noexcept { auto __static_highest = __wrapper_static_highest<_ElementType, _StaticBounds>(); return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper(); } template -_CCCL_API constexpr bool __has_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr bool __has_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { return __effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds) <= __effective_highest<_ElementType, _StaticBounds>(__runtime_bounds); } template -_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, "static argument bounds cannot be represented by the element type"); @@ -228,7 +228,7 @@ _CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_Elemen template _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept { - if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + if constexpr (!::cuda::std::is_same_v<_StaticBounds, no_bounds>) { _CCCL_ASSERT((__val >= __wrapper_static_lowest<_ElementType, _StaticBounds>()), "immediate argument value is below static lowest bound"); @@ -239,7 +239,7 @@ _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const template _CCCL_API constexpr void __validate_runtime_element_bounds( - [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept + [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] runtime_bounds<_ElementType> __runtime_bounds) noexcept { _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound"); _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound"); @@ -252,7 +252,7 @@ _CCCL_API constexpr void __validate_runtime_element_bounds( //! @brief Wraps a runtime argument value with optional bounds. //! //! The value is host-accessible at API call time. -template +template struct immediate { using __element_type = __element_type_of_t<_Arg>; @@ -288,8 +288,8 @@ struct immediate #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>) - -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>) + -> immediate<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED @@ -298,7 +298,7 @@ _CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>) // ===================================================================== //! @brief Wraps a runtime argument sequence with optional bounds. -template +template struct immediate_sequence { using __element_type = __element_type_of_t<_Arg>; @@ -308,7 +308,7 @@ struct immediate_sequence "static argument bounds cannot be represented by the element type"); _Arg __arg_; - __runtime_bounds<__element_type> __runtime_bounds_{}; + runtime_bounds<__element_type> __runtime_bounds_{}; private: _CCCL_API constexpr void __validate_bounds() const noexcept @@ -353,7 +353,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -363,7 +363,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -373,23 +373,23 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept : immediate_sequence(::cuda::std::move(__arg), __sb, __rb) {} }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== @@ -397,7 +397,7 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bound // ===================================================================== //! @brief Common base for deferred argument wrappers. -template +template struct __deferred_base { using __element_type = __element_type_of_t<_Arg>; @@ -406,7 +406,7 @@ struct __deferred_base "static argument bounds cannot be represented by the element type"); _Arg __arg_; - __runtime_bounds<__element_type> __runtime_bounds_{}; + runtime_bounds<__element_type> __runtime_bounds_{}; _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} @@ -421,7 +421,7 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -430,7 +430,7 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -439,14 +439,14 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept : __deferred_base(::cuda::std::move(__arg), __sb, __rb) {} }; //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available //! by the time the argument is consumed in stream order. -template +template struct deferred : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; @@ -457,24 +457,23 @@ template _CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>) - -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>) -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be //! available by the time the argument is consumed in stream order. -template +template struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; @@ -485,19 +484,19 @@ template _CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h index 3e70b4d1893..6d1943da32a 100644 --- a/libcudacxx/include/cuda/__argument/argument_bounds.h +++ b/libcudacxx/include/cuda/__argument/argument_bounds.h @@ -34,7 +34,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT //! @brief Sentinel type indicating no bounds are present. -struct __no_bounds +struct no_bounds {}; // ===================================================================== @@ -48,7 +48,7 @@ struct __no_bounds //! @tparam _Lower The static lower bound. //! @tparam _Upper The static upper bound. template -struct __static_bounds +struct static_bounds { static_assert(::cuda::std::is_same_v, "Static bounds endpoints must have the same type"); @@ -67,7 +67,7 @@ struct __static_bounds template inline constexpr bool __is_static_bounds_v = false; template -inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = true; +inline constexpr bool __is_static_bounds_v> = true; // ===================================================================== // runtime_bounds @@ -77,14 +77,14 @@ inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = tr //! //! @tparam _Tp The value type of the bounds. template -struct __runtime_bounds +struct runtime_bounds { _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest(); _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)(); - constexpr __runtime_bounds() noexcept = default; + constexpr runtime_bounds() noexcept = default; - _CCCL_API constexpr __runtime_bounds(_Tp __lower, _Tp __upper) noexcept + _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept : __lower_(__lower) , __upper_(__upper) { @@ -104,13 +104,13 @@ struct __runtime_bounds #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __runtime_bounds(_Tp, _Tp) -> __runtime_bounds<_Tp>; +_CCCL_HOST_DEVICE runtime_bounds(_Tp, _Tp) -> runtime_bounds<_Tp>; #endif // _CCCL_DOXYGEN_INVOKED template inline constexpr bool __is_runtime_bounds_v = false; template -inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; +inline constexpr bool __is_runtime_bounds_v> = true; // ===================================================================== // bounds — factory functions @@ -122,7 +122,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; //! @tparam _Upper The static upper bound. //! @return A compile-time bounds object. template -[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept +[[nodiscard]] _CCCL_API constexpr static_bounds<_Lower, _Upper> bounds() noexcept { return {}; } @@ -133,7 +133,7 @@ template //! @param __upper The runtime upper bound. //! @return A runtime bounds object. template -[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept +[[nodiscard]] _CCCL_API constexpr runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept { return {__lower, __upper}; } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index 9a5717f32c3..4275c4aca9c 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -20,21 +20,21 @@ TEST_FUNC constexpr bool test() // Basic static bounds { - constexpr auto b = cuda::argument::__static_bounds<1, 4096>{}; + constexpr auto b = cuda::argument::static_bounds<1, 4096>{}; static_assert(b.lower() == 1); static_assert(b.upper() == 4096); } // Exact static bounds { - constexpr auto b = cuda::argument::__static_bounds<42, 42>{}; + constexpr auto b = cuda::argument::static_bounds<42, 42>{}; static_assert(b.lower() == 42); static_assert(b.upper() == 42); } // Long type deduced from NTTPs { - static_assert(cuda::std::is_same_v::lower()), long>); + static_assert(cuda::std::is_same_v::lower()), long>); } #if TEST_HAS_CLASS_NTTP @@ -52,7 +52,7 @@ TEST_FUNC constexpr bool test() // Basic runtime bounds { - auto b = cuda::argument::__runtime_bounds{10, 100}; + auto b = cuda::argument::runtime_bounds{10, 100}; assert(b.lower() == 10); assert(b.upper() == 100); assert(b.__lower_ == 10); @@ -86,10 +86,10 @@ TEST_FUNC constexpr bool test() // Static and runtime bounds intersection { - static_assert(cuda::argument::__has_bounds_intersection>( - cuda::argument::__runtime_bounds{50, 200})); - static_assert(!cuda::argument::__has_bounds_intersection>( - cuda::argument::__runtime_bounds{0, 50})); + static_assert(cuda::argument::__has_bounds_intersection>( + cuda::argument::runtime_bounds{50, 200})); + static_assert(!cuda::argument::__has_bounds_intersection>( + cuda::argument::runtime_bounds{0, 50})); } // Non-bounds type diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp index 517893abb7d..c86f1caa251 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp @@ -151,12 +151,12 @@ TEST_FUNC void test() static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); static_assert( - cuda::argument::__traits>>::lowest == 1); + cuda::argument::__traits>>::lowest == 1); static_assert( - cuda::argument::__traits>&>::highest == 8); + cuda::argument::__traits>&>::highest == 8); static_assert( cuda::argument::__traits< - cuda::argument::immediate_sequence, cuda::argument::__static_bounds<1, 8>>>::highest + cuda::argument::immediate_sequence, cuda::argument::static_bounds<1, 8>>>::highest == 8); #if TEST_HAS_CLASS_NTTP static_assert( diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp index f9bda950aa7..5dc2fb849c7 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp @@ -41,7 +41,7 @@ TEST_FUNC constexpr bool test() // Deferred single value via pointer { int val = 42; - using def_t = cuda::argument::deferred>; + using def_t = cuda::argument::deferred>; static_assert(cuda::argument::__traits::lowest == 0); static_assert(cuda::argument::__traits::highest == 100); // Also verify construction works diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp index 79a96a3366e..7f9902e50a2 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp @@ -10,7 +10,7 @@ #include -using arg_t = cuda::argument::immediate>; +using arg_t = cuda::argument::immediate>; [[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits::highest; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp index d3a0499fc55..8170fdd7ee5 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp @@ -10,7 +10,7 @@ #include -[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{}; +[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{}; int main(int, char**) { From 0bb5139d04dd95d089f23ae61dc44b237eb22589 Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Thu, 4 Jun 2026 14:04:59 -0700 Subject: [PATCH 3/6] Tighten bounds validation --- libcudacxx/include/cuda/__argument/argument.h | 29 +++++++++++++------ .../cuda/argument/argument_bounds.pass.cpp | 8 +++++ .../static_argument_bounds_type.fail.cpp | 20 +++++++++++++ 3 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index f823b27f6b1..db40be13993 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -163,7 +163,10 @@ _CCCL_API constexpr bool __static_bound_in_range() noexcept } template -inline constexpr bool __valid_static_bounds_v = true; +inline constexpr bool __valid_static_bounds_v = false; + +template +inline constexpr bool __valid_static_bounds_v<_ElementType, no_bounds> = true; template inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> = @@ -220,7 +223,8 @@ template _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)), "static and runtime argument bounds do not intersect"); } @@ -258,7 +262,8 @@ struct immediate using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _Arg __arg_; @@ -305,7 +310,8 @@ struct immediate_sequence static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; @@ -403,7 +409,8 @@ struct __deferred_base using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; @@ -689,7 +696,8 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = false; @@ -718,7 +726,8 @@ struct __traits_impl> using element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = false; @@ -733,7 +742,8 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = true; @@ -748,7 +758,8 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = true; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index 4275c4aca9c..6be3c0d925b 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -97,6 +97,14 @@ TEST_FUNC constexpr bool test() static_assert(!cuda::argument::__is_bounds_v); } + // Bounds types accepted by argument wrapper template parameters + { + static_assert(cuda::argument::__valid_static_bounds_v); + static_assert(cuda::argument::__valid_static_bounds_v>); + static_assert(!cuda::argument::__valid_static_bounds_v>); + static_assert(!cuda::argument::__valid_static_bounds_v); + } + return true; } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp new file mode 100644 index 00000000000..66a1828769b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using arg_t = cuda::argument::immediate>; + +[[maybe_unused]] arg_t invalid_arg{0}; + +int main(int, char**) +{ + return 0; +} From 9e66badb1155363548b07a19fd0f095e78997b0b Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Thu, 4 Jun 2026 14:12:53 -0700 Subject: [PATCH 4/6] Asssert deferred_sequence type is a sequence --- libcudacxx/include/cuda/__argument/argument.h | 3 +++ .../deferred_sequence_scalar.fail.cpp | 18 +++++++++++++++++ .../deferred_sequence_scalar_traits.fail.cpp | 20 +++++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index db40be13993..81df64ab616 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -483,6 +483,8 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi template struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> { + static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type"); + using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; @@ -757,6 +759,7 @@ struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; + static_assert(__is_sequence_v, "deferred sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " "values representable by the element type"); diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp new file mode 100644 index 00000000000..1ce371b84e5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +[[maybe_unused]] cuda::argument::deferred_sequence invalid_arg{0}; + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp new file mode 100644 index 00000000000..41a9a2ae778 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using traits = cuda::argument::__traits>; + +[[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred; + +int main(int, char**) +{ + return 0; +} From f4cb111bea422cae228250d27cc3aa4f72546a55 Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Fri, 5 Jun 2026 15:20:02 -0700 Subject: [PATCH 5/6] Feedback from review meeting --- .../bench/segmented_topk/fixed/keys.cu | 10 +- .../bench/segmented_topk/variable/indexed.cu | 14 +- .../bench/segmented_topk/variable/keys.cu | 12 +- cub/cub/agent/agent_batched_topk.cuh | 10 +- cub/cub/detail/segmented_params.cuh | 34 +- .../device/dispatch/dispatch_batched_topk.cuh | 24 +- .../dispatch/kernels/kernel_batched_topk.cuh | 8 +- .../catch2_test_device_segmented_topk_keys.cu | 32 +- ...catch2_test_device_segmented_topk_pairs.cu | 22 +- libcudacxx/include/cuda/__argument/argument.h | 317 ++++++++++++------ .../include/cuda/__argument/argument_bounds.h | 132 +------- .../include/cuda/std/__internal/namespaces.h | 4 +- .../cuda/argument/argument_bounds.pass.cpp | 52 ++- .../cuda/argument/argument_traits.pass.cpp | 159 +++++---- .../cuda/argument/deferred_argument.pass.cpp | 118 +++---- .../deferred_sequence_scalar.fail.cpp | 2 +- .../deferred_sequence_scalar_traits.fail.cpp | 2 +- .../cuda/argument/dynamic_argument.pass.cpp | 123 ++++--- .../cuda/argument/static_argument.pass.cpp | 85 +++-- .../static_argument_bounds_type.fail.cpp | 2 +- .../static_bounds_conversion.fail.cpp | 4 +- .../static_bounds_type_mismatch.fail.cpp | 2 +- .../cuda/argument/usage_example.pass.cpp | 71 ++-- 23 files changed, 622 insertions(+), 617 deletions(-) diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu index 4178e7ea0d5..77bd997b742 100644 --- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu @@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys( const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected); const auto num_segments = ::cuda::std::max(1, (max_elements / segment_size)); const auto elements = num_segments * segment_size; - const auto total_num_items = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)}; const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); // Skip workloads where k exceeds the segment size @@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys( auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size); auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements); - auto segment_sizes = ::cuda::argument::constant{}; - auto k = ::cuda::argument::constant{}; - auto select_direction = ::cuda::argument::constant{}; + auto segment_sizes = ::cuda::args::constant{}; + auto k = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; state.add_element_count(elements, "NumElements"); state.add_element_count(segment_size, "SegmentSize"); @@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys( segment_sizes, k, select_direction, - ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)}, + ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)}, total_num_items, env); }); diff --git a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu index f59b250e247..488d3aa4439 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -36,18 +36,18 @@ void decode_style_variable_topk_indexed( static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast(input_elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); auto out_indices_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; - auto num_segments_param = ::cuda::__argument::__immediate{static_cast(num_segments)}; + auto segment_sizes_param = ::cuda::args::__immediate_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()}; + auto k_param = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; + auto num_segments_param = ::cuda::args::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu index 0febcf507bd..5a54ad1fbaa 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu @@ -32,17 +32,17 @@ void decode_style_variable_topk_keys( static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::argument::immediate{static_cast(input_elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::argument::immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::argument::constant{}; - auto select_direction = ::cuda::argument::constant{}; - auto num_segments_param = ::cuda::argument::immediate{static_cast(num_segments)}; + auto segment_sizes_param = ::cuda::args::__immediate_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()}; + auto k_param = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; + auto num_segments_param = ::cuda::args::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh index bc10311d36d..d2a99cfc809 100644 --- a/cub/cub/agent/agent_batched_topk.cuh +++ b/cub/cub/agent/agent_batched_topk.cuh @@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment using key_t = it_value_t; using value_t = it_value_t; - using segment_size_val_t = typename ::cuda::argument::__traits::element_type; - using num_segments_val_t = typename ::cuda::argument::__traits::element_type; + using segment_size_val_t = typename ::cuda::args::__traits::element_type; + using num_segments_val_t = typename ::cuda::args::__traits::element_type; using counters_t = batched_topk_counters; static constexpr auto policy = PolicyGetter{}(); @@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread; // Check if there could be large segments present - static constexpr bool only_small_segments = ::cuda::argument::__traits::highest <= tile_size; + static constexpr bool only_small_segments = ::cuda::args::__traits::highest <= tile_size; // Check if we are dealing with keys-only or key-value pairs static constexpr bool is_keys_only = ::cuda::std::is_same_v; @@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment return; } - constexpr bool is_full_tile = ::cuda::argument::__traits::is_constant - && ::cuda::argument::__traits::lowest == tile_size; + constexpr bool is_full_tile = ::cuda::args::__traits::is_constant + && ::cuda::args::__traits::lowest == tile_size; // Resolve Segment Parameters const auto segment_size = params::get_param(segment_sizes, segment_id); diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh index 1c4e6859d3e..41ba334eaef 100644 --- a/cub/cub/detail/segmented_params.cuh +++ b/cub/cub/detail/segmented_params.cuh @@ -33,10 +33,10 @@ namespace detail::params //! @param[in] __index Segment index to read for sequence arguments. //! @return The single argument value, or the sequence element at the given index. _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT) -_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) +_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) + if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) { return __arg; } @@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_T } } -template +template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto +get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto +get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } // ===================================================================== diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index 29d245c175e..d3c0c651991 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -50,16 +50,16 @@ namespace detail::batched_topk // Internal: wrap the compile-time select direction into a discrete param for dispatch // ----------------------------------------------------------------------------- -// The selection direction is compile-time only: callers pass `::cuda::argument::constant`, which maps to a +// The selection direction is compile-time only: callers pass `::cuda::args::constant`, which maps to a // value-less static_discrete_param. Because the direction is fixed at compile time and carries no runtime value, it // can never disagree with its only supported option, so dispatch can never silently degrade to a no-op. -template -[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant) +template +[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant) { return params::static_discrete_param{}; } -// The selection direction is intentionally a compile-time constant: only `::cuda::argument::constant` is +// The selection direction is intentionally a compile-time constant: only `::cuda::args::constant` is // accepted (the overload above maps it to a value-less static_discrete_param). This catch-all documents that // deliberate limitation and rejects anything else (e.g. a runtime `detail::topk::select` or a per-segment iterator of // directions) with a clear diagnostic. It is an intent/documentation guard rather than a user-facing one: callers @@ -71,7 +71,7 @@ template static_assert(::cuda::std::__always_false_v, "DeviceBatchedTopK currently supports only compile-time selection directions: the min/max entry " "points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}) dispatch with a " - "::cuda::argument::constant; runtime or per-segment directions are " + "::cuda::args::constant; runtime or per-segment directions are " "intentionally not supported"); // Unreachable (the static_assert above always fires); keeps the return type well-formed so the only diagnostic is // the message above. @@ -131,7 +131,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::argument::__traits::highest>> + ::cuda::args::__traits::highest>> #if _CCCL_HAS_CONCEPTS() requires batched_topk_policy_selector #endif // _CCCL_HAS_CONCEPTS() @@ -150,7 +150,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { - using large_segment_tile_offset_t = typename ::cuda::argument::__traits::element_type; + using large_segment_tile_offset_t = typename ::cuda::args::__traits::element_type; // Wrap the raw enum into the internal discrete param type auto select_directions = wrap_select_direction(select_direction); @@ -176,9 +176,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int worker_per_segment_tile_size = worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread; static constexpr bool any_small_segments = - ::cuda::argument::__traits::lowest <= worker_per_segment_tile_size; + ::cuda::args::__traits::lowest <= worker_per_segment_tile_size; static constexpr bool only_small_segments = - ::cuda::argument::__traits::highest <= worker_per_segment_tile_size; + ::cuda::args::__traits::highest <= worker_per_segment_tile_size; // Allocation layout: // only_small_segments: [0] dummy. @@ -188,7 +188,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2); size_t allocation_sizes[allocations_array_size] = {1}; - using num_segments_val_t = typename ::cuda::argument::__traits::element_type; + using num_segments_val_t = typename ::cuda::args::__traits::element_type; using counters_t = batched_topk_counters; using segment_size_scan_offset_t = detail::choose_offset_t; using segment_size_scan_input_op_t = @@ -244,7 +244,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( // TODO (elstehle): support number of segments provided by device-accessible iterator // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host) - static_assert(::cuda::argument::__traits::is_single_value, + static_assert(::cuda::args::__traits::is_single_value, "Only uniform segment sizes are currently supported."); if constexpr (any_small_segments) @@ -346,7 +346,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::argument::__traits::highest>; + ::cuda::args::__traits::highest>; return detail::dispatch_with_env_and_tuning( env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { return dispatch( diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh index 9c904f95971..3412f40359a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh @@ -39,7 +39,7 @@ private: worker_policy worker_per_segment_policy; multi_worker_policy multi_worker_per_segment_policy; }; - static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits::highest; + static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::args::__traits::highest; static constexpr batched_topk_policy active_policy = current_policy(); template @@ -133,8 +133,8 @@ __launch_bounds__(int( KParameterT k, SelectDirectionParameterT select_directions, NumSegmentsParameterT num_segments, - batched_topk_counters::element_type>* d_counters, - typename ::cuda::argument::__traits::element_type* d_large_segments_ids, + batched_topk_counters::element_type>* d_counters, + typename ::cuda::args::__traits::element_type* d_large_segments_ids, LargeSegmentTileOffsetT* d_large_segments_tile_offsets) { using agent_t = typename find_smallest_covering_policy< @@ -151,7 +151,7 @@ __launch_bounds__(int( LargeSegmentTileOffsetT>::agent_t; // Static Assertions (Constraints) - static_assert(agent_t::tile_size >= ::cuda::argument::__traits::highest, + static_assert(agent_t::tile_size >= ::cuda::args::__traits::highest, "Block size exceeds maximum segment size supported by SegmentSizeParameterT"); static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block, "Static shared memory per block must not exceed 48KB limit."); diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index 16f7d4322ca..7e9ad93ca9e 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -156,11 +156,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments", batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, - ::cuda::argument::constant{}, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_segments * segment_size}); + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::constant{}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); // Prepare expected results fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); compact_sorted_keys_to_topk(expected_keys, segment_size, k); @@ -254,12 +254,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::argument::immediate_sequence{ - segment_size_it, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, - ::cuda::argument::constant{}, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_items}); + ::cuda::args::__immediate_sequence{ + segment_size_it, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::constant{}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_items}); // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); @@ -292,11 +292,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment batched_topk_keys( d_keys_in_it, d_keys_out_it, - ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, - ::cuda::argument::constant{}, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_segments * segment_size}); + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::constant{}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); const int num_minus_zero = static_cast(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{})); REQUIRE(num_minus_zero >= 1); diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index 49b2e48a1b6..cef26687cbd 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -225,11 +225,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments" d_keys_out, d_values_in, d_values_out, - ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, - ::cuda::argument::constant{}, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_segments * segment_size}); + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::constant{}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); // Verification: // - We verify correct top-k selection through the keys @@ -346,12 +346,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen d_keys_out, d_values_in, d_values_out, - ::cuda::argument::immediate_sequence{ - segment_size_it, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, - ::cuda::argument::constant{}, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_items}); + ::cuda::args::__immediate_sequence{ + segment_size_it, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::constant{}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_items}); // Verification: // - We verify correct top-k selection through the keys diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index 81df64ab616..0877a8df6d1 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -47,6 +47,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT +struct __access; + // ===================================================================== // __element_type_of // ===================================================================== @@ -95,31 +97,28 @@ inline constexpr bool __is_sequence_v = // spelling carries that intent. //! @brief Wraps a compile-time constant argument value. -template -struct constant +template +class constant { - using value_type = ::cuda::std::remove_cvref_t; +public: + using value_type = ::cuda::std::remove_cvref_t<_Tp>; using __element_type = value_type; - [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept + [[nodiscard]] _CCCL_API static constexpr value_type __get_value() noexcept { - return _Value; + return static_cast(_Value); } }; //! @brief Wraps a compile-time constant argument sequence. template -struct constant_sequence +class __constant_sequence { +public: using value_type = ::cuda::std::remove_cvref_t; using __element_type = __element_type_of_t; static_assert(__is_sequence_v, "The value type of __constant_sequence must be a sequence"); - - [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept - { - return _Value; - } }; // __assert_in_range @@ -223,7 +222,7 @@ template _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)), "static and runtime argument bounds do not intersect"); @@ -257,17 +256,20 @@ _CCCL_API constexpr void __validate_runtime_element_bounds( //! //! The value is host-accessible at API call time. template -struct immediate +class immediate { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); +private: + friend struct __access; + _Arg __arg_; -private: _CCCL_API constexpr void __validate_value() const noexcept { if constexpr (::cuda::std::is_same_v<::cuda::std::remove_cvref_t<_Arg>, __element_type> @@ -299,24 +301,27 @@ _CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>) #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== -// immediate_sequence +// __immediate_sequence // ===================================================================== //! @brief Wraps a runtime argument sequence with optional bounds. template -struct immediate_sequence +class __immediate_sequence { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); +private: + friend struct __access; + _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; -private: _CCCL_API constexpr void __validate_bounds() const noexcept { __validate_bounds_intersection<__element_type, _StaticBounds>(__runtime_bounds_); @@ -344,14 +349,14 @@ struct immediate_sequence } public: - _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); __validate_value(); } - _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); @@ -359,7 +364,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -369,7 +374,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -379,23 +384,23 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept - : immediate_sequence(::cuda::std::move(__arg), __sb, __rb) + _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb) {} }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) - -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== @@ -404,17 +409,22 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_L //! @brief Common base for deferred argument wrappers. template -struct __deferred_base +class __deferred_base { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); +private: + friend struct __access; + _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; +public: _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { @@ -454,8 +464,9 @@ struct __deferred_base //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available //! by the time the argument is consumed in stream order. template -struct deferred : __deferred_base<_Arg, _StaticBounds> +class deferred : public __deferred_base<_Arg, _StaticBounds> { +public: using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; @@ -481,8 +492,9 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be //! available by the time the argument is consumed in stream order. template -struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> +class deferred_sequence : public __deferred_base<_Arg, _StaticBounds> { +public: static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type"); using __deferred_base<_Arg, _StaticBounds>::__deferred_base; @@ -508,6 +520,97 @@ _CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lo -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED +// ===================================================================== +// __access +// ===================================================================== + +struct __access +{ + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(immediate<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& __arg(const immediate<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(immediate<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& + __arg(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__immediate_sequence<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& + __arg(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__deferred_base<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } +}; + // ===================================================================== // __unwrap // ===================================================================== @@ -516,12 +619,12 @@ template inline constexpr bool __is_wrapper_v = false; template inline constexpr bool __is_wrapper_v> = true; +template +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v> = true; -template -inline constexpr bool __is_wrapper_v> = true; +inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true; template -inline constexpr bool __is_wrapper_v> = true; +inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true; template inline constexpr bool __is_wrapper_v> = true; template @@ -537,99 +640,99 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) template [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } -template -[[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const constant<_Value>&) noexcept +template +[[nodiscard]] _CCCL_API constexpr typename constant<_Value, _Tp>::value_type +__unwrap(const constant<_Value, _Tp>&) noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } template [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const constant_sequence<_Value>&) noexcept +__unwrap(const __constant_sequence<_Value>&) noexcept { return _Value; } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } template [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } template [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } -template +template _CCCL_API constexpr auto __constant_compute_lowest() noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } -template +template _CCCL_API constexpr auto __constant_compute_highest() noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } template @@ -680,16 +783,16 @@ struct __traits_impl static constexpr element_type highest = (::cuda::std::numeric_limits::max)(); }; -template -struct __traits_impl> +template +struct __traits_impl> { - using value_type = ::cuda::std::remove_cvref_t; + using value_type = typename constant<_Value, _Tp>::value_type; using element_type = value_type; static constexpr bool is_constant = true; static constexpr bool is_deferred = false; static constexpr bool is_single_value = true; - static constexpr element_type lowest = __constant_compute_lowest<_Value>(); - static constexpr element_type highest = __constant_compute_highest<_Value>(); + static constexpr element_type lowest = __constant_compute_lowest<_Value, _Tp>(); + static constexpr element_type highest = __constant_compute_highest<_Value, _Tp>(); }; template @@ -698,7 +801,7 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -709,7 +812,7 @@ struct __traits_impl> }; template -struct __traits_impl> +struct __traits_impl<__constant_sequence<_Value>> { using value_type = ::cuda::std::remove_cvref_t; using element_type = __element_type_of_t; @@ -722,13 +825,13 @@ struct __traits_impl> }; template -struct __traits_impl> +struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -744,7 +847,7 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -761,7 +864,7 @@ struct __traits_impl> using element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v, "deferred sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -787,14 +890,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) return ::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::lowest(); } -template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept +template +[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value, _Tp>) noexcept { - return __constant_compute_lowest<_Value>(); + return __constant_compute_lowest<_Value, _Tp>(); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept { return __constant_sequence_compute_lowest<_Value>(); } @@ -802,31 +905,34 @@ template template [[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } //! @brief Returns the effective highest bound, combining static and runtime bounds. @@ -837,14 +943,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) return (::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::max)(); } -template -[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept +template +[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value, _Tp>) noexcept { - return __constant_compute_highest<_Value>(); + return __constant_compute_highest<_Value, _Tp>(); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept { return __constant_sequence_compute_highest<_Value>(); } @@ -852,31 +958,34 @@ template template [[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } _CCCL_END_NAMESPACE_CUDA_ARGUMENT diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h index 6d1943da32a..ce6d3a56743 100644 --- a/libcudacxx/include/cuda/__argument/argument_bounds.h +++ b/libcudacxx/include/cuda/__argument/argument_bounds.h @@ -22,11 +22,8 @@ #endif // no system header #include -#include #include -#include #include -#include #include #include @@ -48,8 +45,9 @@ struct no_bounds //! @tparam _Lower The static lower bound. //! @tparam _Upper The static upper bound. template -struct static_bounds +class static_bounds { +public: static_assert(::cuda::std::is_same_v, "Static bounds endpoints must have the same type"); static_assert(_Lower <= _Upper, "Lower bound must be <= upper bound"); @@ -77,11 +75,12 @@ inline constexpr bool __is_static_bounds_v> = true //! //! @tparam _Tp The value type of the bounds. template -struct runtime_bounds +class runtime_bounds { _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest(); _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)(); +public: constexpr runtime_bounds() noexcept = default; _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept @@ -145,129 +144,6 @@ inline constexpr bool __is_runtime_bounds_cv_v = __is_runtime_bounds_v<::cuda::s template inline constexpr bool __is_bounds_v = __is_static_bounds_cv_v<_Tp> || __is_runtime_bounds_cv_v<_Tp>; -// ===================================================================== -// __assert_in_range -// ===================================================================== - -template -_CCCL_API constexpr void __assert_in_range([[maybe_unused]] _From __val) noexcept -{ - if constexpr (::cuda::std::__cccl_is_integer_v<_To> && ::cuda::std::__cccl_is_integer_v<_From>) - { - _CCCL_ASSERT(::cuda::std::in_range<_To>(__val), "runtime bound value overflows the element type"); - } -} - -template -[[nodiscard]] _CCCL_API constexpr _To __runtime_bound_cast(_From __val) noexcept -{ - __assert_in_range<_To>(__val); - return static_cast<_To>(__val); -} - -template -_CCCL_API constexpr bool __static_bound_in_range() noexcept -{ - using _From = decltype(_Value); - - if constexpr (::cuda::std::__cccl_is_integer_v<_To> && ::cuda::std::__cccl_is_integer_v<_From>) - { - return ::cuda::std::in_range<_To>(_Value); - } - else if constexpr (::cuda::std::is_arithmetic_v<_To> && ::cuda::std::is_arithmetic_v<_From>) - { - return static_cast<_From>(static_cast<_To>(_Value)) == _Value; - } - else - { - return true; - } -} - -template -inline constexpr bool __valid_static_bounds_v = true; - -template -inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> = - ::cuda::__argument::__static_bound_in_range<_ElementType, _Lowest>() - && ::cuda::__argument::__static_bound_in_range<_ElementType, _Highest>(); - -template -_CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept -{ - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) - { - return ::cuda::std::numeric_limits<_ElementType>::lowest(); - } - else - { - return static_cast<_ElementType>(_StaticBounds::lower()); - } -} - -template -_CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept -{ - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) - { - return (::cuda::std::numeric_limits<_ElementType>::max)(); - } - else - { - return static_cast<_ElementType>(_StaticBounds::upper()); - } -} - -template -_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - auto __static_lowest = ::cuda::__argument::__wrapper_static_lowest<_ElementType, _StaticBounds>(); - return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower(); -} - -template -_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - auto __static_highest = ::cuda::__argument::__wrapper_static_highest<_ElementType, _StaticBounds>(); - return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper(); -} - -template -_CCCL_API constexpr bool __valid_argument_bounds(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - return ::cuda::__argument::__effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds) - <= ::cuda::__argument::__effective_highest<_ElementType, _StaticBounds>(__runtime_bounds); -} - -template -_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); - _CCCL_VERIFY((::cuda::__argument::__valid_argument_bounds<_ElementType, _StaticBounds>(__runtime_bounds)), - "static and runtime argument bounds do not intersect"); -} - -template -_CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept -{ - if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>) - { - _CCCL_ASSERT((__val >= ::cuda::__argument::__wrapper_static_lowest<_ElementType, _StaticBounds>()), - "immediate argument value is below static lowest bound"); - _CCCL_ASSERT((__val <= ::cuda::__argument::__wrapper_static_highest<_ElementType, _StaticBounds>()), - "immediate argument value is above static highest bound"); - } -} - -template -_CCCL_API constexpr void __validate_runtime_element_bounds( - [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound"); - _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound"); -} - _CCCL_END_NAMESPACE_CUDA_ARGUMENT #include diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h index f3cc191adfc..4876dd6f449 100644 --- a/libcudacxx/include/cuda/std/__internal/namespaces.h +++ b/libcudacxx/include/cuda/std/__internal/namespaces.h @@ -115,8 +115,8 @@ #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution) #define _CCCL_END_NAMESPACE_CUDA_EXECUTION _CCCL_END_NAMESPACE(cuda::execution) -#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument) -#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::argument) +#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::args) +#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::args) // Namespace to avoid name collisions with CPOs on clang-16 (see // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index 6be3c0d925b..46070dbabbd 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -16,31 +16,31 @@ TEST_FUNC constexpr bool test() { - // --- static_argument_bounds --- + // --- static_bounds --- // Basic static bounds { - constexpr auto b = cuda::argument::static_bounds<1, 4096>{}; + constexpr auto b = cuda::args::static_bounds<1, 4096>{}; static_assert(b.lower() == 1); static_assert(b.upper() == 4096); } // Exact static bounds { - constexpr auto b = cuda::argument::static_bounds<42, 42>{}; + constexpr auto b = cuda::args::static_bounds<42, 42>{}; static_assert(b.lower() == 42); static_assert(b.upper() == 42); } // Long type deduced from NTTPs { - static_assert(cuda::std::is_same_v::lower()), long>); + static_assert(cuda::std::is_same_v::lower()), long>); } #if TEST_HAS_CLASS_NTTP // Static bounds preserve their original NTTP types { - constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>(); + constexpr auto b = cuda::args::bounds<1.0f, 8.0f>(); static_assert(b.lower() == 1.0f); static_assert(b.upper() == 8); static_assert(cuda::std::is_same_v); @@ -48,17 +48,13 @@ TEST_FUNC constexpr bool test() } #endif // TEST_HAS_CLASS_NTTP - // --- runtime_argument_bounds --- + // --- runtime_bounds --- // Basic runtime bounds { - auto b = cuda::argument::runtime_bounds{10, 100}; + auto b = cuda::args::runtime_bounds{10, 100}; assert(b.lower() == 10); assert(b.upper() == 100); - assert(b.__lower_ == 10); - assert(b.__upper_ == 100); - b.__upper_ = 90; - assert(b.upper() == 90); static_assert(cuda::std::is_same_v); } @@ -66,43 +62,43 @@ TEST_FUNC constexpr bool test() // Static via factory { - constexpr auto b = cuda::argument::bounds<1, 8>(); + constexpr auto b = cuda::args::bounds<1, 8>(); static_assert(b.lower() == 1); static_assert(b.upper() == 8); - static_assert(cuda::argument::__is_static_bounds_cv_v); - static_assert(!cuda::argument::__is_runtime_bounds_cv_v); - static_assert(cuda::argument::__is_bounds_v); + static_assert(cuda::args::__is_static_bounds_cv_v); + static_assert(!cuda::args::__is_runtime_bounds_cv_v); + static_assert(cuda::args::__is_bounds_v); } // Runtime via factory { - auto b = cuda::argument::bounds(10, 100); + auto b = cuda::args::bounds(10, 100); assert(b.lower() == 10); assert(b.upper() == 100); - static_assert(!cuda::argument::__is_static_bounds_cv_v); - static_assert(cuda::argument::__is_runtime_bounds_cv_v); - static_assert(cuda::argument::__is_bounds_v); + static_assert(!cuda::args::__is_static_bounds_cv_v); + static_assert(cuda::args::__is_runtime_bounds_cv_v); + static_assert(cuda::args::__is_bounds_v); } // Static and runtime bounds intersection { - static_assert(cuda::argument::__has_bounds_intersection>( - cuda::argument::runtime_bounds{50, 200})); - static_assert(!cuda::argument::__has_bounds_intersection>( - cuda::argument::runtime_bounds{0, 50})); + static_assert(cuda::args::__has_bounds_intersection>( + cuda::args::runtime_bounds{50, 200})); + static_assert(!cuda::args::__has_bounds_intersection>( + cuda::args::runtime_bounds{0, 50})); } // Non-bounds type { - static_assert(!cuda::argument::__is_bounds_v); + static_assert(!cuda::args::__is_bounds_v); } // Bounds types accepted by argument wrapper template parameters { - static_assert(cuda::argument::__valid_static_bounds_v); - static_assert(cuda::argument::__valid_static_bounds_v>); - static_assert(!cuda::argument::__valid_static_bounds_v>); - static_assert(!cuda::argument::__valid_static_bounds_v); + static_assert(cuda::args::__valid_static_bounds_v); + static_assert(cuda::args::__valid_static_bounds_v>); + static_assert(!cuda::args::__valid_static_bounds_v>); + static_assert(!cuda::args::__valid_static_bounds_v); } return true; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp index c86f1caa251..aaf57291e23 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp @@ -57,129 +57,124 @@ TEST_FUNC void test() // --- __is_sequence_v --- // builtin and class type are not sequences - static_assert(!cuda::argument::__is_sequence_v); - static_assert(!cuda::argument::__is_sequence_v); - static_assert(!cuda::argument::__is_sequence_v); - static_assert(!cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); // iterators and pointers can be sequences if they are at least random access - static_assert(cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(!cuda::argument::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); // ranges and arrays are sequences - static_assert(cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v&>); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v&>); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v>); // --- __element_type_of_t --- - static_assert(cuda::std::is_same_v&>, int>); - static_assert(cuda::std::is_same_v, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v&>, int>); + static_assert(cuda::std::is_same_v, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); static_assert( - cuda::std::is_same_v>>, - int>); - static_assert(cuda::std::is_same_v>, int>); + cuda::std::is_same_v>>, int>); + static_assert(cuda::std::is_same_v>, int>); // --- argument_traits: is_deferred --- - static_assert(!cuda::argument::__traits::is_deferred); - static_assert(!cuda::argument::__traits>::is_deferred); - static_assert(!cuda::argument::__traits>>::is_deferred); - static_assert(!cuda::argument::__traits>::is_deferred); + static_assert(!cuda::args::__traits::is_deferred); + static_assert(!cuda::args::__traits>::is_deferred); + static_assert(!cuda::args::__traits>>::is_deferred); + static_assert(!cuda::args::__traits>::is_deferred); #if TEST_HAS_CLASS_NTTP - static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_deferred); + static_assert(!cuda::args::__traits{1, 2, 3}>>::is_deferred); #endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::argument::__traits>>::is_deferred); - static_assert(cuda::argument::__traits>>::is_deferred); + static_assert(cuda::args::__traits>>::is_deferred); + static_assert(cuda::args::__traits>>::is_deferred); // --- argument_traits: is_single_value --- - static_assert(cuda::argument::__traits::is_single_value); - static_assert(cuda::argument::__traits::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(cuda::argument::__traits>>::is_single_value); - static_assert(!cuda::argument::__traits>>::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::args::__traits::is_single_value); + static_assert(cuda::args::__traits::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_single_value); + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); // --- argument_traits: value_type --- - static_assert(cuda::std::is_same_v::value_type, int>); - static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, int>); static_assert( - cuda::std::is_same_v>>::value_type, + cuda::std::is_same_v>>::value_type, cuda::std::span>); - static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, float>); #if TEST_HAS_CLASS_NTTP - static_assert( - cuda::std::is_same_v< - cuda::argument::__traits{1, 2, 3}>>::value_type, - cuda::std::array>); + static_assert(cuda::std::is_same_v< + cuda::args::__traits{1, 2, 3}>>::value_type, + cuda::std::array>); #endif // TEST_HAS_CLASS_NTTP // --- argument_traits: lowest / highest --- - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert( - cuda::argument::__traits>>::lowest == 1); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits>>::lowest == 1); + static_assert(cuda::args::__traits>&>::highest == 8); static_assert( - cuda::argument::__traits>&>::highest == 8); - static_assert( - cuda::argument::__traits< - cuda::argument::immediate_sequence, cuda::argument::static_bounds<1, 8>>>::highest + cuda::args::__traits, cuda::args::static_bounds<1, 8>>>::highest == 8); + static_assert(cuda::args::__traits>::lowest == 10.0f); + static_assert(cuda::args::__traits>::highest == 10.0f); #if TEST_HAS_CLASS_NTTP - static_assert( - cuda::argument::__traits{3, 1, 2}>>::lowest == 1); - static_assert( - cuda::argument::__traits{3, 1, 2}>>::highest == 3); + static_assert(cuda::args::__traits{3, 1, 2}>>::lowest == 1); + static_assert(cuda::args::__traits{3, 1, 2}>>::highest == 3); #endif // TEST_HAS_CLASS_NTTP // --- Free function bounds on plain values --- - static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits::max)()); - static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__lowest_(42) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(42) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); // --- Scalar and sequence wrappers expose distinct single-value traits --- - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_single_value); + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp index 5dc2fb849c7..21b200e8f0f 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp @@ -22,110 +22,110 @@ TEST_FUNC constexpr bool test() // Deferred single value via span { int val = 42; - auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; - assert(cuda::argument::__unwrap(def)[0] == 42); - assert(def.__arg_[0] == 42); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}}; + assert(cuda::args::__unwrap(def)[0] == 42); + assert(cuda::args::__access::__arg(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); } // Deferred single value with static bounds { int val = 42; - auto def = cuda::argument::deferred{cuda::std::span{&val, 1}, cuda::argument::bounds<1, 1000>()}; - assert(cuda::argument::__unwrap(def)[0] == 42); - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 1000); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds<1, 1000>()}; + assert(cuda::args::__unwrap(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 1000); } // Deferred single value via pointer { int val = 42; - using def_t = cuda::argument::deferred>; - static_assert(cuda::argument::__traits::lowest == 0); - static_assert(cuda::argument::__traits::highest == 100); + using def_t = cuda::args::deferred>; + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); // Also verify construction works - auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()}; - assert(cuda::argument::__unwrap(def) == &val); + auto def = cuda::args::deferred{&val, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def) == &val); } // Deferred single value via fancy iterator { auto it = cuda::counting_iterator{42}; - auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()}; - assert(cuda::argument::__unwrap(def)[0] == 42); - static_assert(cuda::argument::__traits::lowest == 0); - static_assert(cuda::argument::__traits::highest == 100); - static_assert(cuda::argument::__traits::is_single_value); + auto def = cuda::args::deferred{it, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); + static_assert(cuda::args::__traits::is_single_value); } // Deferred single value with both bounds, runtime bounds first { - int val = 42; - auto def = cuda::argument::deferred{ - cuda::std::span{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 256); - assert(def.__runtime_bounds_.__lower_ == 5); - assert(def.__runtime_bounds_.__upper_ == 100); - assert(cuda::argument::__lowest_(def) == 5); - assert(cuda::argument::__highest_(def) == 100); - def.__runtime_bounds_.__upper_ = 90; - assert(cuda::argument::__highest_(def) == 90); + int val = 42; + auto def = + cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 256>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__access::__runtime_bounds(def).lower() == 5); + assert(cuda::args::__access::__runtime_bounds(def).upper() == 100); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); + cuda::args::__access::__runtime_bounds(def) = cuda::args::bounds(5, 90); + assert(cuda::args::__highest_(def) == 90); } // Deferred sequence via fancy iterator { auto it = cuda::counting_iterator{10}; - auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()}; - assert(cuda::argument::__unwrap(def)[0] == 10); - assert(cuda::argument::__unwrap(def)[2] == 12); - static_assert(cuda::argument::__traits::lowest == 0); - static_assert(cuda::argument::__traits::highest == 100); - static_assert(!cuda::argument::__traits::is_single_value); + auto def = cuda::args::deferred_sequence{it, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def)[0] == 10); + assert(cuda::args::__unwrap(def)[2] == 12); + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); + static_assert(!cuda::args::__traits::is_single_value); } // Deferred sequence with both bounds { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::argument::deferred_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)}; - assert(def.__arg_.size() == 4); - assert(def.__runtime_bounds_.__lower_ == 5); - assert(def.__runtime_bounds_.__upper_ == 100); - static_assert(cuda::argument::__traits::lowest == 1); - assert(cuda::argument::__lowest_(def) == 5); - assert(cuda::argument::__highest_(def) == 100); + auto def = cuda::args::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds<1, 4096>(), cuda::args::bounds(5, 100)}; + assert(cuda::args::__access::__arg(def).size() == 4); + assert(cuda::args::__access::__runtime_bounds(def).lower() == 5); + assert(cuda::args::__access::__runtime_bounds(def).upper() == 100); + static_assert(cuda::args::__traits::lowest == 1); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); } // Deferred sequence with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::argument::deferred_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 4096); - assert(cuda::argument::__lowest_(def) == 5); - assert(cuda::argument::__highest_(def) == 100); + auto def = cuda::args::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 4096>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 4096); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); } // Traits: deferred is single value { - using traits = cuda::argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred with pointer is also single value { - using traits = cuda::argument::__traits>; + using traits = cuda::args::__traits>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred_sequence is not single value { - using traits = cuda::argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(traits::is_deferred); static_assert(!traits::is_single_value); } @@ -133,16 +133,16 @@ TEST_FUNC constexpr bool test() // Unwrap: deferred { int val = 99; - auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; - auto& v = cuda::argument::__unwrap(def); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}}; + auto& v = cuda::args::__unwrap(def); assert(v[0] == 99); } // Unwrap: deferred_sequence { int arr[3] = {10, 20, 30}; - auto def = cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::argument::__unwrap(def); + auto def = cuda::args::deferred_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::args::__unwrap(def); assert(v.size() == 3); assert(v[1] == 20); } @@ -150,14 +150,14 @@ TEST_FUNC constexpr bool test() // Unwrap: rvalue deferred returns by value { int val = 99; - auto v = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span{&val, 1}}); + auto v = cuda::args::__unwrap(cuda::args::deferred{cuda::std::span{&val, 1}}); assert(v[0] == 99); } // Unwrap: rvalue deferred_sequence returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::args::__unwrap(cuda::args::deferred_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp index 1ce371b84e5..64bad620293 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp @@ -10,7 +10,7 @@ #include -[[maybe_unused]] cuda::argument::deferred_sequence invalid_arg{0}; +[[maybe_unused]] cuda::args::deferred_sequence invalid_arg{0}; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp index 41a9a2ae778..111bc226ae5 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp @@ -10,7 +10,7 @@ #include -using traits = cuda::argument::__traits>; +using traits = cuda::args::__traits>; [[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp index e969b9bd234..7970c50e2df 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp @@ -25,99 +25,99 @@ TEST_FUNC constexpr bool test() { // Uniform scalar via CTAD { - auto da = cuda::argument::immediate{5}; - assert(cuda::argument::__unwrap(da) == 5); - assert(da.__arg_ == 5); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - assert(cuda::argument::__lowest_(da) == 5); - assert(cuda::argument::__highest_(da) == 5); - da.__arg_ = 6; - assert(cuda::argument::__unwrap(da) == 6); + auto da = cuda::args::immediate{5}; + assert(cuda::args::__unwrap(da) == 5); + assert(cuda::args::__access::__arg(da) == 5); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + assert(cuda::args::__lowest_(da) == 5); + assert(cuda::args::__highest_(da) == 5); + cuda::args::__access::__arg(da) = 6; + assert(cuda::args::__unwrap(da) == 6); } // Uniform scalar with static bounds { - auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()}; - assert(cuda::argument::__unwrap(da) == 5); - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 8); - assert(cuda::argument::__lowest_(da) == 5); - assert(cuda::argument::__highest_(da) == 5); + auto da = cuda::args::immediate{5, cuda::args::bounds<1, 8>()}; + assert(cuda::args::__unwrap(da) == 5); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 8); + assert(cuda::args::__lowest_(da) == 5); + assert(cuda::args::__highest_(da) == 5); } // Non-sequence values are accepted without scalar-only restrictions { - auto da = cuda::argument::immediate{non_sequence_value{7}}; - assert(cuda::argument::__unwrap(da).payload == 7); + auto da = cuda::args::immediate{non_sequence_value{7}}; + assert(cuda::args::__unwrap(da).payload == 7); } // Pointer-like types can still represent a single value when explicitly wrapped that way { int value = 11; - auto da = cuda::argument::immediate{&value}; - static_assert(cuda::argument::__traits::is_single_value); - assert(*cuda::argument::__unwrap(da) == 11); + auto da = cuda::args::immediate{&value}; + static_assert(cuda::args::__traits::is_single_value); + assert(*cuda::args::__unwrap(da) == 11); } // Per-segment span with runtime bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds(1L, 100L)}; - assert(cuda::argument::__unwrap(da).size() == 4); - assert(da.__arg_.size() == 4); - assert(da.__runtime_bounds_.__lower_ == 1); - assert(da.__runtime_bounds_.__upper_ == 100); - assert(cuda::argument::__lowest_(da) == 1); - assert(cuda::argument::__highest_(da) == 100); - da.__runtime_bounds_.__upper_ = 90; - assert(cuda::argument::__highest_(da) == 90); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}, cuda::args::bounds(1L, 100L)}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__access::__arg(da).size() == 4); + assert(cuda::args::__access::__runtime_bounds(da).lower() == 1); + assert(cuda::args::__access::__runtime_bounds(da).upper() == 100); + assert(cuda::args::__lowest_(da) == 1); + assert(cuda::args::__highest_(da) == 100); + cuda::args::__access::__runtime_bounds(da) = cuda::args::bounds(1, 90); + assert(cuda::args::__highest_(da) == 90); } // Per-segment span with both bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 256); - assert(cuda::argument::__lowest_(da) == 10); - assert(cuda::argument::__highest_(da) == 200); + auto da = cuda::args::__immediate_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds<1, 256>(), cuda::args::bounds(10, 200)}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__lowest_(da) == 10); + assert(cuda::args::__highest_(da) == 200); } // Per-segment span with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 256); - assert(cuda::argument::__lowest_(da) == 10); - assert(cuda::argument::__highest_(da) == 200); + auto da = cuda::args::__immediate_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds(10, 200), cuda::args::bounds<1, 256>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__lowest_(da) == 10); + assert(cuda::args::__highest_(da) == 200); } // Per-segment via span { int arr[4] = {1, 2, 3, 4}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}}; - assert(cuda::argument::__unwrap(da).size() == 4); - assert(cuda::argument::__unwrap(da)[0] == 1); - assert(cuda::argument::__unwrap(da)[3] == 4); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__unwrap(da)[0] == 1); + assert(cuda::args::__unwrap(da)[3] == 4); } // Per-segment with static bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds<1, 100>()}; - assert(cuda::argument::__unwrap(da).size() == 4); - assert(cuda::argument::__unwrap(da)[2] == 30); - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 100); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}, cuda::args::bounds<1, 100>()}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__unwrap(da)[2] == 30); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 100); } // Traits { - using traits = cuda::argument::__traits>; + using traits = cuda::args::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_single_value); static_assert(cuda::std::is_same_v); @@ -125,7 +125,7 @@ TEST_FUNC constexpr bool test() // Sequence traits { - using traits = cuda::argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); static_assert(cuda::std::is_same_v>); @@ -133,39 +133,38 @@ TEST_FUNC constexpr bool test() // __is_sequence_v on unwrapped types { - static_assert( - !cuda::argument::__is_sequence_v>::value_type>); - static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(!cuda::args::__is_sequence_v>::value_type>); + static_assert(!cuda::args::__traits>>::is_single_value); } // Unwrap: scalar { - auto da = cuda::argument::immediate{7}; - auto& v = cuda::argument::__unwrap(da); + auto da = cuda::args::immediate{7}; + auto& v = cuda::args::__unwrap(da); assert(v == 7); v = 8; - assert(cuda::argument::__unwrap(da) == 8); + assert(cuda::args::__unwrap(da) == 8); } // Unwrap: span { int arr[3] = {10, 20, 30}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::argument::__unwrap(da); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::args::__unwrap(da); assert(v.size() == 3); assert(v[1] == 20); } // Unwrap: rvalue scalar returns by value { - const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7}); + const auto& v = cuda::args::__unwrap(cuda::args::immediate{7}); assert(v == 7); } // Unwrap: rvalue span returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::args::__unwrap(cuda::args::__immediate_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp index cb6e78cade3..f3cc7a2a993 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp @@ -24,72 +24,76 @@ TEST_FUNC void test() { // Basic value { - constexpr auto sa = cuda::argument::constant<42>{}; - static_assert(sa.value() == 42); + constexpr auto sa = cuda::args::constant<42>{}; + static_assert(cuda::args::__unwrap(sa) == 42); static_assert(cuda::std::is_same_v); } // Different types { - constexpr auto sa_long = cuda::argument::constant<100L>{}; - static_assert(sa_long.value() == 100L); + constexpr auto sa_long = cuda::args::constant<100L>{}; + static_assert(cuda::args::__unwrap(sa_long) == 100L); static_assert(cuda::std::is_same_v); + + constexpr auto sa_float = cuda::args::constant<10, float>{}; + static_assert(cuda::args::__unwrap(sa_float) == 10.0f); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); } // Negative value { - constexpr auto sa_neg = cuda::argument::constant<-1>{}; - static_assert(sa_neg.value() == -1); + constexpr auto sa_neg = cuda::args::constant<-1>{}; + static_assert(cuda::args::__unwrap(sa_neg) == -1); } #if TEST_HAS_CLASS_NTTP // Non-sequence values are accepted without scalar-only restrictions { - constexpr auto sa = cuda::argument::constant{}; - static_assert(sa.value().payload == 7); - static_assert(cuda::argument::__unwrap(sa).payload == 7); + constexpr auto sa = cuda::args::constant{}; + static_assert(cuda::args::__unwrap(sa).payload == 7); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Array sequence { - constexpr auto sa_arr = cuda::argument::constant_sequence{128, 256, 512}>{}; - static_assert(sa_arr.value()[0] == 128); - static_assert(sa_arr.value()[1] == 256); - static_assert(sa_arr.value()[2] == 512); + constexpr auto sa_arr = cuda::args::__constant_sequence{128, 256, 512}>{}; + static_assert(cuda::args::__unwrap(sa_arr)[0] == 128); + static_assert(cuda::args::__unwrap(sa_arr)[1] == 256); + static_assert(cuda::args::__unwrap(sa_arr)[2] == 512); static_assert(cuda::std::is_same_v>); } #endif // TEST_HAS_CLASS_NTTP // Bounds: scalar { - constexpr auto sa = cuda::argument::constant<42>{}; - static_assert(cuda::argument::__lowest_(sa) == 42); - static_assert(cuda::argument::__highest_(sa) == 42); + constexpr auto sa = cuda::args::constant<42>{}; + static_assert(cuda::args::__lowest_(sa) == 42); + static_assert(cuda::args::__highest_(sa) == 42); } #if TEST_HAS_CLASS_NTTP // Bounds: array sequence computes lowest/highest of elements { - constexpr auto sa = cuda::argument::constant_sequence{128, 256, 512}>{}; - static_assert(cuda::argument::__lowest_(sa) == 128); - static_assert(cuda::argument::__highest_(sa) == 512); + constexpr auto sa = cuda::args::__constant_sequence{128, 256, 512}>{}; + static_assert(cuda::args::__lowest_(sa) == 128); + static_assert(cuda::args::__highest_(sa) == 512); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Bounds: empty array sequence has unconstrained element bounds { - constexpr auto sa = cuda::argument::constant_sequence{}>{}; - static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits::max)()); + constexpr auto sa = cuda::args::__constant_sequence{}>{}; + static_assert(cuda::args::__lowest_(sa) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(sa) == (cuda::std::numeric_limits::max)()); } #endif // TEST_HAS_CLASS_NTTP // Traits { - using traits = cuda::argument::__traits>; + using traits = cuda::args::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_constant); static_assert(traits::is_single_value); @@ -98,10 +102,22 @@ TEST_FUNC void test() static_assert(traits::highest == 42); } + // Traits: explicit constant value type + { + using traits = cuda::args::__traits>; + static_assert(!traits::is_deferred); + static_assert(traits::is_constant); + static_assert(traits::is_single_value); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(traits::lowest == 10.0f); + static_assert(traits::highest == 10.0f); + } + #if TEST_HAS_CLASS_NTTP // Sequence traits { - using traits = cuda::argument::__traits{1, 2, 3}>>; + using traits = cuda::args::__traits{1, 2, 3}>>; static_assert(traits::is_constant); static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); @@ -112,26 +128,33 @@ TEST_FUNC void test() // Single value: scalar is single, sequence is not { - static_assert( - !cuda::argument::__is_sequence_v>::value_type>); + static_assert(!cuda::args::__is_sequence_v>::value_type>); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_single_value); + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } // Unwrap: scalar { - constexpr auto sa = cuda::argument::constant<42>{}; - constexpr auto val = cuda::argument::__unwrap(sa); + constexpr auto sa = cuda::args::constant<42>{}; + constexpr auto val = cuda::args::__unwrap(sa); static_assert(val == 42); } + // Unwrap: scalar with explicit value type + { + constexpr auto sa = cuda::args::constant<10, float>{}; + constexpr auto val = cuda::args::__unwrap(sa); + static_assert(val == 10.0f); + static_assert(cuda::std::is_same_v); + } + #if TEST_HAS_CLASS_NTTP // Unwrap: sequence { - constexpr auto sa = cuda::argument::constant_sequence{10, 20, 30}>{}; - constexpr auto val = cuda::argument::__unwrap(sa); + constexpr auto sa = cuda::args::__constant_sequence{10, 20, 30}>{}; + constexpr auto val = cuda::args::__unwrap(sa); static_assert(val[0] == 10); static_assert(val[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp index 66a1828769b..b59d41fd7a1 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp @@ -10,7 +10,7 @@ #include -using arg_t = cuda::argument::immediate>; +using arg_t = cuda::args::immediate>; [[maybe_unused]] arg_t invalid_arg{0}; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp index 7f9902e50a2..5212c8a1f9c 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp @@ -10,9 +10,9 @@ #include -using arg_t = cuda::argument::immediate>; +using arg_t = cuda::args::immediate>; -[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits::highest; +[[maybe_unused]] constexpr auto invalid_highest = cuda::args::__traits::highest; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp index 8170fdd7ee5..02ba7ecfe96 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp @@ -10,7 +10,7 @@ #include -[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{}; +[[maybe_unused]] constexpr auto invalid_bounds = cuda::args::static_bounds<0, 1L>{}; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp index b7aca57bbdf..eada29e23de 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp @@ -10,7 +10,7 @@ // Integration test: demonstrates how an algorithm consumes argument wrappers // to make compile-time and runtime resource decisions. -// All argument types (plain values, static, dynamic, deferred) work uniformly +// All argument types (plain values, constants, immediate values, deferred values) work uniformly // through the free functions. #include @@ -34,7 +34,7 @@ enum class algorithm_variant template TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) { - if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity) + if constexpr (cuda::args::__traits<_SegSizeArg>::highest <= shared_memory_capacity) { return algorithm_variant::shared_memory; } @@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) template TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments) { - auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::argument::__highest_(__seg_size))); + auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::args::__highest_(__seg_size))); return __highest * __num_segments; } @@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se template TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size) { - const auto& __val = cuda::argument::__unwrap(__seg_size); + const auto& __val = cuda::args::__unwrap(__seg_size); - if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value) + if constexpr (cuda::args::__traits<_SegSizeArg>::is_single_value) { return static_cast(__val); } @@ -93,82 +93,82 @@ TEST_FUNC constexpr bool test() } #endif - // static_argument: scalar, fits in shared memory, buffer = value + // constant: scalar, fits in shared memory, buffer = value { - constexpr auto seg_size = cuda::argument::constant<128>{}; + constexpr auto seg_size = cuda::args::constant<128>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 128 * 4); assert(process_segments(seg_size) == 128); } #if TEST_HAS_CLASS_NTTP - // static_argument: array sequence, highest fits in shared memory + // __constant_sequence: array sequence, highest fits in shared memory { - constexpr auto seg_sizes = cuda::argument::constant_sequence{}; + constexpr auto seg_sizes = cuda::args::__constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 256 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 256); } - // static_argument: array sequence, highest exceeds shared memory, buffer clamped + // __constant_sequence: array sequence, highest exceeds shared memory, buffer clamped { - constexpr auto seg_sizes = cuda::argument::constant_sequence{}; + constexpr auto seg_sizes = cuda::args::__constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 512 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 512); } #endif // TEST_HAS_CLASS_NTTP - // dynamic_argument: tight static bounds, shared memory, buffer = value + // immediate: tight static bounds, shared memory, buffer = value { - constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()}; + constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 256>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: wide static bounds, global memory, buffer = value + // immediate: wide static bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()}; + constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 4096>()}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: no bounds, global memory, buffer = value + // immediate: no bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::argument::immediate{100}; + constexpr auto seg_size = cuda::args::immediate{100}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: per-segment span with runtime bounds only + // __immediate_sequence: per-segment span with runtime bounds only { int sizes[3] = {64, 128, 96}; - auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span{sizes, 3}, cuda::argument::bounds(1, 200)}; + auto seg_sizes = cuda::args::__immediate_sequence{cuda::std::span{sizes, 3}, cuda::args::bounds(1, 200)}; assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); } - // dynamic_argument: per-segment span with both bounds + // __immediate_sequence: per-segment span with both bounds { int sizes[3] = {64, 128, 96}; - auto seg_sizes = cuda::argument::immediate_sequence{ - cuda::std::span{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; - static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); + auto seg_sizes = cuda::args::__immediate_sequence{ + cuda::std::span{sizes, 3}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)}; + static_assert(cuda::args::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); } - // deferred_argument: uniform, bounds for decisions only + // deferred: uniform, bounds for decisions only { - int val = 100; - auto seg_size = cuda::argument::deferred{ - cuda::std::span{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; - static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); + int val = 100; + auto seg_size = + cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)}; + static_assert(cuda::args::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 200 * 4); } @@ -181,17 +181,24 @@ TEST_FUNC constexpr bool test() assert(process_segments(1.0f) == 1); } + // constant float using an integer NTTP and explicit value type + { + constexpr auto seg_size = cuda::args::constant<128, float>{}; + static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); + assert(process_segments(seg_size) == 128); + } + #if TEST_HAS_CLASS_NTTP - // static_argument float (float NTTPs require C++20) + // constant float (float NTTPs require C++20) { - constexpr auto seg_size = cuda::argument::constant<128.0f>{}; + constexpr auto seg_size = cuda::args::constant<128.0f>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 128); } - // dynamic_argument float with static bounds + // immediate float with static bounds { - constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()}; + constexpr auto seg_size = cuda::args::immediate{100.0f, cuda::args::bounds<1.0f, 256.0f>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 100); } From b870d3aa4d6db61f5f44221d4b669ce3c29314c6 Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Tue, 9 Jun 2026 17:08:45 -0700 Subject: [PATCH 6/6] Chnage the test dispatch for now --- .../catch2_test_device_segmented_topk_keys.cu | 19 ++++---- ...catch2_test_device_segmented_topk_pairs.cu | 46 +++++++++++++++++-- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index 7e9ad93ca9e..fe3e97188ac 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -27,11 +27,11 @@ struct is_minus_zero } }; -template CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys( @@ -41,7 +41,6 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys( KeyOutputItItT d_key_segments_out_it, SegmentSizeParamT segment_sizes, KParamT k, - SelectDirectionT select_direction, NumSegmentsParameterT num_segments, TotalNumItemsGuaranteeT total_num_items_guarantee, cudaStream_t stream = nullptr) @@ -56,14 +55,15 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys( values_it, segment_sizes, k, - select_direction, + ::cuda::args::constant{}, num_segments, total_num_items_guarantee, stream); } // %PARAM% TEST_LAUNCH lid 0:1:2 -DECLARE_LAUNCH_WRAPPER(dispatch_batched_topk_keys, batched_topk_keys); +DECLARE_TMPL_LAUNCH_WRAPPER( + dispatch_batched_topk_keys, batched_topk_keys, cub::detail::topk::select Direction, Direction); // Total segment size using max_segment_size_list = c2h::enum_type_list; @@ -153,12 +153,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments", c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_keys( + batched_topk_keys( d_keys_in, d_keys_out, ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::constant{}, ::cuda::args::immediate{num_segments}, ::cuda::args::immediate{num_segments * segment_size}); // Prepare expected results @@ -251,13 +250,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_keys( + batched_topk_keys( d_keys_in, d_keys_out, ::cuda::args::__immediate_sequence{ segment_size_it, ::cuda::args::bounds()}, ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::constant{}, ::cuda::args::immediate{num_segments}, ::cuda::args::immediate{num_items}); @@ -289,12 +287,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment auto d_keys_out_it = cuda::make_strided_iterator(cuda::make_counting_iterator(thrust::raw_pointer_cast(d_keys_out.data())), k); - batched_topk_keys( + batched_topk_keys( d_keys_in_it, d_keys_out_it, ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::constant{}, ::cuda::args::immediate{num_segments}, ::cuda::args::immediate{num_segments * segment_size}); diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index cef26687cbd..6ba474daf81 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -47,8 +47,46 @@ struct flag_intra_segment_duplicates template flag_intra_segment_duplicates(ItemItT, SegIdItT) -> flag_intra_segment_duplicates; +template +CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_pairs( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputItItT d_key_segments_it, + KeyOutputItItT d_key_segments_out_it, + ValueInputItItT d_value_segments_it, + ValueOutputItItT d_value_segments_out_it, + SegmentSizeParameterT segment_sizes, + KParameterT k, + NumSegmentsParameterT num_segments, + TotalNumItemsGuaranteeT total_num_items_guarantee, + cudaStream_t stream = nullptr) +{ + return cub::detail::batched_topk::dispatch( + d_temp_storage, + temp_storage_bytes, + d_key_segments_it, + d_key_segments_out_it, + d_value_segments_it, + d_value_segments_out_it, + segment_sizes, + k, + ::cuda::args::constant{}, + num_segments, + total_num_items_guarantee, + stream); +} + // %PARAM% TEST_LAUNCH lid 0:1:2 -DECLARE_LAUNCH_WRAPPER(cub::detail::batched_topk::dispatch, batched_topk_pairs); +DECLARE_TMPL_LAUNCH_WRAPPER( + dispatch_batched_topk_pairs, batched_topk_pairs, cub::detail::topk::select Direction, Direction); // Total segment size using max_segment_size_list = c2h::enum_type_list; @@ -220,14 +258,13 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments" c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_pairs( + batched_topk_pairs( d_keys_in, d_keys_out, d_values_in, d_values_out, ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::constant{}, ::cuda::args::immediate{num_segments}, ::cuda::args::immediate{num_segments * segment_size}); @@ -341,7 +378,7 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_pairs( + batched_topk_pairs( d_keys_in, d_keys_out, d_values_in, @@ -349,7 +386,6 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen ::cuda::args::__immediate_sequence{ segment_size_it, ::cuda::args::bounds()}, ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::constant{}, ::cuda::args::immediate{num_segments}, ::cuda::args::immediate{num_items});