diff --git a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu index 488d3aa4439..8cb72ce26f7 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu @@ -36,18 +36,18 @@ void decode_style_variable_topk_indexed( static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::args::immediate{static_cast(input_elements)}; + const auto total_num_items = cuda::args::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); auto out_indices_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::args::__immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::args::constant{}; - auto select_direction = ::cuda::args::constant{}; - auto num_segments_param = ::cuda::args::immediate{static_cast(num_segments)}; + auto segment_sizes_param = cuda::args::deferred_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), cuda::args::bounds<1, MaxSegmentSize>()}; + auto k_param = cuda::args::constant{}; + auto select_direction = cuda::args::constant{}; + auto num_segments_param = cuda::args::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu index 5a54ad1fbaa..f5c6e4e4949 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu @@ -32,17 +32,17 @@ void decode_style_variable_topk_keys( static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::args::immediate{static_cast(input_elements)}; + const auto total_num_items = cuda::args::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::args::__immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::args::constant{}; - auto select_direction = ::cuda::args::constant{}; - auto num_segments_param = ::cuda::args::immediate{static_cast(num_segments)}; + auto segment_sizes_param = cuda::args::deferred_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), cuda::args::bounds<1, MaxSegmentSize>()}; + auto k_param = cuda::args::constant{}; + auto select_direction = cuda::args::constant{}; + auto num_segments_param = cuda::args::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh index 41ba334eaef..0ece8092d0c 100644 --- a/cub/cub/detail/segmented_params.cuh +++ b/cub/cub/detail/segmented_params.cuh @@ -53,13 +53,6 @@ get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _Se return ::cuda::args::__unwrap(__arg); } -template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept -{ - return ::cuda::args::__unwrap(__arg)[__index]; -} - template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept @@ -67,13 +60,6 @@ get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unu return ::cuda::args::__unwrap(__arg); } -template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept -{ - return ::cuda::args::__unwrap(__arg)[__index]; -} - template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index fe3e97188ac..0523c1135cd 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -55,7 +55,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys( values_it, segment_sizes, k, - ::cuda::args::constant{}, + cuda::args::constant{}, num_segments, total_num_items_guarantee, stream); @@ -71,19 +71,29 @@ using max_segment_size_list = c2h::enum_type_list; // Segment size: static, uniform using max_num_k_list = c2h::enum_type_list; +// %PARAM% TEST_TYPES types 0:1:2 + +#if TEST_TYPES == 0 +using key_types = + c2h::type_list; +// clang-format on +#elif TEST_TYPES == 1 +using key_types = c2h::type_list; +#elif TEST_TYPES == 2 using key_types = - c2h::type_list; + #if TEST_BF_T() + , bfloat16_t + #endif // TEST_BF_T() + >; // clang-format on +#endif // Selection direction is a compile-time option; cover both as a static test axis. using select_direction_list = @@ -156,10 +166,10 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments", batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, - ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::immediate{num_segments}, - ::cuda::args::immediate{num_segments * segment_size}); + cuda::args::immediate{segment_size, cuda::args::bounds()}, + cuda::args::immediate{k, cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_segments * segment_size}); // Prepare expected results fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); compact_sorted_keys_to_topk(expected_keys, segment_size, k); @@ -228,7 +238,7 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment // Each output segment holds exactly min(k, segment_size[i]) items, tightly packed. auto compacted_output_sizes_it = cuda::make_transform_iterator( cuda::make_counting_iterator(segment_index_t{0}), - get_output_size_op{segment_offsets.cbegin(), cuda::constant_iterator(k)}); + get_output_size_op{segment_offsets.cbegin(), cuda::constant_iterator(k), num_segments}); c2h::device_vector compacted_offsets(num_segments + 1, thrust::no_init); thrust::exclusive_scan( compacted_output_sizes_it, compacted_output_sizes_it + num_segments + 1, compacted_offsets.begin()); @@ -253,11 +263,10 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::args::__immediate_sequence{ - segment_size_it, ::cuda::args::bounds()}, - ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::immediate{num_segments}, - ::cuda::args::immediate{num_items}); + cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds()}, + cuda::args::immediate{k, cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_items}); // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); @@ -270,6 +279,203 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment REQUIRE(expected_keys == keys_out_buffer); } +C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with fixed-size segments and per-segment k", + "[keys][segmented][topk][device]", + key_types, + max_segment_size_list, + max_num_k_list, + select_direction_list) +{ + using segment_size_t = cuda::std::int64_t; + using segment_index_t = cuda::std::int64_t; + + using key_t = c2h::get<0, TestType>; + + // Statically constrained maximum segment size and k + constexpr segment_size_t static_max_segment_size = c2h::get<1, TestType>::value; + constexpr segment_size_t static_max_k = c2h::get<2, TestType>::value; + + // Selection direction comes from the compile-time test axis. + constexpr auto direction = c2h::get<3, TestType>::value; + + // Generate the (uniform) input segment size. Unlike the uniform-k tests, k still varies per segment below. + constexpr segment_size_t min_segment_size = 1; + constexpr auto max_segment_size = static_max_segment_size; + const segment_size_t segment_size = GENERATE_COPY(values({min_segment_size, segment_size_t{3}, max_segment_size}), + take(2, random(min_segment_size, max_segment_size))); + + // Skip invalid combinations + if (segment_size > max_segment_size) + { + SKIP("The given segment size may not exceed the maximum segment size, we statically constrained the algorithm on."); + } + + // Generate number of segments + const segment_index_t num_segments = GENERATE_COPY( + values({segment_index_t{1}, segment_index_t{42}}), take(2, random(segment_index_t{1}, segment_index_t{1000}))); + + // Generate a per-segment k in [1, static_max_k] + c2h::device_vector segment_k(num_segments, thrust::no_init); + c2h::gen(C2H_SEED(1), segment_k, segment_size_t{1}, static_max_k); + + // Capture test parameters + CAPTURE(c2h::type_name(), + c2h::type_name(), + c2h::type_name(), + static_max_segment_size, + static_max_k, + segment_size, + num_segments, + direction); + + // Materialize fixed-size input offsets: [0, segment_size, 2 * segment_size, ...] + auto fixed_offsets_it = cuda::make_strided_iterator(cuda::make_counting_iterator(0), segment_size); + c2h::device_vector segment_offsets(num_segments + 1, thrust::no_init); + thrust::copy(fixed_offsets_it, fixed_offsets_it + (num_segments + 1), segment_offsets.begin()); + + // Compute compacted output offsets: each output segment holds exactly min(k[i], segment_size) items, tightly packed. + auto compacted_output_sizes_it = cuda::make_transform_iterator( + cuda::make_counting_iterator(segment_index_t{0}), + get_output_size_op{segment_offsets.cbegin(), segment_k.cbegin(), num_segments}); + c2h::device_vector compacted_offsets(num_segments + 1, thrust::no_init); + thrust::exclusive_scan( + compacted_output_sizes_it, compacted_output_sizes_it + num_segments + 1, compacted_offsets.begin()); + segment_size_t total_output_size = compacted_offsets.back(); + + // Prepare input & output. Input segments are fixed-size (strided); output segments are compacted (variable). + c2h::device_vector keys_in_buffer(num_segments * segment_size, thrust::no_init); + c2h::device_vector keys_out_buffer(total_output_size, thrust::no_init); + const int num_key_seeds = 1; + c2h::gen(C2H_SEED(num_key_seeds), keys_in_buffer); + auto d_keys_in_ptr = thrust::raw_pointer_cast(keys_in_buffer.data()); + auto d_keys_out_ptr = thrust::raw_pointer_cast(keys_out_buffer.data()); + auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size); + auto d_keys_out = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_out_ptr), compacted_offsets.cbegin()); + + // Copy input for verification + c2h::device_vector expected_keys(keys_in_buffer); + + // Run the top-k algorithm with a per-segment k passed as an immediate sequence + batched_topk_keys( + d_keys_in, + d_keys_out, + cuda::args::immediate{segment_size, cuda::args::bounds()}, + cuda::args::deferred_sequence{ + thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_segments * segment_size}); + + // Prepare expected results: sort each fixed-size input segment, then compact each to its per-segment top-k. + fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); + expected_keys = compact_to_topk_batched(expected_keys, segment_offsets, segment_k.cbegin()); + + // Since the results of top-k are unordered, sort compacted output segments before comparison. + segmented_sort_keys( + keys_out_buffer, num_segments, compacted_offsets.cbegin(), compacted_offsets.cbegin() + 1, direction); + + REQUIRE(expected_keys == keys_out_buffer); +} + +C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with variable-size segments and per-segment k", + "[keys][segmented][topk][device]", + key_types, + max_segment_size_list, + max_num_k_list, + select_direction_list) +{ + using segment_size_t = cuda::std::int64_t; + using segment_index_t = cuda::std::int64_t; + + using key_t = c2h::get<0, TestType>; + + // Statically constrained maximum segment size and k + constexpr segment_size_t static_max_segment_size = c2h::get<1, TestType>::value; + constexpr segment_size_t static_max_k = c2h::get<2, TestType>::value; + + // Selection direction comes from the compile-time test axis. + constexpr auto direction = c2h::get<3, TestType>::value; + + constexpr segment_size_t min_items = 1; + constexpr segment_size_t max_items = 1'000'000; + + // Number of items + const segment_size_t num_items = GENERATE_COPY( + take(2, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + + // Generate segment sizes + constexpr segment_size_t min_segment_size = 1; + constexpr auto max_segment_size = static_max_segment_size; + c2h::device_vector segment_offsets = + c2h::gen_uniform_offsets(C2H_SEED(3), num_items, min_segment_size, max_segment_size); + const segment_index_t num_segments = static_cast(segment_offsets.size() - 1); + auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + auto segment_size_it = cuda::make_transform_iterator( + cuda::make_counting_iterator(segment_index_t{0}), segment_size_op{segment_offsets_it}); + + // Generate a per-segment k in [1, static_max_k] + c2h::device_vector segment_k(num_segments, thrust::no_init); + c2h::gen(C2H_SEED(1), segment_k, segment_size_t{1}, static_max_k); + + // Capture test parameters + CAPTURE(c2h::type_name(), + c2h::type_name(), + c2h::type_name(), + static_max_segment_size, + static_max_k, + num_segments, + direction); + + // Compute compacted output offsets: + // Each output segment holds exactly min(k[i], segment_size[i]) items, tightly packed. + auto compacted_output_sizes_it = cuda::make_transform_iterator( + cuda::make_counting_iterator(segment_index_t{0}), + get_output_size_op{segment_offsets.cbegin(), segment_k.cbegin(), num_segments}); + c2h::device_vector compacted_offsets(num_segments + 1, thrust::no_init); + thrust::exclusive_scan( + compacted_output_sizes_it, compacted_output_sizes_it + num_segments + 1, compacted_offsets.begin()); + segment_size_t total_output_size = compacted_offsets.back(); + + // Prepare keys input & output + c2h::device_vector keys_in_buffer(num_items, thrust::no_init); + c2h::device_vector keys_out_buffer(total_output_size, thrust::no_init); + const int num_key_seeds = 1; + c2h::gen(C2H_SEED(num_key_seeds), keys_in_buffer); + auto d_keys_in_ptr = thrust::raw_pointer_cast(keys_in_buffer.data()); + auto d_keys_out_ptr = thrust::raw_pointer_cast(keys_out_buffer.data()); + auto d_keys_in = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_offsets.cbegin()); + auto d_keys_out = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_out_ptr), compacted_offsets.cbegin()); + + // Copy input for verification + c2h::device_vector expected_keys(keys_in_buffer); + + // Run the top-k algorithm with a per-segment k passed as an immediate sequence + batched_topk_keys( + d_keys_in, + d_keys_out, + cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds()}, + cuda::args::deferred_sequence{ + thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_items}); + + // Verify keys are returned correctly: sort each segment of the expected input, then compact the per-segment top-k + segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); + expected_keys = compact_to_topk_batched(expected_keys, segment_offsets, segment_k.cbegin()); + + // Since the results of top-k are unordered, sort compacted output segments before comparison + segmented_sort_keys( + keys_out_buffer, num_segments, compacted_offsets.cbegin(), compacted_offsets.cbegin() + 1, direction); + + REQUIRE(expected_keys == keys_out_buffer); +} + // Regression test: top-k must preserve -0.0f in the output (not normalize to +0.0f). C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segmented][topk][device][float]") { @@ -290,10 +496,10 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment batched_topk_keys( d_keys_in_it, d_keys_out_it, - ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, - ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::immediate{num_segments}, - ::cuda::args::immediate{num_segments * segment_size}); + cuda::args::immediate{segment_size, cuda::args::bounds()}, + cuda::args::immediate{k, cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_segments * segment_size}); const int num_minus_zero = static_cast(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{})); REQUIRE(num_minus_zero >= 1); diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index 6ba474daf81..da729ed14be 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -78,7 +78,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_pairs( d_value_segments_out_it, segment_sizes, k, - ::cuda::args::constant{}, + cuda::args::constant{}, num_segments, total_num_items_guarantee, stream); @@ -103,15 +103,19 @@ using key_types = #if TEST_HALF_T() , half_t #endif // TEST_HALF_T() - #if TEST_BF_T() - , bfloat16_t - #endif // TEST_BF_T() >; // clang-format on #elif TEST_TYPES == 1 using key_types = c2h::type_list; #elif TEST_TYPES == 2 -using key_types = c2h::type_list; +using key_types = + c2h::type_list; +// clang-format on #endif // Unsigned integer types used for the radix-pass boundary distribution test @@ -263,10 +267,10 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments" d_keys_out, d_values_in, d_values_out, - ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, - ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::immediate{num_segments}, - ::cuda::args::immediate{num_segments * segment_size}); + cuda::args::immediate{segment_size, cuda::args::bounds()}, + cuda::args::immediate{k, cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_segments * segment_size}); // Verification: // - We verify correct top-k selection through the keys @@ -347,7 +351,7 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen // Each output segment holds exactly min(k, segment_size[i]) items, tightly packed. auto compacted_output_sizes_it = cuda::make_transform_iterator( cuda::make_counting_iterator(segment_index_t{0}), - get_output_size_op{segment_offsets.cbegin(), cuda::constant_iterator(k)}); + get_output_size_op{segment_offsets.cbegin(), cuda::constant_iterator(k), num_segments}); c2h::device_vector compacted_offsets(num_segments + 1, thrust::no_init); thrust::exclusive_scan( compacted_output_sizes_it, compacted_output_sizes_it + num_segments + 1, compacted_offsets.begin()); @@ -383,11 +387,10 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen d_keys_out, d_values_in, d_values_out, - ::cuda::args::__immediate_sequence{ - segment_size_it, ::cuda::args::bounds()}, - ::cuda::args::immediate{k, ::cuda::args::bounds()}, - ::cuda::args::immediate{num_segments}, - ::cuda::args::immediate{num_items}); + cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds()}, + cuda::args::immediate{k, cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_items}); // Verification: // - We verify correct top-k selection through the keys @@ -408,3 +411,241 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen REQUIRE(expected_keys == keys_out_buffer); } + +C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with fixed-size segments and per-segment k", + "[pairs][segmented][topk][device]", + key_types, + max_segment_size_list, + max_num_k_list, + select_direction_list) +{ + using segment_size_t = cuda::std::int64_t; + using segment_index_t = cuda::std::int64_t; + + using key_t = c2h::get<0, TestType>; + using val_t = cuda::std::int32_t; + + // Statically constrained maximum segment size and k + constexpr segment_size_t static_max_segment_size = c2h::get<1, TestType>::value; + constexpr segment_size_t static_max_k = c2h::get<2, TestType>::value; + + // Selection direction comes from the compile-time test axis. + constexpr auto direction = c2h::get<3, TestType>::value; + + // Generate the (uniform) input segment size. Unlike the uniform-k tests, k still varies per segment below. + constexpr segment_size_t min_segment_size = 1; + constexpr auto max_segment_size = static_max_segment_size; + const segment_size_t segment_size = GENERATE_COPY(values({min_segment_size, segment_size_t{3}, max_segment_size}), + take(1, random(min_segment_size, max_segment_size))); + + // Skip invalid combinations + if (segment_size > max_segment_size) + { + SKIP("The given segment size may not exceed the maximum segment size, we statically constrained the algorithm on."); + } + + // Generate number of segments + const segment_index_t num_segments = GENERATE_COPY( + values({segment_index_t{1}, segment_index_t{42}}), take(1, random(segment_index_t{1}, segment_index_t{1000}))); + + // Generate a per-segment k in [1, static_max_k] + c2h::device_vector segment_k(num_segments, thrust::no_init); + c2h::gen(C2H_SEED(1), segment_k, segment_size_t{1}, static_max_k); + + // Capture test parameters + CAPTURE(c2h::type_name(), + c2h::type_name(), + c2h::type_name(), + static_max_segment_size, + static_max_k, + segment_size, + num_segments, + direction); + + // Materialize fixed-size input offsets: [0, segment_size, 2 * segment_size, ...] + auto fixed_offsets_it = cuda::make_strided_iterator(cuda::make_counting_iterator(0), segment_size); + c2h::device_vector segment_offsets(num_segments + 1, thrust::no_init); + thrust::copy(fixed_offsets_it, fixed_offsets_it + (num_segments + 1), segment_offsets.begin()); + + // Compute compacted output offsets: each output segment holds exactly min(k[i], segment_size) items, tightly packed. + auto compacted_output_sizes_it = cuda::make_transform_iterator( + cuda::make_counting_iterator(segment_index_t{0}), + get_output_size_op{segment_offsets.cbegin(), segment_k.cbegin(), num_segments}); + c2h::device_vector compacted_offsets(num_segments + 1, thrust::no_init); + thrust::exclusive_scan( + compacted_output_sizes_it, compacted_output_sizes_it + num_segments + 1, compacted_offsets.begin()); + segment_size_t total_output_size = compacted_offsets.back(); + + // Prepare keys input & output. Input segments are fixed-size (strided); output segments are compacted (variable). + c2h::device_vector keys_in_buffer(num_segments * segment_size, thrust::no_init); + c2h::device_vector keys_out_buffer(total_output_size, thrust::no_init); + const int num_key_seeds = 1; + c2h::gen(C2H_SEED(num_key_seeds), keys_in_buffer); + auto d_keys_in_ptr = thrust::raw_pointer_cast(keys_in_buffer.data()); + auto d_keys_out_ptr = thrust::raw_pointer_cast(keys_out_buffer.data()); + auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size); + auto d_keys_out = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_out_ptr), compacted_offsets.cbegin()); + + // Prepare values input & output + auto values_in_it = cuda::make_counting_iterator(val_t{0}); + c2h::device_vector values_out_buffer(total_output_size, thrust::no_init); + auto d_values_out_ptr = thrust::raw_pointer_cast(values_out_buffer.data()); + auto d_values_in = cuda::make_strided_iterator(cuda::make_counting_iterator(values_in_it), segment_size); + auto d_values_out = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_values_out_ptr), compacted_offsets.cbegin()); + + // Copy input for verification + c2h::device_vector expected_keys(keys_in_buffer); + + // Run the top-k algorithm with a per-segment k passed as an immediate sequence + batched_topk_pairs( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + cuda::args::immediate{segment_size, cuda::args::bounds()}, + cuda::args::deferred_sequence{ + thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_segments * segment_size}); + + // Verification: + // - We verify correct top-k selection through the keys + // - We verify that values were permuted along correctly by making sure values remain associated with their keys and + // making sure we do not duplicate values + REQUIRE(verify_pairs_consistency(expected_keys, keys_out_buffer, values_out_buffer) == true); + + // Verify values don't appear more than once in the returned results + REQUIRE(verify_unique_indices(values_out_buffer, compacted_offsets, num_segments) == true); + + // Verify keys are returned correctly: sort each fixed-size input segment, then compact each to its per-segment top-k. + fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); + expected_keys = compact_to_topk_batched(expected_keys, segment_offsets, segment_k.cbegin()); + + // Since the results of top-k are unordered, sort compacted output segments before comparison. + segmented_sort_keys( + keys_out_buffer, num_segments, compacted_offsets.cbegin(), compacted_offsets.cbegin() + 1, direction); + + REQUIRE(expected_keys == keys_out_buffer); +} + +C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with variable-size segments and per-segment k", + "[pairs][segmented][topk][device]", + key_types, + max_segment_size_list, + max_num_k_list, + select_direction_list) +{ + using segment_size_t = cuda::std::int64_t; + using segment_index_t = cuda::std::int64_t; + + using key_t = c2h::get<0, TestType>; + using val_t = cuda::std::int32_t; + + // Statically constrained maximum segment size and k + constexpr segment_size_t static_max_segment_size = c2h::get<1, TestType>::value; + constexpr segment_size_t static_max_k = c2h::get<2, TestType>::value; + + // Selection direction comes from the compile-time test axis. + constexpr auto direction = c2h::get<3, TestType>::value; + + constexpr segment_size_t min_items = 1; + constexpr segment_size_t max_items = 1'000'000; + + // Number of items + const segment_size_t num_items = GENERATE_COPY( + take(2, random(min_items, max_items)), + values({ + min_items, + max_items, + })); + + // Generate segment sizes + constexpr segment_size_t min_segment_size = 1; + constexpr auto max_segment_size = static_max_segment_size; + c2h::device_vector segment_offsets = + c2h::gen_uniform_offsets(C2H_SEED(3), num_items, min_segment_size, max_segment_size); + const segment_index_t num_segments = static_cast(segment_offsets.size() - 1); + auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + auto segment_size_it = cuda::make_transform_iterator( + cuda::make_counting_iterator(segment_index_t{0}), segment_size_op{segment_offsets_it}); + + // Generate a per-segment k in [1, static_max_k] + c2h::device_vector segment_k(num_segments, thrust::no_init); + c2h::gen(C2H_SEED(1), segment_k, segment_size_t{1}, static_max_k); + + // Capture test parameters + CAPTURE(c2h::type_name(), + c2h::type_name(), + c2h::type_name(), + static_max_segment_size, + static_max_k, + num_segments, + direction); + + // Compute compacted output offsets: + // Each output segment holds exactly min(k[i], segment_size[i]) items, tightly packed. + auto compacted_output_sizes_it = cuda::make_transform_iterator( + cuda::make_counting_iterator(segment_index_t{0}), + get_output_size_op{segment_offsets.cbegin(), segment_k.cbegin(), num_segments}); + c2h::device_vector compacted_offsets(num_segments + 1, thrust::no_init); + thrust::exclusive_scan( + compacted_output_sizes_it, compacted_output_sizes_it + num_segments + 1, compacted_offsets.begin()); + segment_size_t total_output_size = compacted_offsets.back(); + + // Prepare keys input & output + c2h::device_vector keys_in_buffer(num_items, thrust::no_init); + c2h::device_vector keys_out_buffer(total_output_size, thrust::no_init); + const int num_key_seeds = 1; + c2h::gen(C2H_SEED(num_key_seeds), keys_in_buffer); + auto d_keys_in_ptr = thrust::raw_pointer_cast(keys_in_buffer.data()); + auto d_keys_out_ptr = thrust::raw_pointer_cast(keys_out_buffer.data()); + auto d_keys_in = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_offsets.cbegin()); + auto d_keys_out = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_out_ptr), compacted_offsets.cbegin()); + + // Prepare values input & output + auto values_in_it = cuda::make_counting_iterator(val_t{0}); + c2h::device_vector values_out_buffer(total_output_size, thrust::no_init); + auto d_values_out_ptr = thrust::raw_pointer_cast(values_out_buffer.data()); + auto d_values_in = + cuda::make_permutation_iterator(cuda::make_counting_iterator(values_in_it), segment_offsets.cbegin()); + auto d_values_out = + cuda::make_permutation_iterator(cuda::make_counting_iterator(d_values_out_ptr), compacted_offsets.cbegin()); + + // Copy input for verification + c2h::device_vector expected_keys(keys_in_buffer); + + // Run the top-k algorithm with a per-segment k passed as an immediate sequence + batched_topk_pairs( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds()}, + cuda::args::deferred_sequence{ + thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds()}, + cuda::args::immediate{num_segments}, + cuda::args::immediate{num_items}); + + // Verification: + // - We verify correct top-k selection through the keys + // - We verify that values were permuted along correctly by making sure values remain associated with their keys and + // making sure we do not duplicate values + REQUIRE(verify_pairs_consistency(expected_keys, keys_out_buffer, values_out_buffer) == true); + + // Verify values don't appear more than once in the returned results + REQUIRE(verify_unique_indices(values_out_buffer, compacted_offsets, num_segments) == true); + + // Verify keys are returned correctly: sort each segment of the expected input, then compact the per-segment top-k + segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); + expected_keys = compact_to_topk_batched(expected_keys, segment_offsets, segment_k.cbegin()); + + // Since the results of top-k are unordered, sort compacted output segments before comparison + segmented_sort_keys( + keys_out_buffer, num_segments, compacted_offsets.cbegin(), compacted_offsets.cbegin() + 1, direction); + + REQUIRE(expected_keys == keys_out_buffer); +} diff --git a/cub/test/catch2_test_device_topk_common.cuh b/cub/test/catch2_test_device_topk_common.cuh index aebf2ce7fdd..3aec567a782 100644 --- a/cub/test/catch2_test_device_topk_common.cuh +++ b/cub/test/catch2_test_device_topk_common.cuh @@ -11,6 +11,7 @@ #include #include +#include #include @@ -61,16 +62,24 @@ struct get_output_size_op { OffsetItT offset_it; KSizesItT k_it; + cuda::std::int64_t num_segments; __device__ __forceinline__ cuda::std::int64_t operator()(cuda::std::int64_t segment_id) const { + // Building the `num_segments + 1` compacted offsets via an exclusive scan invokes this functor once past the last + // segment (segment_id == num_segments). Return 0 there to avoid reading `offset_it[num_segments + 1]` and + // `k_it[num_segments]` out of bounds; that extra element never contributes to an exclusive-scan output. + if (segment_id >= num_segments) + { + return 0; + } const auto segment_size = offset_it[segment_id + 1] - offset_it[segment_id]; return (cuda::std::min) (static_cast(k_it[segment_id]), segment_size); } }; template -get_output_size_op(OffsetItT, KSizesItT) -> get_output_size_op; +get_output_size_op(OffsetItT, KSizesItT, cuda::std::int64_t) -> get_output_size_op; template struct offset_iterator_op @@ -283,10 +292,15 @@ void compact_sorted_keys_to_topk( d_keys_in.resize(new_end - d_keys_in.begin()); } -// Stream-compacts each segment to only contain the top-k elements -template +// Stream-compacts each segment to only contain its top-k elements, where the number of elements to keep is provided +// per segment by `k_it` (k_it[segment_id] -> k for that segment). Each output segment holds exactly +// min(k_it[segment_id], segment_size[segment_id]) items, tightly packed. +template , int> = 0> c2h::device_vector compact_to_topk_batched( - c2h::device_vector& d_keys_in, const c2h::device_vector& d_offsets, cuda::std::int64_t k) + c2h::device_vector& d_keys_in, const c2h::device_vector& d_offsets, KSizesItT k_it) { // Expects d_offsets includes the number of items at the end const auto num_segments = d_offsets.size() - 1; @@ -297,7 +311,8 @@ c2h::device_vector compact_to_topk_batched( // Calculates the output sizes (if segment size is smaller than k, then output size is segment size, otherwise k) auto copy_sizes_it = cuda::make_transform_iterator( - cuda::make_counting_iterator(0), get_output_size_op{d_offsets.cbegin(), cuda::constant_iterator(k)}); + cuda::make_counting_iterator(0), + get_output_size_op{d_offsets.cbegin(), k_it, static_cast(num_segments)}); // Calculate destination offsets via prefix sum c2h::device_vector d_output_offsets(num_segments + 1, thrust::no_init); @@ -323,6 +338,14 @@ c2h::device_vector compact_to_topk_batched( return d_keys_out; } +// Stream-compacts each segment to only contain the top-k elements, using a single uniform k across all segments. +template +c2h::device_vector compact_to_topk_batched( + c2h::device_vector& d_keys_in, const c2h::device_vector& d_offsets, cuda::std::int64_t k) +{ + return compact_to_topk_batched(d_keys_in, d_offsets, cuda::constant_iterator(k)); +} + template void segmented_sort_keys( c2h::device_vector& d_keys_in, @@ -331,6 +354,8 @@ void segmented_sort_keys( OffsetItT d_segment_offsets_end_it, cub::detail::topk::select direction) { + // TODO: switch this reference sort to cub::DeviceSegmentedRadixSort in a follow-up PR: it compiles ~30% faster + // than cub::DeviceSegmentedSort at negligible runtime cost. cuda::std::int64_t num_items = d_keys_in.size(); // Prepare alternate buffer for double buffering