From 478203cadf532d924534199c6c61d0dc503b43e3 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 12 Jun 2026 17:14:42 +0200 Subject: [PATCH] [CUB] Refactor `DeviceAdjacentDifference::SubtractLeft` to always take an environment We want to be able to pass tunings to the APIs that take user provided memory. --- cub/cub/device/device_adjacent_difference.cuh | 102 ++++++---- ...vice_adjacent_difference_substract_left.cu | 179 ++++++++++++++++++ 2 files changed, 240 insertions(+), 41 deletions(-) diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index e3752fd61f1..1f828154719 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -185,6 +186,9 @@ struct DeviceAdjacentDifference //! @tparam NumItemsT //! **[inferred]** Type of num_items //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! //! @param[in] d_temp_storage //! @devicestorage //! @@ -203,14 +207,15 @@ struct DeviceAdjacentDifference //! @param[in] difference_op //! The binary function used to compute differences //! - //! @param[in] stream + //! @param[in] env //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0` + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst template , - typename NumItemsT = uint32_t> + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>> static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy( void* d_temp_storage, size_t& temp_storage_bytes, @@ -218,12 +223,21 @@ struct DeviceAdjacentDifference OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op = {}, - cudaStream_t stream = nullptr) + const EnvT& env = {}) { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy"); - using OffsetT = detail::choose_offset_t; - return detail::adjacent_difference::dispatch( - d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast(num_items), difference_op, stream); + + using OffsetT = detail::choose_offset_t; + using default_policy_selector = detail::adjacent_difference::policy_selector_from_types; + + return detail::dispatch_with_env_and_tuning( + d_temp_storage, + temp_storage_bytes, + env, + [&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) { + return detail::adjacent_difference::dispatch( + storage, bytes, d_input, d_output, static_cast(num_items), difference_op, stream, policy_selector); + }); } //! @rst @@ -295,6 +309,9 @@ struct DeviceAdjacentDifference //! @tparam NumItemsT //! **[inferred]** Type of `num_items` //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``. + //! //! @param[in] d_temp_storage //! @devicestorage //! @@ -310,23 +327,36 @@ struct DeviceAdjacentDifference //! @param[in] difference_op //! The binary function used to compute differences //! - //! @param[in] stream + //! @param[in] env //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template , typename NumItemsT = uint32_t> + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>> static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft( void* d_temp_storage, size_t& temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, - cudaStream_t stream = nullptr) + const EnvT& env = {}) { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft"); + using OffsetT = detail::choose_offset_t; - return detail::adjacent_difference::dispatch( - d_temp_storage, temp_storage_bytes, d_input, d_input, static_cast(num_items), difference_op, stream); + using default_policy_selector = + detail::adjacent_difference::policy_selector_from_types; + + return detail::dispatch_with_env_and_tuning( + d_temp_storage, + temp_storage_bytes, + env, + [&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) { + return detail::adjacent_difference::dispatch( + storage, bytes, d_input, d_input, static_cast(num_items), difference_op, stream, policy_selector); + }); } //! @rst @@ -607,18 +637,20 @@ struct DeviceAdjacentDifference //! @rst //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template , - typename NumItemsT = uint32_t, - typename EnvT = ::cuda::std::execution::env<>, - ::cuda::std::enable_if_t<::cuda::std::is_integral_v, int> = 0> + template < + typename InputIteratorT, + typename OutputIteratorT, + typename DifferenceOpT = ::cuda::std::minus<>, + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t<::cuda::std::__indirectly_binary_invocable, + int> = 0> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t SubtractLeftCopy( InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op = {}, - EnvT env = {}) + const EnvT& env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractLeftCopy"); @@ -626,16 +658,9 @@ struct DeviceAdjacentDifference using default_policy_selector = detail::adjacent_difference::policy_selector_from_types; return detail::dispatch_with_env_and_tuning( - env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { + env, [&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) { return detail::adjacent_difference::dispatch( - d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - static_cast(num_items), - difference_op, - stream, - policy_selector); + storage, bytes, d_input, d_output, static_cast(num_items), difference_op, stream, policy_selector); }); } @@ -702,9 +727,11 @@ struct DeviceAdjacentDifference typename DifferenceOpT = ::cuda::std::minus<>, typename NumItemsT = uint32_t, typename EnvT = ::cuda::std::execution::env<>, - ::cuda::std::enable_if_t<::cuda::std::is_integral_v && !::cuda::std::is_integral_v, int> = 0> - [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t - SubtractLeft(RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, EnvT env = {}) + ::cuda::std::enable_if_t< + ::cuda::std::__indirectly_binary_invocable, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t SubtractLeft( + RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, const EnvT& env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractLeft"); @@ -713,16 +740,9 @@ struct DeviceAdjacentDifference detail::adjacent_difference::policy_selector_from_types; return detail::dispatch_with_env_and_tuning( - env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { + env, [&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) { return detail::adjacent_difference::dispatch( - d_temp_storage, - temp_storage_bytes, - d_input, - d_input, - static_cast(num_items), - difference_op, - stream, - policy_selector); + storage, bytes, d_input, d_input, static_cast(num_items), difference_op, stream, policy_selector); }); } diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu index e2b54917ec7..4e36b2cb99c 100644 --- a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu +++ b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu @@ -5,7 +5,9 @@ #include +#include #include +#include #include #include @@ -64,6 +66,94 @@ C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy does not change the input", REQUIRE(reference == in); } +#if TEST_LAUNCH == 0 +C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with user provided memory and environment", + "[device][adjacent_difference]", + types) +{ + using type = typename c2h::get<0, TestType>; + + const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); + c2h::device_vector in(num_items, thrust::default_init); + c2h::gen(C2H_SEED(2), in); + + c2h::host_vector h_in = in; + c2h::host_vector reference(num_items, thrust::default_init); + std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus{}); + + size_t expected_allocation_size = 0; + auto error = cub::DeviceAdjacentDifference::SubtractLeft( + static_cast(nullptr), expected_allocation_size, in.begin(), num_items, cuda::std::minus<>{}); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + auto d_temp = c2h::device_vector(expected_allocation_size, thrust::no_init); + void* temp_storage = thrust::raw_pointer_cast(d_temp.data()); + + auto test_subtract_left = [&](const auto& env) { + size_t num_bytes = 0; + error = cub::DeviceAdjacentDifference::SubtractLeft( + static_cast(nullptr), num_bytes, in.begin(), num_items, cuda::std::minus<>{}, env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + REQUIRE(expected_allocation_size == num_bytes); + + error = cub::DeviceAdjacentDifference::SubtractLeft( + temp_storage, num_bytes, in.begin(), num_items, cuda::std::minus<>{}, env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + // Verify result + REQUIRE(reference == in); + }; + + int current_device; + error = cudaGetDevice(¤t_device); + REQUIRE(error == cudaSuccess); + + SECTION("DeviceAdjacentDifference::SubtractLeft works with cudaStream_t") + { + cuda::stream stream{cuda::devices[current_device]}; + test_subtract_left(stream.get()); + } + + SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::stream") + { + cuda::stream stream{cuda::devices[current_device]}; + test_subtract_left(stream); + } + + SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::stream_ref") + { + cuda::stream stream{cuda::devices[current_device]}; + cuda::stream_ref stream_ref{stream}; + test_subtract_left(stream_ref); + } + + SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::std::execution::env") + { + cuda::std::execution::env env{}; + test_subtract_left(env); + } + + SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::execution::gpu") + { + const auto policy = cuda::execution::gpu; + test_subtract_left(policy); + } + + SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::execution::gpu with stream") + { + cuda::stream stream{cuda::devices[current_device]}; + const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream); + test_subtract_left(policy); + } +} +#endif // TEST_LAUNCH == 0 + C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with iterators", "[device][adjacent_difference]", types) { using type = typename c2h::get<0, TestType>; @@ -81,6 +171,95 @@ C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with iterators", "[device REQUIRE(reference == in); } +#if TEST_LAUNCH == 0 +C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with user provided memory and environment", + "[device][adjacent_difference]", + types) +{ + using type = typename c2h::get<0, TestType>; + + const int num_items = GENERATE_COPY(take(2, random(1, 1000000))); + c2h::device_vector in(num_items); + c2h::device_vector out(num_items); + c2h::gen(C2H_SEED(2), in); + + c2h::host_vector h_in = in; + c2h::host_vector reference(num_items); + std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus{}); + + size_t expected_allocation_size = 0; + auto error = cub::DeviceAdjacentDifference::SubtractLeftCopy( + static_cast(nullptr), expected_allocation_size, in.begin(), out.begin(), num_items, cuda::std::minus<>{}); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + auto d_temp = c2h::device_vector(expected_allocation_size, thrust::no_init); + void* temp_storage = thrust::raw_pointer_cast(d_temp.data()); + + auto test_subtract_left_copy = [&](const auto& env) { + size_t num_bytes = 0; + error = cub::DeviceAdjacentDifference::SubtractLeftCopy( + static_cast(nullptr), num_bytes, in.begin(), out.begin(), num_items, cuda::std::minus<>{}, env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + REQUIRE(expected_allocation_size == num_bytes); + + error = cub::DeviceAdjacentDifference::SubtractLeftCopy( + temp_storage, num_bytes, in.begin(), out.begin(), num_items, cuda::std::minus<>{}, env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + // Verify result + REQUIRE(reference == out); + }; + + int current_device; + error = cudaGetDevice(¤t_device); + REQUIRE(error == cudaSuccess); + + SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cudaStream_t") + { + cuda::stream stream{cuda::devices[current_device]}; + test_subtract_left_copy(stream.get()); + } + + SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::stream") + { + cuda::stream stream{cuda::devices[current_device]}; + test_subtract_left_copy(stream); + } + + SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::stream_ref") + { + cuda::stream stream{cuda::devices[current_device]}; + cuda::stream_ref stream_ref{stream}; + test_subtract_left_copy(stream_ref); + } + + SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::std::execution::env") + { + cuda::std::execution::env env{}; + test_subtract_left_copy(env); + } + + SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::execution::gpu") + { + const auto policy = cuda::execution::gpu; + test_subtract_left_copy(policy); + } + + SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::execution::gpu with stream") + { + cuda::stream stream{cuda::devices[current_device]}; + const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream); + test_subtract_left_copy(policy); + } +} +#endif // TEST_LAUNCH == 0 + C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with iterators", "[device][adjacent_difference]", types) { using type = typename c2h::get<0, TestType>;