Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 61 additions & 41 deletions cub/cub/device/device_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cuda/__functional/call_or.h>
#include <cuda/__stream/get_stream.h>
#include <cuda/std/__execution/env.h>
#include <cuda/std/__iterator/concepts.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/cstdint>
Expand Down Expand Up @@ -185,6 +186,9 @@ struct DeviceAdjacentDifference
//! @tparam NumItemsT
//! **[inferred]** Type of num_items
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand All @@ -203,27 +207,37 @@ struct DeviceAdjacentDifference
//! @param[in] difference_op
//! The binary function used to compute differences
//!
//! @param[in] stream
//! @param[in] env
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t>
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>>
static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
NumItemsT num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy");
using OffsetT = detail::choose_offset_t<NumItemsT>;
return detail::adjacent_difference::dispatch<MayAlias::No, ReadOption::Left>(
d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream);

using OffsetT = detail::choose_offset_t<NumItemsT>;
using default_policy_selector = detail::adjacent_difference::policy_selector_from_types<InputIteratorT, false>;

return detail::dispatch_with_env_and_tuning<default_policy_selector>(
d_temp_storage,
temp_storage_bytes,
env,
[&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) {
return detail::adjacent_difference::dispatch<MayAlias::No, ReadOption::Left>(
storage, bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream, policy_selector);
});
}

//! @rst
Expand Down Expand Up @@ -295,6 +309,9 @@ struct DeviceAdjacentDifference
//! @tparam NumItemsT
//! **[inferred]** Type of `num_items`
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand All @@ -310,23 +327,36 @@ struct DeviceAdjacentDifference
//! @param[in] difference_op
//! The binary function used to compute differences
//!
//! @param[in] stream
//! @param[in] env
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename RandomAccessIteratorT, typename DifferenceOpT = ::cuda::std::minus<>, typename NumItemsT = uint32_t>
template <typename RandomAccessIteratorT,
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>>
static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(
void* d_temp_storage,
size_t& temp_storage_bytes,
RandomAccessIteratorT d_input,
NumItemsT num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft");

using OffsetT = detail::choose_offset_t<NumItemsT>;
return detail::adjacent_difference::dispatch<MayAlias::Yes, ReadOption::Left>(
d_temp_storage, temp_storage_bytes, d_input, d_input, static_cast<OffsetT>(num_items), difference_op, stream);
using default_policy_selector =
detail::adjacent_difference::policy_selector_from_types<RandomAccessIteratorT, true>;

return detail::dispatch_with_env_and_tuning<default_policy_selector>(
d_temp_storage,
temp_storage_bytes,
env,
[&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) {
return detail::adjacent_difference::dispatch<MayAlias::Yes, ReadOption::Left>(
storage, bytes, d_input, d_input, static_cast<OffsetT>(num_items), difference_op, stream, policy_selector);
});
}

//! @rst
Expand Down Expand Up @@ -607,35 +637,30 @@ struct DeviceAdjacentDifference
//! @rst
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT>, int> = 0>
template <
typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<::cuda::std::__indirectly_binary_invocable<DifferenceOpT, InputIteratorT, InputIteratorT>,
int> = 0>
Comment thread
bernhardmgruber marked this conversation as resolved.
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t SubtractLeftCopy(
InputIteratorT d_input,
OutputIteratorT d_output,
NumItemsT num_items,
DifferenceOpT difference_op = {},
EnvT env = {})
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractLeftCopy");

using OffsetT = detail::choose_offset_t<NumItemsT>;
using default_policy_selector = detail::adjacent_difference::policy_selector_from_types<InputIteratorT, false>;

return detail::dispatch_with_env_and_tuning<default_policy_selector>(
env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
env, [&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) {
Comment thread
miscco marked this conversation as resolved.
return detail::adjacent_difference::dispatch<MayAlias::No, ReadOption::Left>(
d_temp_storage,
temp_storage_bytes,
d_input,
d_output,
static_cast<OffsetT>(num_items),
difference_op,
stream,
policy_selector);
storage, bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream, policy_selector);
});
}

Expand Down Expand Up @@ -702,9 +727,11 @@ struct DeviceAdjacentDifference
typename DifferenceOpT = ::cuda::std::minus<>,
typename NumItemsT = uint32_t,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_integral_v<EnvT>, int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
SubtractLeft(RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, EnvT env = {})
::cuda::std::enable_if_t<
::cuda::std::__indirectly_binary_invocable<DifferenceOpT, RandomAccessIteratorT, RandomAccessIteratorT>,
int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t SubtractLeft(
RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceAdjacentDifference::SubtractLeft");

Expand All @@ -713,16 +740,9 @@ struct DeviceAdjacentDifference
detail::adjacent_difference::policy_selector_from_types<RandomAccessIteratorT, true>;

return detail::dispatch_with_env_and_tuning<default_policy_selector>(
env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
env, [&](auto policy_selector, void* storage, size_t& bytes, cudaStream_t stream) {
return detail::adjacent_difference::dispatch<MayAlias::Yes, ReadOption::Left>(
d_temp_storage,
temp_storage_bytes,
d_input,
d_input,
static_cast<OffsetT>(num_items),
difference_op,
stream,
policy_selector);
storage, bytes, d_input, d_input, static_cast<OffsetT>(num_items), difference_op, stream, policy_selector);
});
}

Expand Down
179 changes: 179 additions & 0 deletions cub/test/catch2_test_device_adjacent_difference_substract_left.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

#include <cub/device/device_adjacent_difference.cuh>

#include <cuda/devices>
#include <cuda/iterator>
#include <cuda/std/execution>

#include <algorithm>
#include <numeric>
Expand Down Expand Up @@ -64,6 +66,94 @@ C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy does not change the input",
REQUIRE(reference == in);
}

#if TEST_LAUNCH == 0
C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with user provided memory and environment",
"[device][adjacent_difference]",
types)
{
using type = typename c2h::get<0, TestType>;

const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
c2h::device_vector<type> in(num_items, thrust::default_init);
c2h::gen(C2H_SEED(2), in);

c2h::host_vector<type> h_in = in;
c2h::host_vector<type> reference(num_items, thrust::default_init);
std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus<type>{});
Comment thread
miscco marked this conversation as resolved.

size_t expected_allocation_size = 0;
auto error = cub::DeviceAdjacentDifference::SubtractLeft(
static_cast<void*>(nullptr), expected_allocation_size, in.begin(), num_items, cuda::std::minus<>{});
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
Comment thread
miscco marked this conversation as resolved.

auto d_temp = c2h::device_vector<uint8_t>(expected_allocation_size, thrust::no_init);
void* temp_storage = thrust::raw_pointer_cast(d_temp.data());

auto test_subtract_left = [&](const auto& env) {
size_t num_bytes = 0;
error = cub::DeviceAdjacentDifference::SubtractLeft(
static_cast<void*>(nullptr), num_bytes, in.begin(), num_items, cuda::std::minus<>{}, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(expected_allocation_size == num_bytes);

error = cub::DeviceAdjacentDifference::SubtractLeft(
temp_storage, num_bytes, in.begin(), num_items, cuda::std::minus<>{}, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

// Verify result
REQUIRE(reference == in);
};
Comment thread
coderabbitai[bot] marked this conversation as resolved.

int current_device;
error = cudaGetDevice(&current_device);
REQUIRE(error == cudaSuccess);

SECTION("DeviceAdjacentDifference::SubtractLeft works with cudaStream_t")
{
cuda::stream stream{cuda::devices[current_device]};
test_subtract_left(stream.get());
}

SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::stream")
{
cuda::stream stream{cuda::devices[current_device]};
test_subtract_left(stream);
}

SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::stream_ref")
{
cuda::stream stream{cuda::devices[current_device]};
cuda::stream_ref stream_ref{stream};
test_subtract_left(stream_ref);
}

SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::std::execution::env")
{
cuda::std::execution::env env{};
test_subtract_left(env);
}

SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::execution::gpu")
{
const auto policy = cuda::execution::gpu;
test_subtract_left(policy);
}

SECTION("DeviceAdjacentDifference::SubtractLeft works with cuda::execution::gpu with stream")
{
cuda::stream stream{cuda::devices[current_device]};
const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
test_subtract_left(policy);
}
}
#endif // TEST_LAUNCH == 0

C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with iterators", "[device][adjacent_difference]", types)
{
using type = typename c2h::get<0, TestType>;
Expand All @@ -81,6 +171,95 @@ C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with iterators", "[device
REQUIRE(reference == in);
}

#if TEST_LAUNCH == 0
C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with user provided memory and environment",
"[device][adjacent_difference]",
types)
{
using type = typename c2h::get<0, TestType>;

const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
c2h::device_vector<type> in(num_items);
c2h::device_vector<type> out(num_items);
c2h::gen(C2H_SEED(2), in);

c2h::host_vector<type> h_in = in;
c2h::host_vector<type> reference(num_items);
std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus<type>{});

size_t expected_allocation_size = 0;
auto error = cub::DeviceAdjacentDifference::SubtractLeftCopy(
static_cast<void*>(nullptr), expected_allocation_size, in.begin(), out.begin(), num_items, cuda::std::minus<>{});
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

auto d_temp = c2h::device_vector<uint8_t>(expected_allocation_size, thrust::no_init);
void* temp_storage = thrust::raw_pointer_cast(d_temp.data());

auto test_subtract_left_copy = [&](const auto& env) {
size_t num_bytes = 0;
error = cub::DeviceAdjacentDifference::SubtractLeftCopy(
static_cast<void*>(nullptr), num_bytes, in.begin(), out.begin(), num_items, cuda::std::minus<>{}, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(expected_allocation_size == num_bytes);

error = cub::DeviceAdjacentDifference::SubtractLeftCopy(
temp_storage, num_bytes, in.begin(), out.begin(), num_items, cuda::std::minus<>{}, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

// Verify result
REQUIRE(reference == out);
};
Comment thread
miscco marked this conversation as resolved.

int current_device;
error = cudaGetDevice(&current_device);
REQUIRE(error == cudaSuccess);

SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cudaStream_t")
{
cuda::stream stream{cuda::devices[current_device]};
test_subtract_left_copy(stream.get());
}

SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::stream")
{
cuda::stream stream{cuda::devices[current_device]};
test_subtract_left_copy(stream);
}

SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::stream_ref")
{
cuda::stream stream{cuda::devices[current_device]};
cuda::stream_ref stream_ref{stream};
test_subtract_left_copy(stream_ref);
}

SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::std::execution::env")
{
cuda::std::execution::env env{};
test_subtract_left_copy(env);
}

SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::execution::gpu")
{
const auto policy = cuda::execution::gpu;
test_subtract_left_copy(policy);
}

SECTION("DeviceAdjacentDifference::SubtractLeftCopy works with cuda::execution::gpu with stream")
{
cuda::stream stream{cuda::devices[current_device]};
const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
test_subtract_left_copy(policy);
}
}
#endif // TEST_LAUNCH == 0

C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with iterators", "[device][adjacent_difference]", types)
{
using type = typename c2h::get<0, TestType>;
Expand Down
Loading