From 7eee8c0f5327f11256c12169b746317f885b653d Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Wed, 3 Jun 2026 22:57:33 -0700
Subject: [PATCH 1/6] Make argument wrappers construction public

---
 .../bench/segmented_topk/fixed/keys.cu        |  12 +-
 .../bench/segmented_topk/variable/keys.cu     |  14 +-
 cub/cub/agent/agent_batched_topk.cuh          |  12 +-
 cub/cub/detail/segmented_params.cuh           |  30 +-
 .../device/dispatch/dispatch_batched_topk.cuh |  26 +-
 .../dispatch/kernels/kernel_batched_topk.cuh  |  10 +-
 .../catch2_test_device_segmented_topk_keys.cu |  33 +-
 ...catch2_test_device_segmented_topk_pairs.cu |  22 +-
 libcudacxx/include/cuda/__argument/argument.h | 283 +++++++++++++-----
 .../include/cuda/__argument/argument_bounds.h |   4 +-
 libcudacxx/include/cuda/argument              |  26 ++
 .../include/cuda/std/__internal/namespaces.h  |   4 +-
 .../cuda/argument/argument_bounds.pass.cpp    |  42 +--
 .../cuda/argument/argument_traits.pass.cpp    | 164 +++++-----
 .../cuda/argument/deferred_argument.pass.cpp  | 110 +++----
 .../cuda/argument/dynamic_argument.pass.cpp   | 121 ++++----
 .../cuda/argument/static_argument.pass.cpp    |  50 ++--
 .../static_bounds_conversion.fail.cpp         |   6 +-
 .../static_bounds_type_mismatch.fail.cpp      |   4 +-
 .../cuda/argument/usage_example.pass.cpp      |  43 ++-
 20 files changed, 587 insertions(+), 429 deletions(-)
 create mode 100644 libcudacxx/include/cuda/argument
diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
index b8f13469dce..4178e7ea0d5 100644
--- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
@@ -4,7 +4,7 @@
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_batched_topk.cuh>
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 
 #include <nvbench_helper.cuh>
@@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
   const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
   const auto num_segments      = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
   const auto elements          = num_segments * segment_size;
-  const auto total_num_items   = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)};
+  const auto total_num_items   = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)};
   const bit_entropy entropy    = str_to_entropy(state.get_string("Entropy"));
 
   // Skip workloads where k exceeds the segment size
@@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
   auto d_keys_in      = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
   auto d_keys_out     = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);
 
-  auto segment_sizes    = ::cuda::__argument::__constant<MaxSegmentSize>{};
-  auto k                = ::cuda::__argument::__constant<MaxNumSelected>{};
-  auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
+  auto segment_sizes    = ::cuda::argument::constant<MaxSegmentSize>{};
+  auto k                = ::cuda::argument::constant<MaxNumSelected>{};
+  auto select_direction = ::cuda::argument::constant<cub::detail::topk::select::max>{};
 
   state.add_element_count(elements, "NumElements");
   state.add_element_count(segment_size, "SegmentSize");
@@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
       segment_sizes,
       k,
       select_direction,
-      ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)},
+      ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
       total_num_items,
       env);
   });
diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
index 001bbb4e258..0febcf507bd 100644
--- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
@@ -7,7 +7,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 
 #include <nvbench_helper.cuh>
@@ -32,17 +32,17 @@ void decode_style_variable_topk_keys(
     static_cast<cuda::std::int64_t>(MaxSegmentSize));
   const auto input_elements  = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
   const auto output_elements = static_cast<std::size_t>(num_segments) * K;
-  const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
+  const auto total_num_items = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(input_elements)};
 
   auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
     num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
   auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
 
-  auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
-    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
-  auto k_param            = ::cuda::__argument::__constant<K>{};
-  auto select_direction   = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
-  auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
+  auto segment_sizes_param = ::cuda::argument::immediate_sequence{
+    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()};
+  auto k_param            = ::cuda::argument::constant<K>{};
+  auto select_direction   = ::cuda::argument::constant<cub::detail::topk::select::max>{};
+  auto num_segments_param = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(num_segments)};
 
   auto d_keys_in = cuda::make_strided_iterator(
     cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh
index c5ec20d26cd..bc10311d36d 100644
--- a/cub/cub/agent/agent_batched_topk.cuh
+++ b/cub/cub/agent/agent_batched_topk.cuh
@@ -23,8 +23,8 @@
 #include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/__argument_>
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/argument>
 
 CUB_NAMESPACE_BEGIN
 
@@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
   using key_t   = it_value_t<key_it_t>;
   using value_t = it_value_t<value_it_t>;
 
-  using segment_size_val_t = typename ::cuda::__argument::__traits<SegmentSizeParameterT>::element_type;
-  using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
+  using segment_size_val_t = typename ::cuda::argument::__traits<SegmentSizeParameterT>::element_type;
+  using num_segments_val_t = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
   using counters_t         = batched_topk_counters<num_segments_val_t>;
 
   static constexpr auto policy                 = PolicyGetter{}();
@@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
     multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;
 
   // Check if there could be large segments present
-  static constexpr bool only_small_segments = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
+  static constexpr bool only_small_segments = ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
 
   // Check if we are dealing with keys-only or key-value pairs
   static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
@@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
       return;
     }
 
-    constexpr bool is_full_tile = ::cuda::__argument::__traits<SegmentSizeParameterT>::is_constant
-                               && ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
+    constexpr bool is_full_tile = ::cuda::argument::__traits<SegmentSizeParameterT>::is_constant
+                               && ::cuda::argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
 
     // Resolve Segment Parameters
     const auto segment_size = params::get_param(segment_sizes, segment_id);
diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh
index 543f55b5036..1c4e6859d3e 100644
--- a/cub/cub/detail/segmented_params.cuh
+++ b/cub/cub/detail/segmented_params.cuh
@@ -13,7 +13,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
 #include <cuda/std/__utility/forward.h>
@@ -33,10 +33,10 @@ namespace detail::params
 //! @param[in] __index Segment index to read for sequence arguments.
 //! @return The single argument value, or the sequence element at the given index.
 _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
-_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
+_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
+  if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
   {
     return __arg;
   }
@@ -48,44 +48,44 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<
 
 template <auto _Value, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::argument::__unwrap(__arg);
 }
 
 template <auto _Value, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::argument::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+  const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::argument::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::argument::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+  const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::argument::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::argument::__unwrap(__arg)[__index];
 }
 
 // =====================================================================
diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
index 56d12268dc9..29d245c175e 100644
--- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
@@ -31,10 +31,10 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-#include <cuda/__argument_>
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/__iterator/counting_iterator.h>
 #include <cuda/__iterator/transform_iterator.h>
+#include <cuda/argument>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/always_false.h>
 #include <cuda/std/__type_traits/is_same.h>
@@ -50,28 +50,28 @@ namespace detail::batched_topk
 // Internal: wrap the compile-time select direction into a discrete param for dispatch
 // -----------------------------------------------------------------------------
 
-// The selection direction is compile-time only: callers pass `::cuda::__argument::__constant<Dir>`, which maps to a
+// The selection direction is compile-time only: callers pass `::cuda::argument::constant<Dir>`, which maps to a
 // value-less static_discrete_param. Because the direction is fixed at compile time and carries no runtime value, it
 // can never disagree with its only supported option, so dispatch can never silently degrade to a no-op.
 template <detail::topk::select Dir>
-[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant<Dir>)
+[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant<Dir>)
 {
   return params::static_discrete_param<detail::topk::select, Dir>{};
 }
 
-// The selection direction is intentionally a compile-time constant: only `::cuda::__argument::__constant<Dir>` is
+// The selection direction is intentionally a compile-time constant: only `::cuda::argument::constant<Dir>` is
 // accepted (the overload above maps it to a value-less static_discrete_param). This catch-all documents that
 // deliberate limitation and rejects anything else (e.g. a runtime `detail::topk::select` or a per-segment iterator of
 // directions) with a clear diagnostic. It is an intent/documentation guard rather than a user-facing one: callers
 // reach the algorithm through the min/max device entry points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}), which
-// construct the matching `__constant<Dir>` internally, so `dispatch` is only ever invoked with a direction we create.
+// construct the matching `constant<Dir>` internally, so `dispatch` is only ever invoked with a direction we create.
 template <typename SelectDirectionT>
 [[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(SelectDirectionT)
 {
   static_assert(::cuda::std::__always_false_v<SelectDirectionT>,
                 "DeviceBatchedTopK currently supports only compile-time selection directions: the min/max entry "
                 "points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}) dispatch with a "
-                "::cuda::__argument::__constant<cub::detail::topk::select>; runtime or per-segment directions are "
+                "::cuda::argument::constant<Dir>; runtime or per-segment directions are "
                 "intentionally not supported");
   // Unreachable (the static_assert above always fires); keeps the return type well-formed so the only diagnostic is
   // the message above.
@@ -131,7 +131,7 @@ template <typename KeyInputItItT,
           typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                                                it_value_t<it_value_t<ValueInputItItT>>,
                                                                ::cuda::std::int64_t,
-                                                               ::cuda::__argument::__traits<KParameterT>::highest>>
+                                                               ::cuda::argument::__traits<KParameterT>::highest>>
 #if _CCCL_HAS_CONCEPTS()
   requires batched_topk_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
@@ -150,7 +150,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream                             = nullptr,
   [[maybe_unused]] PolicySelector policy_selector = {})
 {
-  using large_segment_tile_offset_t = typename ::cuda::__argument::__traits<TotalNumItemsGuaranteeT>::element_type;
+  using large_segment_tile_offset_t = typename ::cuda::argument::__traits<TotalNumItemsGuaranteeT>::element_type;
 
   // Wrap the raw enum into the internal discrete param type
   auto select_directions          = wrap_select_direction(select_direction);
@@ -176,9 +176,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int worker_per_segment_tile_size =
     worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
   static constexpr bool any_small_segments =
-    ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
+    ::cuda::argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
   static constexpr bool only_small_segments =
-    ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
+    ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
 
   // Allocation layout:
   //   only_small_segments: [0] dummy.
@@ -188,7 +188,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int allocations_array_size     = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
   size_t allocation_sizes[allocations_array_size] = {1};
 
-  using num_segments_val_t         = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
+  using num_segments_val_t         = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
   using counters_t                 = batched_topk_counters<num_segments_val_t>;
   using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
   using segment_size_scan_input_op_t =
@@ -244,7 +244,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
 
   // TODO (elstehle): support number of segments provided by device-accessible iterator
   // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
-  static_assert(::cuda::__argument::__traits<NumSegmentsParameterT>::is_single_value,
+  static_assert(::cuda::argument::__traits<NumSegmentsParameterT>::is_single_value,
                 "Only uniform segment sizes are currently supported.");
 
   if constexpr (any_small_segments)
@@ -346,7 +346,7 @@ template <typename KeyInputItItT,
     policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                it_value_t<it_value_t<ValueInputItItT>>,
                                ::cuda::std::int64_t,
-                               ::cuda::__argument::__traits<KParameterT>::highest>;
+                               ::cuda::argument::__traits<KParameterT>::highest>;
   return detail::dispatch_with_env_and_tuning<default_policy_selector>(
     env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
       return dispatch(
diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
index 1ff50dfaf67..9c904f95971 100644
--- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
@@ -20,8 +20,8 @@
 #include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
 #include <cub/util_arch.cuh>
 
-#include <cuda/__argument_>
 #include <cuda/__device/compute_capability.h>
+#include <cuda/argument>
 
 CUB_NAMESPACE_BEGIN
 
@@ -39,7 +39,7 @@ private:
     worker_policy worker_per_segment_policy;
     multi_worker_policy multi_worker_per_segment_policy;
   };
-  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest;
+  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits<SegmentSizeParameterT>::highest;
   static constexpr batched_topk_policy active_policy     = current_policy<PolicySelector>();
 
   template <int Index>
@@ -133,8 +133,8 @@ __launch_bounds__(int(
     KParameterT k,
     SelectDirectionParameterT select_directions,
     NumSegmentsParameterT num_segments,
-    batched_topk_counters<typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
-    typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
+    batched_topk_counters<typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
+    typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
     LargeSegmentTileOffsetT* d_large_segments_tile_offsets)
 {
   using agent_t = typename find_smallest_covering_policy<
@@ -151,7 +151,7 @@ __launch_bounds__(int(
     LargeSegmentTileOffsetT>::agent_t;
 
   // Static Assertions (Constraints)
-  static_assert(agent_t::tile_size >= ::cuda::__argument::__traits<SegmentSizeParameterT>::highest,
+  static_assert(agent_t::tile_size >= ::cuda::argument::__traits<SegmentSizeParameterT>::highest,
                 "Block size exceeds maximum segment size supported by SegmentSizeParameterT");
   static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block,
                 "Static shared memory per block must not exceed 48KB limit.");
diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu
index 3ef76bc2743..16f7d4322ca 100644
--- a/cub/test/catch2_test_device_segmented_topk_keys.cu
+++ b/cub/test/catch2_test_device_segmented_topk_keys.cu
@@ -156,11 +156,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::__argument::__constant<direction>{},
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::constant<direction>{},
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_segments * segment_size});
   // Prepare expected results
   fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
   compact_sorted_keys_to_topk(expected_keys, segment_size, k);
@@ -254,12 +254,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::__argument::__immediate_sequence{
-      segment_size_it, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::__argument::__constant<direction>{},
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_items});
+    ::cuda::argument::immediate_sequence{
+      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::constant<direction>{},
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_items});
 
   // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
   segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
@@ -292,12 +292,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
   batched_topk_keys(
     d_keys_in_it,
     d_keys_out_it,
-    ::cuda::__argument::__immediate{
-      segment_size, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, k>()},
-    ::cuda::__argument::__constant<cub::detail::topk::select::min>{},
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<cuda::std::int64_t{1}, max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<cuda::std::int64_t{1}, k>()},
+    ::cuda::argument::constant<cub::detail::topk::select::min>{},
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_segments * segment_size});
 
   const int num_minus_zero = static_cast<int>(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{}));
   REQUIRE(num_minus_zero >= 1);
diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu
index b16a97b2472..49b2e48a1b6 100644
--- a/cub/test/catch2_test_device_segmented_topk_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu
@@ -225,11 +225,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments"
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::__argument::__constant<direction>{},
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::constant<direction>{},
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_segments * segment_size});
 
   // Verification:
   // - We verify correct top-k selection through the keys
@@ -346,12 +346,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::__argument::__immediate_sequence{
-      segment_size_it, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::__argument::__constant<direction>{},
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_items});
+    ::cuda::argument::immediate_sequence{
+      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::constant<direction>{},
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_items});
 
   // Verification:
   // - We verify correct top-k selection through the keys
diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index a1d55db5045..2df19b9ac5d 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -87,7 +87,7 @@ inline constexpr bool __is_sequence_v =
   || ::cuda::std::__has_random_access_traversal<_Tp>;
 
 // =====================================================================
-// __constant
+// constant
 // =====================================================================
 
 // Non-sequence wrappers intentionally do not reject types with a distinct element type.
@@ -96,7 +96,7 @@ inline constexpr bool __is_sequence_v =
 
 //! @brief Wraps a compile-time constant argument value.
 template <auto _Value>
-struct __constant
+struct constant
 {
   using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using __element_type = value_type;
@@ -109,7 +109,7 @@ struct __constant
 
 //! @brief Wraps a compile-time constant argument sequence.
 template <auto _Value>
-struct __constant_sequence
+struct constant_sequence
 {
   using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using __element_type = __element_type_of_t<value_type>;
@@ -122,15 +122,138 @@ struct __constant_sequence
   }
 };
 
+// __assert_in_range
 // =====================================================================
-// __immediate
+
+template <class _To, class _From>
+_CCCL_API constexpr void __assert_in_range([[maybe_unused]] _From __val) noexcept
+{
+  if constexpr (::cuda::std::__cccl_is_cv_integer_v<_To> && ::cuda::std::__cccl_is_cv_integer_v<_From>)
+  {
+    _CCCL_ASSERT(::cuda::std::in_range<::cuda::std::remove_cv_t<_To>>(__val),
+                 "runtime bound value overflows the element type");
+  }
+}
+
+template <class _To, class _From>
+[[nodiscard]] _CCCL_API constexpr _To __runtime_bound_cast(_From __val) noexcept
+{
+  __assert_in_range<_To>(__val);
+  return static_cast<_To>(__val);
+}
+
+template <class _To, auto _Value>
+_CCCL_API constexpr bool __static_bound_in_range() noexcept
+{
+  using _RawTo   = ::cuda::std::remove_cv_t<_To>;
+  using _RawFrom = ::cuda::std::remove_cv_t<decltype(_Value)>;
+
+  if constexpr (::cuda::std::__cccl_is_integer_v<_RawTo> && ::cuda::std::__cccl_is_integer_v<_RawFrom>)
+  {
+    return ::cuda::std::in_range<_RawTo>(_Value);
+  }
+  else if constexpr (::cuda::std::is_arithmetic_v<_RawTo> && ::cuda::std::is_arithmetic_v<_RawFrom>)
+  {
+    return static_cast<_RawFrom>(static_cast<_RawTo>(_Value)) == _Value;
+  }
+  else
+  {
+    return true;
+  }
+}
+
+template <class _ElementType, class _StaticBounds>
+inline constexpr bool __valid_static_bounds_v = true;
+
+template <class _ElementType, auto _Lowest, auto _Highest>
+inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> =
+  __static_bound_in_range<_ElementType, _Lowest>() && __static_bound_in_range<_ElementType, _Highest>();
+
+template <class _ElementType, class _StaticBounds>
+_CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept
+{
+  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  {
+    return ::cuda::std::numeric_limits<_ElementType>::lowest();
+  }
+  else
+  {
+    return static_cast<_ElementType>(_StaticBounds::lower());
+  }
+}
+
+template <class _ElementType, class _StaticBounds>
+_CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept
+{
+  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  {
+    return (::cuda::std::numeric_limits<_ElementType>::max)();
+  }
+  else
+  {
+    return static_cast<_ElementType>(_StaticBounds::upper());
+  }
+}
+
+template <class _ElementType, class _StaticBounds>
+_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+{
+  auto __static_lowest = __wrapper_static_lowest<_ElementType, _StaticBounds>();
+  return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower();
+}
+
+template <class _ElementType, class _StaticBounds>
+_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+{
+  auto __static_highest = __wrapper_static_highest<_ElementType, _StaticBounds>();
+  return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper();
+}
+
+template <class _ElementType, class _StaticBounds>
+_CCCL_API constexpr bool __has_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+{
+  return __effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds)
+      <= __effective_highest<_ElementType, _StaticBounds>(__runtime_bounds);
+}
+
+template <class _ElementType, class _StaticBounds>
+_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+{
+  static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
+                "static argument bounds cannot be represented by the element type");
+  _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)),
+               "static and runtime argument bounds do not intersect");
+}
+
+template <class _ElementType, class _StaticBounds>
+_CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept
+{
+  if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  {
+    _CCCL_ASSERT((__val >= __wrapper_static_lowest<_ElementType, _StaticBounds>()),
+                 "immediate argument value is below static lowest bound");
+    _CCCL_ASSERT((__val <= __wrapper_static_highest<_ElementType, _StaticBounds>()),
+                 "immediate argument value is above static highest bound");
+  }
+}
+
+template <class _ElementType>
+_CCCL_API constexpr void __validate_runtime_element_bounds(
+  [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept
+{
+  _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound");
+  _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound");
+}
+
+// =====================================================================
+// immediate
 // =====================================================================
 
 //! @brief Wraps a runtime argument value with optional bounds.
 //!
 //! The value is host-accessible at API call time.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __immediate
+struct immediate
 {
   using __element_type = __element_type_of_t<_Arg>;
 
@@ -150,13 +273,13 @@ struct __immediate
   }
 
 public:
-  _CCCL_API constexpr __immediate(_Arg __arg) noexcept
+  _CCCL_API constexpr immediate(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_value();
   }
 
-  _CCCL_API constexpr __immediate(_Arg __arg, _StaticBounds) noexcept
+  _CCCL_API constexpr immediate(_Arg __arg, _StaticBounds) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_value();
@@ -165,18 +288,18 @@ struct __immediate
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __immediate(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __immediate<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
-// __immediate_sequence
+// immediate_sequence
 // =====================================================================
 
 //! @brief Wraps a runtime argument sequence with optional bounds.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __immediate_sequence
+struct immediate_sequence
 {
   using __element_type = __element_type_of_t<_Arg>;
 
@@ -215,14 +338,14 @@ struct __immediate_sequence
   }
 
 public:
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
     __validate_value();
   }
 
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
@@ -230,7 +353,7 @@ struct __immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -240,7 +363,7 @@ struct __immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -250,27 +373,27 @@ struct __immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
-      : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+      : immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
+  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
-// __deferred_base / __deferred / __deferred_sequence
+// __deferred_base / deferred / deferred_sequence
 // =====================================================================
 
 //! @brief Common base for deferred argument wrappers.
@@ -324,57 +447,57 @@ struct __deferred_base
 //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available
 //! by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __deferred : __deferred_base<_Arg, _StaticBounds>
+struct deferred : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg>
-_CCCL_HOST_DEVICE __deferred(_Arg) -> __deferred<_Arg>;
+_CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>) -> __deferred<_Arg>;
+_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
+  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
+  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be
 //! available by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __deferred_sequence : __deferred_base<_Arg, _StaticBounds>
+struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg) -> __deferred_sequence<_Arg>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> __deferred_sequence<_Arg>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
+  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
@@ -384,17 +507,17 @@ _CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_boun
 template <class _Tp>
 inline constexpr bool __is_wrapper_v = false;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__immediate<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<immediate<_Arg, _StaticBounds>> = true;
 template <auto _Value>
-inline constexpr bool __is_wrapper_v<__constant<_Value>> = true;
+inline constexpr bool __is_wrapper_v<constant<_Value>> = true;
 template <auto _Value>
-inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true;
+inline constexpr bool __is_wrapper_v<constant_sequence<_Value>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<immediate_sequence<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__deferred<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<deferred<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__deferred_sequence<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<deferred_sequence<_Arg, _StaticBounds>> = true;
 
 _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
@@ -404,87 +527,87 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
 
 template <auto _Value>
 [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const __constant<_Value>&) noexcept
+__unwrap(const constant<_Value>&) noexcept
 {
   return _Value;
 }
 
 template <auto _Value>
 [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const __constant_sequence<_Value>&) noexcept
+__unwrap(const constant_sequence<_Value>&) noexcept
 {
   return _Value;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
@@ -550,7 +673,7 @@ struct __traits_impl
 };
 
 template <auto _Value>
-struct __traits_impl<__constant<_Value>>
+struct __traits_impl<constant<_Value>>
 {
   using value_type                      = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using element_type                    = value_type;
@@ -562,7 +685,7 @@ struct __traits_impl<__constant<_Value>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__immediate<_Arg, _StaticBounds>>
+struct __traits_impl<immediate<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -577,7 +700,7 @@ struct __traits_impl<__immediate<_Arg, _StaticBounds>>
 };
 
 template <auto _Value>
-struct __traits_impl<__constant_sequence<_Value>>
+struct __traits_impl<constant_sequence<_Value>>
 {
   using value_type   = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using element_type = __element_type_of_t<value_type>;
@@ -590,7 +713,7 @@ struct __traits_impl<__constant_sequence<_Value>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>>
+struct __traits_impl<immediate_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -606,7 +729,7 @@ struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__deferred<_Arg, _StaticBounds>>
+struct __traits_impl<deferred<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -621,7 +744,7 @@ struct __traits_impl<__deferred<_Arg, _StaticBounds>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__deferred_sequence<_Arg, _StaticBounds>>
+struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -652,25 +775,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept
 {
   return __constant_compute_lowest<_Value>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_lowest<_Value>();
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -678,7 +801,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -686,7 +809,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -702,25 +825,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept
 {
   return __constant_compute_highest<_Value>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_highest<_Value>();
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -728,7 +851,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -736,7 +859,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h
index 46dd22c370c..3e70b4d1893 100644
--- a/libcudacxx/include/cuda/__argument/argument_bounds.h
+++ b/libcudacxx/include/cuda/__argument/argument_bounds.h
@@ -122,7 +122,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true;
 //! @tparam _Upper The static upper bound.
 //! @return A compile-time bounds object.
 template <auto _Lower, auto _Upper>
-[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> __bounds() noexcept
+[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept
 {
   return {};
 }
@@ -133,7 +133,7 @@ template <auto _Lower, auto _Upper>
 //! @param __upper The runtime upper bound.
 //! @return A runtime bounds object.
 template <class _Tp>
-[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> __bounds(_Tp __lower, _Tp __upper) noexcept
+[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept
 {
   return {__lower, __upper};
 }
diff --git a/libcudacxx/include/cuda/argument b/libcudacxx/include/cuda/argument
new file mode 100644
index 00000000000..bcb079e7e5e
--- /dev/null
+++ b/libcudacxx/include/cuda/argument
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_ARGUMENT_
+#define _CUDA_ARGUMENT_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__argument/argument.h>
+
+#endif // _CUDA_ARGUMENT_
diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h
index 325830c17cf..f3cc191adfc 100644
--- a/libcudacxx/include/cuda/std/__internal/namespaces.h
+++ b/libcudacxx/include/cuda/std/__internal/namespaces.h
@@ -115,8 +115,8 @@
 #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution)
 #define _CCCL_END_NAMESPACE_CUDA_EXECUTION   _CCCL_END_NAMESPACE(cuda::execution)
 
-#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::__argument)
-#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::__argument)
+#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument)
+#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::argument)
 
 // Namespace to avoid name collisions with CPOs on clang-16 (see
 // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index d7f394a0d74..9a5717f32c3 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
@@ -20,27 +20,27 @@ TEST_FUNC constexpr bool test()
 
   // Basic static bounds
   {
-    constexpr auto b = cuda::__argument::__static_bounds<1, 4096>{};
+    constexpr auto b = cuda::argument::__static_bounds<1, 4096>{};
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 4096);
   }
 
   // Exact static bounds
   {
-    constexpr auto b = cuda::__argument::__static_bounds<42, 42>{};
+    constexpr auto b = cuda::argument::__static_bounds<42, 42>{};
     static_assert(b.lower() == 42);
     static_assert(b.upper() == 42);
   }
 
   // Long type deduced from NTTPs
   {
-    static_assert(cuda::std::is_same_v<decltype(cuda::__argument::__static_bounds<0L, 1000L>::lower()), long>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::argument::__static_bounds<0L, 1000L>::lower()), long>);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Static bounds preserve their original NTTP types
   {
-    constexpr auto b = cuda::__argument::__bounds<1.0f, 8.0f>();
+    constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>();
     static_assert(b.lower() == 1.0f);
     static_assert(b.upper() == 8);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), float>);
@@ -52,9 +52,13 @@ TEST_FUNC constexpr bool test()
 
   // Basic runtime bounds
   {
-    auto b = cuda::__argument::__runtime_bounds{10, 100};
+    auto b = cuda::argument::__runtime_bounds{10, 100};
     assert(b.lower() == 10);
     assert(b.upper() == 100);
+    assert(b.__lower_ == 10);
+    assert(b.__upper_ == 100);
+    b.__upper_ = 90;
+    assert(b.upper() == 90);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), int>);
   }
 
@@ -62,35 +66,35 @@ TEST_FUNC constexpr bool test()
 
   // Static via factory
   {
-    constexpr auto b = cuda::__argument::__bounds<1, 8>();
+    constexpr auto b = cuda::argument::bounds<1, 8>();
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 8);
-    static_assert(cuda::__argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(!cuda::__argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::__argument::__is_bounds_v<decltype(b)>);
+    static_assert(cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(!cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
   }
 
   // Runtime via factory
   {
-    auto b = cuda::__argument::__bounds(10, 100);
+    auto b = cuda::argument::bounds(10, 100);
     assert(b.lower() == 10);
     assert(b.upper() == 100);
-    static_assert(!cuda::__argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::__argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::__argument::__is_bounds_v<decltype(b)>);
+    static_assert(!cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
   }
 
   // Static and runtime bounds intersection
   {
-    static_assert(cuda::__argument::__valid_argument_bounds<int, cuda::__argument::__static_bounds<1, 100>>(
-      cuda::__argument::__runtime_bounds<int>{50, 200}));
-    static_assert(!cuda::__argument::__valid_argument_bounds<int, cuda::__argument::__static_bounds<100, 200>>(
-      cuda::__argument::__runtime_bounds<int>{0, 50}));
+    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<1, 100>>(
+      cuda::argument::__runtime_bounds<int>{50, 200}));
+    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<100, 200>>(
+      cuda::argument::__runtime_bounds<int>{0, 50}));
   }
 
   // Non-bounds type
   {
-    static_assert(!cuda::__argument::__is_bounds_v<int>);
+    static_assert(!cuda::argument::__is_bounds_v<int>);
   }
 
   return true;
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
index 13753040f08..517893abb7d 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 #include <cuda/std/array>
 #include <cuda/std/complex>
@@ -54,138 +54,132 @@ struct non_sequence_value
 
 TEST_FUNC void test()
 {
-  // --- __is_sequence_v / __is_single_value_v ---
+  // --- __is_sequence_v ---
 
   // builtin and class type are not sequences
-  static_assert(!cuda::__argument::__is_sequence_v<int>);
-  static_assert(!cuda::__argument::__is_sequence_v<color>);
-  static_assert(!cuda::__argument::__is_sequence_v<non_sequence_value>);
-  static_assert(!cuda::__argument::__is_sequence_v<range_like<int>>);
-  static_assert(!cuda::__argument::__is_sequence_v<element_type_like<int>>);
-  static_assert(!cuda::__argument::__is_sequence_v<value_type_like<int>>);
-  static_assert(!cuda::__argument::__is_sequence_v<cuda::std::complex<float>>);
-  static_assert(!cuda::__argument::__is_sequence_v<cuda::std::pair<float, int>>);
-  static_assert(!cuda::__argument::__is_sequence_v<cuda::std::tuple<float, int>>);
-  static_assert(!cuda::__argument::__is_sequence_v<cuda::std::optional<int>>);
-  static_assert(!cuda::__argument::__is_sequence_v<cuda::std::expected<int, int>>);
+  static_assert(!cuda::argument::__is_sequence_v<int>);
+  static_assert(!cuda::argument::__is_sequence_v<color>);
+  static_assert(!cuda::argument::__is_sequence_v<non_sequence_value>);
+  static_assert(!cuda::argument::__is_sequence_v<range_like<int>>);
+  static_assert(!cuda::argument::__is_sequence_v<element_type_like<int>>);
+  static_assert(!cuda::argument::__is_sequence_v<value_type_like<int>>);
+  static_assert(!cuda::argument::__is_sequence_v<cuda::std::complex<float>>);
+  static_assert(!cuda::argument::__is_sequence_v<cuda::std::pair<float, int>>);
+  static_assert(!cuda::argument::__is_sequence_v<cuda::std::tuple<float, int>>);
+  static_assert(!cuda::argument::__is_sequence_v<cuda::std::optional<int>>);
+  static_assert(!cuda::argument::__is_sequence_v<cuda::std::expected<int, int>>);
 
   // iterators and pointers can be sequences if they are at least random access
-  static_assert(cuda::__argument::__is_sequence_v<int*>);
-  static_assert(cuda::__argument::__is_sequence_v<const int*>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::counting_iterator<int>>);
-  static_assert(!cuda::__argument::__is_sequence_v<bidirectional_iterator<int*>>);
+  static_assert(cuda::argument::__is_sequence_v<int*>);
+  static_assert(cuda::argument::__is_sequence_v<const int*>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::counting_iterator<int>>);
+  static_assert(!cuda::argument::__is_sequence_v<bidirectional_iterator<int*>>);
 
   // ranges and arrays are sequences
-  static_assert(cuda::__argument::__is_sequence_v<int[]>);
-  static_assert(cuda::__argument::__is_sequence_v<const int[]>);
-  static_assert(cuda::__argument::__is_sequence_v<int[42]>);
-  static_assert(cuda::__argument::__is_sequence_v<const int[42]>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::std::span<int, 1>>);
-  static_assert(cuda::__argument::__is_sequence_v<const cuda::std::span<int, 1>&>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::std::span<int>>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::std::array<int, 3>>);
+  static_assert(cuda::argument::__is_sequence_v<int[]>);
+  static_assert(cuda::argument::__is_sequence_v<const int[]>);
+  static_assert(cuda::argument::__is_sequence_v<int[42]>);
+  static_assert(cuda::argument::__is_sequence_v<const int[42]>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int, 1>>);
+  static_assert(cuda::argument::__is_sequence_v<const cuda::std::span<int, 1>&>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int>>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::std::array<int, 3>>);
 
   // --- __element_type_of_t ---
 
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<int*>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<cuda::counting_iterator<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<cuda::std::array<int, 3>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<range_like<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<element_type_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<int*>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::counting_iterator<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::array<int, 3>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<range_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<element_type_like<int>>, int>);
   static_assert(
-    cuda::std::is_same_v<cuda::__argument::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>,
+    cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>,
                          int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<value_type_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<value_type_like<int>>, int>);
 
   // --- argument_traits: is_deferred ---
 
-  static_assert(!cuda::__argument::__traits<int>::is_deferred);
-  static_assert(!cuda::__argument::__traits<cuda::__argument::__immediate<int>>::is_deferred);
-  static_assert(!cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_deferred);
-  static_assert(!cuda::__argument::__traits<cuda::__argument::__constant<42>>::is_deferred);
+  static_assert(!cuda::argument::__traits<int>::is_deferred);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate<int>>::is_deferred);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(!cuda::argument::__traits<cuda::argument::constant<42>>::is_deferred);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
+    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
 #endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::__argument::__traits<cuda::__argument::__deferred<cuda::std::span<int, 1>>>::is_deferred);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__deferred_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>::is_deferred);
+  static_assert(cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_deferred);
 
   // --- argument_traits: is_single_value ---
 
-  static_assert(cuda::__argument::__traits<int>::is_single_value);
-  static_assert(cuda::__argument::__traits<int*>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__immediate<int>>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__immediate<int*>>::is_single_value);
-  static_assert(
-    cuda::__argument::__traits<cuda::__argument::__immediate<cuda::counting_iterator<int>>>::is_single_value);
-  static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__constant<42>>::is_single_value);
+  static_assert(cuda::argument::__traits<int>::is_single_value);
+  static_assert(cuda::argument::__traits<int*>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<int*>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<cuda::counting_iterator<int>>>::is_single_value);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(!cuda::__argument::__traits<
-                cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
-#endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::__argument::__traits<cuda::__argument::__deferred<int*>>::is_single_value);
   static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__deferred_sequence<cuda::std::span<int>>>::is_single_value);
+    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+#endif // TEST_HAS_CLASS_NTTP
+  static_assert(cuda::argument::__traits<cuda::argument::deferred<int*>>::is_single_value);
+  static_assert(!cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_single_value);
 
   // --- argument_traits: value_type ---
 
-  static_assert(cuda::std::is_same_v<cuda::__argument::__traits<int>::value_type, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__traits<cuda::__argument::__immediate<int>>::value_type, int>);
-  static_assert(cuda::std::is_same_v<
-                cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::value_type,
-                cuda::std::span<int>>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__traits<cuda::__argument::__constant<42>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__traits<int>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type, int>);
+  static_assert(
+    cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::value_type,
+                         cuda::std::span<int>>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type, int>);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
     cuda::std::is_same_v<
-      cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
+      cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
       cuda::std::array<int, 3>>);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- argument_traits: lowest / highest ---
 
-  static_assert(cuda::__argument::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::__argument::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::__argument::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::__argument::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::__argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::__argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::argument::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::argument::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::argument::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::argument::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
   static_assert(
-    cuda::__argument::__traits<const cuda::__argument::__immediate<int, cuda::__argument::__static_bounds<1, 8>>>::lowest
-    == 1);
+    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>>::lowest == 1);
   static_assert(
-    cuda::__argument::__traits<cuda::__argument::__immediate<int, cuda::__argument::__static_bounds<1, 8>>&>::highest
-    == 8);
+    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>&>::highest == 8);
   static_assert(
-    cuda::__argument::__traits<
-      cuda::__argument::__immediate_sequence<cuda::std::span<int>, cuda::__argument::__static_bounds<1, 8>>>::highest
+    cuda::argument::__traits<
+      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::__static_bounds<1, 8>>>::highest
     == 8);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
+    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
   static_assert(
-    cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
+    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- Free function bounds on plain values ---
 
-  static_assert(cuda::__argument::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::__argument::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::__argument::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::__argument::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
 
   // --- Scalar and sequence wrappers expose distinct single-value traits ---
 
-  static_assert(cuda::__argument::__traits<cuda::__argument::__constant<42>>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__immediate<int>>::is_single_value);
-  static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(!cuda::__argument::__traits<
-                cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+  static_assert(
+    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
 }
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
index 9949e0013b4..f9bda950aa7 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 #include <cuda/std/cassert>
 #include <cuda/std/limits>
@@ -22,102 +22,110 @@ TEST_FUNC constexpr bool test()
   // Deferred single value via span<T, 1>
   {
     int val  = 42;
-    auto def = cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}};
-    assert(cuda::__argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
+    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
+    assert(cuda::argument::__unwrap(def)[0] == 42);
+    assert(def.__arg_[0] == 42);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
   }
 
   // Deferred single value with static bounds
   {
     int val  = 42;
-    auto def = cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}, cuda::__argument::__bounds<1, 1000>()};
-    assert(cuda::__argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 1000);
+    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 1000>()};
+    assert(cuda::argument::__unwrap(def)[0] == 42);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 1000);
   }
 
   // Deferred single value via pointer
   {
     int val     = 42;
-    using def_t = cuda::__argument::__deferred<int*, cuda::__argument::__static_bounds<0, 100>>;
-    static_assert(cuda::__argument::__traits<def_t>::lowest == 0);
-    static_assert(cuda::__argument::__traits<def_t>::highest == 100);
+    using def_t = cuda::argument::deferred<int*, cuda::argument::__static_bounds<0, 100>>;
+    static_assert(cuda::argument::__traits<def_t>::lowest == 0);
+    static_assert(cuda::argument::__traits<def_t>::highest == 100);
     // Also verify construction works
-    auto def = cuda::__argument::__deferred{&val, cuda::__argument::__bounds<0, 100>()};
-    assert(cuda::__argument::__unwrap(def) == &val);
+    auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()};
+    assert(cuda::argument::__unwrap(def) == &val);
   }
 
   // Deferred single value via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{42};
-    auto def = cuda::__argument::__deferred{it, cuda::__argument::__bounds<0, 100>()};
-    assert(cuda::__argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 100);
-    static_assert(cuda::__argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()};
+    assert(cuda::argument::__unwrap(def)[0] == 42);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
+    static_assert(cuda::argument::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred single value with both bounds, runtime bounds first
   {
     int val  = 42;
-    auto def = cuda::__argument::__deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 256>()};
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 256);
-    assert(cuda::__argument::__lowest_(def) == 5);
-    assert(cuda::__argument::__highest_(def) == 100);
+    auto def = cuda::argument::deferred{
+      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()};
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 256);
+    assert(def.__runtime_bounds_.__lower_ == 5);
+    assert(def.__runtime_bounds_.__upper_ == 100);
+    assert(cuda::argument::__lowest_(def) == 5);
+    assert(cuda::argument::__highest_(def) == 100);
+    def.__runtime_bounds_.__upper_ = 90;
+    assert(cuda::argument::__highest_(def) == 90);
   }
 
   // Deferred sequence via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{10};
-    auto def = cuda::__argument::__deferred_sequence{it, cuda::__argument::__bounds<0, 100>()};
-    assert(cuda::__argument::__unwrap(def)[0] == 10);
-    assert(cuda::__argument::__unwrap(def)[2] == 12);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 100);
-    static_assert(!cuda::__argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()};
+    assert(cuda::argument::__unwrap(def)[0] == 10);
+    assert(cuda::argument::__unwrap(def)[2] == 12);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
+    static_assert(!cuda::argument::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred sequence with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::__argument::__deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds<1, 4096>(), cuda::__argument::__bounds(5, 100)};
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    assert(cuda::__argument::__lowest_(def) == 5);
-    assert(cuda::__argument::__highest_(def) == 100);
+    auto def   = cuda::argument::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)};
+    assert(def.__arg_.size() == 4);
+    assert(def.__runtime_bounds_.__lower_ == 5);
+    assert(def.__runtime_bounds_.__upper_ == 100);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    assert(cuda::argument::__lowest_(def) == 5);
+    assert(cuda::argument::__highest_(def) == 100);
   }
 
   // Deferred sequence with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::__argument::__deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 4096>()};
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 4096);
-    assert(cuda::__argument::__lowest_(def) == 5);
-    assert(cuda::__argument::__highest_(def) == 100);
+    auto def   = cuda::argument::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()};
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 4096);
+    assert(cuda::argument::__lowest_(def) == 5);
+    assert(cuda::argument::__highest_(def) == 100);
   }
 
   // Traits: deferred is single value
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__deferred<cuda::std::span<int, 1>>>;
+    using traits = cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred with pointer is also single value
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__deferred<int*>>;
+    using traits = cuda::argument::__traits<cuda::argument::deferred<int*>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred_sequence is not single value
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__deferred_sequence<cuda::std::span<int>>>;
+    using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>;
     static_assert(traits::is_deferred);
     static_assert(!traits::is_single_value);
   }
@@ -125,16 +133,16 @@ TEST_FUNC constexpr bool test()
   // Unwrap: deferred
   {
     int val  = 99;
-    auto def = cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}};
-    auto& v  = cuda::__argument::__unwrap(def);
+    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
+    auto& v  = cuda::argument::__unwrap(def);
     assert(v[0] == 99);
   }
 
   // Unwrap: deferred_sequence
   {
     int arr[3]    = {10, 20, 30};
-    auto def      = cuda::__argument::__deferred_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::__argument::__unwrap(def);
+    auto def      = cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::argument::__unwrap(def);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
@@ -142,14 +150,14 @@ TEST_FUNC constexpr bool test()
   // Unwrap: rvalue deferred returns by value
   {
     int val = 99;
-    auto v  = cuda::__argument::__unwrap(cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}});
+    auto v  = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}});
     assert(v[0] == 99);
   }
 
   // Unwrap: rvalue deferred_sequence returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::__argument::__unwrap(cuda::__argument::__deferred_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
index c9723304774..e969b9bd234 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/cassert>
 #include <cuda/std/limits>
 #include <cuda/std/span>
@@ -25,93 +25,99 @@ TEST_FUNC constexpr bool test()
 {
   // Uniform scalar via CTAD
   {
-    auto da = cuda::__argument::__immediate{5};
-    assert(cuda::__argument::__unwrap(da) == 5);
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
-    assert(cuda::__argument::__lowest_(da) == 5);
-    assert(cuda::__argument::__highest_(da) == 5);
+    auto da = cuda::argument::immediate{5};
+    assert(cuda::argument::__unwrap(da) == 5);
+    assert(da.__arg_ == 5);
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
+    assert(cuda::argument::__lowest_(da) == 5);
+    assert(cuda::argument::__highest_(da) == 5);
+    da.__arg_ = 6;
+    assert(cuda::argument::__unwrap(da) == 6);
   }
 
   // Uniform scalar with static bounds
   {
-    auto da = cuda::__argument::__immediate{5, cuda::__argument::__bounds<1, 8>()};
-    assert(cuda::__argument::__unwrap(da) == 5);
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 8);
-    assert(cuda::__argument::__lowest_(da) == 5);
-    assert(cuda::__argument::__highest_(da) == 5);
+    auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()};
+    assert(cuda::argument::__unwrap(da) == 5);
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 8);
+    assert(cuda::argument::__lowest_(da) == 5);
+    assert(cuda::argument::__highest_(da) == 5);
   }
 
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    auto da = cuda::__argument::__immediate{non_sequence_value{7}};
-    assert(cuda::__argument::__unwrap(da).payload == 7);
+    auto da = cuda::argument::immediate{non_sequence_value{7}};
+    assert(cuda::argument::__unwrap(da).payload == 7);
   }
 
   // Pointer-like types can still represent a single value when explicitly wrapped that way
   {
     int value = 11;
-    auto da   = cuda::__argument::__immediate{&value};
-    static_assert(cuda::__argument::__traits<decltype(da)>::is_single_value);
-    assert(*cuda::__argument::__unwrap(da) == 11);
+    auto da   = cuda::argument::immediate{&value};
+    static_assert(cuda::argument::__traits<decltype(da)>::is_single_value);
+    assert(*cuda::argument::__unwrap(da) == 11);
   }
 
   // Per-segment span with runtime bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da =
-      cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds(1L, 100L)};
-    assert(cuda::__argument::__unwrap(da).size() == 4);
-    assert(cuda::__argument::__lowest_(da) == 1);
-    assert(cuda::__argument::__highest_(da) == 100);
+    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds(1L, 100L)};
+    assert(cuda::argument::__unwrap(da).size() == 4);
+    assert(da.__arg_.size() == 4);
+    assert(da.__runtime_bounds_.__lower_ == 1);
+    assert(da.__runtime_bounds_.__upper_ == 100);
+    assert(cuda::argument::__lowest_(da) == 1);
+    assert(cuda::argument::__highest_(da) == 100);
+    da.__runtime_bounds_.__upper_ = 90;
+    assert(cuda::argument::__highest_(da) == 90);
   }
 
   // Per-segment span with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::__argument::__immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(10, 200)};
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::__argument::__lowest_(da) == 10);
-    assert(cuda::__argument::__highest_(da) == 200);
+    auto da    = cuda::argument::immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)};
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
+    assert(cuda::argument::__lowest_(da) == 10);
+    assert(cuda::argument::__highest_(da) == 200);
   }
 
   // Per-segment span with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::__argument::__immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds(10, 200), cuda::__argument::__bounds<1, 256>()};
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::__argument::__lowest_(da) == 10);
-    assert(cuda::__argument::__highest_(da) == 200);
+    auto da    = cuda::argument::immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()};
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
+    assert(cuda::argument::__lowest_(da) == 10);
+    assert(cuda::argument::__highest_(da) == 200);
   }
 
   // Per-segment via span
   {
     int arr[4] = {1, 2, 3, 4};
-    auto da    = cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 4}};
-    assert(cuda::__argument::__unwrap(da).size() == 4);
-    assert(cuda::__argument::__unwrap(da)[0] == 1);
-    assert(cuda::__argument::__unwrap(da)[3] == 4);
+    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}};
+    assert(cuda::argument::__unwrap(da).size() == 4);
+    assert(cuda::argument::__unwrap(da)[0] == 1);
+    assert(cuda::argument::__unwrap(da)[3] == 4);
   }
 
   // Per-segment with static bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da =
-      cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds<1, 100>()};
-    assert(cuda::__argument::__unwrap(da).size() == 4);
-    assert(cuda::__argument::__unwrap(da)[2] == 30);
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 100);
+    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 100>()};
+    assert(cuda::argument::__unwrap(da).size() == 4);
+    assert(cuda::argument::__unwrap(da)[2] == 30);
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 100);
   }
 
   // Traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__immediate<int>>;
+    using traits = cuda::argument::__traits<cuda::argument::immediate<int>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, int>);
@@ -119,48 +125,47 @@ TEST_FUNC constexpr bool test()
 
   // Sequence traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>;
+    using traits = cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>;
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, cuda::std::span<int>>);
   }
 
-  // __is_single_value_v on unwrapped types
+  // __is_sequence_v on unwrapped types
   {
     static_assert(
-      !cuda::__argument::__is_sequence_v<cuda::__argument::__traits<cuda::__argument::__immediate<int>>::value_type>);
-    static_assert(
-      !cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
+      !cuda::argument::__is_sequence_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type>);
+    static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
   }
 
   // Unwrap: scalar
   {
-    auto da = cuda::__argument::__immediate{7};
-    auto& v = cuda::__argument::__unwrap(da);
+    auto da = cuda::argument::immediate{7};
+    auto& v = cuda::argument::__unwrap(da);
     assert(v == 7);
     v = 8;
-    assert(cuda::__argument::__unwrap(da) == 8);
+    assert(cuda::argument::__unwrap(da) == 8);
   }
 
   // Unwrap: span
   {
     int arr[3]    = {10, 20, 30};
-    auto da       = cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::__argument::__unwrap(da);
+    auto da       = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::argument::__unwrap(da);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
 
   // Unwrap: rvalue scalar returns by value
   {
-    const auto& v = cuda::__argument::__unwrap(cuda::__argument::__immediate{7});
+    const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7});
     assert(v == 7);
   }
 
   // Unwrap: rvalue span returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::__argument::__unwrap(cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
index 85ccaf1c8a0..cb6e78cade3 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/array>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -24,37 +24,37 @@ TEST_FUNC void test()
 {
   // Basic value
   {
-    constexpr auto sa = cuda::__argument::__constant<42>{};
+    constexpr auto sa = cuda::argument::constant<42>{};
     static_assert(sa.value() == 42);
     static_assert(cuda::std::is_same_v<decltype(sa)::value_type, int>);
   }
 
   // Different types
   {
-    constexpr auto sa_long = cuda::__argument::__constant<100L>{};
+    constexpr auto sa_long = cuda::argument::constant<100L>{};
     static_assert(sa_long.value() == 100L);
     static_assert(cuda::std::is_same_v<decltype(sa_long)::value_type, long>);
   }
 
   // Negative value
   {
-    constexpr auto sa_neg = cuda::__argument::__constant<-1>{};
+    constexpr auto sa_neg = cuda::argument::constant<-1>{};
     static_assert(sa_neg.value() == -1);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    constexpr auto sa = cuda::__argument::__constant<non_sequence_value{7}>{};
+    constexpr auto sa = cuda::argument::constant<non_sequence_value{7}>{};
     static_assert(sa.value().payload == 7);
-    static_assert(cuda::__argument::__unwrap(sa).payload == 7);
+    static_assert(cuda::argument::__unwrap(sa).payload == 7);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Array sequence
   {
-    constexpr auto sa_arr = cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    constexpr auto sa_arr = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
     static_assert(sa_arr.value()[0] == 128);
     static_assert(sa_arr.value()[1] == 256);
     static_assert(sa_arr.value()[2] == 512);
@@ -64,32 +64,32 @@ TEST_FUNC void test()
 
   // Bounds: scalar
   {
-    constexpr auto sa = cuda::__argument::__constant<42>{};
-    static_assert(cuda::__argument::__lowest_(sa) == 42);
-    static_assert(cuda::__argument::__highest_(sa) == 42);
+    constexpr auto sa = cuda::argument::constant<42>{};
+    static_assert(cuda::argument::__lowest_(sa) == 42);
+    static_assert(cuda::argument::__highest_(sa) == 42);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: array sequence computes lowest/highest of elements
   {
-    constexpr auto sa = cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
-    static_assert(cuda::__argument::__lowest_(sa) == 128);
-    static_assert(cuda::__argument::__highest_(sa) == 512);
+    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    static_assert(cuda::argument::__lowest_(sa) == 128);
+    static_assert(cuda::argument::__highest_(sa) == 512);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: empty array sequence has unconstrained element bounds
   {
-    constexpr auto sa = cuda::__argument::__constant_sequence<cuda::std::array<int, 0>{}>{};
-    static_assert(cuda::__argument::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::__argument::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
+    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 0>{}>{};
+    static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
   }
 #endif // TEST_HAS_CLASS_NTTP
 
   // Traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__constant<42>>;
+    using traits = cuda::argument::__traits<cuda::argument::constant<42>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_constant);
     static_assert(traits::is_single_value);
@@ -101,7 +101,7 @@ TEST_FUNC void test()
 #if TEST_HAS_CLASS_NTTP
   // Sequence traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
+    using traits = cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
     static_assert(traits::is_constant);
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
@@ -113,25 +113,25 @@ TEST_FUNC void test()
   // Single value: scalar is single, sequence is not
   {
     static_assert(
-      !cuda::__argument::__is_sequence_v<cuda::__argument::__traits<cuda::__argument::__constant<42>>::value_type>);
+      !cuda::argument::__is_sequence_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type>);
 #if TEST_HAS_CLASS_NTTP
-    static_assert(!cuda::__argument::__traits<
-                  cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+    static_assert(
+      !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
   }
 
   // Unwrap: scalar
   {
-    constexpr auto sa  = cuda::__argument::__constant<42>{};
-    constexpr auto val = cuda::__argument::__unwrap(sa);
+    constexpr auto sa  = cuda::argument::constant<42>{};
+    constexpr auto val = cuda::argument::__unwrap(sa);
     static_assert(val == 42);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Unwrap: sequence
   {
-    constexpr auto sa  = cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
-    constexpr auto val = cuda::__argument::__unwrap(sa);
+    constexpr auto sa  = cuda::argument::constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
+    constexpr auto val = cuda::argument::__unwrap(sa);
     static_assert(val[0] == 10);
     static_assert(val[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
index 19e475ef453..79a96a3366e 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
@@ -8,11 +8,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 
-using arg_t = cuda::__argument::__immediate<unsigned char, cuda::__argument::__static_bounds<0, 1000>>;
+using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::__static_bounds<0, 1000>>;
 
-[[maybe_unused]] constexpr auto invalid_highest = cuda::__argument::__traits<arg_t>::highest;
+[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits<arg_t>::highest;
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
index 8d240e0cf3e..d3a0499fc55 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
@@ -8,9 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 
-[[maybe_unused]] constexpr auto invalid_bounds = cuda::__argument::__static_bounds<0, 1L>{};
+[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{};
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
index 8cc239585ac..b7aca57bbdf 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
@@ -13,7 +13,7 @@
 // All argument types (plain values, static, dynamic, deferred) work uniformly
 // through the free functions.
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/algorithm>
 #include <cuda/std/array>
 #include <cuda/std/cassert>
@@ -34,7 +34,7 @@ enum class algorithm_variant
 template <class _SegSizeArg>
 TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 {
-  if constexpr (cuda::__argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
+  if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
   {
     return algorithm_variant::shared_memory;
   }
@@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 template <class _SegSizeArg>
 TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments)
 {
-  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::__argument::__highest_(__seg_size)));
+  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::argument::__highest_(__seg_size)));
   return __highest * __num_segments;
 }
 
@@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se
 template <class _SegSizeArg>
 TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size)
 {
-  const auto& __val = cuda::__argument::__unwrap(__seg_size);
+  const auto& __val = cuda::argument::__unwrap(__seg_size);
 
-  if constexpr (cuda::__argument::__traits<_SegSizeArg>::is_single_value)
+  if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value)
   {
     return static_cast<int>(__val);
   }
@@ -95,7 +95,7 @@ TEST_FUNC constexpr bool test()
 
   // static_argument: scalar, fits in shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__constant<128>{};
+    constexpr auto seg_size = cuda::argument::constant<128>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 128 * 4);
     assert(process_segments(seg_size) == 128);
@@ -104,7 +104,7 @@ TEST_FUNC constexpr bool test()
 #if TEST_HAS_CLASS_NTTP
   // static_argument: array sequence, highest fits in shared memory
   {
-    constexpr auto seg_sizes = cuda::__argument::__constant_sequence<cuda::std::array{64, 128, 256}>{};
+    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 256}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 256 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 256);
@@ -112,7 +112,7 @@ TEST_FUNC constexpr bool test()
 
   // static_argument: array sequence, highest exceeds shared memory, buffer clamped
   {
-    constexpr auto seg_sizes = cuda::__argument::__constant_sequence<cuda::std::array{64, 128, 512}>{};
+    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 512}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 512 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 512);
@@ -121,7 +121,7 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: tight static bounds, shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 256>()};
+    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
@@ -129,7 +129,7 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: wide static bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 4096>()};
+    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
@@ -137,7 +137,7 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: no bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100};
+    constexpr auto seg_size = cuda::argument::immediate{100};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
@@ -145,9 +145,8 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: per-segment span with runtime bounds only
   {
-    int sizes[3] = {64, 128, 96};
-    auto seg_sizes =
-      cuda::__argument::__immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::__argument::__bounds(1, 200)};
+    int sizes[3]   = {64, 128, 96};
+    auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::argument::bounds(1, 200)};
     assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
@@ -156,9 +155,9 @@ TEST_FUNC constexpr bool test()
   // dynamic_argument: per-segment span with both bounds
   {
     int sizes[3]   = {64, 128, 96};
-    auto seg_sizes = cuda::__argument::__immediate_sequence{
-      cuda::std::span<int>{sizes, 3}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)};
-    static_assert(cuda::__argument::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
+    auto seg_sizes = cuda::argument::immediate_sequence{
+      cuda::std::span<int>{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
+    static_assert(cuda::argument::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
@@ -167,9 +166,9 @@ TEST_FUNC constexpr bool test()
   // deferred_argument: uniform, bounds for decisions only
   {
     int val       = 100;
-    auto seg_size = cuda::__argument::__deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)};
-    static_assert(cuda::__argument::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
+    auto seg_size = cuda::argument::deferred{
+      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
+    static_assert(cuda::argument::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 200 * 4);
   }
@@ -185,14 +184,14 @@ TEST_FUNC constexpr bool test()
 #if TEST_HAS_CLASS_NTTP
   // static_argument float (float NTTPs require C++20)
   {
-    constexpr auto seg_size = cuda::__argument::__constant<128.0f>{};
+    constexpr auto seg_size = cuda::argument::constant<128.0f>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 128);
   }
 
   // dynamic_argument float with static bounds
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100.0f, cuda::__argument::__bounds<1.0f, 256.0f>()};
+    constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 100);
   }

From f6494eee191de70a10a4e68fc67962e0abe04647 Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Wed, 3 Jun 2026 23:35:26 -0700
Subject: [PATCH 2/6] Forgot about the bounds types

---
 libcudacxx/include/cuda/__argument/argument.h | 87 +++++++++----------
 .../include/cuda/__argument/argument_bounds.h | 20 ++---
 .../cuda/argument/argument_bounds.pass.cpp    | 16 ++--
 .../cuda/argument/argument_traits.pass.cpp    |  6 +-
 .../cuda/argument/deferred_argument.pass.cpp  |  2 +-
 .../static_bounds_conversion.fail.cpp         |  2 +-
 .../static_bounds_type_mismatch.fail.cpp      |  2 +-
 7 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index 2df19b9ac5d..f823b27f6b1 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -166,13 +166,13 @@ template <class _ElementType, class _StaticBounds>
 inline constexpr bool __valid_static_bounds_v = true;
 
 template <class _ElementType, auto _Lowest, auto _Highest>
-inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> =
+inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> =
   __static_bound_in_range<_ElementType, _Lowest>() && __static_bound_in_range<_ElementType, _Highest>();
 
 template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept
 {
-  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>)
   {
     return ::cuda::std::numeric_limits<_ElementType>::lowest();
   }
@@ -185,7 +185,7 @@ _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept
 template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept
 {
-  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>)
   {
     return (::cuda::std::numeric_limits<_ElementType>::max)();
   }
@@ -196,28 +196,28 @@ _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr _ElementType __effective_lowest(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   auto __static_lowest = __wrapper_static_lowest<_ElementType, _StaticBounds>();
   return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower();
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr _ElementType __effective_highest(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   auto __static_highest = __wrapper_static_highest<_ElementType, _StaticBounds>();
   return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper();
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr bool __has_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr bool __has_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   return __effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds)
       <= __effective_highest<_ElementType, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
                 "static argument bounds cannot be represented by the element type");
@@ -228,7 +228,7 @@ _CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_Elemen
 template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept
 {
-  if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  if constexpr (!::cuda::std::is_same_v<_StaticBounds, no_bounds>)
   {
     _CCCL_ASSERT((__val >= __wrapper_static_lowest<_ElementType, _StaticBounds>()),
                  "immediate argument value is below static lowest bound");
@@ -239,7 +239,7 @@ _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const
 
 template <class _ElementType>
 _CCCL_API constexpr void __validate_runtime_element_bounds(
-  [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept
+  [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound");
   _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound");
@@ -252,7 +252,7 @@ _CCCL_API constexpr void __validate_runtime_element_bounds(
 //! @brief Wraps a runtime argument value with optional bounds.
 //!
 //! The value is host-accessible at API call time.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct immediate
 {
   using __element_type = __element_type_of_t<_Arg>;
@@ -288,8 +288,8 @@ struct immediate
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>)
+  -> immediate<_Arg, static_bounds<_Lowest, _Highest>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
@@ -298,7 +298,7 @@ _CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>)
 // =====================================================================
 
 //! @brief Wraps a runtime argument sequence with optional bounds.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct immediate_sequence
 {
   using __element_type = __element_type_of_t<_Arg>;
@@ -308,7 +308,7 @@ struct immediate_sequence
                 "static argument bounds cannot be represented by the element type");
 
   _Arg __arg_;
-  __runtime_bounds<__element_type> __runtime_bounds_{};
+  runtime_bounds<__element_type> __runtime_bounds_{};
 
 private:
   _CCCL_API constexpr void __validate_bounds() const noexcept
@@ -353,7 +353,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -363,7 +363,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -373,23 +373,23 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
       : immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
@@ -397,7 +397,7 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bound
 // =====================================================================
 
 //! @brief Common base for deferred argument wrappers.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct __deferred_base
 {
   using __element_type = __element_type_of_t<_Arg>;
@@ -406,7 +406,7 @@ struct __deferred_base
                 "static argument bounds cannot be represented by the element type");
 
   _Arg __arg_;
-  __runtime_bounds<__element_type> __runtime_bounds_{};
+  runtime_bounds<__element_type> __runtime_bounds_{};
 
   _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
@@ -421,7 +421,7 @@ struct __deferred_base
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -430,7 +430,7 @@ struct __deferred_base
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -439,14 +439,14 @@ struct __deferred_base
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+  _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
       : __deferred_base(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available
 //! by the time the argument is consumed in stream order.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct deferred : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
@@ -457,24 +457,23 @@ template <class _Arg>
 _CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>) -> deferred<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>;
+_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> deferred<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> deferred<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be
 //! available by the time the argument is consumed in stream order.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
@@ -485,19 +484,19 @@ template <class _Arg>
 _CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h
index 3e70b4d1893..6d1943da32a 100644
--- a/libcudacxx/include/cuda/__argument/argument_bounds.h
+++ b/libcudacxx/include/cuda/__argument/argument_bounds.h
@@ -34,7 +34,7 @@
 _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT
 
 //! @brief Sentinel type indicating no bounds are present.
-struct __no_bounds
+struct no_bounds
 {};
 
 // =====================================================================
@@ -48,7 +48,7 @@ struct __no_bounds
 //! @tparam _Lower The static lower bound.
 //! @tparam _Upper The static upper bound.
 template <auto _Lower, auto _Upper>
-struct __static_bounds
+struct static_bounds
 {
   static_assert(::cuda::std::is_same_v<decltype(_Lower), decltype(_Upper)>,
                 "Static bounds endpoints must have the same type");
@@ -67,7 +67,7 @@ struct __static_bounds
 template <class _Tp>
 inline constexpr bool __is_static_bounds_v = false;
 template <auto _Lower, auto _Upper>
-inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = true;
+inline constexpr bool __is_static_bounds_v<static_bounds<_Lower, _Upper>> = true;
 
 // =====================================================================
 // runtime_bounds
@@ -77,14 +77,14 @@ inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = tr
 //!
 //! @tparam _Tp The value type of the bounds.
 template <class _Tp>
-struct __runtime_bounds
+struct runtime_bounds
 {
   _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest();
   _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)();
 
-  constexpr __runtime_bounds() noexcept = default;
+  constexpr runtime_bounds() noexcept = default;
 
-  _CCCL_API constexpr __runtime_bounds(_Tp __lower, _Tp __upper) noexcept
+  _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept
       : __lower_(__lower)
       , __upper_(__upper)
   {
@@ -104,13 +104,13 @@ struct __runtime_bounds
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Tp>
-_CCCL_HOST_DEVICE __runtime_bounds(_Tp, _Tp) -> __runtime_bounds<_Tp>;
+_CCCL_HOST_DEVICE runtime_bounds(_Tp, _Tp) -> runtime_bounds<_Tp>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 template <class _Tp>
 inline constexpr bool __is_runtime_bounds_v = false;
 template <class _Tp>
-inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true;
+inline constexpr bool __is_runtime_bounds_v<runtime_bounds<_Tp>> = true;
 
 // =====================================================================
 // bounds — factory functions
@@ -122,7 +122,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true;
 //! @tparam _Upper The static upper bound.
 //! @return A compile-time bounds object.
 template <auto _Lower, auto _Upper>
-[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept
+[[nodiscard]] _CCCL_API constexpr static_bounds<_Lower, _Upper> bounds() noexcept
 {
   return {};
 }
@@ -133,7 +133,7 @@ template <auto _Lower, auto _Upper>
 //! @param __upper The runtime upper bound.
 //! @return A runtime bounds object.
 template <class _Tp>
-[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept
+[[nodiscard]] _CCCL_API constexpr runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept
 {
   return {__lower, __upper};
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index 9a5717f32c3..4275c4aca9c 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -20,21 +20,21 @@ TEST_FUNC constexpr bool test()
 
   // Basic static bounds
   {
-    constexpr auto b = cuda::argument::__static_bounds<1, 4096>{};
+    constexpr auto b = cuda::argument::static_bounds<1, 4096>{};
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 4096);
   }
 
   // Exact static bounds
   {
-    constexpr auto b = cuda::argument::__static_bounds<42, 42>{};
+    constexpr auto b = cuda::argument::static_bounds<42, 42>{};
     static_assert(b.lower() == 42);
     static_assert(b.upper() == 42);
   }
 
   // Long type deduced from NTTPs
   {
-    static_assert(cuda::std::is_same_v<decltype(cuda::argument::__static_bounds<0L, 1000L>::lower()), long>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::argument::static_bounds<0L, 1000L>::lower()), long>);
   }
 
 #if TEST_HAS_CLASS_NTTP
@@ -52,7 +52,7 @@ TEST_FUNC constexpr bool test()
 
   // Basic runtime bounds
   {
-    auto b = cuda::argument::__runtime_bounds{10, 100};
+    auto b = cuda::argument::runtime_bounds{10, 100};
     assert(b.lower() == 10);
     assert(b.upper() == 100);
     assert(b.__lower_ == 10);
@@ -86,10 +86,10 @@ TEST_FUNC constexpr bool test()
 
   // Static and runtime bounds intersection
   {
-    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<1, 100>>(
-      cuda::argument::__runtime_bounds<int>{50, 200}));
-    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<100, 200>>(
-      cuda::argument::__runtime_bounds<int>{0, 50}));
+    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<1, 100>>(
+      cuda::argument::runtime_bounds<int>{50, 200}));
+    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<100, 200>>(
+      cuda::argument::runtime_bounds<int>{0, 50}));
   }
 
   // Non-bounds type
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
index 517893abb7d..c86f1caa251 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
@@ -151,12 +151,12 @@ TEST_FUNC void test()
   static_assert(cuda::argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
   static_assert(cuda::argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
   static_assert(
-    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>>::lowest == 1);
+    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>>::lowest == 1);
   static_assert(
-    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>&>::highest == 8);
+    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>&>::highest == 8);
   static_assert(
     cuda::argument::__traits<
-      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::__static_bounds<1, 8>>>::highest
+      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::static_bounds<1, 8>>>::highest
     == 8);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
index f9bda950aa7..5dc2fb849c7 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
@@ -41,7 +41,7 @@ TEST_FUNC constexpr bool test()
   // Deferred single value via pointer
   {
     int val     = 42;
-    using def_t = cuda::argument::deferred<int*, cuda::argument::__static_bounds<0, 100>>;
+    using def_t = cuda::argument::deferred<int*, cuda::argument::static_bounds<0, 100>>;
     static_assert(cuda::argument::__traits<def_t>::lowest == 0);
     static_assert(cuda::argument::__traits<def_t>::highest == 100);
     // Also verify construction works
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
index 79a96a3366e..7f9902e50a2 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::__static_bounds<0, 1000>>;
+using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::static_bounds<0, 1000>>;
 
 [[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits<arg_t>::highest;
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
index d3a0499fc55..8170fdd7ee5 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{};
+[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{};
 
 int main(int, char**)
 {

From 0bb5139d04dd95d089f23ae61dc44b237eb22589 Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Thu, 4 Jun 2026 14:04:59 -0700
Subject: [PATCH 3/6] Tighten bounds validation

---
 libcudacxx/include/cuda/__argument/argument.h | 29 +++++++++++++------
 .../cuda/argument/argument_bounds.pass.cpp    |  8 +++++
 .../static_argument_bounds_type.fail.cpp      | 20 +++++++++++++
 3 files changed, 48 insertions(+), 9 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp

diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index f823b27f6b1..db40be13993 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -163,7 +163,10 @@ _CCCL_API constexpr bool __static_bound_in_range() noexcept
 }
 
 template <class _ElementType, class _StaticBounds>
-inline constexpr bool __valid_static_bounds_v = true;
+inline constexpr bool __valid_static_bounds_v = false;
+
+template <class _ElementType>
+inline constexpr bool __valid_static_bounds_v<_ElementType, no_bounds> = true;
 
 template <class _ElementType, auto _Lowest, auto _Highest>
 inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> =
@@ -220,7 +223,8 @@ template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
   _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)),
                "static and runtime argument bounds do not intersect");
 }
@@ -258,7 +262,8 @@ struct immediate
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   _Arg __arg_;
 
@@ -305,7 +310,8 @@ struct immediate_sequence
 
   static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
@@ -403,7 +409,8 @@ struct __deferred_base
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
@@ -689,7 +696,8 @@ struct __traits_impl<immediate<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = false;
@@ -718,7 +726,8 @@ struct __traits_impl<immediate_sequence<_Arg, _StaticBounds>>
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__is_sequence_v<value_type>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = false;
@@ -733,7 +742,8 @@ struct __traits_impl<deferred<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = true;
@@ -748,7 +758,8 @@ struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = true;
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index 4275c4aca9c..6be3c0d925b 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -97,6 +97,14 @@ TEST_FUNC constexpr bool test()
     static_assert(!cuda::argument::__is_bounds_v<int>);
   }
 
+  // Bounds types accepted by argument wrapper template parameters
+  {
+    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::no_bounds>);
+    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::static_bounds<1, 8>>);
+    static_assert(!cuda::argument::__valid_static_bounds_v<int, cuda::argument::runtime_bounds<int>>);
+    static_assert(!cuda::argument::__valid_static_bounds_v<int, int>);
+  }
+
   return true;
 }
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
new file mode 100644
index 00000000000..66a1828769b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/argument>
+
+using arg_t = cuda::argument::immediate<int, cuda::argument::runtime_bounds<int>>;
+
+[[maybe_unused]] arg_t invalid_arg{0};
+
+int main(int, char**)
+{
+  return 0;
+}

From 9e66badb1155363548b07a19fd0f095e78997b0b Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Thu, 4 Jun 2026 14:12:53 -0700
Subject: [PATCH 4/6] Asssert deferred_sequence type is a sequence

---
 libcudacxx/include/cuda/__argument/argument.h |  3 +++
 .../deferred_sequence_scalar.fail.cpp         | 18 +++++++++++++++++
 .../deferred_sequence_scalar_traits.fail.cpp  | 20 +++++++++++++++++++
 3 files changed, 41 insertions(+)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp

diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index db40be13993..81df64ab616 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -483,6 +483,8 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi
 template <class _Arg, class _StaticBounds = no_bounds>
 struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
 {
+  static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type");
+
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
@@ -757,6 +759,7 @@ struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
+  static_assert(__is_sequence_v<value_type>, "deferred sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
                 "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
                 "values representable by the element type");
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
new file mode 100644
index 00000000000..1ce371b84e5
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/argument>
+
+[[maybe_unused]] cuda::argument::deferred_sequence<int> invalid_arg{0};
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
new file mode 100644
index 00000000000..41a9a2ae778
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/argument>
+
+using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<int>>;
+
+[[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred;
+
+int main(int, char**)
+{
+  return 0;
+}

From f4cb111bea422cae228250d27cc3aa4f72546a55 Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Fri, 5 Jun 2026 15:20:02 -0700
Subject: [PATCH 5/6] Feedback from review meeting

---
 .../bench/segmented_topk/fixed/keys.cu        |  10 +-
 .../bench/segmented_topk/variable/indexed.cu  |  14 +-
 .../bench/segmented_topk/variable/keys.cu     |  12 +-
 cub/cub/agent/agent_batched_topk.cuh          |  10 +-
 cub/cub/detail/segmented_params.cuh           |  34 +-
 .../device/dispatch/dispatch_batched_topk.cuh |  24 +-
 .../dispatch/kernels/kernel_batched_topk.cuh  |   8 +-
 .../catch2_test_device_segmented_topk_keys.cu |  32 +-
 ...catch2_test_device_segmented_topk_pairs.cu |  22 +-
 libcudacxx/include/cuda/__argument/argument.h | 317 ++++++++++++------
 .../include/cuda/__argument/argument_bounds.h | 132 +-------
 .../include/cuda/std/__internal/namespaces.h  |   4 +-
 .../cuda/argument/argument_bounds.pass.cpp    |  52 ++-
 .../cuda/argument/argument_traits.pass.cpp    | 159 +++++----
 .../cuda/argument/deferred_argument.pass.cpp  | 118 +++----
 .../deferred_sequence_scalar.fail.cpp         |   2 +-
 .../deferred_sequence_scalar_traits.fail.cpp  |   2 +-
 .../cuda/argument/dynamic_argument.pass.cpp   | 123 ++++---
 .../cuda/argument/static_argument.pass.cpp    |  85 +++--
 .../static_argument_bounds_type.fail.cpp      |   2 +-
 .../static_bounds_conversion.fail.cpp         |   4 +-
 .../static_bounds_type_mismatch.fail.cpp      |   2 +-
 .../cuda/argument/usage_example.pass.cpp      |  71 ++--
 23 files changed, 622 insertions(+), 617 deletions(-)

diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
index 4178e7ea0d5..77bd997b742 100644
--- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
@@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
   const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
   const auto num_segments      = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
   const auto elements          = num_segments * segment_size;
-  const auto total_num_items   = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)};
+  const auto total_num_items   = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)};
   const bit_entropy entropy    = str_to_entropy(state.get_string("Entropy"));
 
   // Skip workloads where k exceeds the segment size
@@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
   auto d_keys_in      = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
   auto d_keys_out     = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);
 
-  auto segment_sizes    = ::cuda::argument::constant<MaxSegmentSize>{};
-  auto k                = ::cuda::argument::constant<MaxNumSelected>{};
-  auto select_direction = ::cuda::argument::constant<cub::detail::topk::select::max>{};
+  auto segment_sizes    = ::cuda::args::constant<MaxSegmentSize>{};
+  auto k                = ::cuda::args::constant<MaxNumSelected>{};
+  auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};
 
   state.add_element_count(elements, "NumElements");
   state.add_element_count(segment_size, "SegmentSize");
@@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
       segment_sizes,
       k,
       select_direction,
-      ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
+      ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
       total_num_items,
       env);
   });
diff --git a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu
index f59b250e247..488d3aa4439 100644
--- a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu
+++ b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu
@@ -7,7 +7,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 #include <cuda/std/cstdint>
 
@@ -36,18 +36,18 @@ void decode_style_variable_topk_indexed(
     static_cast<cuda::std::int64_t>(MaxSegmentSize));
   const auto input_elements  = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
   const auto output_elements = static_cast<std::size_t>(num_segments) * K;
-  const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
+  const auto total_num_items = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(input_elements)};
 
   auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
     num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
   auto out_keys_buffer    = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
   auto out_indices_buffer = thrust::device_vector<IndexT>(output_elements, thrust::no_init);
 
-  auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
-    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
-  auto k_param            = ::cuda::__argument::__constant<K>{};
-  auto select_direction   = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
-  auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
+  auto segment_sizes_param = ::cuda::args::__immediate_sequence{
+    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()};
+  auto k_param            = ::cuda::args::constant<K>{};
+  auto select_direction   = ::cuda::args::constant<cub::detail::topk::select::max>{};
+  auto num_segments_param = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(num_segments)};
 
   auto d_keys_in = cuda::make_strided_iterator(
     cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
index 0febcf507bd..5a54ad1fbaa 100644
--- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
@@ -32,17 +32,17 @@ void decode_style_variable_topk_keys(
     static_cast<cuda::std::int64_t>(MaxSegmentSize));
   const auto input_elements  = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
   const auto output_elements = static_cast<std::size_t>(num_segments) * K;
-  const auto total_num_items = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(input_elements)};
+  const auto total_num_items = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(input_elements)};
 
   auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
     num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
   auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
 
-  auto segment_sizes_param = ::cuda::argument::immediate_sequence{
-    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()};
-  auto k_param            = ::cuda::argument::constant<K>{};
-  auto select_direction   = ::cuda::argument::constant<cub::detail::topk::select::max>{};
-  auto num_segments_param = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(num_segments)};
+  auto segment_sizes_param = ::cuda::args::__immediate_sequence{
+    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()};
+  auto k_param            = ::cuda::args::constant<K>{};
+  auto select_direction   = ::cuda::args::constant<cub::detail::topk::select::max>{};
+  auto num_segments_param = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(num_segments)};
 
   auto d_keys_in = cuda::make_strided_iterator(
     cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh
index bc10311d36d..d2a99cfc809 100644
--- a/cub/cub/agent/agent_batched_topk.cuh
+++ b/cub/cub/agent/agent_batched_topk.cuh
@@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
   using key_t   = it_value_t<key_it_t>;
   using value_t = it_value_t<value_it_t>;
 
-  using segment_size_val_t = typename ::cuda::argument::__traits<SegmentSizeParameterT>::element_type;
-  using num_segments_val_t = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
+  using segment_size_val_t = typename ::cuda::args::__traits<SegmentSizeParameterT>::element_type;
+  using num_segments_val_t = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
   using counters_t         = batched_topk_counters<num_segments_val_t>;
 
   static constexpr auto policy                 = PolicyGetter{}();
@@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
     multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;
 
   // Check if there could be large segments present
-  static constexpr bool only_small_segments = ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
+  static constexpr bool only_small_segments = ::cuda::args::__traits<SegmentSizeParameterT>::highest <= tile_size;
 
   // Check if we are dealing with keys-only or key-value pairs
   static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
@@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
       return;
     }
 
-    constexpr bool is_full_tile = ::cuda::argument::__traits<SegmentSizeParameterT>::is_constant
-                               && ::cuda::argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
+    constexpr bool is_full_tile = ::cuda::args::__traits<SegmentSizeParameterT>::is_constant
+                               && ::cuda::args::__traits<SegmentSizeParameterT>::lowest == tile_size;
 
     // Resolve Segment Parameters
     const auto segment_size = params::get_param(segment_sizes, segment_id);
diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh
index 1c4e6859d3e..41ba334eaef 100644
--- a/cub/cub/detail/segmented_params.cuh
+++ b/cub/cub/detail/segmented_params.cuh
@@ -33,10 +33,10 @@ namespace detail::params
 //! @param[in] __index Segment index to read for sequence arguments.
 //! @return The single argument value, or the sequence element at the given index.
 _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
-_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
+_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
+  if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
   {
     return __arg;
   }
@@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_T
   }
 }
 
-template <auto _Value, class _SegmentIndexT>
+template <auto _Value, class _Tp, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <auto _Value, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
-[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
+get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
-[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
+get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 // =====================================================================
diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
index 29d245c175e..d3c0c651991 100644
--- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
@@ -50,16 +50,16 @@ namespace detail::batched_topk
 // Internal: wrap the compile-time select direction into a discrete param for dispatch
 // -----------------------------------------------------------------------------
 
-// The selection direction is compile-time only: callers pass `::cuda::argument::constant<Dir>`, which maps to a
+// The selection direction is compile-time only: callers pass `::cuda::args::constant<Dir>`, which maps to a
 // value-less static_discrete_param. Because the direction is fixed at compile time and carries no runtime value, it
 // can never disagree with its only supported option, so dispatch can never silently degrade to a no-op.
-template <detail::topk::select Dir>
-[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant<Dir>)
+template <detail::topk::select Dir, class _Tp>
+[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant<Dir, _Tp>)
 {
   return params::static_discrete_param<detail::topk::select, Dir>{};
 }
 
-// The selection direction is intentionally a compile-time constant: only `::cuda::argument::constant<Dir>` is
+// The selection direction is intentionally a compile-time constant: only `::cuda::args::constant<Dir>` is
 // accepted (the overload above maps it to a value-less static_discrete_param). This catch-all documents that
 // deliberate limitation and rejects anything else (e.g. a runtime `detail::topk::select` or a per-segment iterator of
 // directions) with a clear diagnostic. It is an intent/documentation guard rather than a user-facing one: callers
@@ -71,7 +71,7 @@ template <typename SelectDirectionT>
   static_assert(::cuda::std::__always_false_v<SelectDirectionT>,
                 "DeviceBatchedTopK currently supports only compile-time selection directions: the min/max entry "
                 "points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}) dispatch with a "
-                "::cuda::argument::constant<Dir>; runtime or per-segment directions are "
+                "::cuda::args::constant<Dir>; runtime or per-segment directions are "
                 "intentionally not supported");
   // Unreachable (the static_assert above always fires); keeps the return type well-formed so the only diagnostic is
   // the message above.
@@ -131,7 +131,7 @@ template <typename KeyInputItItT,
           typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                                                it_value_t<it_value_t<ValueInputItItT>>,
                                                                ::cuda::std::int64_t,
-                                                               ::cuda::argument::__traits<KParameterT>::highest>>
+                                                               ::cuda::args::__traits<KParameterT>::highest>>
 #if _CCCL_HAS_CONCEPTS()
   requires batched_topk_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
@@ -150,7 +150,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream                             = nullptr,
   [[maybe_unused]] PolicySelector policy_selector = {})
 {
-  using large_segment_tile_offset_t = typename ::cuda::argument::__traits<TotalNumItemsGuaranteeT>::element_type;
+  using large_segment_tile_offset_t = typename ::cuda::args::__traits<TotalNumItemsGuaranteeT>::element_type;
 
   // Wrap the raw enum into the internal discrete param type
   auto select_directions          = wrap_select_direction(select_direction);
@@ -176,9 +176,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int worker_per_segment_tile_size =
     worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
   static constexpr bool any_small_segments =
-    ::cuda::argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
+    ::cuda::args::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
   static constexpr bool only_small_segments =
-    ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
+    ::cuda::args::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
 
   // Allocation layout:
   //   only_small_segments: [0] dummy.
@@ -188,7 +188,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int allocations_array_size     = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
   size_t allocation_sizes[allocations_array_size] = {1};
 
-  using num_segments_val_t         = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
+  using num_segments_val_t         = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
   using counters_t                 = batched_topk_counters<num_segments_val_t>;
   using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
   using segment_size_scan_input_op_t =
@@ -244,7 +244,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
 
   // TODO (elstehle): support number of segments provided by device-accessible iterator
   // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
-  static_assert(::cuda::argument::__traits<NumSegmentsParameterT>::is_single_value,
+  static_assert(::cuda::args::__traits<NumSegmentsParameterT>::is_single_value,
                 "Only uniform segment sizes are currently supported.");
 
   if constexpr (any_small_segments)
@@ -346,7 +346,7 @@ template <typename KeyInputItItT,
     policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                it_value_t<it_value_t<ValueInputItItT>>,
                                ::cuda::std::int64_t,
-                               ::cuda::argument::__traits<KParameterT>::highest>;
+                               ::cuda::args::__traits<KParameterT>::highest>;
   return detail::dispatch_with_env_and_tuning<default_policy_selector>(
     env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
       return dispatch(
diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
index 9c904f95971..3412f40359a 100644
--- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
@@ -39,7 +39,7 @@ private:
     worker_policy worker_per_segment_policy;
     multi_worker_policy multi_worker_per_segment_policy;
   };
-  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits<SegmentSizeParameterT>::highest;
+  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::args::__traits<SegmentSizeParameterT>::highest;
   static constexpr batched_topk_policy active_policy     = current_policy<PolicySelector>();
 
   template <int Index>
@@ -133,8 +133,8 @@ __launch_bounds__(int(
     KParameterT k,
     SelectDirectionParameterT select_directions,
     NumSegmentsParameterT num_segments,
-    batched_topk_counters<typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
-    typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
+    batched_topk_counters<typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type>* d_counters,
+    typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
     LargeSegmentTileOffsetT* d_large_segments_tile_offsets)
 {
   using agent_t = typename find_smallest_covering_policy<
@@ -151,7 +151,7 @@ __launch_bounds__(int(
     LargeSegmentTileOffsetT>::agent_t;
 
   // Static Assertions (Constraints)
-  static_assert(agent_t::tile_size >= ::cuda::argument::__traits<SegmentSizeParameterT>::highest,
+  static_assert(agent_t::tile_size >= ::cuda::args::__traits<SegmentSizeParameterT>::highest,
                 "Block size exceeds maximum segment size supported by SegmentSizeParameterT");
   static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block,
                 "Static shared memory per block must not exceed 48KB limit.");
diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu
index 16f7d4322ca..7e9ad93ca9e 100644
--- a/cub/test/catch2_test_device_segmented_topk_keys.cu
+++ b/cub/test/catch2_test_device_segmented_topk_keys.cu
@@ -156,11 +156,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::argument::constant<direction>{},
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_segments * segment_size});
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::constant<direction>{},
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
   // Prepare expected results
   fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
   compact_sorted_keys_to_topk(expected_keys, segment_size, k);
@@ -254,12 +254,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::argument::immediate_sequence{
-      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::argument::constant<direction>{},
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_items});
+    ::cuda::args::__immediate_sequence{
+      segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::constant<direction>{},
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_items});
 
   // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
   segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
@@ -292,11 +292,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
   batched_topk_keys(
     d_keys_in_it,
     d_keys_out_it,
-    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<cuda::std::int64_t{1}, max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<cuda::std::int64_t{1}, k>()},
-    ::cuda::argument::constant<cub::detail::topk::select::min>{},
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_segments * segment_size});
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<cuda::std::int64_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<cuda::std::int64_t{1}, k>()},
+    ::cuda::args::constant<cub::detail::topk::select::min>{},
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
 
   const int num_minus_zero = static_cast<int>(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{}));
   REQUIRE(num_minus_zero >= 1);
diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu
index 49b2e48a1b6..cef26687cbd 100644
--- a/cub/test/catch2_test_device_segmented_topk_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu
@@ -225,11 +225,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments"
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::argument::constant<direction>{},
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_segments * segment_size});
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::constant<direction>{},
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
 
   // Verification:
   // - We verify correct top-k selection through the keys
@@ -346,12 +346,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::argument::immediate_sequence{
-      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::argument::constant<direction>{},
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_items});
+    ::cuda::args::__immediate_sequence{
+      segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::constant<direction>{},
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_items});
 
   // Verification:
   // - We verify correct top-k selection through the keys
diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index 81df64ab616..0877a8df6d1 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -47,6 +47,8 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT
 
+struct __access;
+
 // =====================================================================
 // __element_type_of
 // =====================================================================
@@ -95,31 +97,28 @@ inline constexpr bool __is_sequence_v =
 // spelling carries that intent.
 
 //! @brief Wraps a compile-time constant argument value.
-template <auto _Value>
-struct constant
+template <auto _Value, class _Tp = decltype(_Value)>
+class constant
 {
-  using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
+public:
+  using value_type     = ::cuda::std::remove_cvref_t<_Tp>;
   using __element_type = value_type;
 
-  [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept
+  [[nodiscard]] _CCCL_API static constexpr value_type __get_value() noexcept
   {
-    return _Value;
+    return static_cast<value_type>(_Value);
   }
 };
 
 //! @brief Wraps a compile-time constant argument sequence.
 template <auto _Value>
-struct constant_sequence
+class __constant_sequence
 {
+public:
   using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using __element_type = __element_type_of_t<value_type>;
 
   static_assert(__is_sequence_v<value_type>, "The value type of __constant_sequence must be a sequence");
-
-  [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept
-  {
-    return _Value;
-  }
 };
 
 // __assert_in_range
@@ -223,7 +222,7 @@ template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
   _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)),
                "static and runtime argument bounds do not intersect");
@@ -257,17 +256,20 @@ _CCCL_API constexpr void __validate_runtime_element_bounds(
 //!
 //! The value is host-accessible at API call time.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct immediate
+class immediate
 {
+public:
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
+private:
+  friend struct __access;
+
   _Arg __arg_;
 
-private:
   _CCCL_API constexpr void __validate_value() const noexcept
   {
     if constexpr (::cuda::std::is_same_v<::cuda::std::remove_cvref_t<_Arg>, __element_type>
@@ -299,24 +301,27 @@ _CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>)
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
-// immediate_sequence
+// __immediate_sequence
 // =====================================================================
 
 //! @brief Wraps a runtime argument sequence with optional bounds.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct immediate_sequence
+class __immediate_sequence
 {
+public:
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
+private:
+  friend struct __access;
+
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
 
-private:
   _CCCL_API constexpr void __validate_bounds() const noexcept
   {
     __validate_bounds_intersection<__element_type, _StaticBounds>(__runtime_bounds_);
@@ -344,14 +349,14 @@ struct immediate_sequence
   }
 
 public:
-  _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
     __validate_value();
   }
 
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
@@ -359,7 +364,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -369,7 +374,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -379,23 +384,23 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
-      : immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+      : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>)
+  -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
-  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE __immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
@@ -404,17 +409,22 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_L
 
 //! @brief Common base for deferred argument wrappers.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct __deferred_base
+class __deferred_base
 {
+public:
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
+private:
+  friend struct __access;
+
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
 
+public:
   _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
@@ -454,8 +464,9 @@ struct __deferred_base
 //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available
 //! by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct deferred : __deferred_base<_Arg, _StaticBounds>
+class deferred : public __deferred_base<_Arg, _StaticBounds>
 {
+public:
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
@@ -481,8 +492,9 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi
 //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be
 //! available by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
+class deferred_sequence : public __deferred_base<_Arg, _StaticBounds>
 {
+public:
   static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type");
 
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
@@ -508,6 +520,97 @@ _CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lo
   -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
+// =====================================================================
+// __access
+// =====================================================================
+
+struct __access
+{
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(immediate<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const _Arg& __arg(const immediate<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(immediate<_Arg, _StaticBounds>&& __wrapper) noexcept
+  {
+    return ::cuda::std::move(__wrapper.__arg_);
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const _Arg&
+  __arg(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__immediate_sequence<_Arg, _StaticBounds>&& __wrapper) noexcept
+  {
+    return ::cuda::std::move(__wrapper.__arg_);
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const _Arg&
+  __arg(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__deferred_base<_Arg, _StaticBounds>&& __wrapper) noexcept
+  {
+    return ::cuda::std::move(__wrapper.__arg_);
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+};
+
 // =====================================================================
 // __unwrap
 // =====================================================================
@@ -516,12 +619,12 @@ template <class _Tp>
 inline constexpr bool __is_wrapper_v = false;
 template <class _Arg, class _StaticBounds>
 inline constexpr bool __is_wrapper_v<immediate<_Arg, _StaticBounds>> = true;
+template <auto _Value, class _Tp>
+inline constexpr bool __is_wrapper_v<constant<_Value, _Tp>> = true;
 template <auto _Value>
-inline constexpr bool __is_wrapper_v<constant<_Value>> = true;
-template <auto _Value>
-inline constexpr bool __is_wrapper_v<constant_sequence<_Value>> = true;
+inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<immediate_sequence<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
 inline constexpr bool __is_wrapper_v<deferred<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
@@ -537,99 +640,99 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
-template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const constant<_Value>&) noexcept
+template <auto _Value, class _Tp>
+[[nodiscard]] _CCCL_API constexpr typename constant<_Value, _Tp>::value_type
+__unwrap(const constant<_Value, _Tp>&) noexcept
 {
-  return _Value;
+  return constant<_Value, _Tp>::__get_value();
 }
 
 template <auto _Value>
 [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const constant_sequence<_Value>&) noexcept
+__unwrap(const __constant_sequence<_Value>&) noexcept
 {
   return _Value;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
-template <auto _Value>
+template <auto _Value, class _Tp>
 _CCCL_API constexpr auto __constant_compute_lowest() noexcept
 {
-  return _Value;
+  return constant<_Value, _Tp>::__get_value();
 }
 
-template <auto _Value>
+template <auto _Value, class _Tp>
 _CCCL_API constexpr auto __constant_compute_highest() noexcept
 {
-  return _Value;
+  return constant<_Value, _Tp>::__get_value();
 }
 
 template <auto _Value>
@@ -680,16 +783,16 @@ struct __traits_impl
   static constexpr element_type highest = (::cuda::std::numeric_limits<element_type>::max)();
 };
 
-template <auto _Value>
-struct __traits_impl<constant<_Value>>
+template <auto _Value, class _Tp>
+struct __traits_impl<constant<_Value, _Tp>>
 {
-  using value_type                      = ::cuda::std::remove_cvref_t<decltype(_Value)>;
+  using value_type                      = typename constant<_Value, _Tp>::value_type;
   using element_type                    = value_type;
   static constexpr bool is_constant     = true;
   static constexpr bool is_deferred     = false;
   static constexpr bool is_single_value = true;
-  static constexpr element_type lowest  = __constant_compute_lowest<_Value>();
-  static constexpr element_type highest = __constant_compute_highest<_Value>();
+  static constexpr element_type lowest  = __constant_compute_lowest<_Value, _Tp>();
+  static constexpr element_type highest = __constant_compute_highest<_Value, _Tp>();
 };
 
 template <class _Arg, class _StaticBounds>
@@ -698,7 +801,7 @@ struct __traits_impl<immediate<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -709,7 +812,7 @@ struct __traits_impl<immediate<_Arg, _StaticBounds>>
 };
 
 template <auto _Value>
-struct __traits_impl<constant_sequence<_Value>>
+struct __traits_impl<__constant_sequence<_Value>>
 {
   using value_type   = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using element_type = __element_type_of_t<value_type>;
@@ -722,13 +825,13 @@ struct __traits_impl<constant_sequence<_Value>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<immediate_sequence<_Arg, _StaticBounds>>
+struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__is_sequence_v<value_type>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -744,7 +847,7 @@ struct __traits_impl<deferred<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -761,7 +864,7 @@ struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__is_sequence_v<value_type>, "deferred sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -787,14 +890,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
   return ::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::lowest();
 }
 
-template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept
+template <auto _Value, class _Tp>
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value, _Tp>) noexcept
 {
-  return __constant_compute_lowest<_Value>();
+  return __constant_compute_lowest<_Value, _Tp>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_lowest<_Value>();
 }
@@ -802,31 +905,34 @@ template <auto _Value>
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 //! @brief Returns the effective highest bound, combining static and runtime bounds.
@@ -837,14 +943,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
   return (::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::max)();
 }
 
-template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept
+template <auto _Value, class _Tp>
+[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value, _Tp>) noexcept
 {
-  return __constant_compute_highest<_Value>();
+  return __constant_compute_highest<_Value, _Tp>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_highest<_Value>();
 }
@@ -852,31 +958,34 @@ template <auto _Value>
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_highest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_highest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_highest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 _CCCL_END_NAMESPACE_CUDA_ARGUMENT
diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h
index 6d1943da32a..ce6d3a56743 100644
--- a/libcudacxx/include/cuda/__argument/argument_bounds.h
+++ b/libcudacxx/include/cuda/__argument/argument_bounds.h
@@ -22,11 +22,8 @@
 #endif // no system header
 
 #include <cuda/std/__cccl/assert.h>
-#include <cuda/std/__type_traits/is_extended_arithmetic.h>
 #include <cuda/std/__type_traits/is_same.h>
-#include <cuda/std/__type_traits/remove_cv.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
-#include <cuda/std/__utility/cmp.h>
 #include <cuda/std/limits>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -48,8 +45,9 @@ struct no_bounds
 //! @tparam _Lower The static lower bound.
 //! @tparam _Upper The static upper bound.
 template <auto _Lower, auto _Upper>
-struct static_bounds
+class static_bounds
 {
+public:
   static_assert(::cuda::std::is_same_v<decltype(_Lower), decltype(_Upper)>,
                 "Static bounds endpoints must have the same type");
   static_assert(_Lower <= _Upper, "Lower bound must be <= upper bound");
@@ -77,11 +75,12 @@ inline constexpr bool __is_static_bounds_v<static_bounds<_Lower, _Upper>> = true
 //!
 //! @tparam _Tp The value type of the bounds.
 template <class _Tp>
-struct runtime_bounds
+class runtime_bounds
 {
   _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest();
   _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)();
 
+public:
   constexpr runtime_bounds() noexcept = default;
 
   _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept
@@ -145,129 +144,6 @@ inline constexpr bool __is_runtime_bounds_cv_v = __is_runtime_bounds_v<::cuda::s
 template <class _Tp>
 inline constexpr bool __is_bounds_v = __is_static_bounds_cv_v<_Tp> || __is_runtime_bounds_cv_v<_Tp>;
 
-// =====================================================================
-// __assert_in_range
-// =====================================================================
-
-template <class _To, class _From>
-_CCCL_API constexpr void __assert_in_range([[maybe_unused]] _From __val) noexcept
-{
-  if constexpr (::cuda::std::__cccl_is_integer_v<_To> && ::cuda::std::__cccl_is_integer_v<_From>)
-  {
-    _CCCL_ASSERT(::cuda::std::in_range<_To>(__val), "runtime bound value overflows the element type");
-  }
-}
-
-template <class _To, class _From>
-[[nodiscard]] _CCCL_API constexpr _To __runtime_bound_cast(_From __val) noexcept
-{
-  __assert_in_range<_To>(__val);
-  return static_cast<_To>(__val);
-}
-
-template <class _To, auto _Value>
-_CCCL_API constexpr bool __static_bound_in_range() noexcept
-{
-  using _From = decltype(_Value);
-
-  if constexpr (::cuda::std::__cccl_is_integer_v<_To> && ::cuda::std::__cccl_is_integer_v<_From>)
-  {
-    return ::cuda::std::in_range<_To>(_Value);
-  }
-  else if constexpr (::cuda::std::is_arithmetic_v<_To> && ::cuda::std::is_arithmetic_v<_From>)
-  {
-    return static_cast<_From>(static_cast<_To>(_Value)) == _Value;
-  }
-  else
-  {
-    return true;
-  }
-}
-
-template <class _ElementType, class _StaticBounds>
-inline constexpr bool __valid_static_bounds_v = true;
-
-template <class _ElementType, auto _Lowest, auto _Highest>
-inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> =
-  ::cuda::__argument::__static_bound_in_range<_ElementType, _Lowest>()
-  && ::cuda::__argument::__static_bound_in_range<_ElementType, _Highest>();
-
-template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept
-{
-  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
-  {
-    return ::cuda::std::numeric_limits<_ElementType>::lowest();
-  }
-  else
-  {
-    return static_cast<_ElementType>(_StaticBounds::lower());
-  }
-}
-
-template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept
-{
-  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
-  {
-    return (::cuda::std::numeric_limits<_ElementType>::max)();
-  }
-  else
-  {
-    return static_cast<_ElementType>(_StaticBounds::upper());
-  }
-}
-
-template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
-{
-  auto __static_lowest = ::cuda::__argument::__wrapper_static_lowest<_ElementType, _StaticBounds>();
-  return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower();
-}
-
-template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
-{
-  auto __static_highest = ::cuda::__argument::__wrapper_static_highest<_ElementType, _StaticBounds>();
-  return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper();
-}
-
-template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr bool __valid_argument_bounds(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
-{
-  return ::cuda::__argument::__effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds)
-      <= ::cuda::__argument::__effective_highest<_ElementType, _StaticBounds>(__runtime_bounds);
-}
-
-template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
-{
-  static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
-  _CCCL_VERIFY((::cuda::__argument::__valid_argument_bounds<_ElementType, _StaticBounds>(__runtime_bounds)),
-               "static and runtime argument bounds do not intersect");
-}
-
-template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept
-{
-  if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
-  {
-    _CCCL_ASSERT((__val >= ::cuda::__argument::__wrapper_static_lowest<_ElementType, _StaticBounds>()),
-                 "immediate argument value is below static lowest bound");
-    _CCCL_ASSERT((__val <= ::cuda::__argument::__wrapper_static_highest<_ElementType, _StaticBounds>()),
-                 "immediate argument value is above static highest bound");
-  }
-}
-
-template <class _ElementType>
-_CCCL_API constexpr void __validate_runtime_element_bounds(
-  [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept
-{
-  _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound");
-  _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound");
-}
-
 _CCCL_END_NAMESPACE_CUDA_ARGUMENT
 
 #include <cuda/std/__cccl/epilogue.h>
diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h
index f3cc191adfc..4876dd6f449 100644
--- a/libcudacxx/include/cuda/std/__internal/namespaces.h
+++ b/libcudacxx/include/cuda/std/__internal/namespaces.h
@@ -115,8 +115,8 @@
 #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution)
 #define _CCCL_END_NAMESPACE_CUDA_EXECUTION   _CCCL_END_NAMESPACE(cuda::execution)
 
-#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument)
-#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::argument)
+#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::args)
+#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::args)
 
 // Namespace to avoid name collisions with CPOs on clang-16 (see
 // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index 6be3c0d925b..46070dbabbd 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -16,31 +16,31 @@
 
 TEST_FUNC constexpr bool test()
 {
-  // --- static_argument_bounds ---
+  // --- static_bounds ---
 
   // Basic static bounds
   {
-    constexpr auto b = cuda::argument::static_bounds<1, 4096>{};
+    constexpr auto b = cuda::args::static_bounds<1, 4096>{};
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 4096);
   }
 
   // Exact static bounds
   {
-    constexpr auto b = cuda::argument::static_bounds<42, 42>{};
+    constexpr auto b = cuda::args::static_bounds<42, 42>{};
     static_assert(b.lower() == 42);
     static_assert(b.upper() == 42);
   }
 
   // Long type deduced from NTTPs
   {
-    static_assert(cuda::std::is_same_v<decltype(cuda::argument::static_bounds<0L, 1000L>::lower()), long>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::args::static_bounds<0L, 1000L>::lower()), long>);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Static bounds preserve their original NTTP types
   {
-    constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>();
+    constexpr auto b = cuda::args::bounds<1.0f, 8.0f>();
     static_assert(b.lower() == 1.0f);
     static_assert(b.upper() == 8);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), float>);
@@ -48,17 +48,13 @@ TEST_FUNC constexpr bool test()
   }
 #endif // TEST_HAS_CLASS_NTTP
 
-  // --- runtime_argument_bounds ---
+  // --- runtime_bounds ---
 
   // Basic runtime bounds
   {
-    auto b = cuda::argument::runtime_bounds{10, 100};
+    auto b = cuda::args::runtime_bounds{10, 100};
     assert(b.lower() == 10);
     assert(b.upper() == 100);
-    assert(b.__lower_ == 10);
-    assert(b.__upper_ == 100);
-    b.__upper_ = 90;
-    assert(b.upper() == 90);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), int>);
   }
 
@@ -66,43 +62,43 @@ TEST_FUNC constexpr bool test()
 
   // Static via factory
   {
-    constexpr auto b = cuda::argument::bounds<1, 8>();
+    constexpr auto b = cuda::args::bounds<1, 8>();
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 8);
-    static_assert(cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(!cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
+    static_assert(cuda::args::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(!cuda::args::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::args::__is_bounds_v<decltype(b)>);
   }
 
   // Runtime via factory
   {
-    auto b = cuda::argument::bounds(10, 100);
+    auto b = cuda::args::bounds(10, 100);
     assert(b.lower() == 10);
     assert(b.upper() == 100);
-    static_assert(!cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
+    static_assert(!cuda::args::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::args::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::args::__is_bounds_v<decltype(b)>);
   }
 
   // Static and runtime bounds intersection
   {
-    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<1, 100>>(
-      cuda::argument::runtime_bounds<int>{50, 200}));
-    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<100, 200>>(
-      cuda::argument::runtime_bounds<int>{0, 50}));
+    static_assert(cuda::args::__has_bounds_intersection<int, cuda::args::static_bounds<1, 100>>(
+      cuda::args::runtime_bounds<int>{50, 200}));
+    static_assert(!cuda::args::__has_bounds_intersection<int, cuda::args::static_bounds<100, 200>>(
+      cuda::args::runtime_bounds<int>{0, 50}));
   }
 
   // Non-bounds type
   {
-    static_assert(!cuda::argument::__is_bounds_v<int>);
+    static_assert(!cuda::args::__is_bounds_v<int>);
   }
 
   // Bounds types accepted by argument wrapper template parameters
   {
-    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::no_bounds>);
-    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::static_bounds<1, 8>>);
-    static_assert(!cuda::argument::__valid_static_bounds_v<int, cuda::argument::runtime_bounds<int>>);
-    static_assert(!cuda::argument::__valid_static_bounds_v<int, int>);
+    static_assert(cuda::args::__valid_static_bounds_v<int, cuda::args::no_bounds>);
+    static_assert(cuda::args::__valid_static_bounds_v<int, cuda::args::static_bounds<1, 8>>);
+    static_assert(!cuda::args::__valid_static_bounds_v<int, cuda::args::runtime_bounds<int>>);
+    static_assert(!cuda::args::__valid_static_bounds_v<int, int>);
   }
 
   return true;
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
index c86f1caa251..aaf57291e23 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
@@ -57,129 +57,124 @@ TEST_FUNC void test()
   // --- __is_sequence_v ---
 
   // builtin and class type are not sequences
-  static_assert(!cuda::argument::__is_sequence_v<int>);
-  static_assert(!cuda::argument::__is_sequence_v<color>);
-  static_assert(!cuda::argument::__is_sequence_v<non_sequence_value>);
-  static_assert(!cuda::argument::__is_sequence_v<range_like<int>>);
-  static_assert(!cuda::argument::__is_sequence_v<element_type_like<int>>);
-  static_assert(!cuda::argument::__is_sequence_v<value_type_like<int>>);
-  static_assert(!cuda::argument::__is_sequence_v<cuda::std::complex<float>>);
-  static_assert(!cuda::argument::__is_sequence_v<cuda::std::pair<float, int>>);
-  static_assert(!cuda::argument::__is_sequence_v<cuda::std::tuple<float, int>>);
-  static_assert(!cuda::argument::__is_sequence_v<cuda::std::optional<int>>);
-  static_assert(!cuda::argument::__is_sequence_v<cuda::std::expected<int, int>>);
+  static_assert(!cuda::args::__is_sequence_v<int>);
+  static_assert(!cuda::args::__is_sequence_v<color>);
+  static_assert(!cuda::args::__is_sequence_v<non_sequence_value>);
+  static_assert(!cuda::args::__is_sequence_v<range_like<int>>);
+  static_assert(!cuda::args::__is_sequence_v<element_type_like<int>>);
+  static_assert(!cuda::args::__is_sequence_v<value_type_like<int>>);
+  static_assert(!cuda::args::__is_sequence_v<cuda::std::complex<float>>);
+  static_assert(!cuda::args::__is_sequence_v<cuda::std::pair<float, int>>);
+  static_assert(!cuda::args::__is_sequence_v<cuda::std::tuple<float, int>>);
+  static_assert(!cuda::args::__is_sequence_v<cuda::std::optional<int>>);
+  static_assert(!cuda::args::__is_sequence_v<cuda::std::expected<int, int>>);
 
   // iterators and pointers can be sequences if they are at least random access
-  static_assert(cuda::argument::__is_sequence_v<int*>);
-  static_assert(cuda::argument::__is_sequence_v<const int*>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::counting_iterator<int>>);
-  static_assert(!cuda::argument::__is_sequence_v<bidirectional_iterator<int*>>);
+  static_assert(cuda::args::__is_sequence_v<int*>);
+  static_assert(cuda::args::__is_sequence_v<const int*>);
+  static_assert(cuda::args::__is_sequence_v<cuda::counting_iterator<int>>);
+  static_assert(!cuda::args::__is_sequence_v<bidirectional_iterator<int*>>);
 
   // ranges and arrays are sequences
-  static_assert(cuda::argument::__is_sequence_v<int[]>);
-  static_assert(cuda::argument::__is_sequence_v<const int[]>);
-  static_assert(cuda::argument::__is_sequence_v<int[42]>);
-  static_assert(cuda::argument::__is_sequence_v<const int[42]>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int, 1>>);
-  static_assert(cuda::argument::__is_sequence_v<const cuda::std::span<int, 1>&>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int>>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::std::array<int, 3>>);
+  static_assert(cuda::args::__is_sequence_v<int[]>);
+  static_assert(cuda::args::__is_sequence_v<const int[]>);
+  static_assert(cuda::args::__is_sequence_v<int[42]>);
+  static_assert(cuda::args::__is_sequence_v<const int[42]>);
+  static_assert(cuda::args::__is_sequence_v<cuda::std::span<int, 1>>);
+  static_assert(cuda::args::__is_sequence_v<const cuda::std::span<int, 1>&>);
+  static_assert(cuda::args::__is_sequence_v<cuda::std::span<int>>);
+  static_assert(cuda::args::__is_sequence_v<cuda::std::array<int, 3>>);
 
   // --- __element_type_of_t ---
 
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<int*>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::counting_iterator<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::array<int, 3>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<range_like<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<element_type_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<int*>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<cuda::counting_iterator<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<cuda::std::array<int, 3>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<range_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<element_type_like<int>>, int>);
   static_assert(
-    cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>,
-                         int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<value_type_like<int>>, int>);
+    cuda::std::is_same_v<cuda::args::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<value_type_like<int>>, int>);
 
   // --- argument_traits: is_deferred ---
 
-  static_assert(!cuda::argument::__traits<int>::is_deferred);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate<int>>::is_deferred);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_deferred);
-  static_assert(!cuda::argument::__traits<cuda::argument::constant<42>>::is_deferred);
+  static_assert(!cuda::args::__traits<int>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::immediate<int>>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::constant<42>>::is_deferred);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(
-    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
 #endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>::is_deferred);
-  static_assert(cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(cuda::args::__traits<cuda::args::deferred<cuda::std::span<int, 1>>>::is_deferred);
+  static_assert(cuda::args::__traits<cuda::args::deferred_sequence<cuda::std::span<int>>>::is_deferred);
 
   // --- argument_traits: is_single_value ---
 
-  static_assert(cuda::argument::__traits<int>::is_single_value);
-  static_assert(cuda::argument::__traits<int*>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<int*>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<cuda::counting_iterator<int>>>::is_single_value);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
+  static_assert(cuda::args::__traits<int>::is_single_value);
+  static_assert(cuda::args::__traits<int*>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int*>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<cuda::counting_iterator<int>>>::is_single_value);
+  static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::constant<42>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+    !cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::argument::__traits<cuda::argument::deferred<int*>>::is_single_value);
-  static_assert(!cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::deferred<int*>>::is_single_value);
+  static_assert(!cuda::args::__traits<cuda::args::deferred_sequence<cuda::std::span<int>>>::is_single_value);
 
   // --- argument_traits: value_type ---
 
-  static_assert(cuda::std::is_same_v<cuda::argument::__traits<int>::value_type, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<int>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<cuda::args::immediate<int>>::value_type, int>);
   static_assert(
-    cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::value_type,
+    cuda::std::is_same_v<cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::value_type,
                          cuda::std::span<int>>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<cuda::args::constant<42>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<cuda::args::constant<10, float>>::value_type, float>);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(
-    cuda::std::is_same_v<
-      cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
-      cuda::std::array<int, 3>>);
+  static_assert(cuda::std::is_same_v<
+                cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
+                cuda::std::array<int, 3>>);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- argument_traits: lowest / highest ---
 
-  static_assert(cuda::argument::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::argument::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::argument::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::argument::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
-  static_assert(
-    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>>::lowest == 1);
+  static_assert(cuda::args::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::args::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::args::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::args::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::args::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::args::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::args::__traits<const cuda::args::immediate<int, cuda::args::static_bounds<1, 8>>>::lowest == 1);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int, cuda::args::static_bounds<1, 8>>&>::highest == 8);
   static_assert(
-    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>&>::highest == 8);
-  static_assert(
-    cuda::argument::__traits<
-      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::static_bounds<1, 8>>>::highest
+    cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>, cuda::args::static_bounds<1, 8>>>::highest
     == 8);
+  static_assert(cuda::args::__traits<cuda::args::constant<10, float>>::lowest == 10.0f);
+  static_assert(cuda::args::__traits<cuda::args::constant<10, float>>::highest == 10.0f);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(
-    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
-  static_assert(
-    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
+  static_assert(cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
+  static_assert(cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- Free function bounds on plain values ---
 
-  static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::args::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::args::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::args::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::args::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
 
   // --- Scalar and sequence wrappers expose distinct single-value traits ---
 
-  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::constant<42>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int>>::is_single_value);
+  static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+    !cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
 }
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
index 5dc2fb849c7..21b200e8f0f 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
@@ -22,110 +22,110 @@ TEST_FUNC constexpr bool test()
   // Deferred single value via span<T, 1>
   {
     int val  = 42;
-    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
-    assert(cuda::argument::__unwrap(def)[0] == 42);
-    assert(def.__arg_[0] == 42);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
+    auto def = cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}};
+    assert(cuda::args::__unwrap(def)[0] == 42);
+    assert(cuda::args::__access::__arg(def)[0] == 42);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::args::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
   }
 
   // Deferred single value with static bounds
   {
     int val  = 42;
-    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 1000>()};
-    assert(cuda::argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 1000);
+    auto def = cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::args::bounds<1, 1000>()};
+    assert(cuda::args::__unwrap(def)[0] == 42);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 1000);
   }
 
   // Deferred single value via pointer
   {
     int val     = 42;
-    using def_t = cuda::argument::deferred<int*, cuda::argument::static_bounds<0, 100>>;
-    static_assert(cuda::argument::__traits<def_t>::lowest == 0);
-    static_assert(cuda::argument::__traits<def_t>::highest == 100);
+    using def_t = cuda::args::deferred<int*, cuda::args::static_bounds<0, 100>>;
+    static_assert(cuda::args::__traits<def_t>::lowest == 0);
+    static_assert(cuda::args::__traits<def_t>::highest == 100);
     // Also verify construction works
-    auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()};
-    assert(cuda::argument::__unwrap(def) == &val);
+    auto def = cuda::args::deferred{&val, cuda::args::bounds<0, 100>()};
+    assert(cuda::args::__unwrap(def) == &val);
   }
 
   // Deferred single value via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{42};
-    auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()};
-    assert(cuda::argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
-    static_assert(cuda::argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::args::deferred{it, cuda::args::bounds<0, 100>()};
+    assert(cuda::args::__unwrap(def)[0] == 42);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 100);
+    static_assert(cuda::args::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred single value with both bounds, runtime bounds first
   {
-    int val  = 42;
-    auto def = cuda::argument::deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()};
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 256);
-    assert(def.__runtime_bounds_.__lower_ == 5);
-    assert(def.__runtime_bounds_.__upper_ == 100);
-    assert(cuda::argument::__lowest_(def) == 5);
-    assert(cuda::argument::__highest_(def) == 100);
-    def.__runtime_bounds_.__upper_ = 90;
-    assert(cuda::argument::__highest_(def) == 90);
+    int val = 42;
+    auto def =
+      cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 256>()};
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 256);
+    assert(cuda::args::__access::__runtime_bounds(def).lower() == 5);
+    assert(cuda::args::__access::__runtime_bounds(def).upper() == 100);
+    assert(cuda::args::__lowest_(def) == 5);
+    assert(cuda::args::__highest_(def) == 100);
+    cuda::args::__access::__runtime_bounds(def) = cuda::args::bounds(5, 90);
+    assert(cuda::args::__highest_(def) == 90);
   }
 
   // Deferred sequence via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{10};
-    auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()};
-    assert(cuda::argument::__unwrap(def)[0] == 10);
-    assert(cuda::argument::__unwrap(def)[2] == 12);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
-    static_assert(!cuda::argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::args::deferred_sequence{it, cuda::args::bounds<0, 100>()};
+    assert(cuda::args::__unwrap(def)[0] == 10);
+    assert(cuda::args::__unwrap(def)[2] == 12);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 100);
+    static_assert(!cuda::args::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred sequence with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::argument::deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)};
-    assert(def.__arg_.size() == 4);
-    assert(def.__runtime_bounds_.__lower_ == 5);
-    assert(def.__runtime_bounds_.__upper_ == 100);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    assert(cuda::argument::__lowest_(def) == 5);
-    assert(cuda::argument::__highest_(def) == 100);
+    auto def   = cuda::args::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds<1, 4096>(), cuda::args::bounds(5, 100)};
+    assert(cuda::args::__access::__arg(def).size() == 4);
+    assert(cuda::args::__access::__runtime_bounds(def).lower() == 5);
+    assert(cuda::args::__access::__runtime_bounds(def).upper() == 100);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    assert(cuda::args::__lowest_(def) == 5);
+    assert(cuda::args::__highest_(def) == 100);
   }
 
   // Deferred sequence with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::argument::deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()};
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 4096);
-    assert(cuda::argument::__lowest_(def) == 5);
-    assert(cuda::argument::__highest_(def) == 100);
+    auto def   = cuda::args::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 4096>()};
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 4096);
+    assert(cuda::args::__lowest_(def) == 5);
+    assert(cuda::args::__highest_(def) == 100);
   }
 
   // Traits: deferred is single value
   {
-    using traits = cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>;
+    using traits = cuda::args::__traits<cuda::args::deferred<cuda::std::span<int, 1>>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred with pointer is also single value
   {
-    using traits = cuda::argument::__traits<cuda::argument::deferred<int*>>;
+    using traits = cuda::args::__traits<cuda::args::deferred<int*>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred_sequence is not single value
   {
-    using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>;
+    using traits = cuda::args::__traits<cuda::args::deferred_sequence<cuda::std::span<int>>>;
     static_assert(traits::is_deferred);
     static_assert(!traits::is_single_value);
   }
@@ -133,16 +133,16 @@ TEST_FUNC constexpr bool test()
   // Unwrap: deferred
   {
     int val  = 99;
-    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
-    auto& v  = cuda::argument::__unwrap(def);
+    auto def = cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}};
+    auto& v  = cuda::args::__unwrap(def);
     assert(v[0] == 99);
   }
 
   // Unwrap: deferred_sequence
   {
     int arr[3]    = {10, 20, 30};
-    auto def      = cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::argument::__unwrap(def);
+    auto def      = cuda::args::deferred_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::args::__unwrap(def);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
@@ -150,14 +150,14 @@ TEST_FUNC constexpr bool test()
   // Unwrap: rvalue deferred returns by value
   {
     int val = 99;
-    auto v  = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}});
+    auto v  = cuda::args::__unwrap(cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}});
     assert(v[0] == 99);
   }
 
   // Unwrap: rvalue deferred_sequence returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::args::__unwrap(cuda::args::deferred_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
index 1ce371b84e5..64bad620293 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-[[maybe_unused]] cuda::argument::deferred_sequence<int> invalid_arg{0};
+[[maybe_unused]] cuda::args::deferred_sequence<int> invalid_arg{0};
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
index 41a9a2ae778..111bc226ae5 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<int>>;
+using traits = cuda::args::__traits<cuda::args::deferred_sequence<int>>;
 
 [[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred;
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
index e969b9bd234..7970c50e2df 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
@@ -25,99 +25,99 @@ TEST_FUNC constexpr bool test()
 {
   // Uniform scalar via CTAD
   {
-    auto da = cuda::argument::immediate{5};
-    assert(cuda::argument::__unwrap(da) == 5);
-    assert(da.__arg_ == 5);
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
-    assert(cuda::argument::__lowest_(da) == 5);
-    assert(cuda::argument::__highest_(da) == 5);
-    da.__arg_ = 6;
-    assert(cuda::argument::__unwrap(da) == 6);
+    auto da = cuda::args::immediate{5};
+    assert(cuda::args::__unwrap(da) == 5);
+    assert(cuda::args::__access::__arg(da) == 5);
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::args::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
+    assert(cuda::args::__lowest_(da) == 5);
+    assert(cuda::args::__highest_(da) == 5);
+    cuda::args::__access::__arg(da) = 6;
+    assert(cuda::args::__unwrap(da) == 6);
   }
 
   // Uniform scalar with static bounds
   {
-    auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()};
-    assert(cuda::argument::__unwrap(da) == 5);
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 8);
-    assert(cuda::argument::__lowest_(da) == 5);
-    assert(cuda::argument::__highest_(da) == 5);
+    auto da = cuda::args::immediate{5, cuda::args::bounds<1, 8>()};
+    assert(cuda::args::__unwrap(da) == 5);
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 8);
+    assert(cuda::args::__lowest_(da) == 5);
+    assert(cuda::args::__highest_(da) == 5);
   }
 
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    auto da = cuda::argument::immediate{non_sequence_value{7}};
-    assert(cuda::argument::__unwrap(da).payload == 7);
+    auto da = cuda::args::immediate{non_sequence_value{7}};
+    assert(cuda::args::__unwrap(da).payload == 7);
   }
 
   // Pointer-like types can still represent a single value when explicitly wrapped that way
   {
     int value = 11;
-    auto da   = cuda::argument::immediate{&value};
-    static_assert(cuda::argument::__traits<decltype(da)>::is_single_value);
-    assert(*cuda::argument::__unwrap(da) == 11);
+    auto da   = cuda::args::immediate{&value};
+    static_assert(cuda::args::__traits<decltype(da)>::is_single_value);
+    assert(*cuda::args::__unwrap(da) == 11);
   }
 
   // Per-segment span with runtime bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds(1L, 100L)};
-    assert(cuda::argument::__unwrap(da).size() == 4);
-    assert(da.__arg_.size() == 4);
-    assert(da.__runtime_bounds_.__lower_ == 1);
-    assert(da.__runtime_bounds_.__upper_ == 100);
-    assert(cuda::argument::__lowest_(da) == 1);
-    assert(cuda::argument::__highest_(da) == 100);
-    da.__runtime_bounds_.__upper_ = 90;
-    assert(cuda::argument::__highest_(da) == 90);
+    auto da    = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::args::bounds(1L, 100L)};
+    assert(cuda::args::__unwrap(da).size() == 4);
+    assert(cuda::args::__access::__arg(da).size() == 4);
+    assert(cuda::args::__access::__runtime_bounds(da).lower() == 1);
+    assert(cuda::args::__access::__runtime_bounds(da).upper() == 100);
+    assert(cuda::args::__lowest_(da) == 1);
+    assert(cuda::args::__highest_(da) == 100);
+    cuda::args::__access::__runtime_bounds(da) = cuda::args::bounds(1, 90);
+    assert(cuda::args::__highest_(da) == 90);
   }
 
   // Per-segment span with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)};
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::argument::__lowest_(da) == 10);
-    assert(cuda::argument::__highest_(da) == 200);
+    auto da    = cuda::args::__immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds<1, 256>(), cuda::args::bounds(10, 200)};
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 256);
+    assert(cuda::args::__lowest_(da) == 10);
+    assert(cuda::args::__highest_(da) == 200);
   }
 
   // Per-segment span with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()};
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::argument::__lowest_(da) == 10);
-    assert(cuda::argument::__highest_(da) == 200);
+    auto da    = cuda::args::__immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds(10, 200), cuda::args::bounds<1, 256>()};
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 256);
+    assert(cuda::args::__lowest_(da) == 10);
+    assert(cuda::args::__highest_(da) == 200);
   }
 
   // Per-segment via span
   {
     int arr[4] = {1, 2, 3, 4};
-    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}};
-    assert(cuda::argument::__unwrap(da).size() == 4);
-    assert(cuda::argument::__unwrap(da)[0] == 1);
-    assert(cuda::argument::__unwrap(da)[3] == 4);
+    auto da    = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 4}};
+    assert(cuda::args::__unwrap(da).size() == 4);
+    assert(cuda::args::__unwrap(da)[0] == 1);
+    assert(cuda::args::__unwrap(da)[3] == 4);
   }
 
   // Per-segment with static bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 100>()};
-    assert(cuda::argument::__unwrap(da).size() == 4);
-    assert(cuda::argument::__unwrap(da)[2] == 30);
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 100);
+    auto da    = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::args::bounds<1, 100>()};
+    assert(cuda::args::__unwrap(da).size() == 4);
+    assert(cuda::args::__unwrap(da)[2] == 30);
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 100);
   }
 
   // Traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::immediate<int>>;
+    using traits = cuda::args::__traits<cuda::args::immediate<int>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, int>);
@@ -125,7 +125,7 @@ TEST_FUNC constexpr bool test()
 
   // Sequence traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>;
+    using traits = cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>;
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, cuda::std::span<int>>);
@@ -133,39 +133,38 @@ TEST_FUNC constexpr bool test()
 
   // __is_sequence_v on unwrapped types
   {
-    static_assert(
-      !cuda::argument::__is_sequence_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type>);
-    static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
+    static_assert(!cuda::args::__is_sequence_v<cuda::args::__traits<cuda::args::immediate<int>>::value_type>);
+    static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
   }
 
   // Unwrap: scalar
   {
-    auto da = cuda::argument::immediate{7};
-    auto& v = cuda::argument::__unwrap(da);
+    auto da = cuda::args::immediate{7};
+    auto& v = cuda::args::__unwrap(da);
     assert(v == 7);
     v = 8;
-    assert(cuda::argument::__unwrap(da) == 8);
+    assert(cuda::args::__unwrap(da) == 8);
   }
 
   // Unwrap: span
   {
     int arr[3]    = {10, 20, 30};
-    auto da       = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::argument::__unwrap(da);
+    auto da       = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::args::__unwrap(da);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
 
   // Unwrap: rvalue scalar returns by value
   {
-    const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7});
+    const auto& v = cuda::args::__unwrap(cuda::args::immediate{7});
     assert(v == 7);
   }
 
   // Unwrap: rvalue span returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::args::__unwrap(cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
index cb6e78cade3..f3cc7a2a993 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
@@ -24,72 +24,76 @@ TEST_FUNC void test()
 {
   // Basic value
   {
-    constexpr auto sa = cuda::argument::constant<42>{};
-    static_assert(sa.value() == 42);
+    constexpr auto sa = cuda::args::constant<42>{};
+    static_assert(cuda::args::__unwrap(sa) == 42);
     static_assert(cuda::std::is_same_v<decltype(sa)::value_type, int>);
   }
 
   // Different types
   {
-    constexpr auto sa_long = cuda::argument::constant<100L>{};
-    static_assert(sa_long.value() == 100L);
+    constexpr auto sa_long = cuda::args::constant<100L>{};
+    static_assert(cuda::args::__unwrap(sa_long) == 100L);
     static_assert(cuda::std::is_same_v<decltype(sa_long)::value_type, long>);
+
+    constexpr auto sa_float = cuda::args::constant<10, float>{};
+    static_assert(cuda::args::__unwrap(sa_float) == 10.0f);
+    static_assert(cuda::std::is_same_v<decltype(sa_float)::value_type, float>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::args::__unwrap(sa_float)), float>);
   }
 
   // Negative value
   {
-    constexpr auto sa_neg = cuda::argument::constant<-1>{};
-    static_assert(sa_neg.value() == -1);
+    constexpr auto sa_neg = cuda::args::constant<-1>{};
+    static_assert(cuda::args::__unwrap(sa_neg) == -1);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    constexpr auto sa = cuda::argument::constant<non_sequence_value{7}>{};
-    static_assert(sa.value().payload == 7);
-    static_assert(cuda::argument::__unwrap(sa).payload == 7);
+    constexpr auto sa = cuda::args::constant<non_sequence_value{7}>{};
+    static_assert(cuda::args::__unwrap(sa).payload == 7);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Array sequence
   {
-    constexpr auto sa_arr = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
-    static_assert(sa_arr.value()[0] == 128);
-    static_assert(sa_arr.value()[1] == 256);
-    static_assert(sa_arr.value()[2] == 512);
+    constexpr auto sa_arr = cuda::args::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    static_assert(cuda::args::__unwrap(sa_arr)[0] == 128);
+    static_assert(cuda::args::__unwrap(sa_arr)[1] == 256);
+    static_assert(cuda::args::__unwrap(sa_arr)[2] == 512);
     static_assert(cuda::std::is_same_v<decltype(sa_arr)::value_type, cuda::std::array<int, 3>>);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
   // Bounds: scalar
   {
-    constexpr auto sa = cuda::argument::constant<42>{};
-    static_assert(cuda::argument::__lowest_(sa) == 42);
-    static_assert(cuda::argument::__highest_(sa) == 42);
+    constexpr auto sa = cuda::args::constant<42>{};
+    static_assert(cuda::args::__lowest_(sa) == 42);
+    static_assert(cuda::args::__highest_(sa) == 42);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: array sequence computes lowest/highest of elements
   {
-    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
-    static_assert(cuda::argument::__lowest_(sa) == 128);
-    static_assert(cuda::argument::__highest_(sa) == 512);
+    constexpr auto sa = cuda::args::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    static_assert(cuda::args::__lowest_(sa) == 128);
+    static_assert(cuda::args::__highest_(sa) == 512);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: empty array sequence has unconstrained element bounds
   {
-    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 0>{}>{};
-    static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
+    constexpr auto sa = cuda::args::__constant_sequence<cuda::std::array<int, 0>{}>{};
+    static_assert(cuda::args::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::args::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
   }
 #endif // TEST_HAS_CLASS_NTTP
 
   // Traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::constant<42>>;
+    using traits = cuda::args::__traits<cuda::args::constant<42>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_constant);
     static_assert(traits::is_single_value);
@@ -98,10 +102,22 @@ TEST_FUNC void test()
     static_assert(traits::highest == 42);
   }
 
+  // Traits: explicit constant value type
+  {
+    using traits = cuda::args::__traits<cuda::args::constant<10, float>>;
+    static_assert(!traits::is_deferred);
+    static_assert(traits::is_constant);
+    static_assert(traits::is_single_value);
+    static_assert(cuda::std::is_same_v<traits::value_type, float>);
+    static_assert(cuda::std::is_same_v<traits::element_type, float>);
+    static_assert(traits::lowest == 10.0f);
+    static_assert(traits::highest == 10.0f);
+  }
+
 #if TEST_HAS_CLASS_NTTP
   // Sequence traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
+    using traits = cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
     static_assert(traits::is_constant);
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
@@ -112,26 +128,33 @@ TEST_FUNC void test()
 
   // Single value: scalar is single, sequence is not
   {
-    static_assert(
-      !cuda::argument::__is_sequence_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type>);
+    static_assert(!cuda::args::__is_sequence_v<cuda::args::__traits<cuda::args::constant<42>>::value_type>);
 #if TEST_HAS_CLASS_NTTP
     static_assert(
-      !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+      !cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
   }
 
   // Unwrap: scalar
   {
-    constexpr auto sa  = cuda::argument::constant<42>{};
-    constexpr auto val = cuda::argument::__unwrap(sa);
+    constexpr auto sa  = cuda::args::constant<42>{};
+    constexpr auto val = cuda::args::__unwrap(sa);
     static_assert(val == 42);
   }
 
+  // Unwrap: scalar with explicit value type
+  {
+    constexpr auto sa  = cuda::args::constant<10, float>{};
+    constexpr auto val = cuda::args::__unwrap(sa);
+    static_assert(val == 10.0f);
+    static_assert(cuda::std::is_same_v<decltype(val), const float>);
+  }
+
 #if TEST_HAS_CLASS_NTTP
   // Unwrap: sequence
   {
-    constexpr auto sa  = cuda::argument::constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
-    constexpr auto val = cuda::argument::__unwrap(sa);
+    constexpr auto sa  = cuda::args::__constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
+    constexpr auto val = cuda::args::__unwrap(sa);
     static_assert(val[0] == 10);
     static_assert(val[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
index 66a1828769b..b59d41fd7a1 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-using arg_t = cuda::argument::immediate<int, cuda::argument::runtime_bounds<int>>;
+using arg_t = cuda::args::immediate<int, cuda::args::runtime_bounds<int>>;
 
 [[maybe_unused]] arg_t invalid_arg{0};
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
index 7f9902e50a2..5212c8a1f9c 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
@@ -10,9 +10,9 @@
 
 #include <cuda/argument>
 
-using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::static_bounds<0, 1000>>;
+using arg_t = cuda::args::immediate<unsigned char, cuda::args::static_bounds<0, 1000>>;
 
-[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits<arg_t>::highest;
+[[maybe_unused]] constexpr auto invalid_highest = cuda::args::__traits<arg_t>::highest;
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
index 8170fdd7ee5..02ba7ecfe96 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{};
+[[maybe_unused]] constexpr auto invalid_bounds = cuda::args::static_bounds<0, 1L>{};
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
index b7aca57bbdf..eada29e23de 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
@@ -10,7 +10,7 @@
 
 // Integration test: demonstrates how an algorithm consumes argument wrappers
 // to make compile-time and runtime resource decisions.
-// All argument types (plain values, static, dynamic, deferred) work uniformly
+// All argument types (plain values, constants, immediate values, deferred values) work uniformly
 // through the free functions.
 
 #include <cuda/argument>
@@ -34,7 +34,7 @@ enum class algorithm_variant
 template <class _SegSizeArg>
 TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 {
-  if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
+  if constexpr (cuda::args::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
   {
     return algorithm_variant::shared_memory;
   }
@@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 template <class _SegSizeArg>
 TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments)
 {
-  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::argument::__highest_(__seg_size)));
+  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::args::__highest_(__seg_size)));
   return __highest * __num_segments;
 }
 
@@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se
 template <class _SegSizeArg>
 TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size)
 {
-  const auto& __val = cuda::argument::__unwrap(__seg_size);
+  const auto& __val = cuda::args::__unwrap(__seg_size);
 
-  if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value)
+  if constexpr (cuda::args::__traits<_SegSizeArg>::is_single_value)
   {
     return static_cast<int>(__val);
   }
@@ -93,82 +93,82 @@ TEST_FUNC constexpr bool test()
   }
 #endif
 
-  // static_argument: scalar, fits in shared memory, buffer = value
+  // constant: scalar, fits in shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::constant<128>{};
+    constexpr auto seg_size = cuda::args::constant<128>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 128 * 4);
     assert(process_segments(seg_size) == 128);
   }
 
 #if TEST_HAS_CLASS_NTTP
-  // static_argument: array sequence, highest fits in shared memory
+  // __constant_sequence: array sequence, highest fits in shared memory
   {
-    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 256}>{};
+    constexpr auto seg_sizes = cuda::args::__constant_sequence<cuda::std::array{64, 128, 256}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 256 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 256);
   }
 
-  // static_argument: array sequence, highest exceeds shared memory, buffer clamped
+  // __constant_sequence: array sequence, highest exceeds shared memory, buffer clamped
   {
-    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 512}>{};
+    constexpr auto seg_sizes = cuda::args::__constant_sequence<cuda::std::array{64, 128, 512}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 512 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 512);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
-  // dynamic_argument: tight static bounds, shared memory, buffer = value
+  // immediate: tight static bounds, shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()};
+    constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 256>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
   }
 
-  // dynamic_argument: wide static bounds, global memory, buffer = value
+  // immediate: wide static bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()};
+    constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 4096>()};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
   }
 
-  // dynamic_argument: no bounds, global memory, buffer = value
+  // immediate: no bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::immediate{100};
+    constexpr auto seg_size = cuda::args::immediate{100};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
   }
 
-  // dynamic_argument: per-segment span with runtime bounds only
+  // __immediate_sequence: per-segment span with runtime bounds only
   {
     int sizes[3]   = {64, 128, 96};
-    auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::argument::bounds(1, 200)};
+    auto seg_sizes = cuda::args::__immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::args::bounds(1, 200)};
     assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
   }
 
-  // dynamic_argument: per-segment span with both bounds
+  // __immediate_sequence: per-segment span with both bounds
   {
     int sizes[3]   = {64, 128, 96};
-    auto seg_sizes = cuda::argument::immediate_sequence{
-      cuda::std::span<int>{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
-    static_assert(cuda::argument::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
+    auto seg_sizes = cuda::args::__immediate_sequence{
+      cuda::std::span<int>{sizes, 3}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)};
+    static_assert(cuda::args::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
   }
 
-  // deferred_argument: uniform, bounds for decisions only
+  // deferred: uniform, bounds for decisions only
   {
-    int val       = 100;
-    auto seg_size = cuda::argument::deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
-    static_assert(cuda::argument::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
+    int val = 100;
+    auto seg_size =
+      cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)};
+    static_assert(cuda::args::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 200 * 4);
   }
@@ -181,17 +181,24 @@ TEST_FUNC constexpr bool test()
     assert(process_segments(1.0f) == 1);
   }
 
+  // constant float using an integer NTTP and explicit value type
+  {
+    constexpr auto seg_size = cuda::args::constant<128, float>{};
+    static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
+    assert(process_segments(seg_size) == 128);
+  }
+
 #if TEST_HAS_CLASS_NTTP
-  // static_argument float (float NTTPs require C++20)
+  // constant float (float NTTPs require C++20)
   {
-    constexpr auto seg_size = cuda::argument::constant<128.0f>{};
+    constexpr auto seg_size = cuda::args::constant<128.0f>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 128);
   }
 
-  // dynamic_argument float with static bounds
+  // immediate float with static bounds
   {
-    constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()};
+    constexpr auto seg_size = cuda::args::immediate{100.0f, cuda::args::bounds<1.0f, 256.0f>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 100);
   }

From b870d3aa4d6db61f5f44221d4b669ce3c29314c6 Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Tue, 9 Jun 2026 17:08:45 -0700
Subject: [PATCH 6/6] Chnage the test dispatch for now

---
 .../catch2_test_device_segmented_topk_keys.cu | 19 ++++----
 ...catch2_test_device_segmented_topk_pairs.cu | 46 +++++++++++++++++--
 2 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu
index 7e9ad93ca9e..fe3e97188ac 100644
--- a/cub/test/catch2_test_device_segmented_topk_keys.cu
+++ b/cub/test/catch2_test_device_segmented_topk_keys.cu
@@ -27,11 +27,11 @@ struct is_minus_zero
   }
 };
 
-template <typename KeyInputItItT,
+template <cub::detail::topk::select Direction,
+          typename KeyInputItItT,
           typename KeyOutputItItT,
           typename SegmentSizeParamT,
           typename KParamT,
-          typename SelectDirectionT,
           typename NumSegmentsParameterT,
           typename TotalNumItemsGuaranteeT>
 CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
@@ -41,7 +41,6 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
   KeyOutputItItT d_key_segments_out_it,
   SegmentSizeParamT segment_sizes,
   KParamT k,
-  SelectDirectionT select_direction,
   NumSegmentsParameterT num_segments,
   TotalNumItemsGuaranteeT total_num_items_guarantee,
   cudaStream_t stream = nullptr)
@@ -56,14 +55,15 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
     values_it,
     segment_sizes,
     k,
-    select_direction,
+    ::cuda::args::constant<Direction>{},
     num_segments,
     total_num_items_guarantee,
     stream);
 }
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
-DECLARE_LAUNCH_WRAPPER(dispatch_batched_topk_keys, batched_topk_keys);
+DECLARE_TMPL_LAUNCH_WRAPPER(
+  dispatch_batched_topk_keys, batched_topk_keys, cub::detail::topk::select Direction, Direction);
 
 // Total segment size
 using max_segment_size_list = c2h::enum_type_list<cuda::std::size_t, 4 * 1024>;
@@ -153,12 +153,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_keys(
+  batched_topk_keys<direction>(
     d_keys_in,
     d_keys_out,
     ::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
     ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::args::constant<direction>{},
     ::cuda::args::immediate{num_segments},
     ::cuda::args::immediate{num_segments * segment_size});
   // Prepare expected results
@@ -251,13 +250,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_keys(
+  batched_topk_keys<direction>(
     d_keys_in,
     d_keys_out,
     ::cuda::args::__immediate_sequence{
       segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
     ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::args::constant<direction>{},
     ::cuda::args::immediate{num_segments},
     ::cuda::args::immediate{num_items});
 
@@ -289,12 +287,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
   auto d_keys_out_it =
     cuda::make_strided_iterator(cuda::make_counting_iterator(thrust::raw_pointer_cast(d_keys_out.data())), k);
 
-  batched_topk_keys(
+  batched_topk_keys<cub::detail::topk::select::min>(
     d_keys_in_it,
     d_keys_out_it,
     ::cuda::args::immediate{segment_size, ::cuda::args::bounds<cuda::std::int64_t{1}, max_segment_size>()},
     ::cuda::args::immediate{k, ::cuda::args::bounds<cuda::std::int64_t{1}, k>()},
-    ::cuda::args::constant<cub::detail::topk::select::min>{},
     ::cuda::args::immediate{num_segments},
     ::cuda::args::immediate{num_segments * segment_size});
 
diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu
index cef26687cbd..6ba474daf81 100644
--- a/cub/test/catch2_test_device_segmented_topk_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu
@@ -47,8 +47,46 @@ struct flag_intra_segment_duplicates
 template <typename ItemItT, typename SegIdItT>
 flag_intra_segment_duplicates(ItemItT, SegIdItT) -> flag_intra_segment_duplicates<ItemItT, SegIdItT>;
 
+template <cub::detail::topk::select Direction,
+          typename KeyInputItItT,
+          typename KeyOutputItItT,
+          typename ValueInputItItT,
+          typename ValueOutputItItT,
+          typename SegmentSizeParameterT,
+          typename KParameterT,
+          typename NumSegmentsParameterT,
+          typename TotalNumItemsGuaranteeT>
+CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_pairs(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  KeyInputItItT d_key_segments_it,
+  KeyOutputItItT d_key_segments_out_it,
+  ValueInputItItT d_value_segments_it,
+  ValueOutputItItT d_value_segments_out_it,
+  SegmentSizeParameterT segment_sizes,
+  KParameterT k,
+  NumSegmentsParameterT num_segments,
+  TotalNumItemsGuaranteeT total_num_items_guarantee,
+  cudaStream_t stream = nullptr)
+{
+  return cub::detail::batched_topk::dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_key_segments_it,
+    d_key_segments_out_it,
+    d_value_segments_it,
+    d_value_segments_out_it,
+    segment_sizes,
+    k,
+    ::cuda::args::constant<Direction>{},
+    num_segments,
+    total_num_items_guarantee,
+    stream);
+}
+
 // %PARAM% TEST_LAUNCH lid 0:1:2
-DECLARE_LAUNCH_WRAPPER(cub::detail::batched_topk::dispatch, batched_topk_pairs);
+DECLARE_TMPL_LAUNCH_WRAPPER(
+  dispatch_batched_topk_pairs, batched_topk_pairs, cub::detail::topk::select Direction, Direction);
 
 // Total segment size
 using max_segment_size_list = c2h::enum_type_list<cuda::std::size_t, 4 * 1024>;
@@ -220,14 +258,13 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments"
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_pairs(
+  batched_topk_pairs<direction>(
     d_keys_in,
     d_keys_out,
     d_values_in,
     d_values_out,
     ::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
     ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::args::constant<direction>{},
     ::cuda::args::immediate{num_segments},
     ::cuda::args::immediate{num_segments * segment_size});
 
@@ -341,7 +378,7 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_pairs(
+  batched_topk_pairs<direction>(
     d_keys_in,
     d_keys_out,
     d_values_in,
@@ -349,7 +386,6 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
     ::cuda::args::__immediate_sequence{
       segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
     ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
-    ::cuda::args::constant<direction>{},
     ::cuda::args::immediate{num_segments},
     ::cuda::args::immediate{num_items});