NVIDIA · fbusato · Jan 4, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
@@ -9,23 +9,24 @@ PTX Instructions
    instructions/barrier_cluster
    instructions/cp_async_bulk
    instructions/cp_async_bulk_commit_group
-   instructions/cp_async_bulk_wait_group
    instructions/cp_async_bulk_tensor
+   instructions/cp_async_bulk_wait_group
    instructions/cp_reduce_async_bulk
    instructions/cp_reduce_async_bulk_tensor
    instructions/fence
    instructions/getctarank
    instructions/mapa
-   instructions/mbarrier_init
    instructions/mbarrier_arrive
    instructions/mbarrier_expect_tx
+   instructions/mbarrier_init
    instructions/mbarrier_test_wait
    instructions/mbarrier_try_wait
    instructions/red_async
+   instructions/shfl_sync
+   instructions/special_registers
    instructions/st_async
-   instructions/tensormap_replace
    instructions/tensormap_cp_fenceproxy
-   instructions/special_registers
+   instructions/tensormap_replace
 
 
 Instructions by section
@@ -232,8 +233,8 @@ Instructions by section
      - No
    * - `shfl <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated>`__
      - No
-   * - `shfl.s <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
-     - No
+   * - `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
+     - Yes, CCCL 2.9.0 / CUDA 12.9
    * - `prmt <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt>`__
      - No
    * - `ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld>`__

@@ -0,0 +1,77 @@
+
+shfl.sync
+^^^^^^^^^
+
+.. code:: cuda
+
+   // PTX ISA 6.0
+   // shfl.sync.mode.b32  d[|p], a, b, c, membermask;
+   //   .mode = { .up, .down, .bfly, .idx };
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_idx(T        data,
+                   uint32_t lane_idx_offset,
+                   uint32_t clamp_segmask,
+                   uint32_t lane_mask) noexcept;
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_idx(T        data,
+                   bool&    pred,
+                   uint32_t lane_idx_offset,
+                   uint32_t clamp_segmask,
+                   uint32_t lane_mask) noexcept;
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_up(T        data,
+                  uint32_t lane_idx_offset,
+                  uint32_t clamp_segmask,
+                  uint32_t lane_mask) noexcept;
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_up(T        data,
+                  bool&    pred,
+                  uint32_t lane_idx_offset,
+                  uint32_t clamp_segmask,
+                  uint32_t lane_mask) noexcept;
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_down(T        data,
+                    uint32_t lane_idx_offset,
+                    uint32_t clamp_segmask,
+                    uint32_t lane_mask) noexcept;
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_down(T        data,
+                    bool&    pred,
+                    uint32_t lane_idx_offset,
+                    uint32_t clamp_segmask,
+                    uint32_t lane_mask) noexcept;
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_bfly(T        data,
+                    uint32_t lane_idx_offset,
+                    uint32_t clamp_segmask,
+                    uint32_t lane_mask) noexcept;
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   T shfl_sync_bfly(T        data,
+                    bool&    pred,
+                    uint32_t lane_idx_offset,
+                    uint32_t clamp_segmask,
+                    uint32_t lane_mask) noexcept;
+
+**Constrains and checks**
+
+- ``T`` must have 32-bit size (compile-time)
+- ``lane_idx_offset`` must be less than the warp size (debug mode)
+- ``clamp_segmask`` must use the bit positions [0:4] and [8:12] (debug mode)
+- ``lane_mask`` must be a subset of the active mask (debug mode)
+- The destination lane must be a member of the ``lane_mask`` (debug mode)
@@ -0,0 +1,10 @@
+
+.. _libcudacxx-ptx-instructions-shfl_sync:
+
+shfl.sync
+=========
+
+-  PTX ISA:
+   `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
+
+.. include:: generated/shfl_sync.rst