template<typename InputIterator, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadPolicy POLICY = BLOCK_LOAD_DIRECT, PtxLoadModifier MODIFIER = PTX_LOAD_NONE>
class cub::BlockLoad< InputIterator, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER >
BlockLoad provides data movement operations for reading block-arranged data from global memory.
.
BlockLoad provides a single tile-loading abstraction whose performance behavior can be statically tuned. In particular, BlockLoad implements alternative cub::BlockLoadPolicy strategies catering to different granularity sizes (i.e., number of items per thread).
- Template Parameters
-
- Algorithm
- BlockLoad can be (optionally) configured to use one of three alternative methods:
- cub::BLOCK_LOAD_DIRECT. A blocked arrangement of data is read directly from memory. More...
- cub::BLOCK_LOAD_VECTORIZE. A blocked arrangement of data is read directly from memory using CUDA's built-in vectorized loads as a coalescing optimization. More...
- cub::BLOCK_LOAD_TRANSPOSE. A striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement. More...
- Usage Considerations
- After any operation, a subsequent
__syncthreads() barrier is required if the supplied BlockLoad::SmemStorage is to be reused or repurposed by the threadblock
- Performance Considerations
-
- Examples
- Example 1. Have a 128-thread threadblock directly load a blocked arrangement of four consecutive integers per thread.
#include <cub.cuh>
__global__ void SomeKernel(int *d_in, ...)
{
int data[4];
...
- Example 2. Have a threadblock load a blocked arrangement of
ITEMS_PER_THREAD consecutive integers per thread using vectorized loads and global-only caching: #include <cub.cuh>
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
__global__ void SomeKernel(int *d_in, ...)
{
int data[ITEMS_PER_THREAD];
BlockLoad::Load(smem_storage, d_in + blockIdx.x * BLOCK_THREADS * ITEMS_PER_THREAD, data);
...
|
|
typedef _SmemStorage | SmemStorage |
| | The operations exposed by BlockLoad require shared memory of this type. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated shared memory or union'd with other types to facilitate shared memory reuse.
|
| |
|
static __device__
__forceinline__ void | Load (SmemStorage &smem_storage, InputIterator block_itr, T(&items)[ITEMS_PER_THREAD]) |
| | Load a tile of items across a threadblock. More...
|
| |
| template<typename SizeT > |
static __device__
__forceinline__ void | Load (SmemStorage &smem_storage, InputIterator block_itr, const SizeT &guarded_items, T(&items)[ITEMS_PER_THREAD]) |
| | Load a tile of items across a threadblock, guarded by range. More...
|
| |
template<typename InputIterator , int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadPolicy POLICY = BLOCK_LOAD_DIRECT, PtxLoadModifier MODIFIER = PTX_LOAD_NONE>
| static __device__ __forceinline__ void cub::BlockLoad< InputIterator, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER >::Load |
( |
SmemStorage & |
smem_storage, |
|
|
InputIterator |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
|
inlinestatic |
Load a tile of items across a threadblock.
- Parameters
-
| [in] | smem_storage | Shared reference to opaque SmemStorage layout |
| [in] | block_itr | The threadblock's base input iterator for loading from |
| [out] | items | Data to load |
template<typename InputIterator , int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockLoadPolicy POLICY = BLOCK_LOAD_DIRECT, PtxLoadModifier MODIFIER = PTX_LOAD_NONE>
template<typename SizeT >
| static __device__ __forceinline__ void cub::BlockLoad< InputIterator, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER >::Load |
( |
SmemStorage & |
smem_storage, |
|
|
InputIterator |
block_itr, |
|
|
const SizeT & |
guarded_items, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
|
inlinestatic |
Load a tile of items across a threadblock, guarded by range.
- Template Parameters
-
| SizeT | [inferred] Integer type for offsets |
- Parameters
-
| [in] | smem_storage | Shared reference to opaque SmemStorage layout |
| [in] | block_itr | The threadblock's base input iterator for loading from |
| [in] | guarded_items | Number of valid items in the tile |
| [out] | items | Data to load |
The documentation for this class was generated from the following file: