template<typename OutputIterator, int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockStorePolicy POLICY = BLOCK_STORE_DIRECT, PtxStoreModifier MODIFIER = PTX_STORE_NONE>
class cub::BlockStore< OutputIterator, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER >
BlockStore provides data movement operations for writing blocked-arranged data to global memory.
.
BlockStore provides a single tile-storing abstraction whose performance behavior can be statically tuned. In particular, BlockStore implements several alternative cub::BlockStorePolicy strategies catering to different granularity sizes (i.e., number of items per thread).
- Template Parameters
-
- Algorithm
- BlockStore can be (optionally) configured to use one of three alternative methods:
- cub::BLOCK_STORE_DIRECT. A blocked arrangement of data is written directly to memory. More...
- cub::BLOCK_STORE_VECTORIZE. A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization. More...
- cub::BLOCK_STORE_TRANSPOSE. A blocked arrangement is locally transposed into a striped arrangement which is then written to memory. More...
- Usage Considerations
- After any operation, a subsequent
__syncthreads() barrier is required if the supplied BlockStore::SmemStorage is to be reused or repurposed by the threadblock
- Performance Considerations
-
- Examples
- Example 1. Have a 128-thread threadblock directly store a blocked arrangement of four consecutive integers per thread.
#include <cub.cuh>
template <int BLOCK_THREADS>
__global__ void SomeKernel(int *d_out, ...)
{
int data[4];
...
}
Example 2. Have a threadblock store a blocked arrangement of ITEMS_PER_THREAD consecutive integers per thread using vectorized stores and global-only caching:
#include <cub.cuh>
template <int BLOCK_THREADS>
__global__ void SomeKernel(int *d_out, ...)
{
const int ITEMS_PER_THREAD = 4;
int data[4];
...
}
|
static __device__
__forceinline__ void | Store (SmemStorage &smem_storage, OutputIterator block_itr, T(&items)[ITEMS_PER_THREAD]) |
| | Store a tile of items across a threadblock. More...
|
| |
| template<typename SizeT > |
static __device__
__forceinline__ void | Store (SmemStorage &smem_storage, OutputIterator block_itr, const SizeT &guarded_items, T(&items)[ITEMS_PER_THREAD]) |
| | Store a tile of items across a threadblock, guarded by range. More...
|
| |
template<typename OutputIterator , int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockStorePolicy POLICY = BLOCK_STORE_DIRECT, PtxStoreModifier MODIFIER = PTX_STORE_NONE>
template<typename SizeT >
| static __device__ __forceinline__ void cub::BlockStore< OutputIterator, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER >::Store |
( |
SmemStorage & |
smem_storage, |
|
|
OutputIterator |
block_itr, |
|
|
const SizeT & |
guarded_items, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
|
inlinestatic |
Store a tile of items across a threadblock, guarded by range.
- Template Parameters
-
| SizeT | [inferred] Integer type for offsets |
- Parameters
-
| [in] | smem_storage | Shared reference to opaque SmemStorage layout |
| [in] | block_itr | The threadblock's base output iterator for storing to |
| [in] | guarded_items | Number of valid items in the tile |
| [in] | items | Data to store |