Created
August 6, 2016 18:48
-
-
Save allanmac/bf28f61a88be14693e594653e2c280fd to your computer and use it in GitHub Desktop.
Inclusive vs. exclusive warp scan
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#define WARP_SIZE 32 | |
// | |
// | |
// | |
__device__ | |
uint32_t | |
warp_scan_inc_up_add_shuffled(uint32_t v, uint32_t const width=WARP_SIZE) | |
{ | |
asm("{ \n\t" | |
" .reg .u32 t; \n\t" | |
" .reg .pred p; \n\t"); | |
#pragma unroll | |
for (uint32_t d = 1; d < width; d *= 2) | |
asm("shfl.up.b32 t|p, %0, %1, 0x0; \n\t" | |
"@p add.u32 %0, %0, t; \n\t" : "+r"(v) : "r"(d)); | |
asm("}"); | |
return v; | |
} | |
__device__ | |
uint32_t | |
warp_scan_exc_up_add_shuffled(uint32_t v, uint32_t const width=WARP_SIZE) | |
{ | |
#if 1 | |
asm("{ \n\t" | |
" .reg .u32 t; \n\t" | |
" .reg .pred p; \n\t" | |
" \n\t" | |
" shfl.up.b32 t|p, %0, 1, 0x0; \n\t" | |
" selp.b32 %0, t, 0, p; \n\t" : "+r"(v)); | |
#pragma unroll | |
for (uint32_t d = 1; d < width; d *= 2) | |
asm("shfl.up.b32 t|p, %0, %1, 0x0; \n\t" | |
"@p add.u32 %0, %0, t; \n\t" : "+r"(v) : "r"(d)); | |
asm("}"); | |
return v; | |
#else | |
return warp_scan_inc_up_add_shuffled(v,width) - v; | |
#endif | |
} | |
// | |
// | |
// | |
extern "C" | |
__global__ | |
void | |
warp_scan_exc_up_add_shuffled_kernel(uint32_t const * const vin, uint32_t * const vout) | |
{ | |
vout[threadIdx.x] = warp_scan_exc_up_add_shuffled(vin[threadIdx.x]); | |
} | |
extern "C" | |
__global__ | |
void | |
warp_scan_inc_up_add_shuffled_kernel(uint32_t const * const vin, uint32_t * const vout) | |
{ | |
vout[threadIdx.x] = warp_scan_inc_up_add_shuffled(vin[threadIdx.x]); | |
} | |
// | |
// | |
// | |
void | |
print_scan(char const * const msg, uint32_t const warp[WARP_SIZE]) | |
{ | |
printf("%6s: ",msg); | |
for (int ii=0; ii<WARP_SIZE; ii++) | |
printf("%2d ",warp[ii]); | |
printf("\n"); | |
} | |
// | |
// | |
// | |
int | |
main(int argc, char** argv) | |
{ | |
// | |
// warp_scan [device] [0=exclusive] -- otherwise defaults to inclusive | |
// | |
const int device = (argc >= 2) ? atoi(argv[1]) : 0; | |
const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true; | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props,device); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive"); | |
cudaSetDevice(device); | |
// | |
// | |
// | |
size_t const size = sizeof(uint32_t) * WARP_SIZE; | |
uint32_t *vin_d, *vout_d; | |
cudaMalloc(&vin_d, size); | |
cudaMalloc(&vout_d,size); | |
// | |
// | |
// | |
uint32_t const vin_h[32] = { 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, | |
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 }; | |
cudaMemcpy(vin_d,vin_h,size,cudaMemcpyHostToDevice); | |
// | |
// LAUNCH KERNEL | |
// | |
if (inclusive) | |
warp_scan_inc_up_add_shuffled_kernel<<<1,WARP_SIZE>>>(vin_d,vout_d); | |
else | |
warp_scan_exc_up_add_shuffled_kernel<<<1,WARP_SIZE>>>(vin_d,vout_d); | |
cudaDeviceSynchronize(); | |
// | |
// | |
// | |
uint32_t vout_h[32]; | |
cudaMemcpy(vout_h,vout_d,size,cudaMemcpyDeviceToHost); | |
print_scan("warp",vin_h); | |
print_scan("scan",vout_h); | |
// | |
// | |
// | |
cudaFree(vin_d); | |
cudaFree(vout_d); | |
cudaDeviceReset(); | |
return 0; | |
} | |
// | |
// | |
// |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment