Last active
July 1, 2016 15:32
-
-
Save kalj/0d279d405b214d5304559583b0c3bb9c to your computer and use it in GitHub Desktop.
Register-hungry CUDA code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define ELEM_DEGREE 4 | |
enum Direction { X, Y, Z}; | |
enum Transpose { TR, NOTR}; | |
template <Direction dir, unsigned int n, | |
bool add, bool inplace,typename Number> | |
__device__ void reduce(Number * __restrict__ dst, const Number * __restrict__ src, const Number *myphi) | |
{ | |
const unsigned int q = threadIdx.x; // new index | |
const unsigned int i = threadIdx.y; // two unchanged | |
const unsigned int j = threadIdx.z%n; // indices | |
Number tmp = 0; | |
#pragma unroll | |
for(int k = 0; k < n; ++k) { | |
const unsigned int srcidx = | |
(dir==X) ? (k + n*(i + n*j)) | |
: (dir==Y) ? (i + n*(k + n*j)) | |
: (i + n*(j + n*k)); | |
tmp += myphi[k] * (inplace ? dst[srcidx] : src[srcidx]); | |
} | |
if(inplace) __syncthreads(); | |
const unsigned int dstidx = | |
(dir==X) ? (q + n*(i + n*j)) | |
: (dir==Y) ? (i + n*(q + n*j)) | |
: (i + n*(j + n*q)); | |
if(add) | |
dst[dstidx] += tmp; | |
else | |
dst[dstidx] = tmp; | |
} | |
template <int dim, int n, typename Number> | |
class Phi { | |
const static int npts=n*n*n; | |
const int tid = threadIdx.x+n*(threadIdx.y + n*threadIdx.z); | |
const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y; | |
const int ncells; | |
public: | |
__device__ Phi(int nc) | |
: ncells(nc) { } | |
__device__ unsigned int global_q() const { return ROWLENGTH*cell+tid; } | |
__device__ void read_values(Number *__restrict__ values, const Number *__restrict__ src, | |
const unsigned int * __restrict__ loc2glob) | |
{ | |
values[tid] = __ldg(&src[loc2glob[ROWLENGTH*cell+tid]]); | |
__syncthreads(); | |
} | |
__device__ void write_values(Number *__restrict__ dst, const Number *__restrict__ values, | |
const unsigned int * __restrict__ loc2glob) | |
{ | |
dst[loc2glob[cell*ROWLENGTH+tid]] += values[tid]; | |
} | |
__device__ void get_grad(GpuArray<dim,Number> &grad, const Number *__restrict__ gradients, | |
const Number * __restrict__ jac) const | |
{ | |
#pragma unroll | |
for(int d1=0; d1<dim; d1++) { | |
Number tmp = 0; | |
#pragma unroll | |
for(int d2=0; d2<dim; d2++) { | |
tmp += jac[((dim*d2+d1)*ncells+cell)*ROWLENGTH+tid]*gradients[d2*npts+tid]; | |
} | |
grad[d1] = tmp; | |
} | |
} | |
__device__ void submit_grad(Number *__restrict__ gradients, const GpuArray<dim,Number> &grad, | |
const Number * __restrict__ jac, | |
const Number * __restrict__ jxw) const | |
{ | |
#pragma unroll | |
for(int d1=0; d1<dim; d1++) { | |
Number tmp = 0; | |
#pragma unroll | |
for(int d2=0; d2<dim; d2++) { | |
tmp += jac[((dim*d1+d2)*ncells+cell)*ROWLENGTH+tid]*grad[d2]; | |
} | |
gradients[d1*npts+tid] = tmp*jxw[cell*ROWLENGTH+tid]; | |
} | |
__syncthreads(); | |
} | |
__device__ void evaluate(Number *__restrict__ gradients, const Number *__restrict__ values) | |
{ | |
/////////////////////////////////////////////////////////////// | |
// Stage PHI and DPHI in registers: | |
Number my_phi[n]; | |
Number my_dphi[n]; | |
#pragma unroll | |
for(int i = 0; i < n; i++) { | |
my_phi[i] = shape_values[threadIdx.x + n*i]; | |
my_dphi[i] = shape_gradient[threadIdx.x + n*i]; | |
} | |
/////////////////////////////////////////////////////////////// | |
// reduce along x / i / q - direction | |
reduce<X,n,false,false> (&gradients[0],values,my_dphi); | |
reduce<X,n,false,false> (&gradients[1*npts],values,my_phi); | |
reduce<X,n,false,false> (&gradients[2*npts],values,my_phi); | |
__syncthreads(); | |
// reduce along y / j / r - direction | |
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi); | |
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi); | |
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi); | |
__syncthreads(); | |
// reduce along z / k / s - direction | |
reduce<Z,n,false,true> (&gradients[0],&gradients[0],my_phi); | |
reduce<Z,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi); | |
reduce<Z,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_dphi); | |
__syncthreads(); | |
// now we should have values at quadrature points | |
// no synch is necessary since we are only working on local data. | |
} | |
__device__ void integrate(Number *__restrict__ gradients, Number *__restrict__ values) | |
{ | |
/////////////////////////////////////////////////////////////// | |
// Stage transpose of PHI and DPHI in registers: | |
Number my_phi[n]; | |
Number my_dphi[n]; | |
#pragma unroll | |
for(int i = 0; i < n; i++) { | |
my_phi[i] = shape_values[threadIdx.x*n + i]; | |
my_dphi[i] = shape_gradient[threadIdx.x*n + i]; | |
} | |
/////////////////////////////////////////////////////////////// | |
__syncthreads(); | |
reduce<X,n,false,true> (&gradients[0],&gradients[0],my_dphi); | |
reduce<X,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi); | |
reduce<X,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi); | |
__syncthreads(); | |
// reduce along y / j / r - direction | |
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi); | |
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi); | |
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi); | |
__syncthreads(); | |
// reduce along z / k / s - direction | |
reduce<Z,n,false,false> (values,&gradients[0],my_phi); | |
__syncthreads(); | |
reduce<Z,n,true,false> (values,&gradients[1*npts],my_phi); | |
__syncthreads(); | |
reduce<Z,n,true,false> (values,&gradients[2*npts],my_dphi); | |
__syncthreads(); | |
// no synch is necessary since we are only working on local data. | |
} | |
}; | |
template <int dim, unsigned int n, typename Number> | |
__global__ void kernel_grad(Number *__restrict__ dst, const Number *__restrict__ src, | |
const unsigned int *__restrict__ loc2glob, | |
const Number *__restrict__ coeff, const Number *__restrict__ jac, | |
const Number *__restrict__ jxw, const unsigned int n_cells) | |
{ | |
const unsigned int nqpts=n*n*n; | |
__shared__ Number values[nqpts]; | |
__shared__ Number gradients[3*nqpts]; | |
const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y; | |
if(cell < n_cells) { | |
// USER CODE BEGIN HERE | |
Phi<dim,n,Number> phi(n_cells); | |
phi.read_values(values,src,loc2glob); | |
phi.evaluate(gradients,values); | |
GpuArray<dim,Number> grad; | |
phi.get_grad(grad,gradients,jac); | |
grad *= coeff[phi.global_q()]; | |
phi.submit_grad(gradients,grad,jac,jxw); | |
phi.integrate(gradients,values); | |
phi.write_values(dst,values,loc2glob); | |
// END USER CODE | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define ELEM_DEGREE 4 | |
enum Direction { X, Y, Z}; | |
enum Transpose { TR, NOTR}; | |
template <Direction dir, unsigned int n, | |
bool add, bool inplace,typename Number> | |
__device__ void reduce(Number * __restrict__ dst, const Number * __restrict__ src, const Number *myphi) | |
{ | |
const unsigned int q = threadIdx.x; // new index | |
const unsigned int i = threadIdx.y; // two unchanged | |
const unsigned int j = threadIdx.z%n; // indices | |
Number tmp = 0; | |
#pragma unroll | |
for(int k = 0; k < n; ++k) { | |
const unsigned int srcidx = | |
(dir==X) ? (k + n*(i + n*j)) | |
: (dir==Y) ? (i + n*(k + n*j)) | |
: (i + n*(j + n*k)); | |
tmp += myphi[k] * (inplace ? dst[srcidx] : src[srcidx]); | |
} | |
if(inplace) __syncthreads(); | |
const unsigned int dstidx = | |
(dir==X) ? (q + n*(i + n*j)) | |
: (dir==Y) ? (i + n*(q + n*j)) | |
: (i + n*(j + n*q)); | |
if(add) | |
dst[dstidx] += tmp; | |
else | |
dst[dstidx] = tmp; | |
} | |
template <int dim, int n, typename Number> | |
class Phi { | |
typedef typename MatrixFreeGpu<dim,Number>::GpuData GData; | |
const unsigned int * __restrict__ loc2glob; | |
const Number * __restrict__ jxw; | |
const Number * __restrict__ jac; | |
Number *__restrict__ values; | |
Number *__restrict__ gradients; | |
const static int npts=n*n*n; | |
const int tid = threadIdx.x+n*(threadIdx.y + n*threadIdx.z); | |
const unsigned int cell; | |
const int ncells; | |
Number my_phi[n]; | |
Number my_dphi[n]; | |
public: | |
__device__ Phi(Number * __restrict__ v, Number * __restrict__ g,const GData *data, unsigned int c) | |
: values(v),gradients(g), | |
loc2glob(data->loc2glob+data->rowlength*c), | |
jxw(data->JxW+data->rowlength*c), | |
jac(data->inv_jac+data->rowlength*c), | |
cell(c), | |
ncells(data->n_cells) | |
{ } | |
__device__ unsigned int global_q() const { return ROWLENGTH*cell+tid; } | |
__device__ void read_values(const Number *__restrict__ src) | |
{ | |
values[tid] = __ldg(&src[loc2glob[tid]]); | |
__syncthreads(); | |
} | |
__device__ void write_values(Number *__restrict__ dst) const | |
{ | |
dst[loc2glob[tid]] += values[tid]; | |
} | |
__device__ GpuArray<dim,Number> get_grad() const | |
{ | |
GpuArray<dim,Number> grad; | |
#pragma unroll | |
for(int d1=0; d1<dim; d1++) { | |
Number tmp = 0; | |
#pragma unroll | |
for(int d2=0; d2<dim; d2++) { | |
tmp += jac[((dim*d2+d1)*ncells)*ROWLENGTH+tid]*gradients[d2*npts+tid]; | |
} | |
grad[d1] = tmp; | |
} | |
return grad; | |
} | |
__device__ void submit_grad(const GpuArray<dim,Number> &grad) | |
{ | |
#pragma unroll | |
for(int d1=0; d1<dim; d1++) { | |
Number tmp = 0; | |
#pragma unroll | |
for(int d2=0; d2<dim; d2++) { | |
tmp += jac[((dim*d1+d2)*ncells)*ROWLENGTH+tid]*grad[d2]; | |
} | |
gradients[d1*npts+tid] = tmp*jxw[tid]; | |
} | |
__syncthreads(); | |
} | |
__device__ void evaluate() | |
{ | |
/////////////////////////////////////////////////////////////// | |
// Stage PHI and DPHI in registers: | |
#pragma unroll | |
for(int i = 0; i < n; i++) { | |
my_phi[i] = shape_values[threadIdx.x + n*i]; | |
my_dphi[i] = shape_gradient[threadIdx.x + n*i]; | |
} | |
/////////////////////////////////////////////////////////////// | |
// reduce along x / i / q - direction | |
reduce<X,n,false,false> (&gradients[0],values,my_dphi); | |
reduce<X,n,false,false> (&gradients[1*npts],values,my_phi); | |
reduce<X,n,false,false> (&gradients[2*npts],values,my_phi); | |
__syncthreads(); | |
// reduce along y / j / r - direction | |
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi); | |
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi); | |
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi); | |
__syncthreads(); | |
// reduce along z / k / s - direction | |
reduce<Z,n,false,true> (&gradients[0],&gradients[0],my_phi); | |
reduce<Z,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi); | |
reduce<Z,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_dphi); | |
__syncthreads(); | |
// now we should have values at quadrature points | |
// no synch is necessary since we are only working on local data. | |
} | |
__device__ void integrate() | |
{ | |
/////////////////////////////////////////////////////////////// | |
// Stage transpose of PHI and DPHI in registers: | |
#pragma unroll | |
for(int i = 0; i < n; i++) { | |
my_phi[i] = shape_values[threadIdx.x*n + i]; | |
my_dphi[i] = shape_gradient[threadIdx.x*n + i]; | |
} | |
/////////////////////////////////////////////////////////////// | |
__syncthreads(); | |
reduce<X,n,false,true> (&gradients[0],&gradients[0],my_dphi); | |
reduce<X,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi); | |
reduce<X,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi); | |
__syncthreads(); | |
// reduce along y / j / r - direction | |
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi); | |
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi); | |
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi); | |
__syncthreads(); | |
// reduce along z / k / s - direction | |
reduce<Z,n,false,false> (values,&gradients[0],my_phi); | |
__syncthreads(); | |
reduce<Z,n,true,false> (values,&gradients[1*npts],my_phi); | |
__syncthreads(); | |
reduce<Z,n,true,false> (values,&gradients[2*npts],my_dphi); | |
__syncthreads(); | |
// no synch is necessary since we are only working on local data. | |
} | |
}; | |
template <int dim, unsigned int n, typename Number> | |
__global__ void kernel_grad(Number *__restrict__ dst, const Number *__restrict__ src, | |
const typename MatrixFreeGpu<dim,Number>::GpuData gpu_data, | |
const Number *__restrict__ coeff) | |
{ | |
const unsigned int nqpts=n*n*n; | |
__shared__ Number values[nqpts]; | |
__shared__ Number gradients[3*nqpts]; | |
const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y; | |
if(cell < gpu_data.n_cells) { | |
// USER CODE BEGIN HERE | |
Phi<dim,n,Number> phi(values,gradients,&gpu_data,cell); | |
phi.read_values(src); | |
phi.evaluate(); | |
phi.submit_grad((coeff[phi.global_q()] * | |
phi.get_grad())); | |
phi.integrate(); | |
phi.write_values(dst); | |
// END USER CODE | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment