kalj · July 1, 2016 15:32
diff --git a/46.cu b/46.cu
 #define ELEM_DEGREE 4


 enum Direction { X, Y, Z};
 enum Transpose { TR, NOTR};

 template <Direction dir, unsigned int n,
          bool add, bool inplace,typename Number>
 __device__ void reduce(Number * __restrict__ dst, const Number * __restrict__ src, const Number *myphi)
 {
  const unsigned int q = threadIdx.x; // new index
  const unsigned int i = threadIdx.y; // two unchanged
  const unsigned int j = threadIdx.z%n; // indices

  Number tmp = 0;

 #pragma unroll
  for(int k = 0; k < n; ++k) {

    const unsigned int srcidx =
        (dir==X) ? (k + n*(i + n*j))
        : (dir==Y) ? (i + n*(k + n*j))
        : (i + n*(j + n*k));

      tmp += myphi[k] * (inplace ? dst[srcidx] : src[srcidx]);
  }

  if(inplace) __syncthreads();

  const unsigned int dstidx =
      (dir==X) ? (q + n*(i + n*j))
      : (dir==Y) ? (i + n*(q + n*j))
      : (i + n*(j + n*q));

  if(add)
    dst[dstidx] += tmp;
  else
    dst[dstidx] = tmp;
 }

 template <int dim, int n, typename Number>
 class Phi {
  const static int npts=n*n*n;
  const int tid = threadIdx.x+n*(threadIdx.y + n*threadIdx.z);
  const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;
  const int ncells;
 public:
  __device__ Phi(int nc)
    : ncells(nc) { }

  __device__ unsigned int global_q() const { return ROWLENGTH*cell+tid; }

  __device__ void read_values(Number *__restrict__ values, const Number *__restrict__ src,
                              const unsigned int * __restrict__ loc2glob)
  {
    values[tid] = __ldg(&src[loc2glob[ROWLENGTH*cell+tid]]);
    __syncthreads();
  }

  __device__ void write_values(Number *__restrict__ dst, const Number *__restrict__ values,
                               const unsigned int * __restrict__ loc2glob)
  {
    dst[loc2glob[cell*ROWLENGTH+tid]] += values[tid];
  }

  __device__ void get_grad(GpuArray<dim,Number> &grad, const Number *__restrict__ gradients,
                                           const Number * __restrict__ jac) const
  {

 #pragma unroll
    for(int d1=0; d1<dim; d1++) {
      Number tmp = 0;
 #pragma unroll
      for(int d2=0; d2<dim; d2++) {
        tmp += jac[((dim*d2+d1)*ncells+cell)*ROWLENGTH+tid]*gradients[d2*npts+tid];
      }
      grad[d1] = tmp;
    }
  }

  __device__ void submit_grad(Number *__restrict__ gradients, const GpuArray<dim,Number> &grad,
                              const Number * __restrict__ jac,
                              const Number * __restrict__ jxw) const
  {

 #pragma unroll
    for(int d1=0; d1<dim; d1++) {
      Number tmp = 0;
 #pragma unroll
      for(int d2=0; d2<dim; d2++) {
        tmp += jac[((dim*d1+d2)*ncells+cell)*ROWLENGTH+tid]*grad[d2];
      }
      gradients[d1*npts+tid] = tmp*jxw[cell*ROWLENGTH+tid];
    }

    __syncthreads();
  }

  __device__ void evaluate(Number *__restrict__ gradients, const Number *__restrict__ values)
  {
    ///////////////////////////////////////////////////////////////
    // Stage PHI and DPHI in registers:
    Number my_phi[n];
    Number my_dphi[n];
 #pragma unroll
    for(int i = 0; i < n; i++) {
      my_phi[i] = shape_values[threadIdx.x + n*i];
      my_dphi[i] = shape_gradient[threadIdx.x + n*i];
    }
    ///////////////////////////////////////////////////////////////

    // reduce along x / i / q - direction
    reduce<X,n,false,false> (&gradients[0],values,my_dphi);
    reduce<X,n,false,false> (&gradients[1*npts],values,my_phi);
    reduce<X,n,false,false> (&gradients[2*npts],values,my_phi);
    __syncthreads();

    // reduce along y / j / r - direction
    reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
    reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
    reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
    __syncthreads();

    // reduce along z / k / s - direction
    reduce<Z,n,false,true> (&gradients[0],&gradients[0],my_phi);
    reduce<Z,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
    reduce<Z,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_dphi);

    __syncthreads();
    // now we should have values at quadrature points
    // no synch is necessary since we are only working on local data.

  }

  __device__ void integrate(Number *__restrict__ gradients, Number *__restrict__ values)
  {
    ///////////////////////////////////////////////////////////////
    // Stage transpose of PHI and DPHI in registers:
    Number my_phi[n];
    Number my_dphi[n];
 #pragma unroll
    for(int i = 0; i < n; i++) {
      my_phi[i] = shape_values[threadIdx.x*n + i];
      my_dphi[i] = shape_gradient[threadIdx.x*n + i];
    }
    ///////////////////////////////////////////////////////////////

    __syncthreads();


    reduce<X,n,false,true>  (&gradients[0],&gradients[0],my_dphi);
    reduce<X,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
    reduce<X,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
    __syncthreads();

    // reduce along y / j / r - direction
    reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
    reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
    reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
    __syncthreads();

    // reduce along z / k / s - direction
    reduce<Z,n,false,false> (values,&gradients[0],my_phi);
    __syncthreads();
    reduce<Z,n,true,false>  (values,&gradients[1*npts],my_phi);
    __syncthreads();
    reduce<Z,n,true,false>  (values,&gradients[2*npts],my_dphi);

    __syncthreads();
    // no synch is necessary since we are only working on local data.

  }
 };

 template <int dim, unsigned int n, typename Number>
 __global__ void kernel_grad(Number *__restrict__ dst, const Number *__restrict__ src,
                            const unsigned int *__restrict__ loc2glob,
                            const Number *__restrict__ coeff, const Number *__restrict__ jac,
                            const Number *__restrict__ jxw, const unsigned int n_cells)
 {
  const unsigned int nqpts=n*n*n;

  __shared__ Number values[nqpts];
  __shared__ Number gradients[3*nqpts];
  const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;
  if(cell < n_cells) {

    // USER CODE BEGIN HERE

    Phi<dim,n,Number> phi(n_cells);

    phi.read_values(values,src,loc2glob);

    phi.evaluate(gradients,values);

    GpuArray<dim,Number> grad;
    phi.get_grad(grad,gradients,jac);
    grad *= coeff[phi.global_q()];
    phi.submit_grad(gradients,grad,jac,jxw);

    phi.integrate(gradients,values);

    phi.write_values(dst,values,loc2glob);
    // END USER CODE
  }

 }
diff --git a/56.cu b/56.cu
 #define ELEM_DEGREE 4


 enum Direction { X, Y, Z};
 enum Transpose { TR, NOTR};

 template <Direction dir, unsigned int n,
          bool add, bool inplace,typename Number>
 __device__ void reduce(Number * __restrict__ dst, const Number * __restrict__ src, const Number *myphi)
 {
  const unsigned int q = threadIdx.x; // new index
  const unsigned int i = threadIdx.y; // two unchanged
  const unsigned int j = threadIdx.z%n; // indices

  Number tmp = 0;

 #pragma unroll
  for(int k = 0; k < n; ++k) {

    const unsigned int srcidx =
        (dir==X) ? (k + n*(i + n*j))
        : (dir==Y) ? (i + n*(k + n*j))
        : (i + n*(j + n*k));

      tmp += myphi[k] * (inplace ? dst[srcidx] : src[srcidx]);
  }

  if(inplace) __syncthreads();

  const unsigned int dstidx =
      (dir==X) ? (q + n*(i + n*j))
      : (dir==Y) ? (i + n*(q + n*j))
      : (i + n*(j + n*q));

  if(add)
    dst[dstidx] += tmp;
  else
    dst[dstidx] = tmp;
 }

 template <int dim, int n, typename Number>
 class Phi {
  typedef typename MatrixFreeGpu<dim,Number>::GpuData GData;
  const unsigned int  * __restrict__ loc2glob;
  const Number  * __restrict__ jxw;
  const Number  * __restrict__ jac;
  Number *__restrict__ values;
  Number *__restrict__ gradients;
  const static int npts=n*n*n;
  const int tid = threadIdx.x+n*(threadIdx.y + n*threadIdx.z);
  const unsigned int cell;
  const int ncells;
  Number my_phi[n];
  Number my_dphi[n];
 public:

  __device__ Phi(Number * __restrict__ v, Number * __restrict__ g,const GData *data, unsigned int c)
    : values(v),gradients(g),
      loc2glob(data->loc2glob+data->rowlength*c),
      jxw(data->JxW+data->rowlength*c),
      jac(data->inv_jac+data->rowlength*c),
      cell(c),
      ncells(data->n_cells)
  { }

  __device__ unsigned int global_q() const { return ROWLENGTH*cell+tid; }

  __device__ void read_values(const Number *__restrict__ src)
  {
    values[tid] = __ldg(&src[loc2glob[tid]]);
    __syncthreads();
  }

  __device__ void write_values(Number *__restrict__ dst) const
  {
    dst[loc2glob[tid]] += values[tid];
  }

  __device__ GpuArray<dim,Number> get_grad() const
  {
    GpuArray<dim,Number> grad;

 #pragma unroll
    for(int d1=0; d1<dim; d1++) {
      Number tmp = 0;
 #pragma unroll
      for(int d2=0; d2<dim; d2++) {
        tmp += jac[((dim*d2+d1)*ncells)*ROWLENGTH+tid]*gradients[d2*npts+tid];
      }
      grad[d1] = tmp;
    }
    return grad;
  }

  __device__ void submit_grad(const GpuArray<dim,Number> &grad)
  {

 #pragma unroll
    for(int d1=0; d1<dim; d1++) {
      Number tmp = 0;
 #pragma unroll
      for(int d2=0; d2<dim; d2++) {
        tmp += jac[((dim*d1+d2)*ncells)*ROWLENGTH+tid]*grad[d2];
      }
      gradients[d1*npts+tid] = tmp*jxw[tid];
    }

    __syncthreads();
  }

  __device__ void evaluate()
  {
    ///////////////////////////////////////////////////////////////
    // Stage PHI and DPHI in registers:
 #pragma unroll
    for(int i = 0; i < n; i++) {
      my_phi[i] = shape_values[threadIdx.x + n*i];
      my_dphi[i] = shape_gradient[threadIdx.x + n*i];
    }
    ///////////////////////////////////////////////////////////////

    // reduce along x / i / q - direction
    reduce<X,n,false,false> (&gradients[0],values,my_dphi);
    reduce<X,n,false,false> (&gradients[1*npts],values,my_phi);
    reduce<X,n,false,false> (&gradients[2*npts],values,my_phi);
    __syncthreads();

    // reduce along y / j / r - direction
    reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
    reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
    reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
    __syncthreads();

    // reduce along z / k / s - direction
    reduce<Z,n,false,true> (&gradients[0],&gradients[0],my_phi);
    reduce<Z,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
    reduce<Z,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_dphi);

    __syncthreads();
    // now we should have values at quadrature points
    // no synch is necessary since we are only working on local data.

  }

  __device__ void integrate()
  {
    ///////////////////////////////////////////////////////////////
    // Stage transpose of PHI and DPHI in registers:
 #pragma unroll
    for(int i = 0; i < n; i++) {
      my_phi[i] = shape_values[threadIdx.x*n + i];
      my_dphi[i] = shape_gradient[threadIdx.x*n + i];
    }
    ///////////////////////////////////////////////////////////////

    __syncthreads();


    reduce<X,n,false,true>  (&gradients[0],&gradients[0],my_dphi);
    reduce<X,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
    reduce<X,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
    __syncthreads();

    // reduce along y / j / r - direction
    reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
    reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
    reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
    __syncthreads();

    // reduce along z / k / s - direction
    reduce<Z,n,false,false> (values,&gradients[0],my_phi);
    __syncthreads();
    reduce<Z,n,true,false>  (values,&gradients[1*npts],my_phi);
    __syncthreads();
    reduce<Z,n,true,false>  (values,&gradients[2*npts],my_dphi);

    __syncthreads();
    // no synch is necessary since we are only working on local data.

  }
 };

 template <int dim, unsigned int n, typename Number>
 __global__ void kernel_grad(Number *__restrict__ dst, const Number *__restrict__ src,
                            const typename MatrixFreeGpu<dim,Number>::GpuData gpu_data,
                            const Number *__restrict__ coeff)
 {
  const unsigned int nqpts=n*n*n;

  __shared__ Number values[nqpts];
  __shared__ Number gradients[3*nqpts];
  const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;

  if(cell < gpu_data.n_cells) {
    // USER CODE BEGIN HERE
    Phi<dim,n,Number> phi(values,gradients,&gpu_data,cell);

    phi.read_values(src);

    phi.evaluate();

    phi.submit_grad((coeff[phi.global_q()] *
                     phi.get_grad()));

    phi.integrate();

    phi.write_values(dst);
    // END USER CODE
  }

 }
	#define ELEM_DEGREE 4


	enum Direction { X, Y, Z};
	enum Transpose { TR, NOTR};

	template <Direction dir, unsigned int n,
	bool add, bool inplace,typename Number>
	__device__ void reduce(Number * __restrict__ dst, const Number * __restrict__ src, const Number *myphi)
	{
	const unsigned int q = threadIdx.x; // new index
	const unsigned int i = threadIdx.y; // two unchanged
	const unsigned int j = threadIdx.z%n; // indices

	Number tmp = 0;

	#pragma unroll
	for(int k = 0; k < n; ++k) {

	const unsigned int srcidx =
	(dir==X) ? (k + n(i + nj))
	: (dir==Y) ? (i + n(k + nj))
	: (i + n(j + nk));

	tmp += myphi[k] * (inplace ? dst[srcidx] : src[srcidx]);
	}

	if(inplace) __syncthreads();

	const unsigned int dstidx =
	(dir==X) ? (q + n(i + nj))
	: (dir==Y) ? (i + n(q + nj))
	: (i + n(j + nq));

	if(add)
	dst[dstidx] += tmp;
	else
	dst[dstidx] = tmp;
	}

	template <int dim, int n, typename Number>
	class Phi {
	const static int npts=nnn;
	const int tid = threadIdx.x+n(threadIdx.y + nthreadIdx.z);
	const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;
	const int ncells;
	public:
	__device__ Phi(int nc)
	: ncells(nc) { }

	__device__ unsigned int global_q() const { return ROWLENGTH*cell+tid; }

	__device__ void read_values(Number __restrict__ values, const Number __restrict__ src,
	const unsigned int * __restrict__ loc2glob)
	{
	values[tid] = __ldg(&src[loc2glob[ROWLENGTH*cell+tid]]);
	__syncthreads();
	}

	__device__ void write_values(Number __restrict__ dst, const Number __restrict__ values,
	const unsigned int * __restrict__ loc2glob)
	{
	dst[loc2glob[cell*ROWLENGTH+tid]] += values[tid];
	}

	__device__ void get_grad(GpuArray<dim,Number> &grad, const Number *__restrict__ gradients,
	const Number * __restrict__ jac) const
	{

	#pragma unroll
	for(int d1=0; d1<dim; d1++) {
	Number tmp = 0;
	#pragma unroll
	for(int d2=0; d2<dim; d2++) {
	tmp += jac[((dimd2+d1)ncells+cell)ROWLENGTH+tid]gradients[d2*npts+tid];
	}
	grad[d1] = tmp;
	}
	}

	__device__ void submit_grad(Number *__restrict__ gradients, const GpuArray<dim,Number> &grad,
	const Number * __restrict__ jac,
	const Number * __restrict__ jxw) const
	{

	#pragma unroll
	for(int d1=0; d1<dim; d1++) {
	Number tmp = 0;
	#pragma unroll
	for(int d2=0; d2<dim; d2++) {
	tmp += jac[((dimd1+d2)ncells+cell)ROWLENGTH+tid]grad[d2];
	}
	gradients[d1npts+tid] = tmpjxw[cell*ROWLENGTH+tid];
	}

	__syncthreads();
	}

	__device__ void evaluate(Number __restrict__ gradients, const Number __restrict__ values)
	{
	///////////////////////////////////////////////////////////////
	// Stage PHI and DPHI in registers:
	Number my_phi[n];
	Number my_dphi[n];
	#pragma unroll
	for(int i = 0; i < n; i++) {
	my_phi[i] = shape_values[threadIdx.x + n*i];
	my_dphi[i] = shape_gradient[threadIdx.x + n*i];
	}
	///////////////////////////////////////////////////////////////

	// reduce along x / i / q - direction
	reduce<X,n,false,false> (&gradients[0],values,my_dphi);
	reduce<X,n,false,false> (&gradients[1*npts],values,my_phi);
	reduce<X,n,false,false> (&gradients[2*npts],values,my_phi);
	__syncthreads();

	// reduce along y / j / r - direction
	reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
	reduce<Y,n,false,true> (&gradients[1npts],&gradients[1npts],my_dphi);
	reduce<Y,n,false,true> (&gradients[2npts],&gradients[2npts],my_phi);
	__syncthreads();

	// reduce along z / k / s - direction
	reduce<Z,n,false,true> (&gradients[0],&gradients[0],my_phi);
	reduce<Z,n,false,true> (&gradients[1npts],&gradients[1npts],my_phi);
	reduce<Z,n,false,true> (&gradients[2npts],&gradients[2npts],my_dphi);

	__syncthreads();
	// now we should have values at quadrature points
	// no synch is necessary since we are only working on local data.

	}

	__device__ void integrate(Number __restrict__ gradients, Number __restrict__ values)
	{
	///////////////////////////////////////////////////////////////
	// Stage transpose of PHI and DPHI in registers:
	Number my_phi[n];
	Number my_dphi[n];
	#pragma unroll
	for(int i = 0; i < n; i++) {
	my_phi[i] = shape_values[threadIdx.x*n + i];
	my_dphi[i] = shape_gradient[threadIdx.x*n + i];
	}
	///////////////////////////////////////////////////////////////

	__syncthreads();


	reduce<X,n,false,true> (&gradients[0],&gradients[0],my_dphi);
	reduce<X,n,false,true> (&gradients[1npts],&gradients[1npts],my_phi);
	reduce<X,n,false,true> (&gradients[2npts],&gradients[2npts],my_phi);
	__syncthreads();

	// reduce along y / j / r - direction
	reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
	reduce<Y,n,false,true> (&gradients[1npts],&gradients[1npts],my_dphi);
	reduce<Y,n,false,true> (&gradients[2npts],&gradients[2npts],my_phi);
	__syncthreads();

	// reduce along z / k / s - direction
	reduce<Z,n,false,false> (values,&gradients[0],my_phi);
	__syncthreads();
	reduce<Z,n,true,false> (values,&gradients[1*npts],my_phi);
	__syncthreads();
	reduce<Z,n,true,false> (values,&gradients[2*npts],my_dphi);

	__syncthreads();
	// no synch is necessary since we are only working on local data.

	}
	};

	template <int dim, unsigned int n, typename Number>
	__global__ void kernel_grad(Number __restrict__ dst, const Number __restrict__ src,
	const unsigned int *__restrict__ loc2glob,
	const Number __restrict__ coeff, const Number __restrict__ jac,
	const Number *__restrict__ jxw, const unsigned int n_cells)
	{
	const unsigned int nqpts=nnn;

	__shared__ Number values[nqpts];
	__shared__ Number gradients[3*nqpts];
	const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;
	if(cell < n_cells) {

	// USER CODE BEGIN HERE

	Phi<dim,n,Number> phi(n_cells);

	phi.read_values(values,src,loc2glob);

	phi.evaluate(gradients,values);

	GpuArray<dim,Number> grad;
	phi.get_grad(grad,gradients,jac);
	grad *= coeff[phi.global_q()];
	phi.submit_grad(gradients,grad,jac,jxw);

	phi.integrate(gradients,values);

	phi.write_values(dst,values,loc2glob);
	// END USER CODE
	}

	}