Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save cmdr2/57127084280cf19174ca9ae053322fc5 to your computer and use it in GitHub Desktop.
Save cmdr2/57127084280cf19174ca9ae053322fc5 to your computer and use it in GitHub Desktop.
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#define CHECK_CUDA(call) \
if ((call) != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << std::endl; \
std::exit(1); \
}
constexpr int BATCH = 1;
constexpr int CHANNELS = 1;
constexpr int H = 512;
constexpr int W = 512;
constexpr int K = 3; // Kernel size
constexpr int STRIDE_Y = 1;
constexpr int STRIDE_X = 1;
constexpr int PAD_Y = 0;
constexpr int PAD_X = 0;
constexpr int DIL_Y = 1;
constexpr int DIL_X = 1;
constexpr int BLOCK_DIM_Y = 16;
constexpr int BLOCK_DIM_X = 16;
__global__ void conv2d_general_kernel(
const float* __restrict__ input,
const float* __restrict__ kernel,
float* __restrict__ output,
int H, int W,
int outH, int outW,
int kSize, int strideY, int strideX,
int padY, int padX,
int dilY, int dilX)
{
const int b = blockIdx.z / CHANNELS;
const int c = blockIdx.z % CHANNELS;
const int out_y = blockIdx.y * blockDim.y + threadIdx.y;
const int out_x = blockIdx.x * blockDim.x + threadIdx.x;
const int in_y = out_y * strideY - padY;
const int in_x = out_x * strideX - padX;
extern __shared__ float tile[];
const int sharedH = BLOCK_DIM_Y + (K - 1) * DIL_Y;
const int sharedW = BLOCK_DIM_X + (K - 1) * DIL_X;
int tile_y = threadIdx.y;
int tile_x = threadIdx.x;
auto get_input = [&](int y, int x) -> float {
return (y >= 0 && y < H && x >= 0 && x < W)
? input[b * CHANNELS * H * W + c * H * W + y * W + x]
: 0.0f;
};
for (int dy = 0; dy < sharedH; dy += BLOCK_DIM_Y) {
for (int dx = 0; dx < sharedW; dx += BLOCK_DIM_X) {
int load_y = in_y + dy;
int load_x = in_x + dx;
int s_y = tile_y + dy;
int s_x = tile_x + dx;
if (s_y < sharedH && s_x < sharedW)
tile[s_y * sharedW + s_x] = get_input(load_y, load_x);
}
}
__syncthreads();
if (out_y >= outH || out_x >= outW) return;
float sum = 0.0f;
for (int ky = 0; ky < kSize; ++ky) {
for (int kx = 0; kx < kSize; ++kx) {
int s_y = tile_y + ky * dilY;
int s_x = tile_x + kx * dilX;
float val = tile[s_y * sharedW + s_x];
float kval = kernel[c * kSize * kSize + ky * kSize + kx];
sum += val * kval;
}
}
output[b * CHANNELS * outH * outW + c * outH * outW + out_y * outW + out_x] = sum;
}
int main() {
int outH = (H + 2 * PAD_Y - DIL_Y * (K - 1) - 1) / STRIDE_Y + 1;
int outW = (W + 2 * PAD_X - DIL_X * (K - 1) - 1) / STRIDE_X + 1;
size_t in_size = BATCH * CHANNELS * H * W * sizeof(float);
size_t out_size = BATCH * CHANNELS * outH * outW * sizeof(float);
size_t kernel_size = CHANNELS * K * K * sizeof(float);
float *in_host = new float[in_size / sizeof(float)];
float *out_host = new float[out_size / sizeof(float)];
float *kernel_host = new float[kernel_size / sizeof(float)];
for (int i = 0; i < in_size / sizeof(float); ++i) in_host[i] = 1.0f * (i + 1);
for (int i = 0; i < kernel_size / sizeof(float); ++i) kernel_host[i] = 10.0f * (i + 1);
float *in_dev, *out_dev, *kernel_dev;
CHECK_CUDA(cudaMalloc(&in_dev, in_size));
CHECK_CUDA(cudaMalloc(&out_dev, out_size));
CHECK_CUDA(cudaMalloc(&kernel_dev, kernel_size));
CHECK_CUDA(cudaMemcpy(in_dev, in_host, in_size, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(kernel_dev, kernel_host, kernel_size, cudaMemcpyHostToDevice));
dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y, BATCH * CHANNELS);
size_t shared_mem = (BLOCK_DIM_Y + (K - 1) * DIL_Y) * (BLOCK_DIM_X + (K - 1) * DIL_X) * sizeof(float);
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 10000; ++i) {
conv2d_general_kernel<<<grid, block, shared_mem>>>(
in_dev, kernel_dev, out_dev,
H, W, outH, outW,
K, STRIDE_Y, STRIDE_X,
PAD_Y, PAD_X, DIL_Y, DIL_X);
}
CHECK_CUDA(cudaDeviceSynchronize());
CHECK_CUDA(cudaMemcpy(out_host, out_dev, out_size, cudaMemcpyDeviceToHost));
auto end = std::chrono::high_resolution_clock::now();
double time_ms = std::chrono::duration<double, std::milli>(end - start).count();
std::cout << "Execution time over 10000 runs: " << time_ms << " ms\n";
size_t free_mem, total_mem;
CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
std::cout << "GPU Memory Used: " << (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n";
std::cout << "Output ("<<outW<<", "<<outH<<")\n";
// for (int i = 0; i < outH; ++i) {
// for (int j = 0; j < outW; ++j) {
// std::cout << out_host[i * outW + j] << " ";
// }
// std::cout << "\n";
// }
CHECK_CUDA(cudaFree(in_dev));
CHECK_CUDA(cudaFree(out_dev));
CHECK_CUDA(cudaFree(kernel_dev));
delete[] in_host;
delete[] out_host;
delete[] kernel_host;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment