Skip to content

Instantly share code, notes, and snippets.

@cmdr2
Created April 8, 2025 08:37
Show Gist options
  • Save cmdr2/80671adadd54112e68a471eb02021528 to your computer and use it in GitHub Desktop.
Save cmdr2/80671adadd54112e68a471eb02021528 to your computer and use it in GitHub Desktop.
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n"; \
exit(EXIT_FAILURE); \
} \
} while (0)
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
constexpr int BATCH = 1;
constexpr int CHANNELS = 1;
constexpr int HEIGHT = 512;
constexpr int WIDTH = 512;
constexpr int STRIDE = 1;
constexpr int PADDING = 0; // change this to 0 for "valid", 1 for "same"
constexpr int OUT_H = (HEIGHT + 2 * PADDING - 2) / STRIDE;
constexpr int OUT_W = (WIDTH + 2 * PADDING - 2) / STRIDE;
__global__ void conv2d_kernel(const float* __restrict__ input,
const float* __restrict__ kernel,
float* __restrict__ output,
int H, int W, int stride, int pad)
{
const int b = blockIdx.z / CHANNELS;
const int c = blockIdx.z % CHANNELS;
const int out_y = blockIdx.y * blockDim.y + threadIdx.y;
const int out_x = blockIdx.x * blockDim.x + threadIdx.x;
const int in_y = out_y * stride - pad;
const int in_x = out_x * stride - pad;
const int tile_y = threadIdx.y + 1;
const int tile_x = threadIdx.x + 1;
__shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2];
auto get_input = [&](int y, int x) -> float {
return (y >= 0 && y < H && x >= 0 && x < W)
? input[b * CHANNELS * H * W + c * H * W + y * W + x]
: 0.0f;
};
tile[tile_y][tile_x] = get_input(in_y, in_x);
if (threadIdx.y == 0) {
tile[0][tile_x] = get_input(in_y - 1, in_x);
if (threadIdx.x == 0) tile[0][0] = get_input(in_y - 1, in_x - 1);
if (threadIdx.x == blockDim.x - 1) tile[0][tile_x + 1] = get_input(in_y - 1, in_x + 1);
}
if (threadIdx.y == blockDim.y - 1) {
tile[tile_y + 1][tile_x] = get_input(in_y + 1, in_x);
if (threadIdx.x == 0) tile[tile_y + 1][0] = get_input(in_y + 1, in_x - 1);
if (threadIdx.x == blockDim.x - 1) tile[tile_y + 1][tile_x + 1] = get_input(in_y + 1, in_x + 1);
}
if (threadIdx.x == 0) tile[tile_y][0] = get_input(in_y, in_x - 1);
if (threadIdx.x == blockDim.x - 1) tile[tile_y][tile_x + 1] = get_input(in_y, in_x + 1);
__syncthreads();
if (out_y >= OUT_H || out_x >= OUT_W) return;
float sum = 0.0f;
#pragma unroll
for (int ky = -1; ky <= 1; ++ky)
#pragma unroll
for (int kx = -1; kx <= 1; ++kx)
sum += kernel[(ky + 1) * 3 + (kx + 1)] *
tile[tile_y + ky][tile_x + kx];
int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x;
output[out_idx] = sum;
}
int main() {
constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH;
constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W;
float* h_input = new float[IN_SIZE];
float* h_output = new float[OUT_SIZE];
float h_kernel[9] = {
0, 1, 0,
1,-4, 1,
0, 1, 0
};
for (int i = 0; i < IN_SIZE; ++i)
h_input[i] = static_cast<float>(i % 255) / 255.0f;
float *d_input, *d_kernel, *d_output;
CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float)));
CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice));
dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid((OUT_W + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
(OUT_H + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
BATCH * CHANNELS);
using clock = std::chrono::high_resolution_clock;
auto start = clock::now();
for (int i = 0; i < 10000; ++i) {
conv2d_kernel<<<grid, block>>>(
d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE, PADDING);
}
CHECK_CUDA(cudaDeviceSynchronize());
auto end = clock::now();
std::chrono::duration<double, std::milli> duration_ms = end - start;
std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n";
size_t free_mem, total_mem;
CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
std::cout << "GPU memory used: "
<< (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n";
cudaFree(d_input);
cudaFree(d_kernel);
cudaFree(d_output);
delete[] h_input;
delete[] h_output;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment