Skip to content

Instantly share code, notes, and snippets.

@cmdr2
Created April 8, 2025 08:39
Show Gist options
  • Save cmdr2/424bc537e4ccf5c66114c2e088345407 to your computer and use it in GitHub Desktop.
Save cmdr2/424bc537e4ccf5c66114c2e088345407 to your computer and use it in GitHub Desktop.
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n"; \
exit(EXIT_FAILURE); \
} \
} while (0)
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
constexpr int BATCH = 1;
constexpr int CHANNELS = 1;
constexpr int HEIGHT = 512;
constexpr int WIDTH = 512;
constexpr int KERNEL_SIZE = 3;
constexpr int RADIUS = KERNEL_SIZE / 2;
constexpr int STRIDE = 1;
constexpr int OUT_H = (HEIGHT - 2) / STRIDE;
constexpr int OUT_W = (WIDTH - 2) / STRIDE;
__global__ void conv2d_kernel_batched(const float* __restrict__ input,
const float* __restrict__ kernel,
float* __restrict__ output,
int H, int W, int stride)
{
int b = blockIdx.z / CHANNELS;
int c = blockIdx.z % CHANNELS;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int x = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2];
if (y < H && x < W) {
int tile_y = threadIdx.y + 1;
int tile_x = threadIdx.x + 1;
int idx = b * CHANNELS * H * W + c * H * W + y * W + x;
tile[tile_y][tile_x] = input[idx];
if (threadIdx.y == 0 && y > 0)
tile[0][tile_x] = input[idx - W];
if (threadIdx.y == blockDim.y - 1 && y < H - 1)
tile[tile_y + 1][tile_x] = input[idx + W];
if (threadIdx.x == 0 && x > 0)
tile[tile_y][0] = input[idx - 1];
if (threadIdx.x == blockDim.x - 1 && x < W - 1)
tile[tile_y][tile_x + 1] = input[idx + 1];
if (threadIdx.x == 0 && threadIdx.y == 0 && x > 0 && y > 0)
tile[0][0] = input[idx - W - 1];
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0 && x < W - 1 && y > 0)
tile[0][tile_x + 1] = input[idx - W + 1];
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1 && x > 0 && y < H - 1)
tile[tile_y + 1][0] = input[idx + W - 1];
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1 && x < W - 1 && y < H - 1)
tile[tile_y + 1][tile_x + 1] = input[idx + W + 1];
}
__syncthreads();
if (y >= RADIUS && x >= RADIUS &&
y <= H - RADIUS - 1 && x <= W - RADIUS - 1 &&
((y - RADIUS) % stride == 0) &&
((x - RADIUS) % stride == 0)) {
float sum = 0.0f;
for (int ky = -1; ky <= 1; ++ky)
for (int kx = -1; kx <= 1; ++kx)
sum += kernel[(ky + 1) * 3 + (kx + 1)] *
tile[threadIdx.y + ky + 1][threadIdx.x + kx + 1];
int out_y = (y - RADIUS) / stride;
int out_x = (x - RADIUS) / stride;
int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x;
output[out_idx] = sum;
}
}
int main() {
constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH;
constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W;
float* h_input = new float[IN_SIZE];
float* h_output = new float[OUT_SIZE];
float h_kernel[9] = {
0, 1, 0,
1,-4, 1,
0, 1, 0
};
for (int i = 0; i < IN_SIZE; ++i)
h_input[i] = static_cast<float>(i % 255) / 255.0f;
float *d_input, *d_kernel, *d_output;
CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float)));
CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice));
dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid((WIDTH + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
(HEIGHT + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
BATCH * CHANNELS);
using clock = std::chrono::high_resolution_clock;
auto start = clock::now();
for (int i = 0; i < 10000; ++i) {
conv2d_kernel_batched<<<grid, block>>>(
d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE);
}
CHECK_CUDA(cudaDeviceSynchronize());
auto end = clock::now();
std::chrono::duration<double, std::milli> duration_ms = end - start;
std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n";
size_t free_mem, total_mem;
CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
std::cout << "GPU memory used: "
<< (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n";
cudaFree(d_input);
cudaFree(d_kernel);
cudaFree(d_output);
delete[] h_input;
delete[] h_output;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment