Created
April 8, 2025 08:37
-
-
Save cmdr2/80671adadd54112e68a471eb02021528 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime.h> | |
#include <iostream> | |
#include <chrono> | |
#define CHECK_CUDA(call) \ | |
do { \ | |
cudaError_t err = call; \ | |
if (err != cudaSuccess) { \ | |
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n"; \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while (0) | |
#define BLOCK_DIM_X 16 | |
#define BLOCK_DIM_Y 16 | |
constexpr int BATCH = 1; | |
constexpr int CHANNELS = 1; | |
constexpr int HEIGHT = 512; | |
constexpr int WIDTH = 512; | |
constexpr int STRIDE = 1; | |
constexpr int PADDING = 0; // change this to 0 for "valid", 1 for "same" | |
constexpr int OUT_H = (HEIGHT + 2 * PADDING - 2) / STRIDE; | |
constexpr int OUT_W = (WIDTH + 2 * PADDING - 2) / STRIDE; | |
__global__ void conv2d_kernel(const float* __restrict__ input, | |
const float* __restrict__ kernel, | |
float* __restrict__ output, | |
int H, int W, int stride, int pad) | |
{ | |
const int b = blockIdx.z / CHANNELS; | |
const int c = blockIdx.z % CHANNELS; | |
const int out_y = blockIdx.y * blockDim.y + threadIdx.y; | |
const int out_x = blockIdx.x * blockDim.x + threadIdx.x; | |
const int in_y = out_y * stride - pad; | |
const int in_x = out_x * stride - pad; | |
const int tile_y = threadIdx.y + 1; | |
const int tile_x = threadIdx.x + 1; | |
__shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2]; | |
auto get_input = [&](int y, int x) -> float { | |
return (y >= 0 && y < H && x >= 0 && x < W) | |
? input[b * CHANNELS * H * W + c * H * W + y * W + x] | |
: 0.0f; | |
}; | |
tile[tile_y][tile_x] = get_input(in_y, in_x); | |
if (threadIdx.y == 0) { | |
tile[0][tile_x] = get_input(in_y - 1, in_x); | |
if (threadIdx.x == 0) tile[0][0] = get_input(in_y - 1, in_x - 1); | |
if (threadIdx.x == blockDim.x - 1) tile[0][tile_x + 1] = get_input(in_y - 1, in_x + 1); | |
} | |
if (threadIdx.y == blockDim.y - 1) { | |
tile[tile_y + 1][tile_x] = get_input(in_y + 1, in_x); | |
if (threadIdx.x == 0) tile[tile_y + 1][0] = get_input(in_y + 1, in_x - 1); | |
if (threadIdx.x == blockDim.x - 1) tile[tile_y + 1][tile_x + 1] = get_input(in_y + 1, in_x + 1); | |
} | |
if (threadIdx.x == 0) tile[tile_y][0] = get_input(in_y, in_x - 1); | |
if (threadIdx.x == blockDim.x - 1) tile[tile_y][tile_x + 1] = get_input(in_y, in_x + 1); | |
__syncthreads(); | |
if (out_y >= OUT_H || out_x >= OUT_W) return; | |
float sum = 0.0f; | |
#pragma unroll | |
for (int ky = -1; ky <= 1; ++ky) | |
#pragma unroll | |
for (int kx = -1; kx <= 1; ++kx) | |
sum += kernel[(ky + 1) * 3 + (kx + 1)] * | |
tile[tile_y + ky][tile_x + kx]; | |
int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x; | |
output[out_idx] = sum; | |
} | |
int main() { | |
constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH; | |
constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W; | |
float* h_input = new float[IN_SIZE]; | |
float* h_output = new float[OUT_SIZE]; | |
float h_kernel[9] = { | |
0, 1, 0, | |
1,-4, 1, | |
0, 1, 0 | |
}; | |
for (int i = 0; i < IN_SIZE; ++i) | |
h_input[i] = static_cast<float>(i % 255) / 255.0f; | |
float *d_input, *d_kernel, *d_output; | |
CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float))); | |
CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float))); | |
CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float))); | |
CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice)); | |
CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice)); | |
dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); | |
dim3 grid((OUT_W + BLOCK_DIM_X - 1) / BLOCK_DIM_X, | |
(OUT_H + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, | |
BATCH * CHANNELS); | |
using clock = std::chrono::high_resolution_clock; | |
auto start = clock::now(); | |
for (int i = 0; i < 10000; ++i) { | |
conv2d_kernel<<<grid, block>>>( | |
d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE, PADDING); | |
} | |
CHECK_CUDA(cudaDeviceSynchronize()); | |
auto end = clock::now(); | |
std::chrono::duration<double, std::milli> duration_ms = end - start; | |
std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n"; | |
size_t free_mem, total_mem; | |
CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem)); | |
std::cout << "GPU memory used: " | |
<< (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n"; | |
cudaFree(d_input); | |
cudaFree(d_kernel); | |
cudaFree(d_output); | |
delete[] h_input; | |
delete[] h_output; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment