Created
April 8, 2025 08:39
-
-
Save cmdr2/424bc537e4ccf5c66114c2e088345407 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime.h> | |
#include <iostream> | |
#include <chrono> | |
#define CHECK_CUDA(call) \ | |
do { \ | |
cudaError_t err = call; \ | |
if (err != cudaSuccess) { \ | |
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n"; \ | |
exit(EXIT_FAILURE); \ | |
} \ | |
} while (0) | |
#define BLOCK_DIM_X 16 | |
#define BLOCK_DIM_Y 16 | |
constexpr int BATCH = 1; | |
constexpr int CHANNELS = 1; | |
constexpr int HEIGHT = 512; | |
constexpr int WIDTH = 512; | |
constexpr int KERNEL_SIZE = 3; | |
constexpr int RADIUS = KERNEL_SIZE / 2; | |
constexpr int STRIDE = 1; | |
constexpr int OUT_H = (HEIGHT - 2) / STRIDE; | |
constexpr int OUT_W = (WIDTH - 2) / STRIDE; | |
__global__ void conv2d_kernel_batched(const float* __restrict__ input, | |
const float* __restrict__ kernel, | |
float* __restrict__ output, | |
int H, int W, int stride) | |
{ | |
int b = blockIdx.z / CHANNELS; | |
int c = blockIdx.z % CHANNELS; | |
int y = blockIdx.y * blockDim.y + threadIdx.y; | |
int x = blockIdx.x * blockDim.x + threadIdx.x; | |
__shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2]; | |
if (y < H && x < W) { | |
int tile_y = threadIdx.y + 1; | |
int tile_x = threadIdx.x + 1; | |
int idx = b * CHANNELS * H * W + c * H * W + y * W + x; | |
tile[tile_y][tile_x] = input[idx]; | |
if (threadIdx.y == 0 && y > 0) | |
tile[0][tile_x] = input[idx - W]; | |
if (threadIdx.y == blockDim.y - 1 && y < H - 1) | |
tile[tile_y + 1][tile_x] = input[idx + W]; | |
if (threadIdx.x == 0 && x > 0) | |
tile[tile_y][0] = input[idx - 1]; | |
if (threadIdx.x == blockDim.x - 1 && x < W - 1) | |
tile[tile_y][tile_x + 1] = input[idx + 1]; | |
if (threadIdx.x == 0 && threadIdx.y == 0 && x > 0 && y > 0) | |
tile[0][0] = input[idx - W - 1]; | |
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0 && x < W - 1 && y > 0) | |
tile[0][tile_x + 1] = input[idx - W + 1]; | |
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1 && x > 0 && y < H - 1) | |
tile[tile_y + 1][0] = input[idx + W - 1]; | |
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1 && x < W - 1 && y < H - 1) | |
tile[tile_y + 1][tile_x + 1] = input[idx + W + 1]; | |
} | |
__syncthreads(); | |
if (y >= RADIUS && x >= RADIUS && | |
y <= H - RADIUS - 1 && x <= W - RADIUS - 1 && | |
((y - RADIUS) % stride == 0) && | |
((x - RADIUS) % stride == 0)) { | |
float sum = 0.0f; | |
for (int ky = -1; ky <= 1; ++ky) | |
for (int kx = -1; kx <= 1; ++kx) | |
sum += kernel[(ky + 1) * 3 + (kx + 1)] * | |
tile[threadIdx.y + ky + 1][threadIdx.x + kx + 1]; | |
int out_y = (y - RADIUS) / stride; | |
int out_x = (x - RADIUS) / stride; | |
int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x; | |
output[out_idx] = sum; | |
} | |
} | |
int main() { | |
constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH; | |
constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W; | |
float* h_input = new float[IN_SIZE]; | |
float* h_output = new float[OUT_SIZE]; | |
float h_kernel[9] = { | |
0, 1, 0, | |
1,-4, 1, | |
0, 1, 0 | |
}; | |
for (int i = 0; i < IN_SIZE; ++i) | |
h_input[i] = static_cast<float>(i % 255) / 255.0f; | |
float *d_input, *d_kernel, *d_output; | |
CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float))); | |
CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float))); | |
CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float))); | |
CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice)); | |
CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice)); | |
dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); | |
dim3 grid((WIDTH + BLOCK_DIM_X - 1) / BLOCK_DIM_X, | |
(HEIGHT + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, | |
BATCH * CHANNELS); | |
using clock = std::chrono::high_resolution_clock; | |
auto start = clock::now(); | |
for (int i = 0; i < 10000; ++i) { | |
conv2d_kernel_batched<<<grid, block>>>( | |
d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE); | |
} | |
CHECK_CUDA(cudaDeviceSynchronize()); | |
auto end = clock::now(); | |
std::chrono::duration<double, std::milli> duration_ms = end - start; | |
std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n"; | |
size_t free_mem, total_mem; | |
CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem)); | |
std::cout << "GPU memory used: " | |
<< (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n"; | |
cudaFree(d_input); | |
cudaFree(d_kernel); | |
cudaFree(d_output); | |
delete[] h_input; | |
delete[] h_output; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment