Created
April 8, 2025 11:41
-
-
Save cmdr2/57127084280cf19174ca9ae053322fc5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime.h> | |
#include <iostream> | |
#include <chrono> | |
#define CHECK_CUDA(call) \ | |
if ((call) != cudaSuccess) { \ | |
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << std::endl; \ | |
std::exit(1); \ | |
} | |
constexpr int BATCH = 1; | |
constexpr int CHANNELS = 1; | |
constexpr int H = 512; | |
constexpr int W = 512; | |
constexpr int K = 3; // Kernel size | |
constexpr int STRIDE_Y = 1; | |
constexpr int STRIDE_X = 1; | |
constexpr int PAD_Y = 0; | |
constexpr int PAD_X = 0; | |
constexpr int DIL_Y = 1; | |
constexpr int DIL_X = 1; | |
constexpr int BLOCK_DIM_Y = 16; | |
constexpr int BLOCK_DIM_X = 16; | |
__global__ void conv2d_general_kernel( | |
const float* __restrict__ input, | |
const float* __restrict__ kernel, | |
float* __restrict__ output, | |
int H, int W, | |
int outH, int outW, | |
int kSize, int strideY, int strideX, | |
int padY, int padX, | |
int dilY, int dilX) | |
{ | |
const int b = blockIdx.z / CHANNELS; | |
const int c = blockIdx.z % CHANNELS; | |
const int out_y = blockIdx.y * blockDim.y + threadIdx.y; | |
const int out_x = blockIdx.x * blockDim.x + threadIdx.x; | |
const int in_y = out_y * strideY - padY; | |
const int in_x = out_x * strideX - padX; | |
extern __shared__ float tile[]; | |
const int sharedH = BLOCK_DIM_Y + (K - 1) * DIL_Y; | |
const int sharedW = BLOCK_DIM_X + (K - 1) * DIL_X; | |
int tile_y = threadIdx.y; | |
int tile_x = threadIdx.x; | |
auto get_input = [&](int y, int x) -> float { | |
return (y >= 0 && y < H && x >= 0 && x < W) | |
? input[b * CHANNELS * H * W + c * H * W + y * W + x] | |
: 0.0f; | |
}; | |
for (int dy = 0; dy < sharedH; dy += BLOCK_DIM_Y) { | |
for (int dx = 0; dx < sharedW; dx += BLOCK_DIM_X) { | |
int load_y = in_y + dy; | |
int load_x = in_x + dx; | |
int s_y = tile_y + dy; | |
int s_x = tile_x + dx; | |
if (s_y < sharedH && s_x < sharedW) | |
tile[s_y * sharedW + s_x] = get_input(load_y, load_x); | |
} | |
} | |
__syncthreads(); | |
if (out_y >= outH || out_x >= outW) return; | |
float sum = 0.0f; | |
for (int ky = 0; ky < kSize; ++ky) { | |
for (int kx = 0; kx < kSize; ++kx) { | |
int s_y = tile_y + ky * dilY; | |
int s_x = tile_x + kx * dilX; | |
float val = tile[s_y * sharedW + s_x]; | |
float kval = kernel[c * kSize * kSize + ky * kSize + kx]; | |
sum += val * kval; | |
} | |
} | |
output[b * CHANNELS * outH * outW + c * outH * outW + out_y * outW + out_x] = sum; | |
} | |
int main() { | |
int outH = (H + 2 * PAD_Y - DIL_Y * (K - 1) - 1) / STRIDE_Y + 1; | |
int outW = (W + 2 * PAD_X - DIL_X * (K - 1) - 1) / STRIDE_X + 1; | |
size_t in_size = BATCH * CHANNELS * H * W * sizeof(float); | |
size_t out_size = BATCH * CHANNELS * outH * outW * sizeof(float); | |
size_t kernel_size = CHANNELS * K * K * sizeof(float); | |
float *in_host = new float[in_size / sizeof(float)]; | |
float *out_host = new float[out_size / sizeof(float)]; | |
float *kernel_host = new float[kernel_size / sizeof(float)]; | |
for (int i = 0; i < in_size / sizeof(float); ++i) in_host[i] = 1.0f * (i + 1); | |
for (int i = 0; i < kernel_size / sizeof(float); ++i) kernel_host[i] = 10.0f * (i + 1); | |
float *in_dev, *out_dev, *kernel_dev; | |
CHECK_CUDA(cudaMalloc(&in_dev, in_size)); | |
CHECK_CUDA(cudaMalloc(&out_dev, out_size)); | |
CHECK_CUDA(cudaMalloc(&kernel_dev, kernel_size)); | |
CHECK_CUDA(cudaMemcpy(in_dev, in_host, in_size, cudaMemcpyHostToDevice)); | |
CHECK_CUDA(cudaMemcpy(kernel_dev, kernel_host, kernel_size, cudaMemcpyHostToDevice)); | |
dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); | |
dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y, BATCH * CHANNELS); | |
size_t shared_mem = (BLOCK_DIM_Y + (K - 1) * DIL_Y) * (BLOCK_DIM_X + (K - 1) * DIL_X) * sizeof(float); | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (int i = 0; i < 10000; ++i) { | |
conv2d_general_kernel<<<grid, block, shared_mem>>>( | |
in_dev, kernel_dev, out_dev, | |
H, W, outH, outW, | |
K, STRIDE_Y, STRIDE_X, | |
PAD_Y, PAD_X, DIL_Y, DIL_X); | |
} | |
CHECK_CUDA(cudaDeviceSynchronize()); | |
CHECK_CUDA(cudaMemcpy(out_host, out_dev, out_size, cudaMemcpyDeviceToHost)); | |
auto end = std::chrono::high_resolution_clock::now(); | |
double time_ms = std::chrono::duration<double, std::milli>(end - start).count(); | |
std::cout << "Execution time over 10000 runs: " << time_ms << " ms\n"; | |
size_t free_mem, total_mem; | |
CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem)); | |
std::cout << "GPU Memory Used: " << (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n"; | |
std::cout << "Output ("<<outW<<", "<<outH<<")\n"; | |
// for (int i = 0; i < outH; ++i) { | |
// for (int j = 0; j < outW; ++j) { | |
// std::cout << out_host[i * outW + j] << " "; | |
// } | |
// std::cout << "\n"; | |
// } | |
CHECK_CUDA(cudaFree(in_dev)); | |
CHECK_CUDA(cudaFree(out_dev)); | |
CHECK_CUDA(cudaFree(kernel_dev)); | |
delete[] in_host; | |
delete[] out_host; | |
delete[] kernel_host; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment