cmdr2 · April 8, 2025 08:37
diff --git a/conv_with_stride_and_padding.cu b/conv_with_stride_and_padding.cu
 #include <cuda_runtime.h>
 #include <iostream>
 #include <chrono>

 #define CHECK_CUDA(call)                                                       \
    do {                                                                       \
        cudaError_t err = call;                                                \
        if (err != cudaSuccess) {                                              \
            std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n";   \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    } while (0)

 #define BLOCK_DIM_X 16
 #define BLOCK_DIM_Y 16

 constexpr int BATCH = 1;
 constexpr int CHANNELS = 1;
 constexpr int HEIGHT = 512;
 constexpr int WIDTH = 512;
 constexpr int STRIDE = 1;
 constexpr int PADDING = 0;  // change this to 0 for "valid", 1 for "same"

 constexpr int OUT_H = (HEIGHT + 2 * PADDING - 2) / STRIDE;
 constexpr int OUT_W = (WIDTH + 2 * PADDING - 2) / STRIDE;

 __global__ void conv2d_kernel(const float* __restrict__ input,
                                             const float* __restrict__ kernel,
                                             float* __restrict__ output,
                                             int H, int W, int stride, int pad)
 {
    const int b = blockIdx.z / CHANNELS;
    const int c = blockIdx.z % CHANNELS;

    const int out_y = blockIdx.y * blockDim.y + threadIdx.y;
    const int out_x = blockIdx.x * blockDim.x + threadIdx.x;

    const int in_y = out_y * stride - pad;
    const int in_x = out_x * stride - pad;

    const int tile_y = threadIdx.y + 1;
    const int tile_x = threadIdx.x + 1;

    __shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2];

    auto get_input = [&](int y, int x) -> float {
        return (y >= 0 && y < H && x >= 0 && x < W)
            ? input[b * CHANNELS * H * W + c * H * W + y * W + x]
            : 0.0f;
    };

    tile[tile_y][tile_x] = get_input(in_y, in_x);

    if (threadIdx.y == 0) {
        tile[0][tile_x] = get_input(in_y - 1, in_x);
        if (threadIdx.x == 0) tile[0][0] = get_input(in_y - 1, in_x - 1);
        if (threadIdx.x == blockDim.x - 1) tile[0][tile_x + 1] = get_input(in_y - 1, in_x + 1);
    }

    if (threadIdx.y == blockDim.y - 1) {
        tile[tile_y + 1][tile_x] = get_input(in_y + 1, in_x);
        if (threadIdx.x == 0) tile[tile_y + 1][0] = get_input(in_y + 1, in_x - 1);
        if (threadIdx.x == blockDim.x - 1) tile[tile_y + 1][tile_x + 1] = get_input(in_y + 1, in_x + 1);
    }

    if (threadIdx.x == 0) tile[tile_y][0] = get_input(in_y, in_x - 1);
    if (threadIdx.x == blockDim.x - 1) tile[tile_y][tile_x + 1] = get_input(in_y, in_x + 1);

    __syncthreads();

    if (out_y >= OUT_H || out_x >= OUT_W) return;

    float sum = 0.0f;
    #pragma unroll
    for (int ky = -1; ky <= 1; ++ky)
        #pragma unroll
        for (int kx = -1; kx <= 1; ++kx)
            sum += kernel[(ky + 1) * 3 + (kx + 1)] *
                   tile[tile_y + ky][tile_x + kx];

    int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x;
    output[out_idx] = sum;
 }


 int main() {
    constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH;
    constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W;

    float* h_input = new float[IN_SIZE];
    float* h_output = new float[OUT_SIZE];
    float h_kernel[9] = {
        0, 1, 0,
        1,-4, 1,
        0, 1, 0
    };

    for (int i = 0; i < IN_SIZE; ++i)
        h_input[i] = static_cast<float>(i % 255) / 255.0f;

    float *d_input, *d_kernel, *d_output;
    CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float)));

    CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice));

    dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
    dim3 grid((OUT_W + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
              (OUT_H + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
              BATCH * CHANNELS);

    using clock = std::chrono::high_resolution_clock;
    auto start = clock::now();

    for (int i = 0; i < 10000; ++i) {
        conv2d_kernel<<<grid, block>>>(
            d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE, PADDING);
    }
    CHECK_CUDA(cudaDeviceSynchronize());

    auto end = clock::now();
    std::chrono::duration<double, std::milli> duration_ms = end - start;

    std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n";

    size_t free_mem, total_mem;
    CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
    std::cout << "GPU memory used: "
              << (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n";

    cudaFree(d_input);
    cudaFree(d_kernel);
    cudaFree(d_output);
    delete[] h_input;
    delete[] h_output;

    return 0;
 }
	#include <cuda_runtime.h>
	#include <iostream>
	#include <chrono>

	#define CHECK_CUDA(call) \
	do { \
	cudaError_t err = call; \
	if (err != cudaSuccess) { \
	std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n"; \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define BLOCK_DIM_X 16
	#define BLOCK_DIM_Y 16

	constexpr int BATCH = 1;
	constexpr int CHANNELS = 1;
	constexpr int HEIGHT = 512;
	constexpr int WIDTH = 512;
	constexpr int STRIDE = 1;
	constexpr int PADDING = 0; // change this to 0 for "valid", 1 for "same"

	constexpr int OUT_H = (HEIGHT + 2 * PADDING - 2) / STRIDE;
	constexpr int OUT_W = (WIDTH + 2 * PADDING - 2) / STRIDE;

	__global__ void conv2d_kernel(const float* __restrict__ input,
	const float* __restrict__ kernel,
	float* __restrict__ output,
	int H, int W, int stride, int pad)
	{
	const int b = blockIdx.z / CHANNELS;
	const int c = blockIdx.z % CHANNELS;

	const int out_y = blockIdx.y * blockDim.y + threadIdx.y;
	const int out_x = blockIdx.x * blockDim.x + threadIdx.x;

	const int in_y = out_y * stride - pad;
	const int in_x = out_x * stride - pad;

	const int tile_y = threadIdx.y + 1;
	const int tile_x = threadIdx.x + 1;

	__shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2];

	auto get_input = [&](int y, int x) -> float {
	return (y >= 0 && y < H && x >= 0 && x < W)
	? input[b * CHANNELS * H * W + c * H * W + y * W + x]
	: 0.0f;
	};

	tile[tile_y][tile_x] = get_input(in_y, in_x);

	if (threadIdx.y == 0) {
	tile[0][tile_x] = get_input(in_y - 1, in_x);
	if (threadIdx.x == 0) tile[0][0] = get_input(in_y - 1, in_x - 1);
	if (threadIdx.x == blockDim.x - 1) tile[0][tile_x + 1] = get_input(in_y - 1, in_x + 1);
	}

	if (threadIdx.y == blockDim.y - 1) {
	tile[tile_y + 1][tile_x] = get_input(in_y + 1, in_x);
	if (threadIdx.x == 0) tile[tile_y + 1][0] = get_input(in_y + 1, in_x - 1);
	if (threadIdx.x == blockDim.x - 1) tile[tile_y + 1][tile_x + 1] = get_input(in_y + 1, in_x + 1);
	}

	if (threadIdx.x == 0) tile[tile_y][0] = get_input(in_y, in_x - 1);
	if (threadIdx.x == blockDim.x - 1) tile[tile_y][tile_x + 1] = get_input(in_y, in_x + 1);

	__syncthreads();

	if (out_y >= OUT_H \|\| out_x >= OUT_W) return;

	float sum = 0.0f;
	#pragma unroll
	for (int ky = -1; ky <= 1; ++ky)
	#pragma unroll
	for (int kx = -1; kx <= 1; ++kx)
	sum += kernel[(ky + 1) * 3 + (kx + 1)] *
	tile[tile_y + ky][tile_x + kx];

	int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x;
	output[out_idx] = sum;
	}


	int main() {
	constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH;
	constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W;

	float* h_input = new float[IN_SIZE];
	float* h_output = new float[OUT_SIZE];
	float h_kernel[9] = {
	0, 1, 0,
	1,-4, 1,
	0, 1, 0
	};

	for (int i = 0; i < IN_SIZE; ++i)
	h_input[i] = static_cast<float>(i % 255) / 255.0f;

	float d_input, d_kernel, *d_output;
	CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float)));
	CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float)));
	CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float)));

	CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
	CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice));

	dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
	dim3 grid((OUT_W + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
	(OUT_H + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
	BATCH * CHANNELS);

	using clock = std::chrono::high_resolution_clock;
	auto start = clock::now();

	for (int i = 0; i < 10000; ++i) {
	conv2d_kernel<<<grid, block>>>(
	d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE, PADDING);
	}
	CHECK_CUDA(cudaDeviceSynchronize());

	auto end = clock::now();
	std::chrono::duration<double, std::milli> duration_ms = end - start;

	std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n";

	size_t free_mem, total_mem;
	CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
	std::cout << "GPU memory used: "
	<< (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n";

	cudaFree(d_input);
	cudaFree(d_kernel);
	cudaFree(d_output);
	delete[] h_input;
	delete[] h_output;

	return 0;
	}