cmdr2 · April 8, 2025 08:39
diff --git a/conv_with_stride.cu b/conv_with_stride.cu
 #include <cuda_runtime.h>
 #include <iostream>
 #include <chrono>

 #define CHECK_CUDA(call)                                                       \
    do {                                                                       \
        cudaError_t err = call;                                                \
        if (err != cudaSuccess) {                                              \
            std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n";   \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    } while (0)

 #define BLOCK_DIM_X 16
 #define BLOCK_DIM_Y 16

 constexpr int BATCH = 1;
 constexpr int CHANNELS = 1;
 constexpr int HEIGHT = 512;
 constexpr int WIDTH = 512;
 constexpr int KERNEL_SIZE = 3;
 constexpr int RADIUS = KERNEL_SIZE / 2;
 constexpr int STRIDE = 1;
 constexpr int OUT_H = (HEIGHT - 2) / STRIDE;
 constexpr int OUT_W = (WIDTH - 2) / STRIDE;

 __global__ void conv2d_kernel_batched(const float* __restrict__ input,
                                      const float* __restrict__ kernel,
                                      float* __restrict__ output,
                                      int H, int W, int stride)
 {
    int b = blockIdx.z / CHANNELS;
    int c = blockIdx.z % CHANNELS;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;

    __shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2];

    if (y < H && x < W) {
        int tile_y = threadIdx.y + 1;
        int tile_x = threadIdx.x + 1;
        int idx = b * CHANNELS * H * W + c * H * W + y * W + x;
        tile[tile_y][tile_x] = input[idx];

        if (threadIdx.y == 0 && y > 0)
            tile[0][tile_x] = input[idx - W];
        if (threadIdx.y == blockDim.y - 1 && y < H - 1)
            tile[tile_y + 1][tile_x] = input[idx + W];
        if (threadIdx.x == 0 && x > 0)
            tile[tile_y][0] = input[idx - 1];
        if (threadIdx.x == blockDim.x - 1 && x < W - 1)
            tile[tile_y][tile_x + 1] = input[idx + 1];

        if (threadIdx.x == 0 && threadIdx.y == 0 && x > 0 && y > 0)
            tile[0][0] = input[idx - W - 1];
        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0 && x < W - 1 && y > 0)
            tile[0][tile_x + 1] = input[idx - W + 1];
        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1 && x > 0 && y < H - 1)
            tile[tile_y + 1][0] = input[idx + W - 1];
        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1 && x < W - 1 && y < H - 1)
            tile[tile_y + 1][tile_x + 1] = input[idx + W + 1];
    }

    __syncthreads();

    if (y >= RADIUS && x >= RADIUS &&
        y <= H - RADIUS - 1 && x <= W - RADIUS - 1 &&
        ((y - RADIUS) % stride == 0) &&
        ((x - RADIUS) % stride == 0)) {

        float sum = 0.0f;
        for (int ky = -1; ky <= 1; ++ky)
            for (int kx = -1; kx <= 1; ++kx)
                sum += kernel[(ky + 1) * 3 + (kx + 1)] *
                       tile[threadIdx.y + ky + 1][threadIdx.x + kx + 1];

        int out_y = (y - RADIUS) / stride;
        int out_x = (x - RADIUS) / stride;
        int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x;
        output[out_idx] = sum;
    }
 }

 int main() {
    constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH;
    constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W;

    float* h_input = new float[IN_SIZE];
    float* h_output = new float[OUT_SIZE];
    float h_kernel[9] = {
        0, 1, 0,
        1,-4, 1,
        0, 1, 0
    };

    for (int i = 0; i < IN_SIZE; ++i)
        h_input[i] = static_cast<float>(i % 255) / 255.0f;

    float *d_input, *d_kernel, *d_output;
    CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float)));

    CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice));

    dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
    dim3 grid((WIDTH + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
              (HEIGHT + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
              BATCH * CHANNELS);

    using clock = std::chrono::high_resolution_clock;
    auto start = clock::now();

    for (int i = 0; i < 10000; ++i) {
        conv2d_kernel_batched<<<grid, block>>>(
            d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE);
    }
    CHECK_CUDA(cudaDeviceSynchronize());

    auto end = clock::now();
    std::chrono::duration<double, std::milli> duration_ms = end - start;

    std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n";

    size_t free_mem, total_mem;
    CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
    std::cout << "GPU memory used: "
              << (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n";

    cudaFree(d_input);
    cudaFree(d_kernel);
    cudaFree(d_output);
    delete[] h_input;
    delete[] h_output;

    return 0;
 }
	#include <cuda_runtime.h>
	#include <iostream>
	#include <chrono>

	#define CHECK_CUDA(call) \
	do { \
	cudaError_t err = call; \
	if (err != cudaSuccess) { \
	std::cerr << "CUDA Error: " << cudaGetErrorString(err) << "\n"; \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define BLOCK_DIM_X 16
	#define BLOCK_DIM_Y 16

	constexpr int BATCH = 1;
	constexpr int CHANNELS = 1;
	constexpr int HEIGHT = 512;
	constexpr int WIDTH = 512;
	constexpr int KERNEL_SIZE = 3;
	constexpr int RADIUS = KERNEL_SIZE / 2;
	constexpr int STRIDE = 1;
	constexpr int OUT_H = (HEIGHT - 2) / STRIDE;
	constexpr int OUT_W = (WIDTH - 2) / STRIDE;

	__global__ void conv2d_kernel_batched(const float* __restrict__ input,
	const float* __restrict__ kernel,
	float* __restrict__ output,
	int H, int W, int stride)
	{
	int b = blockIdx.z / CHANNELS;
	int c = blockIdx.z % CHANNELS;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int x = blockIdx.x * blockDim.x + threadIdx.x;

	__shared__ float tile[BLOCK_DIM_Y + 2][BLOCK_DIM_X + 2];

	if (y < H && x < W) {
	int tile_y = threadIdx.y + 1;
	int tile_x = threadIdx.x + 1;
	int idx = b * CHANNELS * H * W + c * H * W + y * W + x;
	tile[tile_y][tile_x] = input[idx];

	if (threadIdx.y == 0 && y > 0)
	tile[0][tile_x] = input[idx - W];
	if (threadIdx.y == blockDim.y - 1 && y < H - 1)
	tile[tile_y + 1][tile_x] = input[idx + W];
	if (threadIdx.x == 0 && x > 0)
	tile[tile_y][0] = input[idx - 1];
	if (threadIdx.x == blockDim.x - 1 && x < W - 1)
	tile[tile_y][tile_x + 1] = input[idx + 1];

	if (threadIdx.x == 0 && threadIdx.y == 0 && x > 0 && y > 0)
	tile[0][0] = input[idx - W - 1];
	if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0 && x < W - 1 && y > 0)
	tile[0][tile_x + 1] = input[idx - W + 1];
	if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1 && x > 0 && y < H - 1)
	tile[tile_y + 1][0] = input[idx + W - 1];
	if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1 && x < W - 1 && y < H - 1)
	tile[tile_y + 1][tile_x + 1] = input[idx + W + 1];
	}

	__syncthreads();

	if (y >= RADIUS && x >= RADIUS &&
	y <= H - RADIUS - 1 && x <= W - RADIUS - 1 &&
	((y - RADIUS) % stride == 0) &&
	((x - RADIUS) % stride == 0)) {

	float sum = 0.0f;
	for (int ky = -1; ky <= 1; ++ky)
	for (int kx = -1; kx <= 1; ++kx)
	sum += kernel[(ky + 1) * 3 + (kx + 1)] *
	tile[threadIdx.y + ky + 1][threadIdx.x + kx + 1];

	int out_y = (y - RADIUS) / stride;
	int out_x = (x - RADIUS) / stride;
	int out_idx = b * CHANNELS * OUT_H * OUT_W + c * OUT_H * OUT_W + out_y * OUT_W + out_x;
	output[out_idx] = sum;
	}
	}

	int main() {
	constexpr int IN_SIZE = BATCH * CHANNELS * HEIGHT * WIDTH;
	constexpr int OUT_SIZE = BATCH * CHANNELS * OUT_H * OUT_W;

	float* h_input = new float[IN_SIZE];
	float* h_output = new float[OUT_SIZE];
	float h_kernel[9] = {
	0, 1, 0,
	1,-4, 1,
	0, 1, 0
	};

	for (int i = 0; i < IN_SIZE; ++i)
	h_input[i] = static_cast<float>(i % 255) / 255.0f;

	float d_input, d_kernel, *d_output;
	CHECK_CUDA(cudaMalloc(&d_input, IN_SIZE * sizeof(float)));
	CHECK_CUDA(cudaMalloc(&d_kernel, 9 * sizeof(float)));
	CHECK_CUDA(cudaMalloc(&d_output, OUT_SIZE * sizeof(float)));

	CHECK_CUDA(cudaMemcpy(d_input, h_input, IN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
	CHECK_CUDA(cudaMemcpy(d_kernel, h_kernel, 9 * sizeof(float), cudaMemcpyHostToDevice));

	dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
	dim3 grid((WIDTH + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
	(HEIGHT + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
	BATCH * CHANNELS);

	using clock = std::chrono::high_resolution_clock;
	auto start = clock::now();

	for (int i = 0; i < 10000; ++i) {
	conv2d_kernel_batched<<<grid, block>>>(
	d_input, d_kernel, d_output, HEIGHT, WIDTH, STRIDE);
	}
	CHECK_CUDA(cudaDeviceSynchronize());

	auto end = clock::now();
	std::chrono::duration<double, std::milli> duration_ms = end - start;

	std::cout << "Execution time for 10000 runs with stride " << STRIDE << ": " << duration_ms.count() << " ms\n";

	size_t free_mem, total_mem;
	CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
	std::cout << "GPU memory used: "
	<< (total_mem - free_mem) / (1024.0 * 1024.0) << " MB\n";

	cudaFree(d_input);
	cudaFree(d_kernel);
	cudaFree(d_output);
	delete[] h_input;
	delete[] h_output;

	return 0;
	}