Skip to content

Instantly share code, notes, and snippets.

@bugparty
Created June 16, 2025 03:38
Show Gist options
  • Save bugparty/7123923d24b809fe5e3699ee686635b5 to your computer and use it in GitHub Desktop.
Save bugparty/7123923d24b809fe5e3699ee686635b5 to your computer and use it in GitHub Desktop.
steamdeck room hip gpu bandwidth test
#include <hip/hip_runtime.h>
#include <iostream>
#include <chrono>
#include <hip/hip_fp16.h>
#include "utils.hpp"
#define N (2047 * 1024 * 1024) // MB
#define THREADS_PER_BLOCK 256
__global__ void read_kernel(const int* __restrict__ data, long long* __restrict__ sum_out, size_t count) {
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
long long sum = 0;
for (size_t i = idx; i < count; i += gridDim.x * blockDim.x) {
sum = data[i];
}
}
__global__ void read_kernel2(const int* __restrict__ data, long long* __restrict__ sum_out, size_t count) {
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
long long sum = 0;
for (size_t i = idx; i < count; i += gridDim.x * blockDim.x) {
sum += data[i];
}
}
int main() {
int* data = nullptr;
long long* sum = nullptr;
hipError_t err = hipSuccess;
std::cout << "Allocating managed memory..." << std::endl;
HIP_CHECK(hipMallocManaged(&data, N * sizeof(int)));
HIP_CHECK(hipMallocManaged(&sum, sizeof(long long)));
*sum = 0;
std::cout << "Initializing data..." << std::endl;
for (size_t i = 0; i < N; ++i) data[i] = i % 13;
hipDeviceSynchronize(); // ensure init done
std::cout << "Prefetching to GPU..." << std::endl;
int device = 0;
hipGetDevice(&device);
hipMemPrefetchAsync(data, N * sizeof(int), device);
hipDeviceSynchronize();
std::cout << "Running warm-up bandwitdh read." << std::endl;
read_kernel<<<(N + 32 - 1) / 32, 32>>>(data, sum, N);
hipDeviceSynchronize();
std::vector<int> block_sizes = {8,16, 32,64,128};
for (int block_size : block_sizes) {
std::cout << "Launching kernel..." << std::endl;
auto start = std::chrono::high_resolution_clock::now();
int blocks = (N + block_size - 1) / block_size;
read_kernel2<<<blocks, block_size>>>(data, sum, N);
hipDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
double duration = std::chrono::duration<double>(end - start).count();
double gb = (N * sizeof(int)) / 1e9;
std::cout << "wrap size: " << block_size << std::endl;
std::cout << "Read BW: " << (gb / duration) << " GB/s" << std::endl;
}
hipFree(data);
hipFree(sum);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment