Last active
January 15, 2024 03:15
-
-
Save fengyuentau/20af905a3ee0dbf5da93c9befc6a9841 to your computer and use it in GitHub Desktop.
OpenCL Benchmark C++
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cmake_minimum_required(VERSION 3.13) | |
project("CLBlast performance test") | |
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | |
find_package(OpenCL) | |
find_package(CLBlast HINTS "/home/opencv-cn/Workspace/others/CLBlast/build/install") | |
message(STATUS "CLBlast_FOUND=${CLBlast_FOUND}, CLBlast_INCLUDE_DIRS=${CLBlast_INCLUDE_DIRS}, CLBlast_LIBS=${CLBlast_LIBS}") | |
find_package(OpenCV 4.9.0 HINTS "/home/opencv-cn/Workspace/opencv/build/pre-4.9.0/install") | |
message(STATUS "OpenCV_FOUND=${OpenCV_FOUND}, OpenCV_INCLUDE_DIRS=${OpenCV_INCLUDE_DIRS}, OpenCV_LIBS=${OpenCV_LIBS}") | |
include_directories("/home/opencv-cn/Workspace/others/CLBlast/build/install/include") | |
include_directories(${OpenCV_INCLUDE_DIRS}) | |
add_executable(main main.cpp) | |
target_link_libraries(main "clblast" "${OpenCV_LIBS}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "opencv2/opencv.hpp" | |
#include <vector> | |
#include <iostream> | |
#include <numeric> | |
#define CL_HPP_TARGET_OPENCL_VERSION 120 | |
#define CL_HPP_MINIMUM_OPENCL_VERSION 120 | |
#define CL_TARGET_OPENCL_VERSION 120 | |
#include "CL/opencl.hpp" | |
#include "clblast.h" | |
using Shape = std::vector<int>; | |
struct TestGemmParam { | |
Shape a; | |
Shape b; | |
Shape c; | |
bool trans_a; | |
bool trans_b; | |
TestGemmParam(Shape A, Shape B, Shape C = {}, bool transA = false, bool transB = false) | |
: a(A), b(B), c(C), trans_a(transA), trans_b(transB) {} | |
}; | |
static const TestGemmParam test_configs[] = { | |
{ { 768, 768 }, { 768, 768 }, { 768 } }, | |
{ { 1024, 1024 }, { 1024, 1024 }, { 1024 } }, | |
{ { 50, 768 }, { 768, 2304 } }, | |
{ { 197, 768 }, { 768, 2304 } }, | |
{ { 50, 1024 }, { 1024, 3072 } }, | |
{ { 197, 1024 }, { 1024, 3072 } }, | |
}; | |
int main() { | |
// OpenCL platform | |
auto platforms = std::vector<cl::Platform>(); | |
cl::Platform::get(&platforms); | |
if (platforms.size() == 0) { | |
std::cerr << "Cannot get OpenCL platforms" << std::endl; | |
return 1; | |
} | |
for (size_t i = 0; i < platforms.size(); i++) { | |
std::string platform_name; | |
auto error = platforms[i].getInfo(CL_PLATFORM_NAME, &platform_name); | |
std::cout << platform_name << std::endl; | |
} | |
auto platform = platforms[2]; | |
// OpenCL device | |
auto devices = std::vector<cl::Device>(); | |
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); | |
if (devices.size() == 0) { | |
std::cerr << "Cannot get OpenCL devices" << std::endl; | |
return 1; | |
} | |
auto device = devices[0]; | |
// OpenCL context, queue | |
auto context = cl::Context(std::vector<cl::Device>{device}); | |
auto queue = cl::CommandQueue(context, device); | |
for (auto config : test_configs) { | |
Shape a_shape = config.a; | |
Shape b_shape = config.b; | |
Shape c_shape = config.c; | |
bool trans_a = config.trans_a; | |
bool trans_b = config.trans_b; | |
int M = trans_a ? a_shape.back() : a_shape[0], | |
N = trans_b ? b_shape[0] : b_shape.back(), | |
K = trans_b ? b_shape.back() : b_shape[0], | |
lda = a_shape.back(), | |
ldb = b_shape.back(), | |
ldc = N; | |
cv::Mat A(a_shape, CV_32FC1), | |
B(b_shape, CV_32FC1); | |
auto C = c_shape.empty() ? cv::Mat::zeros(M, N, CV_32FC1) : cv::Mat(c_shape, CV_32FC1); | |
auto Y = cv::Mat(std::vector<int>{M, N}, CV_32FC1); | |
std::memset(Y.ptr<float>(), 0, Y.total() * sizeof(float)); | |
cv::randn(A, 0.f, 1.f); | |
cv::randn(B, 0.f, 1.f); | |
if (!c_shape.empty()) { | |
cv::randn(C, 0.f, 1.f); | |
} | |
// Copy cv::Mat to device | |
auto A_device = cl::Buffer(context, CL_MEM_READ_WRITE, A.total() * sizeof(float)); | |
auto B_device = cl::Buffer(context, CL_MEM_READ_WRITE, B.total() * sizeof(float)); | |
// auto C_device = cl::Buffer(context, CL_MEM_READ_WRITE, C.total() * sizeof(float)); | |
auto Y_device = cl::Buffer(context, CL_MEM_READ_WRITE, Y.total() * sizeof(float)); | |
queue.enqueueWriteBuffer(A_device, CL_TRUE, 0, A.total() * sizeof(float), A.ptr<const float>()); | |
queue.enqueueWriteBuffer(B_device, CL_TRUE, 0, B.total() * sizeof(float), B.ptr<const float>()); | |
// queue.enqueueWriteBuffer(C_device, CL_TRUE, 0, C.total() * sizeof(float), C.ptr<const float>()); | |
queue.enqueueWriteBuffer(Y_device, CL_TRUE, 0, Y.total() * sizeof(float), Y.ptr<const float>()); | |
auto event = cl_event{nullptr}; | |
// Warmup | |
auto queue_plain = queue(); | |
auto status = clblast::Gemm(clblast::Layout::kRowMajor, | |
clblast::Transpose::kNo, | |
clblast::Transpose::kNo, | |
M, N, K, | |
1.f, // alpha | |
A_device(), 0, lda, | |
B_device(), 0, ldb, | |
0.f, // beta | |
Y_device(), 0, ldc, | |
&queue_plain, &event); | |
if (status == clblast::StatusCode::kSuccess) { | |
clWaitForEvents(1, &event); | |
} | |
// Benchmark | |
std::vector<double> times; | |
cv::TickMeter meter; | |
for (int i = 0; i < 10; i++) { | |
meter.reset(); | |
meter.start(); | |
auto status = clblast::Gemm(clblast::Layout::kRowMajor, | |
clblast::Transpose::kNo, | |
clblast::Transpose::kNo, | |
M, N, K, | |
1.f, // alpha | |
A_device(), 0, lda, | |
B_device(), 0, ldb, | |
0.f, // beta | |
Y_device(), 0, ldc, | |
&queue_plain, &event); | |
if (status == clblast::StatusCode::kSuccess) { | |
clWaitForEvents(1, &event); | |
} | |
meter.stop(); | |
times.push_back(meter.getTimeMilli()); | |
} | |
meter.reset(); | |
clReleaseEvent(event); | |
clReleaseMemObject(A_device.get()); | |
clReleaseMemObject(B_device.get()); | |
clReleaseMemObject(Y_device.get()); | |
// Handle results | |
std::sort(times.begin(), times.end()); | |
double mean = std::accumulate(times.begin(), times.end(), decltype(times)::value_type(0)) / times.size(); | |
double median = (times[4] + times[5]) / 2; | |
double minimum = times[0]; | |
std::string str_a_shape = cv::format("[%d, %d]", a_shape[0], a_shape[1]); | |
std::string str_b_shape = cv::format("[%d, %d]", b_shape[0], b_shape[1]); | |
std::cout << cv::format("A=%s, B=%s, mean=%.2f, median=%.2f, min=%.2f\n", str_a_shape.c_str(), str_b_shape.c_str(), mean, median, minimum); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment