Skip to content

Instantly share code, notes, and snippets.

@cmdr2
Created February 24, 2025 07:44
Show Gist options
  • Save cmdr2/7a739f5f626c8e7b52fa4b3f87f66f1d to your computer and use it in GitHub Desktop.
Save cmdr2/7a739f5f626c8e7b52fa4b3f87f66f1d to your computer and use it in GitHub Desktop.
Add two float16 tensors using ggml. Each tensor takes 1 GB of memory.
#include "ggml.h"
#include "ggml-cpu.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#include <vector>
#include <iostream>
#include <chrono>
ggml_backend_t backend = NULL;
ggml_gallocr_t allocr = NULL;
using namespace std::chrono;
void init_backend() {
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
if (!backend) {
backend = ggml_backend_cpu_init();
}
}
void init_mem_allocator() {
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
}
void predict() {
// Record initial VRAM state
size_t free_mem_start, total_mem;
ggml_backend_cuda_get_device_memory(0, &free_mem_start, &total_mem);
// create a context
struct ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context* ctx = ggml_init(params);
const int N = 1024 * 1024 * 500;
// 1. Define the tensor variables
struct ggml_tensor* a = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, N);
struct ggml_tensor* b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, N);
// 2. Define the computation graph
struct ggml_tensor* result = ggml_add(ctx, a, b);
struct ggml_cgraph* gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, result);
// 3. Allocate memory for the tensor variables, and assign the data
ggml_gallocr_alloc_graph(allocr, gf);
// Convert float data to ggml_fp16_t
std::vector<ggml_fp16_t> a_data(N);
std::vector<ggml_fp16_t> b_data(N);
for (size_t i = 0; i < N; ++i) {
a_data[i] = ggml_fp32_to_fp16(1.5f);
b_data[i] = ggml_fp32_to_fp16(2.5f);
}
size_t size = N * ggml_type_size(GGML_TYPE_F16);
printf("host mem usage: %lld MB\n", 2 * size / (1024 * 1024)); // one per array
ggml_backend_tensor_set(a, a_data.data(), 0, ggml_nbytes(a));
ggml_backend_tensor_set(b, b_data.data(), 0, ggml_nbytes(b));
// 4. Run the computation, and read the result
auto start = high_resolution_clock::now();
ggml_backend_graph_compute(backend, gf);
auto stop = high_resolution_clock::now();
std::cout<<"Time taken: "<<duration_cast<milliseconds>(stop - start).count()<<" ms"<<std::endl;
// Record final VRAM state
size_t free_mem_end;
ggml_backend_cuda_get_device_memory(0, &free_mem_end, &total_mem);
// Calculate peak VRAM usage
size_t peak_usage = free_mem_start - free_mem_end;
printf("Peak VRAM usage: %f MB\n", peak_usage / (1024.0 * 1024.0));
struct ggml_tensor* result_node = ggml_graph_node(gf, -1); // get the last node in the graph
std::cout<<"Output type: "<<result_node->type<<std::endl;
int n = ggml_nelements(result_node); // create an array to store the result data
std::vector<ggml_fp16_t> result_data(n);
// copy the data from the backend memory into the result array
ggml_backend_tensor_get(result_node, result_data.data(), 0, ggml_nbytes(result_node));
// print the data
for (int i = 0; i < 10; i++) {
std::cout<<ggml_fp16_to_fp32(result_data[i])<<", ";
}
std::cout<<std::endl;
// free the resources
ggml_free(ctx);
}
int main(int argc, char* argv[]) {
init_backend();
init_mem_allocator();
predict();
// free the resources
ggml_gallocr_free(allocr);
ggml_backend_free(backend);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment