Created
February 24, 2025 07:44
-
-
Save cmdr2/7a739f5f626c8e7b52fa4b3f87f66f1d to your computer and use it in GitHub Desktop.
Add two float16 tensors using ggml. Each tensor takes 1 GB of memory.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "ggml.h" | |
#include "ggml-cpu.h" | |
#ifdef GGML_USE_CUDA | |
#include "ggml-cuda.h" | |
#endif | |
#include <vector> | |
#include <iostream> | |
#include <chrono> | |
ggml_backend_t backend = NULL; | |
ggml_gallocr_t allocr = NULL; | |
using namespace std::chrono; | |
void init_backend() { | |
#ifdef GGML_USE_CUDA | |
fprintf(stderr, "%s: using CUDA backend\n", __func__); | |
backend = ggml_backend_cuda_init(0); // init device 0 | |
if (!backend) { | |
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); | |
} | |
#endif | |
if (!backend) { | |
backend = ggml_backend_cpu_init(); | |
} | |
} | |
void init_mem_allocator() { | |
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); | |
} | |
void predict() { | |
// Record initial VRAM state | |
size_t free_mem_start, total_mem; | |
ggml_backend_cuda_get_device_memory(0, &free_mem_start, &total_mem); | |
// create a context | |
struct ggml_init_params params = { | |
/*.mem_size =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(), | |
/*.mem_buffer =*/ NULL, | |
/*.no_alloc =*/ true, | |
}; | |
struct ggml_context* ctx = ggml_init(params); | |
const int N = 1024 * 1024 * 500; | |
// 1. Define the tensor variables | |
struct ggml_tensor* a = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, N); | |
struct ggml_tensor* b = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, N); | |
// 2. Define the computation graph | |
struct ggml_tensor* result = ggml_add(ctx, a, b); | |
struct ggml_cgraph* gf = ggml_new_graph(ctx); | |
ggml_build_forward_expand(gf, result); | |
// 3. Allocate memory for the tensor variables, and assign the data | |
ggml_gallocr_alloc_graph(allocr, gf); | |
// Convert float data to ggml_fp16_t | |
std::vector<ggml_fp16_t> a_data(N); | |
std::vector<ggml_fp16_t> b_data(N); | |
for (size_t i = 0; i < N; ++i) { | |
a_data[i] = ggml_fp32_to_fp16(1.5f); | |
b_data[i] = ggml_fp32_to_fp16(2.5f); | |
} | |
size_t size = N * ggml_type_size(GGML_TYPE_F16); | |
printf("host mem usage: %lld MB\n", 2 * size / (1024 * 1024)); // one per array | |
ggml_backend_tensor_set(a, a_data.data(), 0, ggml_nbytes(a)); | |
ggml_backend_tensor_set(b, b_data.data(), 0, ggml_nbytes(b)); | |
// 4. Run the computation, and read the result | |
auto start = high_resolution_clock::now(); | |
ggml_backend_graph_compute(backend, gf); | |
auto stop = high_resolution_clock::now(); | |
std::cout<<"Time taken: "<<duration_cast<milliseconds>(stop - start).count()<<" ms"<<std::endl; | |
// Record final VRAM state | |
size_t free_mem_end; | |
ggml_backend_cuda_get_device_memory(0, &free_mem_end, &total_mem); | |
// Calculate peak VRAM usage | |
size_t peak_usage = free_mem_start - free_mem_end; | |
printf("Peak VRAM usage: %f MB\n", peak_usage / (1024.0 * 1024.0)); | |
struct ggml_tensor* result_node = ggml_graph_node(gf, -1); // get the last node in the graph | |
std::cout<<"Output type: "<<result_node->type<<std::endl; | |
int n = ggml_nelements(result_node); // create an array to store the result data | |
std::vector<ggml_fp16_t> result_data(n); | |
// copy the data from the backend memory into the result array | |
ggml_backend_tensor_get(result_node, result_data.data(), 0, ggml_nbytes(result_node)); | |
// print the data | |
for (int i = 0; i < 10; i++) { | |
std::cout<<ggml_fp16_to_fp32(result_data[i])<<", "; | |
} | |
std::cout<<std::endl; | |
// free the resources | |
ggml_free(ctx); | |
} | |
int main(int argc, char* argv[]) { | |
init_backend(); | |
init_mem_allocator(); | |
predict(); | |
// free the resources | |
ggml_gallocr_free(allocr); | |
ggml_backend_free(backend); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment