ggml-org · Jessen-Li · Apr 16, 2026
@@ -3493,11 +3493,11 @@ void llama_perf_context_reset(llama_context * ctx) {
     ctx->perf_reset();
 }
 
-void llama_memory_breakdown_print(const struct llama_context * ctx) {
-    const auto & devices = ctx->get_model().devices;
-
-    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
 
+void llama_memory_breakdown_print_impl(
+        const struct llama_context * ctx,
+        const std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> & memory_breakdown) {
+    const auto & devices = ctx->get_model().devices;
     std::vector<std::array<std::string, 9>> table_data;
     table_data.reserve(devices.size());
     const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
@@ -3629,6 +3629,11 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
     }
 }
 
+void llama_memory_breakdown_print(const struct llama_context * ctx) {
+    const auto memory_breakdown = ctx->memory_breakdown();
+    llama_memory_breakdown_print_impl(ctx, memory_breakdown);
+}
+
 //
 // training
 //

@@ -357,3 +357,7 @@ struct llama_context {
 
     mutable int32_t n_reused = 0; // number of times the previous graph was reused
 };
+
+void llama_memory_breakdown_print_impl(
+    const struct llama_context * ctx,
+    const std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> & memory_breakdown);
@@ -137,7 +137,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
     hp_n_ctx_train = model->hparams.n_ctx_train;
     hp_n_expert    = model->hparams.n_expert;
 
-    llama_memory_breakdown_print(ctx); // goes to debug log
+    llama_memory_breakdown_print_impl(ctx, memory_breakdown); // goes to debug log
 
     llama_free(ctx);
     llama_model_free(model);