From 84c2a6d0d1934a0db8ed8c0a7bae1d59b320a22c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Dec 2025 21:13:07 +0200 Subject: [PATCH 1/6] examples : add idle --- examples/CMakeLists.txt | 1 + examples/idle/CMakeLists.txt | 5 ++ examples/idle/README.md | 3 + examples/idle/idle.cpp | 110 +++++++++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+) create mode 100644 examples/idle/CMakeLists.txt create mode 100644 examples/idle/README.md create mode 100644 examples/idle/idle.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index dab795fb90a..91797cf78a9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,6 +20,7 @@ else() add_subdirectory(gguf-hash) add_subdirectory(gguf) + add_subdirectory(idle) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(parallel) diff --git a/examples/idle/CMakeLists.txt b/examples/idle/CMakeLists.txt new file mode 100644 index 00000000000..d5018fec4b7 --- /dev/null +++ b/examples/idle/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-idle) +add_executable(${TARGET} idle.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/idle/README.md b/examples/idle/README.md new file mode 100644 index 00000000000..0aa3625f21f --- /dev/null +++ b/examples/idle/README.md @@ -0,0 +1,3 @@ +# llama.cpp/example/idle + + diff --git a/examples/idle/idle.cpp b/examples/idle/idle.cpp new file mode 100644 index 00000000000..8afa16289a0 --- /dev/null +++ b/examples/idle/idle.cpp @@ -0,0 +1,110 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include + +static void print_usage(int /*argc*/, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]); + printf("\n"); +} + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + return 1; + } + + common_init(); + + // init LLM + + llama_backend_init(); + llama_numa_init(params.numa); + + // initialize the model + + llama_model_params model_params = common_model_params_to_llama(params); + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + + if (model == NULL) { + LOG_ERR("%s: error: unable to load model\n" , __func__); + return 1; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + + // we need just a dummy token to evaluate + std::vector prompt_tokens(1, llama_vocab_bos(vocab)); + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 512; + ctx_params.n_batch = 512; + ctx_params.no_perf = false; + + llama_context * ctx = llama_init_from_model(model, ctx_params); + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + + const int n_iters = 10; + + // warm-up + llama_decode(ctx, batch); + llama_memory_clear(llama_get_memory(ctx), true); + llama_synchronize(ctx); + + for (int64_t t_pause_ms = 200; t_pause_ms <= 1800; t_pause_ms += 200) { + double t_sum_us = 0.0; + double t_sum2_us = 0.0; + + for (int i = 0; i < n_iters; i++) { + // this pause is important - it simulates "idle GPU" + std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms)); + + const int64_t t_start_us = llama_time_us(); + + // this should take constant time + llama_decode(ctx, batch); + llama_synchronize(ctx); + + const int64_t t_end_us = llama_time_us(); + + const double t_cur_us = t_end_us - t_start_us; + +#if 1 + // print individual decode times + printf(" - decode time: %8.2f ms\n", t_cur_us / 1000); +#endif + + t_sum_us += t_cur_us; + t_sum2_us += t_cur_us * t_cur_us; + + llama_memory_clear(llama_get_memory(ctx), true); + llama_synchronize(ctx); // just in case + } + + const double t_avg_us = t_sum_us / n_iters; + const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1)); + + printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000); + fflush(stdout); + } + + llama_free(ctx); + llama_model_free(model); + + return 0; +} From a103ca1ec5248d4f093e45b91666d96f4d6995d4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Dec 2025 21:11:02 +0200 Subject: [PATCH 2/6] metal : attach residency sets to queue --- ggml/src/ggml-metal/ggml-metal-device.m | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 0d5a9814c7c..f322418d02a 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1233,6 +1233,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, return NULL; } + [res->queue addResidencySet:res->rset]; + //ggml_metal_log_allocated_size(device, size_aligned); return res; @@ -1329,10 +1331,14 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s return NULL; } + [res->queue addResidencySet:res->rset]; + return res; } void ggml_metal_buffer_free(ggml_metal_buffer_t buf) { + [buf->queue removeResidencySet:buf->rset]; + for (int i = 0; i < buf->n_buffers; i++) { [buf->buffers[i].metal release]; } From 5074a2140e1d12a3dab572af4dba52fa6d142d68 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Dec 2025 21:33:38 +0200 Subject: [PATCH 3/6] idle : add link --- examples/idle/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/idle/README.md b/examples/idle/README.md index 0aa3625f21f..d38e607994d 100644 --- a/examples/idle/README.md +++ b/examples/idle/README.md @@ -1,3 +1,3 @@ # llama.cpp/example/idle - +https://github.com/ggml-org/llama.cpp/pull/17766 From 5f52ca569f9684ccb03d4c9c65928407c0657ea9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Dec 2025 10:29:24 +0200 Subject: [PATCH 4/6] idle : adjust intervals --- examples/idle/idle.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/idle/idle.cpp b/examples/idle/idle.cpp index 8afa16289a0..000427143db 100644 --- a/examples/idle/idle.cpp +++ b/examples/idle/idle.cpp @@ -59,14 +59,14 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); - const int n_iters = 10; + const int n_iters = 3; // warm-up llama_decode(ctx, batch); llama_memory_clear(llama_get_memory(ctx), true); llama_synchronize(ctx); - for (int64_t t_pause_ms = 200; t_pause_ms <= 1800; t_pause_ms += 200) { + for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) { double t_sum_us = 0.0; double t_sum2_us = 0.0; From b2d7fa663b9c86837ef177727f2c1c06e8168033 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Dec 2025 10:29:34 +0200 Subject: [PATCH 5/6] metal : add residency sets keep-alive heartbeat --- ggml/src/ggml-metal/ggml-metal-context.m | 42 +++-- ggml/src/ggml-metal/ggml-metal-device.h | 15 ++ ggml/src/ggml-metal/ggml-metal-device.m | 187 +++++++++++++++++++---- 3 files changed, 198 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index e66646284db..42a35736eea 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -24,9 +24,6 @@ }; struct ggml_metal { - id device; - id queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND] - ggml_metal_device_t dev; ggml_metal_library_t lib; @@ -91,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { // init context ggml_metal_t res = calloc(1, sizeof(struct ggml_metal)); - res->device = ggml_metal_device_get_obj(dev); + id device = ggml_metal_device_get_obj(dev); - GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // TODO: would it be better to have one queue for the backend and one queue for the device? // the graph encoders and async ops would use the backend queue while the sync ops would use the device queue? //res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND] - res->queue = ggml_metal_device_get_queue(dev); - if (res->queue == nil) { + id queue = ggml_metal_device_get_queue(dev); + if (queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; } @@ -274,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @autoreleasepool { // wrap the source data into a Metal buffer - id buf_src = [ctx->device newBufferWithBytes:data + id device = ggml_metal_device_get_obj(ctx->dev); + id buf_src = [device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared]; @@ -289,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, // queue the copy operation into the queue of the Metal context // this will be queued at the end, after any currently ongoing GPU operations - id cmd_buf = [ctx->queue commandBuffer]; + id queue = ggml_metal_device_get_queue(ctx->dev); + id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [encoder copyFromBuffer:buf_src @@ -315,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { @autoreleasepool { - id buf_dst = [ctx->device newBufferWithBytesNoCopy:data + id device = ggml_metal_device_get_obj(ctx->dev); + id buf_dst = [device newBufferWithBytesNoCopy:data length:size options:MTLResourceStorageModeShared deallocator:nil]; @@ -331,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te // queue the copy operation into the queue of the Metal context // this will be queued at the end, after any currently ongoing GPU operations - id cmd_buf = [ctx->queue commandBuffer]; + id queue = ggml_metal_device_get_queue(ctx->dev); + id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [encoder copyFromBuffer:bid_src.metal @@ -362,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * // number of threads in addition to the main thread const int n_cb = ctx->n_cb; + // keep the memory wired + ggml_metal_device_rsets_keep_alive(ctx->dev); + // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes @@ -389,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * if (!ctx->capture_started) { // create capture scope - ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device]; + id device = ggml_metal_device_get_obj(ctx->dev); + ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device]; MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new]; descriptor.captureObject = ctx->capture_scope; @@ -406,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * } } + // short-hand + id queue = ggml_metal_device_get_queue(ctx->dev); + // the main thread commits the first few commands immediately // cmd_buf[n_cb] { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; [cmd_buf retain]; if (ctx->cmd_bufs[n_cb].obj) { @@ -428,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * // prepare the rest of the command buffers asynchronously (optional) // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; [cmd_buf retain]; if (ctx->cmd_bufs[cb_idx].obj) { @@ -589,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c } bool ggml_metal_supports_family(ggml_metal_t ctx, int family) { - GGML_ASSERT(ctx->device != nil); + GGML_ASSERT(ctx->dev != nil); + + id device = ggml_metal_device_get_obj(ctx->dev); - return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; + return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } void ggml_metal_capture_next_compute(ggml_metal_t ctx) { diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 4be0432ea76..77f2e98cfe8 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -186,6 +186,16 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_att int32_t dv, int32_t nwg); +// MTLResidencySet wrapper + +typedef void * ggml_metal_rset_t; + +// a collection of residency sets (non-owning) +typedef struct ggml_metal_rsets * ggml_metal_rsets_t; + +ggml_metal_rsets_t ggml_metal_rsets_init(void); +void ggml_metal_rsets_free(ggml_metal_rsets_t rsets); + // // device // @@ -219,6 +229,11 @@ void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id @@ -519,11 +518,101 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder) { // ref: https://github.com/ggml-org/llama.cpp/pull/15906 id mtl_queue; + ggml_metal_rsets_t rsets; + ggml_metal_library_t library; struct ggml_metal_device_props props; }; +// +// MTLResidenceSet wrapper +// + +struct ggml_metal_rsets { + NSLock * lock; + + NSMutableArray * data; + + // number of seconds since the last graph computation + // keep the residency sets wired for that amount of time to avoid being collected by the OS + int keep_alive_s; + + // background heartbeat thread to keep the residency sets alive + atomic_bool d_stop; + atomic_int d_loop; + + dispatch_group_t d_group; +}; + +ggml_metal_rsets_t ggml_metal_rsets_init(void) { + ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets)); + + res->lock = [[NSLock alloc] init]; + res->data = [[NSMutableArray alloc] init]; + + // by default keep the memory wired for half an hour + res->keep_alive_s = 30*60; + + const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S"); + if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) { + res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S); + } + + if (res->keep_alive_s <= 0) { + res->keep_alive_s = 30*60; + } + + GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s); + + atomic_store_explicit(&res->d_stop, false, memory_order_relaxed); + atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed); + + res->d_group = dispatch_group_create(); + + // start a background thread that periodically requests residency for all the currently active sets in the collection + // the requests stop after a certain amount of time (keep_alive_s) of inactivity + dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0); + dispatch_group_async(res->d_group, d_queue, ^{ + while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) { + if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) { + [res->lock lock]; + + for (int i = 0; i < (int) res->data.count; ++i) { + [res->data[i] requestResidency]; + } + + atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed); + + [res->lock unlock]; + } + + // half a second + usleep(500 * 1000); + } + }); + + return res; +} + +void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) { + if (rsets == NULL) { + return; + } + + GGML_ASSERT([rsets->data count] == 0); + + atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed); + + dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER); + dispatch_release(rsets->d_group); + + [rsets->data release]; + [rsets->lock release]; + + free(rsets); +} + ggml_metal_device_t ggml_metal_device_init(void) { ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device)); @@ -692,6 +781,13 @@ ggml_metal_device_t ggml_metal_device_init(void) { GGML_LOG_ERROR("%s: error: failed to create library\n", __func__); } + if (dev->props.use_residency_sets) { + dev->rsets = ggml_metal_rsets_init(); + } else { + dev->rsets = nil; + } + + // -------------------------------------------------- // print MTL GPU family: @@ -745,6 +841,8 @@ ggml_metal_device_t ggml_metal_device_init(void) { void ggml_metal_device_free(ggml_metal_device_t dev) { assert(dev != NULL); + ggml_metal_rsets_free(dev->rsets); + ggml_metal_library_free(dev->library); dev->library = NULL; @@ -773,6 +871,42 @@ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) { return dev->library; } +void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) { + if (rset == nil) { + return; + } + + GGML_ASSERT(dev->rsets); + + [dev->rsets->lock lock]; + + [dev->rsets->data addObject:rset]; + + [dev->rsets->lock unlock]; +} + +void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) { + if (rset == nil) { + return; + } + + GGML_ASSERT(dev->rsets); + + [dev->rsets->lock lock]; + + [dev->rsets->data removeObject:rset]; + + [dev->rsets->lock unlock]; +} + +void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) { + if (dev->rsets == NULL) { + return; + } + + atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed); +} + void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) { if (@available(macOS 10.12, iOS 16.0, *)) { *total = dev->mtl_device.recommendedMaxWorkingSetSize; @@ -1066,9 +1200,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te // note: cannot use explicity "id" here because it is not available on certain OSes id rset; - // pointers to global device objects - id device; - id queue; + // pointers to global device + ggml_metal_device_t dev; }; static void ggml_metal_log_allocated_size(id device, size_t size_aligned) { @@ -1111,7 +1244,7 @@ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) { desc.initialCapacity = buf->n_buffers; NSError * error; - buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error]; + buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error]; if (error) { GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); [desc release]; @@ -1172,6 +1305,8 @@ static void ggml_metal_buffer_rset_free(ggml_metal_buffer_t buf) { ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) { ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer)); + res->dev = dev; + const size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; @@ -1196,9 +1331,6 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, res->owned = true; - res->device = ggml_metal_device_get_obj(dev); - res->queue = ggml_metal_device_get_queue(dev); - res->n_buffers = 1; if (res->all_data != NULL) { @@ -1207,12 +1339,12 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, if (size_aligned > 0) { if (props_dev->use_shared_buffers && shared) { - res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data + res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; } else { - res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; + res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; } } @@ -1233,7 +1365,7 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, return NULL; } - [res->queue addResidencySet:res->rset]; + ggml_metal_device_rsets_add(dev, res->rset); //ggml_metal_log_allocated_size(device, size_aligned); @@ -1243,6 +1375,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) { ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer)); + res->dev = dev; + res->all_data = ptr; res->all_size = size; @@ -1265,9 +1399,6 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s size_aligned += (size_page - (size_aligned % size_page)); } - res->device = ggml_metal_device_get_obj(dev); - res->queue = ggml_metal_device_get_queue(dev); - const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev); // the buffer fits into the max buffer size allowed by the device @@ -1277,7 +1408,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s res->buffers[res->n_buffers].metal = nil; if (size_aligned > 0) { - res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (res->buffers[res->n_buffers].metal == nil) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); @@ -1286,7 +1417,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s } } - ggml_metal_log_allocated_size(res->device, size_aligned); + ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned); ++res->n_buffers; } else { @@ -1304,7 +1435,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s res->buffers[res->n_buffers].metal = nil; if (size_step_aligned > 0) { - res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (res->buffers[res->n_buffers].metal == nil) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); @@ -1313,7 +1444,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s } } - ggml_metal_log_allocated_size(res->device, size_step_aligned); + ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned); if (i + size_step < size) { GGML_LOG_INFO("\n"); @@ -1331,13 +1462,13 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s return NULL; } - [res->queue addResidencySet:res->rset]; + ggml_metal_device_rsets_add(dev, res->rset); return res; } void ggml_metal_buffer_free(ggml_metal_buffer_t buf) { - [buf->queue removeResidencySet:buf->rset]; + ggml_metal_device_rsets_rm(buf->dev, buf->rset); for (int i = 0; i < buf->n_buffers; i++) { [buf->buffers[i].metal release]; @@ -1375,8 +1506,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor); bid_dst.offs += offset; - id queue = buf->queue; - id cmd_buf = [queue commandBufferWithUnretainedReferences]; + id cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences]; { id encoder = [cmd_buf blitCommandEncoder]; @@ -1402,7 +1532,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * @autoreleasepool { // src void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data - id buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr + id buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr length:size options:MTLResourceStorageModeShared deallocator:nil]; @@ -1417,8 +1547,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * // this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0); - id queue = buf->queue; - id cmd_buf = [queue commandBufferWithUnretainedReferences]; + id cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences]; { id encoder = [cmd_buf blitCommandEncoder]; @@ -1460,15 +1589,14 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten bid_src.offs += offset; // dst - id buf_dst = [buf->device newBufferWithBytesNoCopy:data + id buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data length:size options:MTLResourceStorageModeShared deallocator:nil]; GGML_ASSERT(buf_dst); - id queue = buf->queue; - id cmd_buf = [queue commandBufferWithUnretainedReferences]; + id cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences]; { id encoder = [cmd_buf blitCommandEncoder]; @@ -1494,8 +1622,7 @@ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) { } @autoreleasepool { - id queue = buf->queue; - id cmd_buf = [queue commandBufferWithUnretainedReferences]; + id cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences]; { id encoder = [cmd_buf blitCommandEncoder]; From df4f195b5388ec86e8f41bbd7ef119edb76bcf89 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Dec 2025 12:15:16 +0200 Subject: [PATCH 6/6] cont : adjust default keep-alive time --- ggml/src/ggml-metal/ggml-metal-device.m | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index f57801e0f7d..c53ec506cd1 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -551,8 +551,8 @@ ggml_metal_rsets_t ggml_metal_rsets_init(void) { res->lock = [[NSLock alloc] init]; res->data = [[NSMutableArray alloc] init]; - // by default keep the memory wired for half an hour - res->keep_alive_s = 30*60; + // by default keep the memory wired for 3 minutes + res->keep_alive_s = 3*60; const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S"); if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) { @@ -560,7 +560,7 @@ ggml_metal_rsets_t ggml_metal_rsets_init(void) { } if (res->keep_alive_s <= 0) { - res->keep_alive_s = 30*60; + res->keep_alive_s = 3*60; } GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);