Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ else()

add_subdirectory(gguf-hash)
add_subdirectory(gguf)
add_subdirectory(idle)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(parallel)
Expand Down
5 changes: 5 additions & 0 deletions examples/idle/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET llama-idle)
add_executable(${TARGET} idle.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
3 changes: 3 additions & 0 deletions examples/idle/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# llama.cpp/example/idle

https://github.com/ggml-org/llama.cpp/pull/17766
110 changes: 110 additions & 0 deletions examples/idle/idle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"

#include <cmath>
#include <cstdio>
#include <cstring>
#include <string>
#include <thread>
#include <vector>

static void print_usage(int /*argc*/, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
printf("\n");
}

int main(int argc, char ** argv) {
common_params params;

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1;
}

common_init();

// init LLM

llama_backend_init();
llama_numa_init(params.numa);

// initialize the model

llama_model_params model_params = common_model_params_to_llama(params);

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1;
}

const llama_vocab * vocab = llama_model_get_vocab(model);

// we need just a dummy token to evaluate
std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));

llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 512;
ctx_params.n_batch = 512;
ctx_params.no_perf = false;

llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}

llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());

const int n_iters = 3;

// warm-up
llama_decode(ctx, batch);
llama_memory_clear(llama_get_memory(ctx), true);
llama_synchronize(ctx);

for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
double t_sum_us = 0.0;
double t_sum2_us = 0.0;

for (int i = 0; i < n_iters; i++) {
// this pause is important - it simulates "idle GPU"
std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));

const int64_t t_start_us = llama_time_us();

// this should take constant time
llama_decode(ctx, batch);
llama_synchronize(ctx);

const int64_t t_end_us = llama_time_us();

const double t_cur_us = t_end_us - t_start_us;

#if 1
// print individual decode times
printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
#endif

t_sum_us += t_cur_us;
t_sum2_us += t_cur_us * t_cur_us;

llama_memory_clear(llama_get_memory(ctx), true);
llama_synchronize(ctx); // just in case
}

const double t_avg_us = t_sum_us / n_iters;
const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));

printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
fflush(stdout);
}

llama_free(ctx);
llama_model_free(model);

return 0;
}
42 changes: 26 additions & 16 deletions ggml/src/ggml-metal/ggml-metal-context.m
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@
};

struct ggml_metal {
id<MTLDevice> device;
id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]

ggml_metal_device_t dev;
ggml_metal_library_t lib;

Expand Down Expand Up @@ -91,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
// init context
ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));

res->device = ggml_metal_device_get_obj(dev);
id<MTLDevice> device = ggml_metal_device_get_obj(dev);

GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);

// TODO: would it be better to have one queue for the backend and one queue for the device?
// the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
//res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
res->queue = ggml_metal_device_get_queue(dev);
if (res->queue == nil) {
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
if (queue == nil) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
return NULL;
}
Expand Down Expand Up @@ -274,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@autoreleasepool {
// wrap the source data into a Metal buffer
id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
length:size
options:MTLResourceStorageModeShared];

Expand All @@ -289,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,

// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

[encoder copyFromBuffer:buf_src
Expand All @@ -315,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,

void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@autoreleasepool {
id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
Expand All @@ -331,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te

// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

[encoder copyFromBuffer:bid_src.metal
Expand Down Expand Up @@ -362,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
// number of threads in addition to the main thread
const int n_cb = ctx->n_cb;

// keep the memory wired
ggml_metal_device_rsets_keep_alive(ctx->dev);

// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
Expand Down Expand Up @@ -389,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *

if (!ctx->capture_started) {
// create capture scope
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];

MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
descriptor.captureObject = ctx->capture_scope;
Expand All @@ -406,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
}
}

// short-hand
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);

// the main thread commits the first few commands immediately
// cmd_buf[n_cb]
{
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
[cmd_buf retain];

if (ctx->cmd_bufs[n_cb].obj) {
Expand All @@ -428,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
// prepare the rest of the command buffers asynchronously (optional)
// cmd_buf[0.. n_cb)
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
[cmd_buf retain];

if (ctx->cmd_bufs[cb_idx].obj) {
Expand Down Expand Up @@ -589,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c
}

bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
GGML_ASSERT(ctx->device != nil);
GGML_ASSERT(ctx->dev != nil);

id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);

return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
}

void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
Expand Down
15 changes: 15 additions & 0 deletions ggml/src/ggml-metal/ggml-metal-device.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,16 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_att
int32_t dv,
int32_t nwg);

// MTLResidencySet wrapper

typedef void * ggml_metal_rset_t;

// a collection of residency sets (non-owning)
typedef struct ggml_metal_rsets * ggml_metal_rsets_t;

ggml_metal_rsets_t ggml_metal_rsets_init(void);
void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);

//
// device
//
Expand Down Expand Up @@ -219,6 +229,11 @@ void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQue

ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);

void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset);
void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset);

void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);

void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);

Expand Down
Loading
Loading