From 403652bf7a683363579fbc0fd0e2e9f3446f77ae Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 1 Dec 2025 19:25:52 -0600 Subject: [PATCH 1/2] vulkan: perf_logger improvements - Move perf_logger from device to ctx. - Add an env var to control the frequency we dump the stats. If you set a very large value, it just dumps when the ctx is destroyed. - Add a fusion info string to the tracking, only log one item per fused op. - Fix MUL_MAT_ID flops calculation. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 120 +++++++++++++++++++-------- 1 file changed, 85 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f917a745d5..b1d2b4c0d3 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -769,11 +769,6 @@ struct vk_device_struct { std::unique_ptr memory_logger; #endif - // for GGML_VK_PERF_LOGGER - std::unique_ptr perf_logger; - vk::QueryPool query_pool; - int32_t num_queries; - ~vk_device_struct() { VK_LOG_DEBUG("destroy device " << name); @@ -1564,12 +1559,21 @@ class vk_memory_logger { #define VK_LOG_MEMORY(msg) ((void) 0) #endif // GGML_VULKAN_MEMORY_DEBUG +static bool vk_perf_logger_enabled = false; +// number of calls between perf logger prints +static uint32_t vk_perf_logger_frequency = 1; + class vk_perf_logger { public: - void print_timings() { + void print_timings(bool force = false) { if (timings.empty()) { return; } + print_count++; + if ((print_count % vk_perf_logger_frequency) != 0 && !force) { + return; + } + print_count = 0; uint64_t total_all_op_times = 0; std::cerr << "----------------\nVulkan Timings:" << std::endl; for (const auto & t : timings) { @@ -1606,16 +1610,20 @@ class vk_perf_logger { flops.clear(); } - void log_timing(const ggml_tensor * node, uint64_t time) { + void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { + std::string fusion_str; + if (fusion_name) { + fusion_str = fusion_name + std::string(" "); + } if (node->op == GGML_OP_UNARY) { - timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time); + timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time); return; } if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { - const uint64_t m = node->src[0]->ne[1]; - const uint64_t n = (node->op == GGML_OP_MUL_MAT) ? node->ne[1] : node->ne[2]; + const uint64_t m = node->ne[0]; + const uint64_t n = node->ne[1]; const uint64_t k = node->src[1]->ne[0]; - const uint64_t batch = node->src[1]->ne[2] * node->src[1]->ne[3]; + const uint64_t batch = node->ne[2] * node->ne[3]; std::string name = ggml_op_name(node->op); if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) || (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) { @@ -1624,9 +1632,13 @@ class vk_perf_logger { name += " "; name += ggml_type_name(node->src[0]->type); name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); + if (node->op == GGML_OP_MUL_MAT_ID) { + name += " n_expert=" + std::to_string(node->src[0]->ne[2]); + } if (batch > 1) { name += " batch=" + std::to_string(batch); } + name = fusion_str + name; timings[name].push_back(time); flops[name].push_back(m * n * (k + (k - 1)) * batch); return; @@ -1648,6 +1660,7 @@ class vk_perf_logger { uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1)); name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N); + name = fusion_str + name; flops[name].push_back(n_flops); timings[name].push_back(time); return; @@ -1655,6 +1668,7 @@ class vk_perf_logger { if (node->op == GGML_OP_RMS_NORM) { std::string name = ggml_op_name(node->op); name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; + name = fusion_str + name; timings[name].push_back(time); return; } @@ -1665,6 +1679,7 @@ class vk_perf_logger { const ggml_tensor * v = node->src[2]; const ggml_tensor * m = node->src[3]; std::stringstream name; + name << fusion_str; name << ggml_op_name(node->op) << " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " << " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " << @@ -1676,17 +1691,19 @@ class vk_perf_logger { } if (node->op == GGML_OP_TOP_K) { std::stringstream name; + name << fusion_str; name << ggml_op_name(node->op) << " K=" << node->ne[0] << " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")"; timings[name.str()].push_back(time); return; } - timings[ggml_op_name(node->op)].push_back(time); + timings[fusion_str + ggml_op_name(node->op)].push_back(time); } private: std::map> timings; std::map> flops; + uint32_t print_count {}; }; struct ggml_backend_vk_context { @@ -1740,6 +1757,14 @@ struct ggml_backend_vk_context { // Bit 'i' means nodes[start_of_fusion + i] writes to memory. // If there's no fusion, bit 0 is still set. int fused_ops_write_mask {}; + + // for GGML_VK_PERF_LOGGER + std::unique_ptr perf_logger; + vk::QueryPool query_pool; + std::vector query_fusion_names; + std::vector query_nodes; + int32_t num_queries {}; + int32_t query_idx {}; }; static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT @@ -1865,8 +1890,6 @@ struct vk_instance_t { static bool vk_instance_initialized = false; static vk_instance_t vk_instance; -static bool vk_perf_logger_enabled = false; - #ifdef GGML_VULKAN_CHECK_RESULTS static size_t vk_skip_checks; static size_t vk_output_tensor; @@ -4258,9 +4281,6 @@ static vk_device ggml_vk_get_device(size_t idx) { #ifdef GGML_VULKAN_MEMORY_DEBUG device->memory_logger = std::unique_ptr(new vk_memory_logger()); #endif - if (vk_perf_logger_enabled) { - device->perf_logger = std::unique_ptr(new vk_perf_logger()); - } size_t dev_num = vk_instance.device_indices[idx]; @@ -5195,6 +5215,11 @@ static void ggml_vk_instance_init() { } vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; + const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY"); + + if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) { + vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY); + } // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance); @@ -5372,6 +5397,10 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue); ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue); + if (vk_perf_logger_enabled) { + ctx->perf_logger = std::unique_ptr(new vk_perf_logger()); + } + #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); @@ -12351,6 +12380,9 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->compute_cmd_pool.destroy(ctx->device->device); ctx->transfer_cmd_pool.destroy(ctx->device->device); + if (vk_perf_logger_enabled) { + ctx->perf_logger->print_timings(true); + } } static int ggml_vk_get_device_count() { @@ -13149,24 +13181,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg vk_context compute_ctx; if (vk_perf_logger_enabled) { // allocate/resize the query pool - if (ctx->device->num_queries < cgraph->n_nodes + 1) { - if (ctx->device->query_pool) { - ctx->device->device.destroyQueryPool(ctx->device->query_pool); + if (ctx->num_queries < cgraph->n_nodes + 1) { + if (ctx->query_pool) { + ctx->device->device.destroyQueryPool(ctx->query_pool); } vk::QueryPoolCreateInfo query_create_info; query_create_info.queryType = vk::QueryType::eTimestamp; query_create_info.queryCount = cgraph->n_nodes + 100; - ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info); - ctx->device->num_queries = query_create_info.queryCount; + ctx->query_pool = ctx->device->device.createQueryPool(query_create_info); + ctx->num_queries = query_create_info.queryCount; + ctx->query_fusion_names.resize(cgraph->n_nodes); + ctx->query_nodes.resize(cgraph->n_nodes); } - ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1); + ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1); + std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr); + std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr); GGML_ASSERT(ctx->compute_ctx.expired()); compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ctx->compute_ctx = compute_ctx; ggml_vk_ctx_begin(ctx->device, compute_ctx); - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0); + ctx->query_idx = 0; + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); } ctx->prealloc_y_last_pipeline_used = nullptr; @@ -13207,52 +13244,66 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg total_mul_mat_bytes += bytes; } + const char *fusion_string {}; if (!ctx->device->disable_fusion) { uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); if (num_adds) { ctx->num_additional_fused_ops = num_adds - 1; + fusion_string = "MULTI_ADD"; } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) { ctx->num_additional_fused_ops = 2; + fusion_string = "MUL_MAT_ADD_ADD"; } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) { ctx->num_additional_fused_ops = 1; + fusion_string = "MUL_MAT_ADD"; } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) { ctx->num_additional_fused_ops = 2; + fusion_string = "MUL_MAT_ID_ADD_ID_MUL"; } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) { ctx->num_additional_fused_ops = 1; + fusion_string = "MUL_MAT_ID_ADD_ID"; } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) { ctx->num_additional_fused_ops = 1; + fusion_string = "MUL_MAT_ID_MUL"; } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) && ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) && ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) && ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) { ctx->num_additional_fused_ops = 4; + fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS"; } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&& ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) { ctx->num_additional_fused_ops = 2; + fusion_string = "RMS_NORM_MUL_ROPE"; } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { ctx->num_additional_fused_ops = 1; + fusion_string = "RMS_NORM_MUL"; } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) && ggml_check_edges(cgraph, i, rope_view_set_rows_edges) && ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) { ctx->num_additional_fused_ops = 2; + fusion_string = "ROPE_VIEW_SET_ROWS"; } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) && ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) && ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) { ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1; // view of argsort writes to memory ctx->fused_ops_write_mask |= 1 << 3; + fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM"; } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) && ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) && ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) { ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1; // view of argsort writes to memory ctx->fused_ops_write_mask |= 1 << 3; + fusion_string = "TOPK_MOE_EARLY_SOFTMAX"; } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) && ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) && ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) { ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1; // view of argsort writes to memory ctx->fused_ops_write_mask |= 1 << 1; + fusion_string = "TOPK_MOE_LATE_SOFTMAX"; } } ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops; @@ -13266,7 +13317,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); - if (vk_perf_logger_enabled) { + if (vk_perf_logger_enabled && enqueued) { if (ctx->compute_ctx.expired()) { compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ctx->compute_ctx = compute_ctx; @@ -13274,10 +13325,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } else { compute_ctx = ctx->compute_ctx.lock(); } - // If there are fused ops, just write out timestamps for all nodes to keep the accounting simple - for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) { - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1); - } + ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; + ctx->query_fusion_names[ctx->query_idx] = fusion_string; + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); } if (enqueued) { @@ -13318,14 +13368,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Get the results and pass them to the logger std::vector timestamps(cgraph->n_nodes + 1); - VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); - for (int i = 0; i < cgraph->n_nodes; i++) { - if (!ggml_vk_is_empty(cgraph->nodes[i])) { - ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod)); - } + VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); + for (int i = 1; i < ctx->query_idx; i++) { + auto node = ctx->query_nodes[i]; + auto name = ctx->query_fusion_names[i]; + ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); } - ctx->device->perf_logger->print_timings(); + ctx->perf_logger->print_timings(); } if (!ctx->device->support_async) { From e3f771bd98c6886e4aac85257dafb5ccb30c58bf Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 4 Dec 2025 10:02:07 -0600 Subject: [PATCH 2/2] fix vector sizes --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index b1d2b4c0d3..f4ac833b2b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -13190,8 +13190,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg query_create_info.queryCount = cgraph->n_nodes + 100; ctx->query_pool = ctx->device->device.createQueryPool(query_create_info); ctx->num_queries = query_create_info.queryCount; - ctx->query_fusion_names.resize(cgraph->n_nodes); - ctx->query_nodes.resize(cgraph->n_nodes); + ctx->query_fusion_names.resize(ctx->num_queries); + ctx->query_nodes.resize(ctx->num_queries); } ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);