@@ -777,11 +777,6 @@ struct vk_device_struct {
777777 std::unique_ptr<vk_memory_logger> memory_logger;
778778#endif
779779
780- // for GGML_VK_PERF_LOGGER
781- std::unique_ptr<vk_perf_logger> perf_logger;
782- vk::QueryPool query_pool;
783- int32_t num_queries;
784-
785780 ~vk_device_struct() {
786781 VK_LOG_DEBUG("destroy device " << name);
787782
@@ -1523,12 +1518,21 @@ class vk_memory_logger {
15231518#define VK_LOG_MEMORY(msg) ((void) 0)
15241519#endif // GGML_VULKAN_MEMORY_DEBUG
15251520
1521+ static bool vk_perf_logger_enabled = false;
1522+ // number of calls between perf logger prints
1523+ static uint32_t vk_perf_logger_frequency = 1;
1524+
15261525class vk_perf_logger {
15271526 public:
1528- void print_timings() {
1527+ void print_timings(bool force = false ) {
15291528 if (timings.empty()) {
15301529 return;
15311530 }
1531+ print_count++;
1532+ if ((print_count % vk_perf_logger_frequency) != 0 && !force) {
1533+ return;
1534+ }
1535+ print_count = 0;
15321536 uint64_t total_all_op_times = 0;
15331537 std::cerr << "----------------\nVulkan Timings:" << std::endl;
15341538 for (const auto & t : timings) {
@@ -1565,16 +1569,20 @@ class vk_perf_logger {
15651569 flops.clear();
15661570 }
15671571
1568- void log_timing(const ggml_tensor * node, uint64_t time) {
1572+ void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1573+ std::string fusion_str;
1574+ if (fusion_name) {
1575+ fusion_str = fusion_name + std::string(" ");
1576+ }
15691577 if (node->op == GGML_OP_UNARY) {
1570- timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
1578+ timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
15711579 return;
15721580 }
15731581 if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
1574- const uint64_t m = node->src[0]-> ne[1 ];
1575- const uint64_t n = ( node->op == GGML_OP_MUL_MAT) ? node-> ne[1] : node->ne[2 ];
1582+ const uint64_t m = node->ne[0 ];
1583+ const uint64_t n = node->ne[1];
15761584 const uint64_t k = node->src[1]->ne[0];
1577- const uint64_t batch = node->src[1]-> ne[2] * node->src[1] ->ne[3];
1585+ const uint64_t batch = node->ne[2] * node->ne[3];
15781586 std::string name = ggml_op_name(node->op);
15791587 if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) ||
15801588 (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) {
@@ -1583,9 +1591,13 @@ class vk_perf_logger {
15831591 name += " ";
15841592 name += ggml_type_name(node->src[0]->type);
15851593 name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
1594+ if (node->op == GGML_OP_MUL_MAT_ID) {
1595+ name += " n_expert=" + std::to_string(node->src[0]->ne[2]);
1596+ }
15861597 if (batch > 1) {
15871598 name += " batch=" + std::to_string(batch);
15881599 }
1600+ name = fusion_str + name;
15891601 timings[name].push_back(time);
15901602 flops[name].push_back(m * n * (k + (k - 1)) * batch);
15911603 return;
@@ -1607,13 +1619,15 @@ class vk_perf_logger {
16071619 uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
16081620 name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
16091621 ", N=N*OW*OH=" + std::to_string(size_N);
1622+ name = fusion_str + name;
16101623 flops[name].push_back(n_flops);
16111624 timings[name].push_back(time);
16121625 return;
16131626 }
16141627 if (node->op == GGML_OP_RMS_NORM) {
16151628 std::string name = ggml_op_name(node->op);
16161629 name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
1630+ name = fusion_str + name;
16171631 timings[name].push_back(time);
16181632 return;
16191633 }
@@ -1624,6 +1638,7 @@ class vk_perf_logger {
16241638 const ggml_tensor * v = node->src[2];
16251639 const ggml_tensor * m = node->src[3];
16261640 std::stringstream name;
1641+ name << fusion_str;
16271642 name << ggml_op_name(node->op) <<
16281643 " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
16291644 " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
@@ -1635,17 +1650,19 @@ class vk_perf_logger {
16351650 }
16361651 if (node->op == GGML_OP_TOP_K) {
16371652 std::stringstream name;
1653+ name << fusion_str;
16381654 name << ggml_op_name(node->op) <<
16391655 " K=" << node->ne[0] <<
16401656 " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
16411657 timings[name.str()].push_back(time);
16421658 return;
16431659 }
1644- timings[ggml_op_name(node->op)].push_back(time);
1660+ timings[fusion_str + ggml_op_name(node->op)].push_back(time);
16451661 }
16461662 private:
16471663 std::map<std::string, std::vector<uint64_t>> timings;
16481664 std::map<std::string, std::vector<uint64_t>> flops;
1665+ uint32_t print_count {};
16491666};
16501667
16511668struct ggml_backend_vk_context {
@@ -1699,6 +1716,14 @@ struct ggml_backend_vk_context {
16991716 // Bit 'i' means nodes[start_of_fusion + i] writes to memory.
17001717 // If there's no fusion, bit 0 is still set.
17011718 int fused_ops_write_mask {};
1719+
1720+ // for GGML_VK_PERF_LOGGER
1721+ std::unique_ptr<vk_perf_logger> perf_logger;
1722+ vk::QueryPool query_pool;
1723+ std::vector<const char *> query_fusion_names;
1724+ std::vector<ggml_tensor *> query_nodes;
1725+ int32_t num_queries {};
1726+ int32_t query_idx {};
17021727};
17031728
17041729static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -1824,8 +1849,6 @@ struct vk_instance_t {
18241849static bool vk_instance_initialized = false;
18251850static vk_instance_t vk_instance;
18261851
1827- static bool vk_perf_logger_enabled = false;
1828-
18291852#ifdef GGML_VULKAN_CHECK_RESULTS
18301853static size_t vk_skip_checks;
18311854static size_t vk_output_tensor;
@@ -4205,9 +4228,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
42054228#ifdef GGML_VULKAN_MEMORY_DEBUG
42064229 device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
42074230#endif
4208- if (vk_perf_logger_enabled) {
4209- device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
4210- }
42114231
42124232 size_t dev_num = vk_instance.device_indices[idx];
42134233
@@ -5153,6 +5173,11 @@ static void ggml_vk_instance_init() {
51535173 }
51545174
51555175 vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
5176+ const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
5177+
5178+ if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
5179+ vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY);
5180+ }
51565181
51575182 // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
51585183 VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
@@ -5330,6 +5355,10 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
53305355 ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
53315356 ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
53325357
5358+ if (vk_perf_logger_enabled) {
5359+ ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
5360+ }
5361+
53335362#ifdef GGML_VULKAN_CHECK_RESULTS
53345363 const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
53355364 vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -12205,6 +12234,9 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1220512234
1220612235 ctx->compute_cmd_pool.destroy(ctx->device->device);
1220712236 ctx->transfer_cmd_pool.destroy(ctx->device->device);
12237+ if (vk_perf_logger_enabled) {
12238+ ctx->perf_logger->print_timings(true);
12239+ }
1220812240}
1220912241
1221012242static int ggml_vk_get_device_count() {
@@ -13003,24 +13035,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1300313035 vk_context compute_ctx;
1300413036 if (vk_perf_logger_enabled) {
1300513037 // allocate/resize the query pool
13006- if (ctx->device-> num_queries < cgraph->n_nodes + 1) {
13007- if (ctx->device-> query_pool) {
13008- ctx->device->device.destroyQueryPool(ctx->device-> query_pool);
13038+ if (ctx->num_queries < cgraph->n_nodes + 1) {
13039+ if (ctx->query_pool) {
13040+ ctx->device->device.destroyQueryPool(ctx->query_pool);
1300913041 }
1301013042 vk::QueryPoolCreateInfo query_create_info;
1301113043 query_create_info.queryType = vk::QueryType::eTimestamp;
1301213044 query_create_info.queryCount = cgraph->n_nodes + 100;
13013- ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
13014- ctx->device->num_queries = query_create_info.queryCount;
13045+ ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
13046+ ctx->num_queries = query_create_info.queryCount;
13047+ ctx->query_fusion_names.resize(ctx->num_queries);
13048+ ctx->query_nodes.resize(ctx->num_queries);
1301513049 }
1301613050
13017- ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
13051+ ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
13052+ std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
13053+ std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
1301813054
1301913055 GGML_ASSERT(ctx->compute_ctx.expired());
1302013056 compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1302113057 ctx->compute_ctx = compute_ctx;
1302213058 ggml_vk_ctx_begin(ctx->device, compute_ctx);
13023- compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
13059+ ctx->query_idx = 0;
13060+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1302413061 }
1302513062
1302613063 ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -13061,52 +13098,66 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1306113098 total_mul_mat_bytes += bytes;
1306213099 }
1306313100
13101+ const char *fusion_string {};
1306413102 if (!ctx->device->disable_fusion) {
1306513103 uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
1306613104 if (num_adds) {
1306713105 ctx->num_additional_fused_ops = num_adds - 1;
13106+ fusion_string = "MULTI_ADD";
1306813107 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
1306913108 ctx->num_additional_fused_ops = 2;
13109+ fusion_string = "MUL_MAT_ADD_ADD";
1307013110 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
1307113111 ctx->num_additional_fused_ops = 1;
13112+ fusion_string = "MUL_MAT_ADD";
1307213113 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
1307313114 ctx->num_additional_fused_ops = 2;
13115+ fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
1307413116 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
1307513117 ctx->num_additional_fused_ops = 1;
13118+ fusion_string = "MUL_MAT_ID_ADD_ID";
1307613119 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
1307713120 ctx->num_additional_fused_ops = 1;
13121+ fusion_string = "MUL_MAT_ID_MUL";
1307813122 } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
1307913123 ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
1308013124 ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
1308113125 ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
1308213126 ctx->num_additional_fused_ops = 4;
13127+ fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
1308313128 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
1308413129 ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
1308513130 ctx->num_additional_fused_ops = 2;
13131+ fusion_string = "RMS_NORM_MUL_ROPE";
1308613132 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
1308713133 ctx->num_additional_fused_ops = 1;
13134+ fusion_string = "RMS_NORM_MUL";
1308813135 } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
1308913136 ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
1309013137 ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
1309113138 ctx->num_additional_fused_ops = 2;
13139+ fusion_string = "ROPE_VIEW_SET_ROWS";
1309213140 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
1309313141 ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
1309413142 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
1309513143 ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
1309613144 // view of argsort writes to memory
1309713145 ctx->fused_ops_write_mask |= 1 << 3;
13146+ fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
1309813147 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
1309913148 ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
1310013149 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
1310113150 ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
1310213151 // view of argsort writes to memory
1310313152 ctx->fused_ops_write_mask |= 1 << 3;
13153+ fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
1310413154 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
1310513155 ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
1310613156 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
1310713157 ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
1310813158 // view of argsort writes to memory
1310913159 ctx->fused_ops_write_mask |= 1 << 1;
13160+ fusion_string = "TOPK_MOE_LATE_SOFTMAX";
1311013161 }
1311113162 }
1311213163 ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@@ -13120,18 +13171,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1312013171
1312113172 bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
1312213173
13123- if (vk_perf_logger_enabled) {
13174+ if (vk_perf_logger_enabled && enqueued ) {
1312413175 if (ctx->compute_ctx.expired()) {
1312513176 compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1312613177 ctx->compute_ctx = compute_ctx;
1312713178 ggml_vk_ctx_begin(ctx->device, compute_ctx);
1312813179 } else {
1312913180 compute_ctx = ctx->compute_ctx.lock();
1313013181 }
13131- // If there are fused ops, just write out timestamps for all nodes to keep the accounting simple
13132- for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) {
13133- compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1);
13134- }
13182+ ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13183+ ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13184+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1313513185 }
1313613186
1313713187 if (enqueued) {
@@ -13172,14 +13222,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1317213222
1317313223 // Get the results and pass them to the logger
1317413224 std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
13175- VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device-> query_pool, 0, cgraph->n_nodes + 1 , (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13176- for (int i = 0 ; i < cgraph->n_nodes ; i++) {
13177- if (!ggml_vk_is_empty(cgraph->nodes [i])) {
13178- ctx->device->perf_logger->log_timing(cgraph->nodes [i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod)) ;
13179- }
13225+ VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx , (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13226+ for (int i = 1 ; i < ctx->query_idx ; i++) {
13227+ auto node = ctx->query_nodes [i];
13228+ auto name = ctx->query_fusion_names [i];
13229+ ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
1318013230 }
1318113231
13182- ctx->device-> perf_logger->print_timings();
13232+ ctx->perf_logger->print_timings();
1318313233 }
1318413234
1318513235 if (!ctx->device->support_async) {
0 commit comments