@@ -769,11 +769,6 @@ struct vk_device_struct {
769769 std::unique_ptr<vk_memory_logger> memory_logger;
770770#endif
771771
772- // for GGML_VK_PERF_LOGGER
773- std::unique_ptr<vk_perf_logger> perf_logger;
774- vk::QueryPool query_pool;
775- int32_t num_queries;
776-
777772 ~vk_device_struct() {
778773 VK_LOG_DEBUG("destroy device " << name);
779774
@@ -1563,12 +1558,21 @@ class vk_memory_logger {
15631558#define VK_LOG_MEMORY(msg) ((void) 0)
15641559#endif // GGML_VULKAN_MEMORY_DEBUG
15651560
1561+ static bool vk_perf_logger_enabled = false;
1562+ // number of calls between perf logger prints
1563+ static uint32_t vk_perf_logger_frequency = 1;
1564+
15661565class vk_perf_logger {
15671566 public:
1568- void print_timings() {
1567+ void print_timings(bool force = false ) {
15691568 if (timings.empty()) {
15701569 return;
15711570 }
1571+ print_count++;
1572+ if ((print_count % vk_perf_logger_frequency) != 0 && !force) {
1573+ return;
1574+ }
1575+ print_count = 0;
15721576 uint64_t total_all_op_times = 0;
15731577 std::cerr << "----------------\nVulkan Timings:" << std::endl;
15741578 for (const auto & t : timings) {
@@ -1605,16 +1609,20 @@ class vk_perf_logger {
16051609 flops.clear();
16061610 }
16071611
1608- void log_timing(const ggml_tensor * node, uint64_t time) {
1612+ void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1613+ std::string fusion_str;
1614+ if (fusion_name) {
1615+ fusion_str = fusion_name + std::string(" ");
1616+ }
16091617 if (node->op == GGML_OP_UNARY) {
1610- timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
1618+ timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
16111619 return;
16121620 }
16131621 if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
1614- const uint64_t m = node->src[0]-> ne[1 ];
1615- const uint64_t n = ( node->op == GGML_OP_MUL_MAT) ? node-> ne[1] : node->ne[2 ];
1622+ const uint64_t m = node->ne[0 ];
1623+ const uint64_t n = node->ne[1];
16161624 const uint64_t k = node->src[1]->ne[0];
1617- const uint64_t batch = node->src[1]-> ne[2] * node->src[1] ->ne[3];
1625+ const uint64_t batch = node->ne[2] * node->ne[3];
16181626 std::string name = ggml_op_name(node->op);
16191627 if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) ||
16201628 (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) {
@@ -1623,9 +1631,13 @@ class vk_perf_logger {
16231631 name += " ";
16241632 name += ggml_type_name(node->src[0]->type);
16251633 name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
1634+ if (node->op == GGML_OP_MUL_MAT_ID) {
1635+ name += " n_expert=" + std::to_string(node->src[0]->ne[2]);
1636+ }
16261637 if (batch > 1) {
16271638 name += " batch=" + std::to_string(batch);
16281639 }
1640+ name = fusion_str + name;
16291641 timings[name].push_back(time);
16301642 flops[name].push_back(m * n * (k + (k - 1)) * batch);
16311643 return;
@@ -1647,13 +1659,15 @@ class vk_perf_logger {
16471659 uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
16481660 name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
16491661 ", N=N*OW*OH=" + std::to_string(size_N);
1662+ name = fusion_str + name;
16501663 flops[name].push_back(n_flops);
16511664 timings[name].push_back(time);
16521665 return;
16531666 }
16541667 if (node->op == GGML_OP_RMS_NORM) {
16551668 std::string name = ggml_op_name(node->op);
16561669 name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
1670+ name = fusion_str + name;
16571671 timings[name].push_back(time);
16581672 return;
16591673 }
@@ -1664,6 +1678,7 @@ class vk_perf_logger {
16641678 const ggml_tensor * v = node->src[2];
16651679 const ggml_tensor * m = node->src[3];
16661680 std::stringstream name;
1681+ name << fusion_str;
16671682 name << ggml_op_name(node->op) <<
16681683 " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
16691684 " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
@@ -1673,11 +1688,12 @@ class vk_perf_logger {
16731688 timings[name.str()].push_back(time);
16741689 return;
16751690 }
1676- timings[ggml_op_name(node->op)].push_back(time);
1691+ timings[fusion_str + ggml_op_name(node->op)].push_back(time);
16771692 }
16781693 private:
16791694 std::map<std::string, std::vector<uint64_t>> timings;
16801695 std::map<std::string, std::vector<uint64_t>> flops;
1696+ uint32_t print_count {};
16811697};
16821698
16831699struct ggml_backend_vk_context {
@@ -1731,6 +1747,14 @@ struct ggml_backend_vk_context {
17311747 // Bit 'i' means nodes[start_of_fusion + i] writes to memory.
17321748 // If there's no fusion, bit 0 is still set.
17331749 int fused_ops_write_mask {};
1750+
1751+ // for GGML_VK_PERF_LOGGER
1752+ std::unique_ptr<vk_perf_logger> perf_logger;
1753+ vk::QueryPool query_pool;
1754+ std::vector<const char *> query_fusion_names;
1755+ std::vector<ggml_tensor *> query_nodes;
1756+ int32_t num_queries {};
1757+ int32_t query_idx {};
17341758};
17351759
17361760static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -1856,8 +1880,6 @@ struct vk_instance_t {
18561880static bool vk_instance_initialized = false;
18571881static vk_instance_t vk_instance;
18581882
1859- static bool vk_perf_logger_enabled = false;
1860-
18611883#ifdef GGML_VULKAN_CHECK_RESULTS
18621884static size_t vk_skip_checks;
18631885static size_t vk_output_tensor;
@@ -4249,9 +4271,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
42494271#ifdef GGML_VULKAN_MEMORY_DEBUG
42504272 device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
42514273#endif
4252- if (vk_perf_logger_enabled) {
4253- device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
4254- }
42554274
42564275 size_t dev_num = vk_instance.device_indices[idx];
42574276
@@ -5186,6 +5205,11 @@ static void ggml_vk_instance_init() {
51865205 }
51875206
51885207 vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
5208+ const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
5209+
5210+ if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
5211+ vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY);
5212+ }
51895213
51905214 // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
51915215 VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
@@ -5363,6 +5387,10 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
53635387 ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
53645388 ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
53655389
5390+ if (vk_perf_logger_enabled) {
5391+ ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
5392+ }
5393+
53665394#ifdef GGML_VULKAN_CHECK_RESULTS
53675395 const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
53685396 vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -12334,6 +12362,9 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1233412362
1233512363 ctx->compute_cmd_pool.destroy(ctx->device->device);
1233612364 ctx->transfer_cmd_pool.destroy(ctx->device->device);
12365+ if (vk_perf_logger_enabled) {
12366+ ctx->perf_logger->print_timings(true);
12367+ }
1233712368}
1233812369
1233912370static int ggml_vk_get_device_count() {
@@ -13132,24 +13163,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1313213163 vk_context compute_ctx;
1313313164 if (vk_perf_logger_enabled) {
1313413165 // allocate/resize the query pool
13135- if (ctx->device-> num_queries < cgraph->n_nodes + 1) {
13136- if (ctx->device-> query_pool) {
13137- ctx->device->device.destroyQueryPool(ctx->device-> query_pool);
13166+ if (ctx->num_queries < cgraph->n_nodes + 1) {
13167+ if (ctx->query_pool) {
13168+ ctx->device->device.destroyQueryPool(ctx->query_pool);
1313813169 }
1313913170 vk::QueryPoolCreateInfo query_create_info;
1314013171 query_create_info.queryType = vk::QueryType::eTimestamp;
1314113172 query_create_info.queryCount = cgraph->n_nodes + 100;
13142- ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
13143- ctx->device->num_queries = query_create_info.queryCount;
13173+ ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
13174+ ctx->num_queries = query_create_info.queryCount;
13175+ ctx->query_fusion_names.resize(cgraph->n_nodes);
13176+ ctx->query_nodes.resize(cgraph->n_nodes);
1314413177 }
1314513178
13146- ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
13179+ ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
13180+ std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
13181+ std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
1314713182
1314813183 GGML_ASSERT(ctx->compute_ctx.expired());
1314913184 compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1315013185 ctx->compute_ctx = compute_ctx;
1315113186 ggml_vk_ctx_begin(ctx->device, compute_ctx);
13152- compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
13187+ ctx->query_idx = 0;
13188+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1315313189 }
1315413190
1315513191 ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -13190,52 +13226,66 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1319013226 total_mul_mat_bytes += bytes;
1319113227 }
1319213228
13229+ const char *fusion_string {};
1319313230 if (!ctx->device->disable_fusion) {
1319413231 uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
1319513232 if (num_adds) {
1319613233 ctx->num_additional_fused_ops = num_adds - 1;
13234+ fusion_string = "MULTI_ADD";
1319713235 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
1319813236 ctx->num_additional_fused_ops = 2;
13237+ fusion_string = "MUL_MAT_ADD_ADD";
1319913238 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
1320013239 ctx->num_additional_fused_ops = 1;
13240+ fusion_string = "MUL_MAT_ADD";
1320113241 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
1320213242 ctx->num_additional_fused_ops = 2;
13243+ fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
1320313244 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
1320413245 ctx->num_additional_fused_ops = 1;
13246+ fusion_string = "MUL_MAT_ID_ADD_ID";
1320513247 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
1320613248 ctx->num_additional_fused_ops = 1;
13249+ fusion_string = "MUL_MAT_ID_MUL";
1320713250 } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
1320813251 ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
1320913252 ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
1321013253 ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
1321113254 ctx->num_additional_fused_ops = 4;
13255+ fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
1321213256 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
1321313257 ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
1321413258 ctx->num_additional_fused_ops = 2;
13259+ fusion_string = "RMS_NORM_MUL_ROPE";
1321513260 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
1321613261 ctx->num_additional_fused_ops = 1;
13262+ fusion_string = "RMS_NORM_MUL";
1321713263 } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
1321813264 ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
1321913265 ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
1322013266 ctx->num_additional_fused_ops = 2;
13267+ fusion_string = "ROPE_VIEW_SET_ROWS";
1322113268 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
1322213269 ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
1322313270 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
1322413271 ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
1322513272 // view of argsort writes to memory
1322613273 ctx->fused_ops_write_mask |= 1 << 3;
13274+ fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
1322713275 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
1322813276 ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
1322913277 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
1323013278 ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
1323113279 // view of argsort writes to memory
1323213280 ctx->fused_ops_write_mask |= 1 << 3;
13281+ fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
1323313282 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
1323413283 ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
1323513284 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
1323613285 ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
1323713286 // view of argsort writes to memory
1323813287 ctx->fused_ops_write_mask |= 1 << 1;
13288+ fusion_string = "TOPK_MOE_LATE_SOFTMAX";
1323913289 }
1324013290 }
1324113291 ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@@ -13249,18 +13299,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1324913299
1325013300 bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
1325113301
13252- if (vk_perf_logger_enabled) {
13302+ if (vk_perf_logger_enabled && enqueued ) {
1325313303 if (ctx->compute_ctx.expired()) {
1325413304 compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1325513305 ctx->compute_ctx = compute_ctx;
1325613306 ggml_vk_ctx_begin(ctx->device, compute_ctx);
1325713307 } else {
1325813308 compute_ctx = ctx->compute_ctx.lock();
1325913309 }
13260- // If there are fused ops, just write out timestamps for all nodes to keep the accounting simple
13261- for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) {
13262- compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1);
13263- }
13310+ ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13311+ ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13312+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1326413313 }
1326513314
1326613315 if (enqueued) {
@@ -13301,14 +13350,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1330113350
1330213351 // Get the results and pass them to the logger
1330313352 std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
13304- VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device-> query_pool, 0, cgraph->n_nodes + 1 , (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13305- for (int i = 0 ; i < cgraph->n_nodes ; i++) {
13306- if (!ggml_vk_is_empty(cgraph->nodes [i])) {
13307- ctx->device->perf_logger->log_timing(cgraph->nodes [i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod)) ;
13308- }
13353+ VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx , (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13354+ for (int i = 1 ; i < ctx->query_idx ; i++) {
13355+ auto node = ctx->query_nodes [i];
13356+ auto name = ctx->query_fusion_names [i];
13357+ ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
1330913358 }
1331013359
13311- ctx->device-> perf_logger->print_timings();
13360+ ctx->perf_logger->print_timings();
1331213361 }
1331313362
1331413363 if (!ctx->device->support_async) {
0 commit comments