@@ -769,11 +769,6 @@ struct vk_device_struct {
769769 std::unique_ptr<vk_memory_logger> memory_logger;
770770#endif
771771
772- // for GGML_VK_PERF_LOGGER
773- std::unique_ptr<vk_perf_logger> perf_logger;
774- vk::QueryPool query_pool;
775- int32_t num_queries;
776-
777772 ~vk_device_struct() {
778773 VK_LOG_DEBUG("destroy device " << name);
779774
@@ -1564,12 +1559,21 @@ class vk_memory_logger {
15641559#define VK_LOG_MEMORY(msg) ((void) 0)
15651560#endif // GGML_VULKAN_MEMORY_DEBUG
15661561
1562+ static bool vk_perf_logger_enabled = false;
1563+ // number of calls between perf logger prints
1564+ static uint32_t vk_perf_logger_frequency = 1;
1565+
15671566class vk_perf_logger {
15681567 public:
1569- void print_timings() {
1568+ void print_timings(bool force = false ) {
15701569 if (timings.empty()) {
15711570 return;
15721571 }
1572+ print_count++;
1573+ if ((print_count % vk_perf_logger_frequency) != 0 && !force) {
1574+ return;
1575+ }
1576+ print_count = 0;
15731577 uint64_t total_all_op_times = 0;
15741578 std::cerr << "----------------\nVulkan Timings:" << std::endl;
15751579 for (const auto & t : timings) {
@@ -1606,16 +1610,20 @@ class vk_perf_logger {
16061610 flops.clear();
16071611 }
16081612
1609- void log_timing(const ggml_tensor * node, uint64_t time) {
1613+ void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1614+ std::string fusion_str;
1615+ if (fusion_name) {
1616+ fusion_str = fusion_name + std::string(" ");
1617+ }
16101618 if (node->op == GGML_OP_UNARY) {
1611- timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
1619+ timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
16121620 return;
16131621 }
16141622 if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
1615- const uint64_t m = node->src[0]-> ne[1 ];
1616- const uint64_t n = ( node->op == GGML_OP_MUL_MAT) ? node-> ne[1] : node->ne[2 ];
1623+ const uint64_t m = node->ne[0 ];
1624+ const uint64_t n = node->ne[1];
16171625 const uint64_t k = node->src[1]->ne[0];
1618- const uint64_t batch = node->src[1]-> ne[2] * node->src[1] ->ne[3];
1626+ const uint64_t batch = node->ne[2] * node->ne[3];
16191627 std::string name = ggml_op_name(node->op);
16201628 if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) ||
16211629 (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) {
@@ -1624,9 +1632,13 @@ class vk_perf_logger {
16241632 name += " ";
16251633 name += ggml_type_name(node->src[0]->type);
16261634 name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
1635+ if (node->op == GGML_OP_MUL_MAT_ID) {
1636+ name += " n_expert=" + std::to_string(node->src[0]->ne[2]);
1637+ }
16271638 if (batch > 1) {
16281639 name += " batch=" + std::to_string(batch);
16291640 }
1641+ name = fusion_str + name;
16301642 timings[name].push_back(time);
16311643 flops[name].push_back(m * n * (k + (k - 1)) * batch);
16321644 return;
@@ -1648,13 +1660,15 @@ class vk_perf_logger {
16481660 uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
16491661 name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
16501662 ", N=N*OW*OH=" + std::to_string(size_N);
1663+ name = fusion_str + name;
16511664 flops[name].push_back(n_flops);
16521665 timings[name].push_back(time);
16531666 return;
16541667 }
16551668 if (node->op == GGML_OP_RMS_NORM) {
16561669 std::string name = ggml_op_name(node->op);
16571670 name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
1671+ name = fusion_str + name;
16581672 timings[name].push_back(time);
16591673 return;
16601674 }
@@ -1665,6 +1679,7 @@ class vk_perf_logger {
16651679 const ggml_tensor * v = node->src[2];
16661680 const ggml_tensor * m = node->src[3];
16671681 std::stringstream name;
1682+ name << fusion_str;
16681683 name << ggml_op_name(node->op) <<
16691684 " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
16701685 " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
@@ -1676,17 +1691,19 @@ class vk_perf_logger {
16761691 }
16771692 if (node->op == GGML_OP_TOP_K) {
16781693 std::stringstream name;
1694+ name << fusion_str;
16791695 name << ggml_op_name(node->op) <<
16801696 " K=" << node->ne[0] <<
16811697 " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
16821698 timings[name.str()].push_back(time);
16831699 return;
16841700 }
1685- timings[ggml_op_name(node->op)].push_back(time);
1701+ timings[fusion_str + ggml_op_name(node->op)].push_back(time);
16861702 }
16871703 private:
16881704 std::map<std::string, std::vector<uint64_t>> timings;
16891705 std::map<std::string, std::vector<uint64_t>> flops;
1706+ uint32_t print_count {};
16901707};
16911708
16921709struct ggml_backend_vk_context {
@@ -1740,6 +1757,14 @@ struct ggml_backend_vk_context {
17401757 // Bit 'i' means nodes[start_of_fusion + i] writes to memory.
17411758 // If there's no fusion, bit 0 is still set.
17421759 int fused_ops_write_mask {};
1760+
1761+ // for GGML_VK_PERF_LOGGER
1762+ std::unique_ptr<vk_perf_logger> perf_logger;
1763+ vk::QueryPool query_pool;
1764+ std::vector<const char *> query_fusion_names;
1765+ std::vector<ggml_tensor *> query_nodes;
1766+ int32_t num_queries {};
1767+ int32_t query_idx {};
17431768};
17441769
17451770static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -1865,8 +1890,6 @@ struct vk_instance_t {
18651890static bool vk_instance_initialized = false;
18661891static vk_instance_t vk_instance;
18671892
1868- static bool vk_perf_logger_enabled = false;
1869-
18701893#ifdef GGML_VULKAN_CHECK_RESULTS
18711894static size_t vk_skip_checks;
18721895static size_t vk_output_tensor;
@@ -4258,9 +4281,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
42584281#ifdef GGML_VULKAN_MEMORY_DEBUG
42594282 device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
42604283#endif
4261- if (vk_perf_logger_enabled) {
4262- device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
4263- }
42644284
42654285 size_t dev_num = vk_instance.device_indices[idx];
42664286
@@ -5195,6 +5215,11 @@ static void ggml_vk_instance_init() {
51955215 }
51965216
51975217 vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
5218+ const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
5219+
5220+ if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
5221+ vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY);
5222+ }
51985223
51995224 // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
52005225 VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
@@ -5372,6 +5397,10 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
53725397 ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
53735398 ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
53745399
5400+ if (vk_perf_logger_enabled) {
5401+ ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
5402+ }
5403+
53755404#ifdef GGML_VULKAN_CHECK_RESULTS
53765405 const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
53775406 vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -12351,6 +12380,9 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1235112380
1235212381 ctx->compute_cmd_pool.destroy(ctx->device->device);
1235312382 ctx->transfer_cmd_pool.destroy(ctx->device->device);
12383+ if (vk_perf_logger_enabled) {
12384+ ctx->perf_logger->print_timings(true);
12385+ }
1235412386}
1235512387
1235612388static int ggml_vk_get_device_count() {
@@ -13149,24 +13181,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1314913181 vk_context compute_ctx;
1315013182 if (vk_perf_logger_enabled) {
1315113183 // allocate/resize the query pool
13152- if (ctx->device-> num_queries < cgraph->n_nodes + 1) {
13153- if (ctx->device-> query_pool) {
13154- ctx->device->device.destroyQueryPool(ctx->device-> query_pool);
13184+ if (ctx->num_queries < cgraph->n_nodes + 1) {
13185+ if (ctx->query_pool) {
13186+ ctx->device->device.destroyQueryPool(ctx->query_pool);
1315513187 }
1315613188 vk::QueryPoolCreateInfo query_create_info;
1315713189 query_create_info.queryType = vk::QueryType::eTimestamp;
1315813190 query_create_info.queryCount = cgraph->n_nodes + 100;
13159- ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
13160- ctx->device->num_queries = query_create_info.queryCount;
13191+ ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
13192+ ctx->num_queries = query_create_info.queryCount;
13193+ ctx->query_fusion_names.resize(cgraph->n_nodes);
13194+ ctx->query_nodes.resize(cgraph->n_nodes);
1316113195 }
1316213196
13163- ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
13197+ ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
13198+ std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
13199+ std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
1316413200
1316513201 GGML_ASSERT(ctx->compute_ctx.expired());
1316613202 compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1316713203 ctx->compute_ctx = compute_ctx;
1316813204 ggml_vk_ctx_begin(ctx->device, compute_ctx);
13169- compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
13205+ ctx->query_idx = 0;
13206+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1317013207 }
1317113208
1317213209 ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -13207,52 +13244,66 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1320713244 total_mul_mat_bytes += bytes;
1320813245 }
1320913246
13247+ const char *fusion_string {};
1321013248 if (!ctx->device->disable_fusion) {
1321113249 uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
1321213250 if (num_adds) {
1321313251 ctx->num_additional_fused_ops = num_adds - 1;
13252+ fusion_string = "MULTI_ADD";
1321413253 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
1321513254 ctx->num_additional_fused_ops = 2;
13255+ fusion_string = "MUL_MAT_ADD_ADD";
1321613256 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
1321713257 ctx->num_additional_fused_ops = 1;
13258+ fusion_string = "MUL_MAT_ADD";
1321813259 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
1321913260 ctx->num_additional_fused_ops = 2;
13261+ fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
1322013262 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
1322113263 ctx->num_additional_fused_ops = 1;
13264+ fusion_string = "MUL_MAT_ID_ADD_ID";
1322213265 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
1322313266 ctx->num_additional_fused_ops = 1;
13267+ fusion_string = "MUL_MAT_ID_MUL";
1322413268 } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
1322513269 ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
1322613270 ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
1322713271 ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
1322813272 ctx->num_additional_fused_ops = 4;
13273+ fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
1322913274 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
1323013275 ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
1323113276 ctx->num_additional_fused_ops = 2;
13277+ fusion_string = "RMS_NORM_MUL_ROPE";
1323213278 } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
1323313279 ctx->num_additional_fused_ops = 1;
13280+ fusion_string = "RMS_NORM_MUL";
1323413281 } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
1323513282 ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
1323613283 ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
1323713284 ctx->num_additional_fused_ops = 2;
13285+ fusion_string = "ROPE_VIEW_SET_ROWS";
1323813286 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
1323913287 ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
1324013288 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
1324113289 ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
1324213290 // view of argsort writes to memory
1324313291 ctx->fused_ops_write_mask |= 1 << 3;
13292+ fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
1324413293 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
1324513294 ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
1324613295 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
1324713296 ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
1324813297 // view of argsort writes to memory
1324913298 ctx->fused_ops_write_mask |= 1 << 3;
13299+ fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
1325013300 } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
1325113301 ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
1325213302 ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
1325313303 ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
1325413304 // view of argsort writes to memory
1325513305 ctx->fused_ops_write_mask |= 1 << 1;
13306+ fusion_string = "TOPK_MOE_LATE_SOFTMAX";
1325613307 }
1325713308 }
1325813309 ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@@ -13266,18 +13317,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1326613317
1326713318 bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
1326813319
13269- if (vk_perf_logger_enabled) {
13320+ if (vk_perf_logger_enabled && enqueued ) {
1327013321 if (ctx->compute_ctx.expired()) {
1327113322 compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1327213323 ctx->compute_ctx = compute_ctx;
1327313324 ggml_vk_ctx_begin(ctx->device, compute_ctx);
1327413325 } else {
1327513326 compute_ctx = ctx->compute_ctx.lock();
1327613327 }
13277- // If there are fused ops, just write out timestamps for all nodes to keep the accounting simple
13278- for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) {
13279- compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1);
13280- }
13328+ ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13329+ ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13330+ compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1328113331 }
1328213332
1328313333 if (enqueued) {
@@ -13318,14 +13368,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1331813368
1331913369 // Get the results and pass them to the logger
1332013370 std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
13321- VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device-> query_pool, 0, cgraph->n_nodes + 1 , (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13322- for (int i = 0 ; i < cgraph->n_nodes ; i++) {
13323- if (!ggml_vk_is_empty(cgraph->nodes [i])) {
13324- ctx->device->perf_logger->log_timing(cgraph->nodes [i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod)) ;
13325- }
13371+ VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx , (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13372+ for (int i = 1 ; i < ctx->query_idx ; i++) {
13373+ auto node = ctx->query_nodes [i];
13374+ auto name = ctx->query_fusion_names [i];
13375+ ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
1332613376 }
1332713377
13328- ctx->device-> perf_logger->print_timings();
13378+ ctx->perf_logger->print_timings();
1332913379 }
1333013380
1333113381 if (!ctx->device->support_async) {
0 commit comments