Skip to content

Commit faf2b4b

Browse files
committed
vulkan: perf_logger improvements
- Move perf_logger from device to ctx. - Add an env var to control the frequency we dump the stats. If you set a very large value, it just dumps when the ctx is destroyed. - Add a fusion info string to the tracking, only log one item per fused op. - Fix MUL_MAT_ID flops calculation.
1 parent 61bde8e commit faf2b4b

File tree

1 file changed

+85
-35
lines changed

1 file changed

+85
-35
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 85 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -769,11 +769,6 @@ struct vk_device_struct {
769769
std::unique_ptr<vk_memory_logger> memory_logger;
770770
#endif
771771

772-
// for GGML_VK_PERF_LOGGER
773-
std::unique_ptr<vk_perf_logger> perf_logger;
774-
vk::QueryPool query_pool;
775-
int32_t num_queries;
776-
777772
~vk_device_struct() {
778773
VK_LOG_DEBUG("destroy device " << name);
779774

@@ -1564,12 +1559,21 @@ class vk_memory_logger {
15641559
#define VK_LOG_MEMORY(msg) ((void) 0)
15651560
#endif // GGML_VULKAN_MEMORY_DEBUG
15661561

1562+
static bool vk_perf_logger_enabled = false;
1563+
// number of calls between perf logger prints
1564+
static uint32_t vk_perf_logger_frequency = 1;
1565+
15671566
class vk_perf_logger {
15681567
public:
1569-
void print_timings() {
1568+
void print_timings(bool force = false) {
15701569
if (timings.empty()) {
15711570
return;
15721571
}
1572+
print_count++;
1573+
if ((print_count % vk_perf_logger_frequency) != 0 && !force) {
1574+
return;
1575+
}
1576+
print_count = 0;
15731577
uint64_t total_all_op_times = 0;
15741578
std::cerr << "----------------\nVulkan Timings:" << std::endl;
15751579
for (const auto & t : timings) {
@@ -1606,16 +1610,20 @@ class vk_perf_logger {
16061610
flops.clear();
16071611
}
16081612

1609-
void log_timing(const ggml_tensor * node, uint64_t time) {
1613+
void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
1614+
std::string fusion_str;
1615+
if (fusion_name) {
1616+
fusion_str = fusion_name + std::string(" ");
1617+
}
16101618
if (node->op == GGML_OP_UNARY) {
1611-
timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
1619+
timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
16121620
return;
16131621
}
16141622
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
1615-
const uint64_t m = node->src[0]->ne[1];
1616-
const uint64_t n = (node->op == GGML_OP_MUL_MAT) ? node->ne[1] : node->ne[2];
1623+
const uint64_t m = node->ne[0];
1624+
const uint64_t n = node->ne[1];
16171625
const uint64_t k = node->src[1]->ne[0];
1618-
const uint64_t batch = node->src[1]->ne[2] * node->src[1]->ne[3];
1626+
const uint64_t batch = node->ne[2] * node->ne[3];
16191627
std::string name = ggml_op_name(node->op);
16201628
if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) ||
16211629
(node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) {
@@ -1624,9 +1632,13 @@ class vk_perf_logger {
16241632
name += " ";
16251633
name += ggml_type_name(node->src[0]->type);
16261634
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
1635+
if (node->op == GGML_OP_MUL_MAT_ID) {
1636+
name += " n_expert=" + std::to_string(node->src[0]->ne[2]);
1637+
}
16271638
if (batch > 1) {
16281639
name += " batch=" + std::to_string(batch);
16291640
}
1641+
name = fusion_str + name;
16301642
timings[name].push_back(time);
16311643
flops[name].push_back(m * n * (k + (k - 1)) * batch);
16321644
return;
@@ -1648,13 +1660,15 @@ class vk_perf_logger {
16481660
uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
16491661
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
16501662
", N=N*OW*OH=" + std::to_string(size_N);
1663+
name = fusion_str + name;
16511664
flops[name].push_back(n_flops);
16521665
timings[name].push_back(time);
16531666
return;
16541667
}
16551668
if (node->op == GGML_OP_RMS_NORM) {
16561669
std::string name = ggml_op_name(node->op);
16571670
name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
1671+
name = fusion_str + name;
16581672
timings[name].push_back(time);
16591673
return;
16601674
}
@@ -1665,6 +1679,7 @@ class vk_perf_logger {
16651679
const ggml_tensor * v = node->src[2];
16661680
const ggml_tensor * m = node->src[3];
16671681
std::stringstream name;
1682+
name << fusion_str;
16681683
name << ggml_op_name(node->op) <<
16691684
" dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " <<
16701685
" q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " <<
@@ -1676,17 +1691,19 @@ class vk_perf_logger {
16761691
}
16771692
if (node->op == GGML_OP_TOP_K) {
16781693
std::stringstream name;
1694+
name << fusion_str;
16791695
name << ggml_op_name(node->op) <<
16801696
" K=" << node->ne[0] <<
16811697
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
16821698
timings[name.str()].push_back(time);
16831699
return;
16841700
}
1685-
timings[ggml_op_name(node->op)].push_back(time);
1701+
timings[fusion_str + ggml_op_name(node->op)].push_back(time);
16861702
}
16871703
private:
16881704
std::map<std::string, std::vector<uint64_t>> timings;
16891705
std::map<std::string, std::vector<uint64_t>> flops;
1706+
uint32_t print_count {};
16901707
};
16911708

16921709
struct ggml_backend_vk_context {
@@ -1740,6 +1757,14 @@ struct ggml_backend_vk_context {
17401757
// Bit 'i' means nodes[start_of_fusion + i] writes to memory.
17411758
// If there's no fusion, bit 0 is still set.
17421759
int fused_ops_write_mask {};
1760+
1761+
// for GGML_VK_PERF_LOGGER
1762+
std::unique_ptr<vk_perf_logger> perf_logger;
1763+
vk::QueryPool query_pool;
1764+
std::vector<const char *> query_fusion_names;
1765+
std::vector<ggml_tensor *> query_nodes;
1766+
int32_t num_queries {};
1767+
int32_t query_idx {};
17431768
};
17441769

17451770
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -1865,8 +1890,6 @@ struct vk_instance_t {
18651890
static bool vk_instance_initialized = false;
18661891
static vk_instance_t vk_instance;
18671892

1868-
static bool vk_perf_logger_enabled = false;
1869-
18701893
#ifdef GGML_VULKAN_CHECK_RESULTS
18711894
static size_t vk_skip_checks;
18721895
static size_t vk_output_tensor;
@@ -4258,9 +4281,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
42584281
#ifdef GGML_VULKAN_MEMORY_DEBUG
42594282
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
42604283
#endif
4261-
if (vk_perf_logger_enabled) {
4262-
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
4263-
}
42644284

42654285
size_t dev_num = vk_instance.device_indices[idx];
42664286

@@ -5195,6 +5215,11 @@ static void ggml_vk_instance_init() {
51955215
}
51965216

51975217
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
5218+
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
5219+
5220+
if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
5221+
vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY);
5222+
}
51985223

51995224
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
52005225
VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
@@ -5372,6 +5397,10 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
53725397
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
53735398
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
53745399

5400+
if (vk_perf_logger_enabled) {
5401+
ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
5402+
}
5403+
53755404
#ifdef GGML_VULKAN_CHECK_RESULTS
53765405
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
53775406
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -12351,6 +12380,9 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1235112380

1235212381
ctx->compute_cmd_pool.destroy(ctx->device->device);
1235312382
ctx->transfer_cmd_pool.destroy(ctx->device->device);
12383+
if (vk_perf_logger_enabled) {
12384+
ctx->perf_logger->print_timings(true);
12385+
}
1235412386
}
1235512387

1235612388
static int ggml_vk_get_device_count() {
@@ -13149,24 +13181,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1314913181
vk_context compute_ctx;
1315013182
if (vk_perf_logger_enabled) {
1315113183
// allocate/resize the query pool
13152-
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
13153-
if (ctx->device->query_pool) {
13154-
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
13184+
if (ctx->num_queries < cgraph->n_nodes + 1) {
13185+
if (ctx->query_pool) {
13186+
ctx->device->device.destroyQueryPool(ctx->query_pool);
1315513187
}
1315613188
vk::QueryPoolCreateInfo query_create_info;
1315713189
query_create_info.queryType = vk::QueryType::eTimestamp;
1315813190
query_create_info.queryCount = cgraph->n_nodes + 100;
13159-
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
13160-
ctx->device->num_queries = query_create_info.queryCount;
13191+
ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
13192+
ctx->num_queries = query_create_info.queryCount;
13193+
ctx->query_fusion_names.resize(cgraph->n_nodes);
13194+
ctx->query_nodes.resize(cgraph->n_nodes);
1316113195
}
1316213196

13163-
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
13197+
ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
13198+
std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
13199+
std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
1316413200

1316513201
GGML_ASSERT(ctx->compute_ctx.expired());
1316613202
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1316713203
ctx->compute_ctx = compute_ctx;
1316813204
ggml_vk_ctx_begin(ctx->device, compute_ctx);
13169-
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
13205+
ctx->query_idx = 0;
13206+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1317013207
}
1317113208

1317213209
ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -13207,52 +13244,66 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1320713244
total_mul_mat_bytes += bytes;
1320813245
}
1320913246

13247+
const char *fusion_string {};
1321013248
if (!ctx->device->disable_fusion) {
1321113249
uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
1321213250
if (num_adds) {
1321313251
ctx->num_additional_fused_ops = num_adds - 1;
13252+
fusion_string = "MULTI_ADD";
1321413253
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
1321513254
ctx->num_additional_fused_ops = 2;
13255+
fusion_string = "MUL_MAT_ADD_ADD";
1321613256
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
1321713257
ctx->num_additional_fused_ops = 1;
13258+
fusion_string = "MUL_MAT_ADD";
1321813259
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
1321913260
ctx->num_additional_fused_ops = 2;
13261+
fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
1322013262
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
1322113263
ctx->num_additional_fused_ops = 1;
13264+
fusion_string = "MUL_MAT_ID_ADD_ID";
1322213265
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
1322313266
ctx->num_additional_fused_ops = 1;
13267+
fusion_string = "MUL_MAT_ID_MUL";
1322413268
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
1322513269
ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
1322613270
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
1322713271
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
1322813272
ctx->num_additional_fused_ops = 4;
13273+
fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
1322913274
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
1323013275
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
1323113276
ctx->num_additional_fused_ops = 2;
13277+
fusion_string = "RMS_NORM_MUL_ROPE";
1323213278
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
1323313279
ctx->num_additional_fused_ops = 1;
13280+
fusion_string = "RMS_NORM_MUL";
1323413281
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
1323513282
ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
1323613283
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
1323713284
ctx->num_additional_fused_ops = 2;
13285+
fusion_string = "ROPE_VIEW_SET_ROWS";
1323813286
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
1323913287
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
1324013288
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
1324113289
ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
1324213290
// view of argsort writes to memory
1324313291
ctx->fused_ops_write_mask |= 1 << 3;
13292+
fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
1324413293
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
1324513294
ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
1324613295
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
1324713296
ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
1324813297
// view of argsort writes to memory
1324913298
ctx->fused_ops_write_mask |= 1 << 3;
13299+
fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
1325013300
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
1325113301
ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
1325213302
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
1325313303
ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
1325413304
// view of argsort writes to memory
1325513305
ctx->fused_ops_write_mask |= 1 << 1;
13306+
fusion_string = "TOPK_MOE_LATE_SOFTMAX";
1325613307
}
1325713308
}
1325813309
ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@@ -13266,18 +13317,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1326613317

1326713318
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
1326813319

13269-
if (vk_perf_logger_enabled) {
13320+
if (vk_perf_logger_enabled && enqueued) {
1327013321
if (ctx->compute_ctx.expired()) {
1327113322
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1327213323
ctx->compute_ctx = compute_ctx;
1327313324
ggml_vk_ctx_begin(ctx->device, compute_ctx);
1327413325
} else {
1327513326
compute_ctx = ctx->compute_ctx.lock();
1327613327
}
13277-
// If there are fused ops, just write out timestamps for all nodes to keep the accounting simple
13278-
for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) {
13279-
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1);
13280-
}
13328+
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
13329+
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
13330+
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
1328113331
}
1328213332

1328313333
if (enqueued) {
@@ -13318,14 +13368,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1331813368

1331913369
// Get the results and pass them to the logger
1332013370
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
13321-
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13322-
for (int i = 0; i < cgraph->n_nodes; i++) {
13323-
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
13324-
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
13325-
}
13371+
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
13372+
for (int i = 1; i < ctx->query_idx; i++) {
13373+
auto node = ctx->query_nodes[i];
13374+
auto name = ctx->query_fusion_names[i];
13375+
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
1332613376
}
1332713377

13328-
ctx->device->perf_logger->print_timings();
13378+
ctx->perf_logger->print_timings();
1332913379
}
1333013380

1333113381
if (!ctx->device->support_async) {

0 commit comments

Comments
 (0)