Skip to content

Commit 38b1232

Browse files
authored
Merge branch 'ikawrakow:main' into main
2 parents da8b3a5 + 8e3041b commit 38b1232

File tree

16 files changed

+1697
-1085
lines changed

16 files changed

+1697
-1085
lines changed

common/common.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,12 +1276,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12761276
else if (arg_next == "layer") {
12771277
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
12781278
}
1279-
else if (arg_next == "row") {
1280-
fprintf(stderr, "\n\n=====================================================================================\n");
1281-
fprintf(stderr, " Split mode row is no longer supported\n");
1282-
fprintf(stderr, "=====================================================================================\n\n\n");
1283-
GGML_ABORT("fatal error");
1284-
params.split_mode = LLAMA_SPLIT_MODE_ROW;
1279+
else if (arg_next == "attn") {
1280+
params.split_mode = LLAMA_SPLIT_MODE_ATTN;
1281+
}
1282+
else if (arg_next == "graph") {
1283+
params.split_mode = LLAMA_SPLIT_MODE_GRAPH;
12851284
}
12861285
else {
12871286
invalid_param = true;
@@ -2249,6 +2248,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
22492248
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
22502249
"how to split the model across multiple GPUs, one of:\n"
22512250
" - none: use one GPU only\n"
2251+
" - graph: split model tensors and computation graph across GPUs\n"
22522252
" - layer (default): split layers and KV across GPUs\n" });
22532253
options.push_back({ "*", "-ts, --tensor-split SPLIT",
22542254
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });

examples/llama-bench/llama-bench.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ static const char * split_mode_str(llama_split_mode mode) {
217217
switch (mode) {
218218
case LLAMA_SPLIT_MODE_NONE: return "none";
219219
case LLAMA_SPLIT_MODE_LAYER: return "layer";
220-
case LLAMA_SPLIT_MODE_ROW: return "row";
220+
case LLAMA_SPLIT_MODE_GRAPH: return "graph";
221221
default: GGML_ABORT("invalid split mode");
222222
}
223223
}
@@ -334,7 +334,7 @@ static void print_usage(int /* argc */, char ** argv) {
334334
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
335335
printf(" --n-cpu-moe <n> (default: none)\n");
336336
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
337-
printf(" -sm, --split-mode <none|layer> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
337+
printf(" -sm, --split-mode <none|row|layer> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
338338
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
339339
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
340340
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
@@ -630,12 +630,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
630630
mode = LLAMA_SPLIT_MODE_NONE;
631631
} else if (m == "layer") {
632632
mode = LLAMA_SPLIT_MODE_LAYER;
633-
} else if (m == "row") {
634-
fprintf(stderr, "\n\n=======================================================================\n");
635-
fprintf(stderr, "Split mode 'row' is no longer supported\n");
636-
fprintf(stderr, "=======================================================================\n\n\n");
637-
invalid_param = true;
638-
break;
633+
} else if (m == "graph") {
634+
mode = LLAMA_SPLIT_MODE_GRAPH;
639635
} else {
640636
invalid_param = true;
641637
break;

ggml/include/ggml.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3021,6 +3021,13 @@ extern "C" {
30213021

30223022
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
30233023

3024+
typedef struct {
3025+
int n_device;
3026+
int split_dim;
3027+
struct ggml_tensor * tensor;
3028+
struct ggml_tensor ** splits;
3029+
} ggml_split_tensor_t;
3030+
30243031
#ifdef __cplusplus
30253032
}
30263033
#endif

ggml/src/ggml-backend.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
4343
// get_alloc_size is optional, defaults to ggml_nbytes
4444
if (buft->iface.get_alloc_size) {
4545
size_t size = buft->iface.get_alloc_size(buft, tensor);
46-
assert(size >= ggml_nbytes(tensor));
46+
//assert(size >= ggml_nbytes(tensor));
4747
return size;
4848
}
4949
return ggml_nbytes(tensor);
@@ -1216,8 +1216,10 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
12161216
return -1;
12171217
}
12181218

1219+
//printf("%s: have %d backends, buffer is %s\n", __func__, sched->n_backends, ggml_backend_buffer_name(buffer));
12191220
// find highest prio backend that supports the buffer type and the op
12201221
for (int i = 0; i < sched->n_backends; i++) {
1222+
//printf(" Checking bacckend %d (%s)\n", i, ggml_backend_name(sched->backends[i]));
12211223
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
12221224
ggml_backend_supports_op(sched->backends[i], op)) {
12231225
return i;
@@ -1393,6 +1395,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
13931395
// do not overwrite user assignments
13941396
if (*leaf_backend_id == -1) {
13951397
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1398+
//printf("Pass 1: assigned backend %d to leaf %d, %s\n", *leaf_backend_id, i, graph->leafs[i]->name);
13961399
}
13971400
}
13981401

@@ -1402,6 +1405,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14021405
// do not overwrite user assignments
14031406
if (*node_backend_id == -1) {
14041407
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1408+
//printf("Pass 1: assigned backend %d to node %d, %s(%s)\n", *node_backend_id, i, ggml_op_name(node->op), node->name);
14051409

14061410
#if 0
14071411
// src
@@ -1445,6 +1449,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14451449
cur_backend_id = *node_backend_id;
14461450
}
14471451
} else if (cur_backend_id != -1) {
1452+
//printf("(u1) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id);
14481453
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
14491454
}
14501455
}
@@ -1466,6 +1471,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14661471
cur_backend_id = *node_backend_id;
14671472
}
14681473
} else if (cur_backend_id != -1) {
1474+
//printf("(d1) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id);
14691475
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
14701476
}
14711477
}
@@ -1482,6 +1488,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14821488
if (*node_backend_id != -1) {
14831489
cur_backend_id = *node_backend_id;
14841490
} else if (cur_backend_id != -1) {
1491+
//printf("(u2) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id);
14851492
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
14861493
}
14871494
}
@@ -1498,6 +1505,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14981505
if (*node_backend_id != -1) {
14991506
cur_backend_id = *node_backend_id;
15001507
} else if (cur_backend_id != -1) {
1508+
//printf("(d2) invoking ggml_backend_sched_set_if_supported for node %d, %s with cur_backend_id = %d, node_backend_id = %d\n", i, node->name, cur_backend_id, *node_backend_id);
15011509
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
15021510
}
15031511
}
@@ -1535,6 +1543,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15351543
if (n_supported > n_supported_best) {
15361544
n_supported_best = n_supported;
15371545
*node_backend_id = b;
1546+
//printf("Pass 3: assigned backend %d to unassigned node %d, %s\n", b, i, node->name);
15381547
SET_CAUSE(node, "3.best");
15391548
}
15401549
}
@@ -1555,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15551564
}
15561565
}
15571566
if (supported) {
1567+
//printf("Pass 3: assigned backend %d to node %d, %s previously assigned to backend %d\n", b, i, node->name, *node_backend_id);
15581568
*node_backend_id = b;
15591569
SET_CAUSE(node, "3.upg");
15601570
break;
@@ -1583,9 +1593,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15831593
// views are always on the same backend as the source
15841594
*src_backend_id = tensor_backend_id(src->view_src);
15851595
SET_CAUSE(src, "4.vsrc");
1596+
//printf("Pass 4: assigned backend %d to src %d, %s in node %d, %s frpm view_src\n", *src_backend_id, j, src->name, i, node->name);
15861597
} else {
15871598
*src_backend_id = *cur_backend_id;
15881599
SET_CAUSE(src, "4.cur");
1600+
//printf("Pass 4: assigned backend %d to src %d, %s in node %d, %s frpm current\n", *src_backend_id, j, src->name, i, node->name);
15891601
}
15901602
}
15911603
}
@@ -1620,7 +1632,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
16201632

16211633
// check if we should start a new split based on the sources of the current node
16221634
bool need_new_split = false;
1623-
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1635+
if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) {
1636+
need_new_split = true;
1637+
}
1638+
else if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
16241639
for (int j = 0; j < GGML_MAX_SRC; j++) {
16251640
struct ggml_tensor * src = node->src[j];
16261641
if (src == NULL) {

0 commit comments

Comments
 (0)