|
11 | 11 |
|
12 | 12 | #define MAX_NARGS 2 |
13 | 13 |
|
14 | | -int main(int argc, char *argv[]) { |
15 | | - |
16 | | - int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency())); |
17 | | - int n_rounds = 100; |
18 | | - |
19 | | - if (argc > 1) { |
20 | | - n_threads = std::atoi(argv[1]); |
21 | | - } |
22 | | - |
23 | | - if (argc > 2) { |
24 | | - n_rounds = std::atoi(argv[2]); |
25 | | - } |
26 | | - |
| 14 | +static void test_barrier(int n_threads, int n_rounds) { |
27 | 15 | struct ggml_init_params params = { |
28 | 16 | /* .mem_size = */ 1024*1024*1024, |
29 | 17 | /* .mem_buffer = */ NULL, |
@@ -56,7 +44,7 @@ int main(int argc, char *argv[]) { |
56 | 44 | exit(1); |
57 | 45 | } |
58 | 46 |
|
59 | | - // Create compute plan |
| 47 | + // The test runs with constant number of threads |
60 | 48 | struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool); |
61 | 49 |
|
62 | 50 | std::vector<uint8_t> work_data(cplan.work_size); |
@@ -89,6 +77,80 @@ int main(int argc, char *argv[]) { |
89 | 77 |
|
90 | 78 | ggml_threadpool_free(threadpool); |
91 | 79 | ggml_free(ctx); |
| 80 | +} |
| 81 | + |
| 82 | +static void test_active(int n_threads, int n_rounds) { |
| 83 | + struct ggml_init_params params = { |
| 84 | + /* .mem_size = */ 1024*1024*1024, |
| 85 | + /* .mem_buffer = */ NULL, |
| 86 | + /* .no_alloc = */ false, |
| 87 | + }; |
| 88 | + |
| 89 | + struct ggml_context * ctx = ggml_init(params); |
| 90 | + |
| 91 | + // Create graph |
| 92 | + struct ggml_cgraph * gf = ggml_new_graph(ctx); |
| 93 | + |
| 94 | + // Small graph with, parallel ops with barriers |
| 95 | + struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64); |
| 96 | + for (int i = 0; i < 2; i++) { |
| 97 | + struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128); |
| 98 | + out = ggml_mul_mat(ctx, a, out); |
| 99 | + |
| 100 | + struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64); |
| 101 | + out = ggml_mul_mat(ctx, d, out); |
| 102 | + } |
| 103 | + |
| 104 | + ggml_build_forward_expand(gf, out); |
| 105 | + int n_nodes = ggml_graph_n_nodes(gf); |
| 106 | + |
| 107 | + // Create threadpool |
| 108 | + struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads); |
| 109 | + struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); |
| 110 | + if (!threadpool) { |
| 111 | + fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads); |
| 112 | + exit(1); |
| 113 | + } |
| 114 | + |
| 115 | + std::cerr << "graph-compute with" |
| 116 | + << "\n n_threads: " << n_threads |
| 117 | + << "\n n_nodes: " << n_nodes |
| 118 | + << "\n n_rounds: " << n_rounds |
| 119 | + << "\n"; |
| 120 | + // ggml_graph_print(gf); |
| 121 | + |
| 122 | + // In this test we keep changing number of threads every 4th iteration |
| 123 | + // to test for race conditions in that path |
| 124 | + |
| 125 | + for (int i=0; i < n_rounds; i++) { |
| 126 | + struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool); |
| 127 | + |
| 128 | + std::vector<uint8_t> work_data(cplan.work_size); |
| 129 | + cplan.work_data = work_data.data(); |
| 130 | + |
| 131 | + ggml_graph_compute(gf, &cplan); |
| 132 | + } |
| 133 | + |
| 134 | + ggml_threadpool_free(threadpool); |
| 135 | + ggml_free(ctx); |
| 136 | +} |
| 137 | + |
| 138 | +int main(int argc, char *argv[]) { |
| 139 | + |
| 140 | + int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency())); |
| 141 | + int n_rounds = 100; |
| 142 | + |
| 143 | + if (argc > 1) { |
| 144 | + n_threads = std::atoi(argv[1]); |
| 145 | + } |
| 146 | + |
| 147 | + if (argc > 2) { |
| 148 | + n_rounds = std::atoi(argv[2]); |
| 149 | + } |
| 150 | + |
| 151 | + test_barrier(n_threads, n_rounds); |
| 152 | + |
| 153 | + test_active(n_threads, n_rounds * 100); |
92 | 154 |
|
93 | 155 | return 0; |
94 | 156 | } |
0 commit comments