@@ -193,6 +193,11 @@ typedef pthread_t ggml_thread_t;
193193#include <TargetConditionals.h>
194194#endif
195195
196+ #include <stdatomic.h>
197+
198+ static _Atomic uint64_t ggml_op_us [GGML_OP_COUNT ];
199+ static _Atomic uint64_t ggml_op_calls [GGML_OP_COUNT ];
200+
196201static const struct ggml_type_traits_cpu type_traits_cpu [GGML_TYPE_COUNT ] = {
197202 [GGML_TYPE_F32 ] = {
198203 .from_float = (ggml_from_float_t ) ggml_cpu_fp32_to_fp32 ,
@@ -2864,6 +2869,44 @@ struct ggml_cplan ggml_graph_plan(
28642869 return cplan ;
28652870}
28662871
2872+ // static thread_ret_t ggml_graph_compute_thread(void * data) {
2873+ // struct ggml_compute_state * state = (struct ggml_compute_state *) data;
2874+ // struct ggml_threadpool * tp = state->threadpool;
2875+ //
2876+ // const struct ggml_cgraph * cgraph = tp->cgraph;
2877+ // const struct ggml_cplan * cplan = tp->cplan;
2878+ //
2879+ // set_numa_thread_affinity(state->ith);
2880+ //
2881+ // struct ggml_compute_params params = {
2882+ // /*.ith =*/ state->ith,
2883+ // /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
2884+ // /*.wsize =*/ cplan->work_size,
2885+ // /*.wdata =*/ cplan->work_data,
2886+ // /*.threadpool=*/ tp,
2887+ // };
2888+ //
2889+ // for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
2890+ // struct ggml_tensor * node = cgraph->nodes[node_n];
2891+ //
2892+ // ggml_compute_forward(¶ms, node);
2893+ //
2894+ // if (state->ith == 0 && cplan->abort_callback &&
2895+ // cplan->abort_callback(cplan->abort_callback_data)) {
2896+ // atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
2897+ // tp->ec = GGML_STATUS_ABORTED;
2898+ // }
2899+ //
2900+ // if (node_n + 1 < cgraph->n_nodes) {
2901+ // ggml_barrier(state->threadpool);
2902+ // }
2903+ // }
2904+ //
2905+ // ggml_barrier(state->threadpool);
2906+ //
2907+ // return 0;
2908+ // }
2909+
28672910static thread_ret_t ggml_graph_compute_thread (void * data ) {
28682911 struct ggml_compute_state * state = (struct ggml_compute_state * ) data ;
28692912 struct ggml_threadpool * tp = state -> threadpool ;
@@ -2884,21 +2927,25 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28842927 for (int node_n = 0 ; node_n < cgraph -> n_nodes && atomic_load_explicit (& tp -> abort , memory_order_relaxed ) != node_n ; node_n ++ ) {
28852928 struct ggml_tensor * node = cgraph -> nodes [node_n ];
28862929
2930+ uint64_t t0 = ggml_time_us ();
28872931 ggml_compute_forward (& params , node );
2932+ uint64_t dt = ggml_time_us () - t0 ;
2933+
2934+ atomic_fetch_add_explicit (& ggml_op_us [node -> op ], dt , memory_order_relaxed );
2935+ atomic_fetch_add_explicit (& ggml_op_calls [node -> op ], 1 , memory_order_relaxed );
28882936
28892937 if (state -> ith == 0 && cplan -> abort_callback &&
28902938 cplan -> abort_callback (cplan -> abort_callback_data )) {
28912939 atomic_store_explicit (& tp -> abort , node_n + 1 , memory_order_relaxed );
28922940 tp -> ec = GGML_STATUS_ABORTED ;
2893- }
2941+ }
28942942
28952943 if (node_n + 1 < cgraph -> n_nodes ) {
28962944 ggml_barrier (state -> threadpool );
28972945 }
28982946 }
28992947
29002948 ggml_barrier (state -> threadpool );
2901-
29022949 return 0 ;
29032950}
29042951
@@ -3201,6 +3248,33 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
32013248 ggml_threadpool_free (threadpool );
32023249 }
32033250
3251+ // printf("\n========= GGML OP PERF =========\n");
3252+ // for (int i = 0; i < GGML_OP_COUNT; i++) {
3253+ // uint64_t us = atomic_load(&ggml_op_us[i]);
3254+ // uint64_t calls = atomic_load(&ggml_op_calls[i]);
3255+ // if (calls == 0) continue;
3256+ //
3257+ // printf("%-16s : %8llu us %6llu calls avg %6llu us\n",
3258+ // ggml_op_name(i),
3259+ // (unsigned long long)us,
3260+ // (unsigned long long)calls,
3261+ // (unsigned long long)(us / calls));
3262+ // }
3263+ // printf("================================\n\n");
3264+
3265+ // printf("\n");
3266+ // for (int i = 0; i < GGML_OP_COUNT; i++) {
3267+ // uint64_t us = atomic_load(&ggml_op_us[i]);
3268+ // uint64_t calls = atomic_load(&ggml_op_calls[i]);
3269+ // if (calls == 0) continue;
3270+ //
3271+ // printf("%-16s,%8llu us,%6llu,%6llu us,",
3272+ // ggml_op_name(i),
3273+ // (unsigned long long)us,
3274+ // (unsigned long long)calls,
3275+ // (unsigned long long)(us / calls));
3276+ // }
3277+
32043278 return ret ;
32053279}
32063280
0 commit comments