@@ -10,34 +10,37 @@ void init_tp_data(ExtTPData* tp_data)
1010 tp_data->next_stage = 0 ;
1111}
1212
13- __global__ void cross_device_barrier_cuda_kernel
14- (
15- uint32_t * sync,
16- uint32_t * sync_next,
17- uint32_t num_devices,
18- uint32_t device
19- )
20- {
21- // Arrive
22- sync[device] = 1 ;
13+ // Not used atm, disabled for compatibility
2314
24- // Clear flags for next barrier
25- if (device == 0 )
26- for (int i = 0 ; i < MAX_SYNC_DEVICES; ++i)
27- sync_next[i] = 0 ;
28-
29- // Wait for other devices to arrive
30- int delay = 5 ;
31- while (true )
32- {
33- int i = 0 ;
34- for (; i < num_devices; ++i) if (i != device && sync[i] == 0 ) break ;
35- if (i == num_devices) break ;
36- __nanosleep (delay);
37- delay = min (delay * 2 , 500 );
38- __threadfence_system ();
39- }
40- }
15+ // __global__ void cross_device_barrier_cuda_kernel
16+ // (
17+ // uint32_t* sync,
18+ // uint32_t* sync_next,
19+ // uint32_t num_devices,
20+ // uint32_t device
21+ // )
22+ // {
23+ //
24+ // // Arrive
25+ // sync[device] = 1;
26+ //
27+ // // Clear flags for next barrier
28+ // if (device == 0)
29+ // for (int i = 0; i < MAX_SYNC_DEVICES; ++i)
30+ // sync_next[i] = 0;
31+ //
32+ // // Wait for other devices to arrive
33+ // int delay = 5;
34+ // while (true)
35+ // {
36+ // int i = 0;
37+ // for (; i < num_devices; ++i) if (i != device && sync[i] == 0) break;
38+ // if (i == num_devices) break;
39+ // __nanosleep(delay);
40+ // delay = min(delay * 2, 500);
41+ // __threadfence_system();
42+ // }
43+ // }
4144
4245void cross_device_barrier_cuda
4346(
@@ -48,11 +51,11 @@ void cross_device_barrier_cuda
4851 uint32_t device
4952)
5053{
51- cross_device_barrier_cuda_kernel<<<1 , 1 , 0 , stream>>>
52- (
53- sync,
54- sync_next,
55- num_devices,
56- device
57- );
54+ // cross_device_barrier_cuda_kernel<<<1, 1, 0, stream>>>
55+ // (
56+ // sync,
57+ // sync_next,
58+ // num_devices,
59+ // device
60+ // );
5861}
0 commit comments