Skip to content

Commit 45bcee1

Browse files
committed
BF: CS-676 Improve qmaster shutdown performance for scenarios where the master has more that 128 threads
1 parent a73d075 commit 45bcee1

File tree

2 files changed

+41
-11
lines changed

2 files changed

+41
-11
lines changed

source/daemons/qmaster/sge_thread_reader.cc

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@
2727
#include "uti/sge_os.h"
2828
#include "uti/sge_profiling.h"
2929
#include "uti/sge_rmon_macros.h"
30+
#include "uti/sge_time.h"
3031

3132
#include "sgeobj/ocs_DataStore.h"
3233

34+
#include "sge_thread_ctrl.h"
35+
3336
#ifdef OBSERVE
3437
# include "cull/cull_observe.h"
3538
#endif
@@ -143,7 +146,7 @@ sge_reader_main(void *arg) {
143146

144147
// init monitoring
145148
cl_thread_func_startup(thread_config);
146-
sge_monitor_init(p_monitor, thread_config->thread_name, GDI_EXT, MT_WARNING, MT_ERROR);
149+
sge_monitor_init(p_monitor, thread_config->thread_name, GDI_EXT, RT_WARNING, RT_ERROR);
147150
sge_qmaster_thread_init(QMASTER, READER_THREAD, true);
148151

149152
/* register at profiling module */
@@ -164,7 +167,8 @@ sge_reader_main(void *arg) {
164167

165168
MONITOR_SET_QLEN(p_monitor, sge_tq_get_task_count(ReaderRequestQueue));
166169

167-
if (packet != nullptr) {
170+
// handle the packet only if it is not nullptr and the shutdown has not started
171+
if (packet != nullptr && !sge_thread_has_shutdown_started()) {
168172
sge_gdi_task_class_t *task;
169173
bool is_only_read_request = true;
170174

@@ -216,7 +220,7 @@ sge_reader_main(void *arg) {
216220

217221
// handle the request (GDI/Report/Ack ...
218222
if (packet->request_type == PACKET_GDI_REQUEST) {
219-
// sge_usleep(3000000);
223+
//sge_usleep(1000000);
220224

221225
task = packet->first_task;
222226
while (task != nullptr) {
@@ -298,14 +302,25 @@ sge_reader_main(void *arg) {
298302
thread_output_profiling("reader thread profiling summary:\n", &next_prof_output);
299303

300304
sge_monitor_output(p_monitor);
301-
} else {
302-
int execute = 0;
305+
}
303306

307+
// pass the cancellation point at least once or stay here if shutdown was triggered
308+
bool shutdown_started = false;
309+
do {
304310
// pthread cancellation point
311+
int execute = 0;
305312
pthread_cleanup_push(sge_reader_cleanup_monitor, static_cast<void *>(p_monitor));
306313
cl_thread_func_testcancel(thread_config);
307314
pthread_cleanup_pop(execute); // cleanup monitor
308-
}
315+
316+
// shutdown in process?
317+
shutdown_started = sge_thread_has_shutdown_started();
318+
319+
// if we will wait here than do not eat up all cpu time
320+
if (shutdown_started) {
321+
sge_usleep(25000);
322+
}
323+
} while (shutdown_started);
309324
}
310325

311326
// Don't add cleanup code here. It will never be executed. Instead, register a cleanup function with

source/daemons/qmaster/sge_thread_worker.cc

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,12 @@
4141
#include "uti/sge_os.h"
4242
#include "uti/sge_profiling.h"
4343
#include "uti/sge_rmon_macros.h"
44+
#include "uti/sge_time.h"
4445

4546
#include "sgeobj/ocs_DataStore.h"
4647

48+
#include "sge_thread_ctrl.h"
49+
4750
#ifdef OBSERVE
4851
# include "cull/cull_observe.h"
4952
#endif
@@ -193,7 +196,7 @@ sge_worker_main(void *arg) {
193196

194197
// init monitoring
195198
cl_thread_func_startup(thread_config);
196-
sge_monitor_init(p_monitor, thread_config->thread_name, GDI_EXT, MT_WARNING, MT_ERROR);
199+
sge_monitor_init(p_monitor, thread_config->thread_name, GDI_EXT, WT_WARNING, WT_ERROR);
197200
sge_qmaster_thread_init(QMASTER, WORKER_THREAD, true);
198201

199202
/* register at profiling module */
@@ -214,7 +217,8 @@ sge_worker_main(void *arg) {
214217

215218
MONITOR_SET_QLEN(p_monitor, sge_tq_get_task_count(GlobalRequestQueue));
216219

217-
if (packet != nullptr) {
220+
// handle the packet only if it is not nullptr and the shutdown has not started
221+
if (packet != nullptr && !sge_thread_has_shutdown_started()) {
218222
sge_gdi_task_class_t *task;
219223
bool is_only_read_request = true;
220224

@@ -346,14 +350,25 @@ sge_worker_main(void *arg) {
346350
thread_output_profiling("worker thread profiling summary:\n", &next_prof_output);
347351

348352
sge_monitor_output(p_monitor);
349-
} else {
350-
int execute = 0;
353+
}
351354

355+
// pass the cancellation point at least once or stay here if shutdown was triggered
356+
bool shutdown_started = false;
357+
do {
352358
// pthread cancellation point
359+
int execute = 0;
353360
pthread_cleanup_push(sge_worker_cleanup_monitor, static_cast<void *>(p_monitor));
354361
cl_thread_func_testcancel(thread_config);
355362
pthread_cleanup_pop(execute); // cleanup monitor
356-
}
363+
364+
// shutdown in process?
365+
shutdown_started = sge_thread_has_shutdown_started();
366+
367+
// if we will wait here than do not eat up all cpu time
368+
if (shutdown_started) {
369+
sge_usleep(25000);
370+
}
371+
} while (shutdown_started);
357372
}
358373

359374
// Don't add cleanup code here. It will never be executed. Instead, register a cleanup function with

0 commit comments

Comments
 (0)