|
62 | 62 | typedef struct { |
63 | 63 | const char *name; /* thread name */ |
64 | 64 | struct timeval last_wait_time; /* last wait time, last time when one thread loop finished */ |
65 | | - int warning_timeout; /* how long can the thread be blocked before a warning is shown */ |
66 | | - int error_timeout; /* how long can the thread be blocked before an error is shown */ |
| 65 | + long warning_timeout; /* how long can the thread be blocked before a warning is shown */ |
| 66 | + long error_timeout; /* how long can the thread be blocked before an error is shown */ |
67 | 67 | time_t update_time; /* last update time */ |
68 | 68 | dstring *output; /* thread specific info line */ |
69 | 69 | pthread_mutex_t Output_Mutex; /* gards one line */ |
70 | 70 | } Output_t; |
71 | 71 |
|
72 | | -#define MAX_OUTPUT_LINES 10 /* number of threads to monitor, currently 10 threads at max |
73 | | - at the same time*/ |
74 | | -static Output_t Output[MAX_OUTPUT_LINES] = { |
75 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
76 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
77 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
78 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
79 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
80 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
81 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
82 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
83 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
84 | | - {nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER}, |
85 | | -}; |
| 72 | +#define MAX_OUTPUT_LINES 512 /* max number of threads to monitor at the same time */ |
| 73 | +static bool Output_initialized = false; |
| 74 | +static Output_t Output[MAX_OUTPUT_LINES]; |
86 | 75 |
|
87 | 76 | /* global mutex used for mallinfo initialisation and also used to access the Info_Line string */ |
88 | 77 | static pthread_mutex_t global_mutex = PTHREAD_MUTEX_INITIALIZER; |
@@ -203,9 +192,19 @@ void sge_monitor_free(monitoring_t *monitor) { |
203 | 192 | *******************************************************************************/ |
204 | 193 | void |
205 | 194 | sge_monitor_init(monitoring_t *monitor, const char *thread_name, extension_t ext, |
206 | | - int warning_timeout, int error_timeout) { |
| 195 | + long warning_timeout, long error_timeout) { |
207 | 196 | DENTER(GDI_LAYER); |
208 | 197 |
|
| 198 | + sge_mutex_lock("sge_monitor_status", __func__, __LINE__, &global_mutex); |
| 199 | + if (!Output_initialized) { |
| 200 | + Output_initialized = true; |
| 201 | + for (int i = 0; i < MAX_OUTPUT_LINES; i++) { |
| 202 | + memset(&Output[i], 0, sizeof(Output_t)); |
| 203 | + Output[i].Output_Mutex = PTHREAD_MUTEX_INITIALIZER; |
| 204 | + } |
| 205 | + } |
| 206 | + sge_mutex_unlock("sge_monitor_status", __func__, __LINE__, &global_mutex); |
| 207 | + |
209 | 208 | /* |
210 | 209 | * initialize the mallinfo function pointer if it is available |
211 | 210 | */ |
@@ -405,41 +404,38 @@ u_long32 sge_monitor_status(char **info_message, u_long32 monitor_time) { |
405 | 404 | {/* this is the qping info section, it checks if each thread is still alive */ |
406 | 405 | int i; |
407 | 406 | int error_count = 0; |
| 407 | + int warning_count = 0; |
408 | 408 | struct timeval now{}; |
409 | | - double time; |
410 | | - char state = 'R'; |
411 | 409 | gettimeofday(&now, nullptr); |
412 | 410 |
|
413 | 411 | for (i = 0; i < MAX_OUTPUT_LINES; i++) { |
| 412 | + |
414 | 413 | sge_mutex_lock("sge_monitor_status", __func__, __LINE__, &(Output[i].Output_Mutex)); |
415 | 414 | if (Output[i].name != nullptr) { |
416 | | - time = now.tv_usec - Output[i].last_wait_time.tv_usec; |
417 | | - time = now.tv_sec - Output[i].last_wait_time.tv_sec + (time / 1000000); |
418 | | - |
| 415 | + char state = 'R'; |
| 416 | + double time = now.tv_usec - Output[i].last_wait_time.tv_usec; |
419 | 417 |
|
420 | | - if (Output[i].warning_timeout != NO_WARNING) { |
421 | | - if (Output[i].warning_timeout < time) { |
422 | | - if (Output[i].error_timeout < time) { |
423 | | - state = 'E'; |
424 | | - } else { |
425 | | - state = 'W'; |
426 | | - } |
427 | | - error_count++; |
428 | | - } |
| 418 | + time = now.tv_sec - Output[i].last_wait_time.tv_sec + (time / 1000000); |
| 419 | + if (Output[i].error_timeout != NO_ERROR && Output[i].error_timeout < time) { |
| 420 | + state = 'E'; |
| 421 | + error_count++; |
| 422 | + } else if (Output[i].warning_timeout != NO_WARNING && Output[i].warning_timeout < time) { |
| 423 | + state = 'W'; |
| 424 | + warning_count++; |
429 | 425 | } |
430 | 426 | sge_dstring_sprintf_append(&Info_Line, MSG_UTI_MONITOR_INFO_SCF, Output[i].name, state, time); |
431 | 427 | } |
432 | 428 | sge_mutex_unlock("sge_monitor_status", __func__, __LINE__, &(Output[i].Output_Mutex)); |
433 | 429 | } |
434 | 430 |
|
435 | | - if (error_count == 0) { |
436 | | - sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_OK); |
437 | | - } else if (error_count == 1) { |
438 | | - ret = 1; |
| 431 | + if (error_count > 0) { |
| 432 | + sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_ERROR); |
| 433 | + ret = 2; |
| 434 | + } else if (warning_count > 0) { |
439 | 435 | sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_WARNING); |
| 436 | + ret = 1; |
440 | 437 | } else { |
441 | | - ret = 2; |
442 | | - sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_ERROR); |
| 438 | + sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_OK); |
443 | 439 | } |
444 | 440 | sge_dstring_append(&Info_Line, "\n"); |
445 | 441 | } |
|
0 commit comments