Skip to content

Commit a73d075

Browse files
committed
BF: CS-674 Monitoring show threads in W or E state although they are working properly
BF: CS=675 Enhance monitoring so that it can observe more than 10 threads
1 parent 22898a5 commit a73d075

File tree

2 files changed

+52
-52
lines changed

2 files changed

+52
-52
lines changed

source/libs/uti/sge_monitor.cc

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -62,27 +62,16 @@
6262
typedef struct {
6363
const char *name; /* thread name */
6464
struct timeval last_wait_time; /* last wait time, last time when one thread loop finished */
65-
int warning_timeout; /* how long can the thread be blocked before a warning is shown */
66-
int error_timeout; /* how long can the thread be blocked before an error is shown */
65+
long warning_timeout; /* how long can the thread be blocked before a warning is shown */
66+
long error_timeout; /* how long can the thread be blocked before an error is shown */
6767
time_t update_time; /* last update time */
6868
dstring *output; /* thread specific info line */
6969
pthread_mutex_t Output_Mutex; /* gards one line */
7070
} Output_t;
7171

72-
#define MAX_OUTPUT_LINES 10 /* number of threads to monitor, currently 10 threads at max
73-
at the same time*/
74-
static Output_t Output[MAX_OUTPUT_LINES] = {
75-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
76-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
77-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
78-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
79-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
80-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
81-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
82-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
83-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
84-
{nullptr, {0, 0}, NO_WARNING, NO_ERROR, 0, nullptr, PTHREAD_MUTEX_INITIALIZER},
85-
};
72+
#define MAX_OUTPUT_LINES 512 /* max number of threads to monitor at the same time */
73+
static bool Output_initialized = false;
74+
static Output_t Output[MAX_OUTPUT_LINES];
8675

8776
/* global mutex used for mallinfo initialisation and also used to access the Info_Line string */
8877
static pthread_mutex_t global_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -203,9 +192,19 @@ void sge_monitor_free(monitoring_t *monitor) {
203192
*******************************************************************************/
204193
void
205194
sge_monitor_init(monitoring_t *monitor, const char *thread_name, extension_t ext,
206-
int warning_timeout, int error_timeout) {
195+
long warning_timeout, long error_timeout) {
207196
DENTER(GDI_LAYER);
208197

198+
sge_mutex_lock("sge_monitor_status", __func__, __LINE__, &global_mutex);
199+
if (!Output_initialized) {
200+
Output_initialized = true;
201+
for (int i = 0; i < MAX_OUTPUT_LINES; i++) {
202+
memset(&Output[i], 0, sizeof(Output_t));
203+
Output[i].Output_Mutex = PTHREAD_MUTEX_INITIALIZER;
204+
}
205+
}
206+
sge_mutex_unlock("sge_monitor_status", __func__, __LINE__, &global_mutex);
207+
209208
/*
210209
* initialize the mallinfo function pointer if it is available
211210
*/
@@ -405,41 +404,38 @@ u_long32 sge_monitor_status(char **info_message, u_long32 monitor_time) {
405404
{/* this is the qping info section, it checks if each thread is still alive */
406405
int i;
407406
int error_count = 0;
407+
int warning_count = 0;
408408
struct timeval now{};
409-
double time;
410-
char state = 'R';
411409
gettimeofday(&now, nullptr);
412410

413411
for (i = 0; i < MAX_OUTPUT_LINES; i++) {
412+
414413
sge_mutex_lock("sge_monitor_status", __func__, __LINE__, &(Output[i].Output_Mutex));
415414
if (Output[i].name != nullptr) {
416-
time = now.tv_usec - Output[i].last_wait_time.tv_usec;
417-
time = now.tv_sec - Output[i].last_wait_time.tv_sec + (time / 1000000);
418-
415+
char state = 'R';
416+
double time = now.tv_usec - Output[i].last_wait_time.tv_usec;
419417

420-
if (Output[i].warning_timeout != NO_WARNING) {
421-
if (Output[i].warning_timeout < time) {
422-
if (Output[i].error_timeout < time) {
423-
state = 'E';
424-
} else {
425-
state = 'W';
426-
}
427-
error_count++;
428-
}
418+
time = now.tv_sec - Output[i].last_wait_time.tv_sec + (time / 1000000);
419+
if (Output[i].error_timeout != NO_ERROR && Output[i].error_timeout < time) {
420+
state = 'E';
421+
error_count++;
422+
} else if (Output[i].warning_timeout != NO_WARNING && Output[i].warning_timeout < time) {
423+
state = 'W';
424+
warning_count++;
429425
}
430426
sge_dstring_sprintf_append(&Info_Line, MSG_UTI_MONITOR_INFO_SCF, Output[i].name, state, time);
431427
}
432428
sge_mutex_unlock("sge_monitor_status", __func__, __LINE__, &(Output[i].Output_Mutex));
433429
}
434430

435-
if (error_count == 0) {
436-
sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_OK);
437-
} else if (error_count == 1) {
438-
ret = 1;
431+
if (error_count > 0) {
432+
sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_ERROR);
433+
ret = 2;
434+
} else if (warning_count > 0) {
439435
sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_WARNING);
436+
ret = 1;
440437
} else {
441-
ret = 2;
442-
sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_ERROR);
438+
sge_dstring_append(&Info_Line, MSG_UTI_MONITOR_OK);
443439
}
444440
sge_dstring_append(&Info_Line, "\n");
445441
}

source/libs/uti/sge_monitor.h

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -98,24 +98,28 @@
9898
/**
9999
* qping thread warning times in seconds
100100
*/
101-
const int NO_WARNING = 0;
102-
const int EVENT_MASTER_THREAD_WARNING = 10;
103-
const int TET_WARNING = 30;
104-
const int MT_WARNING = 10;
105-
const int ST_WARNING = 0; /* no timeout for this thread */
106-
const int EXECD_WARNING = 10;
107-
const int SCT_WARNING = 20;
101+
const long NO_WARNING = 0;
102+
const long EVENT_MASTER_THREAD_WARNING = 5;
103+
const long TET_WARNING = 10;
104+
const long MT_WARNING = 0;
105+
const long WT_WARNING = 60;
106+
const long RT_WARNING = 60;
107+
const long ST_WARNING = 0; /* no timeout for this thread */
108+
const long EXECD_WARNING = 10;
109+
const long SCT_WARNING = 20;
108110

109111
/**
110112
* qping thread error times in seconds
111113
**/
112-
const int NO_ERROR = 0;
113-
const int EVENT_MASTER_THREAD_ERROR = 600;
114-
const int TET_ERROR = 600;
115-
const int MT_ERROR = 600;
116-
const int ST_ERROR = 0; /* no timeout for this thread */
117-
const int EXECD_ERROR = 600;
118-
const int SCT_ERROR = 600;
114+
const long NO_ERROR = 0;
115+
const long EVENT_MASTER_THREAD_ERROR = 60;
116+
const long TET_ERROR = 60;
117+
const long MT_ERROR = 0;
118+
const long WT_ERROR = 600;
119+
const long RT_ERROR = 60*60*24*365;
120+
const long ST_ERROR = 0; /* no timeout for this thread */
121+
const long EXECD_ERROR = 600;
122+
const long SCT_ERROR = 600;
119123

120124
/**
121125
* This function definition is the prototyp for the output function of a data
@@ -167,7 +171,7 @@ typedef struct {
167171
} monitoring_t;
168172

169173
void sge_monitor_init(monitoring_t *monitor, const char *thread_name, extension_t ext,
170-
int warning_timeout, int error_timeout);
174+
long warning_timeout, long error_timeout);
171175

172176
void sge_monitor_free(monitoring_t *monitor);
173177

0 commit comments

Comments
 (0)