diff options
Diffstat (limited to 'web/attachments/327688-check_procs.c.patch')
-rw-r--r-- | web/attachments/327688-check_procs.c.patch | 796 |
1 files changed, 796 insertions, 0 deletions
diff --git a/web/attachments/327688-check_procs.c.patch b/web/attachments/327688-check_procs.c.patch new file mode 100644 index 0000000..2c262f7 --- /dev/null +++ b/web/attachments/327688-check_procs.c.patch | |||
@@ -0,0 +1,796 @@ | |||
1 | --- check_procs.c 2009-02-21 09:59:24.000000000 +0000 | ||
2 | +++ check_procs.c.new 2009-05-19 10:41:14.000000000 +0000 | ||
3 | @@ -27,7 +27,8 @@ | ||
4 | * | ||
5 | * You should have received a copy of the GNU General Public License | ||
6 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
7 | -* | ||
8 | +* | ||
9 | +* State file stuff originally by: Alain Williams <addw@phcomp.co.uk> | ||
10 | * | ||
11 | *****************************************************************************/ | ||
12 | |||
13 | @@ -42,13 +43,7 @@ | ||
14 | #include "regex.h" | ||
15 | |||
16 | #include <pwd.h> | ||
17 | - | ||
18 | -int process_arguments (int, char **); | ||
19 | -int validate_arguments (void); | ||
20 | -int check_thresholds (int); | ||
21 | -int convert_to_seconds (char *); | ||
22 | -void print_help (void); | ||
23 | -void print_usage (void); | ||
24 | +#include <time.h> | ||
25 | |||
26 | int wmax = -1; | ||
27 | int cmax = -1; | ||
28 | @@ -77,6 +72,7 @@ | ||
29 | METRIC_ELAPSED | ||
30 | }; | ||
31 | enum metric metric = METRIC_PROCS; | ||
32 | +char metric_state_name = 'P'; /* Metric name in the state file */ | ||
33 | |||
34 | int verbose = 0; | ||
35 | int uid; | ||
36 | @@ -92,9 +88,98 @@ | ||
37 | char *fmt; | ||
38 | char *fails; | ||
39 | char tmp[MAX_INPUT_BUFFER]; | ||
40 | +time_t now; | ||
41 | +time_t state_limit_start; | ||
42 | |||
43 | FILE *ps_input = NULL; | ||
44 | |||
45 | +/* Optionally trigger an alert if a process has been in a state for | ||
46 | + * some time. This time will be measured in minutes, ie much longer than | ||
47 | + * this program runs for - thus a state file is needed to store this | ||
48 | + * between runs of this program. | ||
49 | + * This happens if --state-file is specified. | ||
50 | + * | ||
51 | + * The state file records information about processes that exceed some criteria | ||
52 | + * for a warning or critical notice. | ||
53 | + * The file will contain one 'V' line. | ||
54 | + * If the metric is PROCS there will be one N line, else zero or more P lines. | ||
55 | + * Format of the state file: | ||
56 | + * Max line length of 500 | ||
57 | + * Empty lines and lines starting '#' are ignored | ||
58 | + * Lines consist of a type character, a space and optional extra information | ||
59 | + * V version_number | ||
60 | + * P pid ppid name MS secs | ||
61 | + * name is limited to a max 20 characters | ||
62 | + * M is the metric: | ||
63 | + * V virtual memory size | ||
64 | + * R resident set memory size | ||
65 | + * C percentage CPU | ||
66 | + * E time elapsed in seconds | ||
67 | + * S is the state: | ||
68 | + * W Warning | ||
69 | + * C Critical | ||
70 | + * N MS secs | ||
71 | + * M will be 'P' | ||
72 | + * S is the state as above | ||
73 | + * secs is the epoch time the metric was first exceeded - hex number | ||
74 | + * The 'MS secs' or 'S secs' may appear twice as it is possible for a process (or the | ||
75 | + * max # processes) to exceed both the warning and critical thresholds but for different times. | ||
76 | + * If something is C then it is implicitly W. | ||
77 | + * | ||
78 | + * There might be a trailing space on a N line. | ||
79 | + * | ||
80 | + * Eg: | ||
81 | + * P 1234 1200 cpu_hog CW 4a05a817 CC 4a05a91f | ||
82 | + * N PW 4a05a91f | ||
83 | + */ | ||
84 | +#define STATE_VERSION 1 /* Change me if the file format changes */ | ||
85 | +#define STATE_MAX_LINE 500 /* Longest line */ | ||
86 | +#define MAX_PROG_NAME 20 /* Longest name of program - search for this if you change it */ | ||
87 | +#define METRIC_CODES "PVRCE" /* For input validation */ | ||
88 | +#define STATE_CODES "WC" /* For input validation */ | ||
89 | +#define STATE2state(x) ((x) == STATE_WARNING ? 'W' : 'C') /* Convert STATE_WARNING or STATE_CRITICAL to 'W' or 'C' */ | ||
90 | + | ||
91 | +/* A process can exceed various limits. This describes on of them | ||
92 | + */ | ||
93 | +typedef struct plimit { | ||
94 | + struct plimit* pl_next; /* NULL terminated list */ | ||
95 | + time_t pl_when; /* When it first exceeded this limit */ | ||
96 | + int pl_state; /* STATE_WARNING or STATE_CRITICAL */ | ||
97 | + int pl_seen; /* Exceeded this run */ | ||
98 | + char pl_metric; /* What is exceeded - as in file */ | ||
99 | +} PLimit; | ||
100 | + | ||
101 | +/* Something to describe a process that is exceeding something | ||
102 | + */ | ||
103 | +typedef struct exproc { | ||
104 | + struct exproc* ep_next; /* NULL terminated list */ | ||
105 | + pid_t ep_pid; /* Process ID */ | ||
106 | + pid_t ep_ppid; /* Parent PID */ | ||
107 | + char* ep_prog; /* Program name */ | ||
108 | + PLimit* ep_limits; /* Limits exceeded list */ | ||
109 | + int ep_seen; /* Updated/noticed this run */ | ||
110 | +} ExProc; | ||
111 | + | ||
112 | +char* state_filename; /* File that we store this in */ | ||
113 | +int state_time = 5; /* Trigger time - minutes */ | ||
114 | +ExProc* state_list; /* Used for process specific metrics - ie metric is *not* PROCS */ | ||
115 | +PLimit* state_nprocs; /* Info on # procs exceeded - used if metric is PROCS */ | ||
116 | +int state_changed; /* Ie need to write back to file */ | ||
117 | +int must_rewrite; /* Set this if there is a syntax error in the file, or | ||
118 | + * some other reason which means we must rewrite it */ | ||
119 | + | ||
120 | +int process_arguments (int, char **); | ||
121 | +int validate_arguments (void); | ||
122 | +int check_thresholds (int); | ||
123 | +int convert_to_seconds (char *); | ||
124 | +void print_help (void); | ||
125 | +void print_usage (void); | ||
126 | +void read_state_file(void); | ||
127 | +void write_state_file(char** argv); | ||
128 | +void record_state(pid_t procpid, pid_t procppid, char* procprog, char prog_metric, int state, time_t start_time); | ||
129 | +void record_limit(PLimit** l_ref, int state, char proc_metric, time_t start_time); | ||
130 | +void read_limit_line(const char* in_line, PLimit** ppl, char* state_filename, int line_no); | ||
131 | +int check_limit(PLimit* pl); | ||
132 | |||
133 | int | ||
134 | main (int argc, char **argv) | ||
135 | @@ -129,13 +214,16 @@ | ||
136 | int result = STATE_UNKNOWN; | ||
137 | output chld_out, chld_err; | ||
138 | |||
139 | + now = time(NULL); | ||
140 | + | ||
141 | setlocale (LC_ALL, ""); | ||
142 | bindtextdomain (PACKAGE, LOCALEDIR); | ||
143 | textdomain (PACKAGE); | ||
144 | setlocale(LC_NUMERIC, "POSIX"); | ||
145 | |||
146 | - input_buffer = malloc (MAX_INPUT_BUFFER); | ||
147 | - procprog = malloc (MAX_INPUT_BUFFER); | ||
148 | + if( ! (input_buffer = malloc (MAX_INPUT_BUFFER)) || | ||
149 | + ! (procprog = malloc (MAX_INPUT_BUFFER))) | ||
150 | + die(STATE_UNKNOWN, _("Out of memory in startup\n")); | ||
151 | |||
152 | asprintf (&metric_name, "PROCS"); | ||
153 | metric = METRIC_PROCS; | ||
154 | @@ -168,6 +256,9 @@ | ||
155 | result = cmd_file_read( input_filename, &chld_out, 0); | ||
156 | } | ||
157 | |||
158 | + /* What do we remember from last time ? */ | ||
159 | + read_state_file(); | ||
160 | + | ||
161 | /* flush first line: j starts at 1 */ | ||
162 | for (j = 1; j < chld_out.lines; j++) { | ||
163 | input_line = chld_out.line[j]; | ||
164 | @@ -237,6 +328,10 @@ | ||
165 | procetime, procprog, procargs); | ||
166 | } | ||
167 | |||
168 | + /* This is all made simpler because metric can only talk about | ||
169 | + * one metric, ie can't check more than one thing at a time. | ||
170 | + * This means that metric_state_name is the char equivalent of metric. | ||
171 | + */ | ||
172 | if (metric == METRIC_VSZ) | ||
173 | i = check_thresholds (procvsz); | ||
174 | else if (metric == METRIC_RSS) | ||
175 | @@ -248,15 +343,29 @@ | ||
176 | i = check_thresholds (procseconds); | ||
177 | |||
178 | if (metric != METRIC_PROCS) { | ||
179 | - if (i == STATE_WARNING) { | ||
180 | - warn++; | ||
181 | - asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog); | ||
182 | - result = max_state (result, i); | ||
183 | - } | ||
184 | - if (i == STATE_CRITICAL) { | ||
185 | - crit++; | ||
186 | - asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog); | ||
187 | - result = max_state (result, i); | ||
188 | + if(state_filename) { | ||
189 | + /* State is being stored - ie don't report immediately. | ||
190 | + * Note what we have found: | ||
191 | + */ | ||
192 | + if(i == STATE_WARNING || i == STATE_CRITICAL) | ||
193 | + record_state(procpid, procppid, procprog, metric_state_name, i, now); | ||
194 | + } else { | ||
195 | + if (i == STATE_WARNING) { | ||
196 | + char* str = fails; | ||
197 | + warn++; | ||
198 | + asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog); | ||
199 | + result = max_state (result, i); | ||
200 | + if(str) | ||
201 | + free(str); | ||
202 | + } | ||
203 | + if (i == STATE_CRITICAL) { | ||
204 | + char* str = fails; | ||
205 | + crit++; | ||
206 | + asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog); | ||
207 | + result = max_state (result, i); | ||
208 | + if(str) | ||
209 | + free(str); | ||
210 | + } | ||
211 | } | ||
212 | } | ||
213 | } | ||
214 | @@ -276,7 +385,59 @@ | ||
215 | |||
216 | /* Needed if procs found, but none match filter */ | ||
217 | if ( metric == METRIC_PROCS ) { | ||
218 | - result = max_state (result, check_thresholds (procs) ); | ||
219 | + int threshold = check_thresholds(procs); | ||
220 | + int putative_result = max_state(result, threshold); | ||
221 | + | ||
222 | + if(state_filename) { /* Do not report immediately - note what we found */ | ||
223 | + /* Only record something if we may need to report it */ | ||
224 | + if(putative_result == STATE_WARNING || putative_result == STATE_CRITICAL) | ||
225 | + record_limit(&state_nprocs, putative_result, 'P', now); | ||
226 | + } else | ||
227 | + result = putative_result; | ||
228 | + } | ||
229 | + | ||
230 | + /* If we have a state file, the above has just stored the results away, so have | ||
231 | + * a look and see if there is anything that we should note. | ||
232 | + * The slight subtlety is that we could have something recorded as both a warning | ||
233 | + * & a critical - in this case only report the critical. | ||
234 | + */ | ||
235 | + if(state_filename) { | ||
236 | + /* Compute the start time of any state that we must report. | ||
237 | + * Ie any state younger than this we keep quiet about. | ||
238 | + */ | ||
239 | + state_limit_start = (time_t)((unsigned long)now - state_time * 60); | ||
240 | + | ||
241 | + if(verbose >= 3) | ||
242 | + printf("Checking metric %c, limit_start %s", metric_state_name, ctime(&state_limit_start)); | ||
243 | + | ||
244 | + if(metric == METRIC_PROCS) { | ||
245 | + result = check_limit(state_nprocs); | ||
246 | + } else { | ||
247 | + ExProc* pp; | ||
248 | + | ||
249 | + for(pp = state_list; pp; pp = pp->ep_next) { | ||
250 | + char* str = fails; | ||
251 | + | ||
252 | + /* What is the state of this recorded process ? */ | ||
253 | + int res = check_limit(pp->ep_limits); | ||
254 | + | ||
255 | + switch(res) { | ||
256 | + case STATE_OK: | ||
257 | + continue; /* Don't do the stuff below */ | ||
258 | + case STATE_WARNING: | ||
259 | + warn++; | ||
260 | + break; | ||
261 | + case STATE_CRITICAL: | ||
262 | + crit++; | ||
263 | + break; | ||
264 | + } | ||
265 | + | ||
266 | + asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), pp->ep_prog); | ||
267 | + result = max_state(result, res); | ||
268 | + if(str) | ||
269 | + free(str); | ||
270 | + } | ||
271 | + } | ||
272 | } | ||
273 | |||
274 | if ( result == STATE_OK ) { | ||
275 | @@ -302,6 +463,9 @@ | ||
276 | printf (" [%s]", fails); | ||
277 | |||
278 | printf ("\n"); | ||
279 | + | ||
280 | + write_state_file(argv); | ||
281 | + | ||
282 | return result; | ||
283 | } | ||
284 | |||
285 | @@ -336,6 +500,8 @@ | ||
286 | {"verbose", no_argument, 0, 'v'}, | ||
287 | {"ereg-argument-array", required_argument, 0, CHAR_MAX+1}, | ||
288 | {"input-file", required_argument, 0, CHAR_MAX+2}, | ||
289 | + {"state-file", required_argument, 0, 'S'}, | ||
290 | + {"state-time", required_argument, 0, 'T'}, | ||
291 | {0, 0, 0, 0} | ||
292 | }; | ||
293 | |||
294 | @@ -344,7 +510,7 @@ | ||
295 | strcpy (argv[c], "-t"); | ||
296 | |||
297 | while (1) { | ||
298 | - c = getopt_long (argc, argv, "Vvht:c:w:p:s:u:C:a:z:r:m:P:", | ||
299 | + c = getopt_long (argc, argv, "Vvht:c:w:p:s:u:C:a:z:r:m:P:S:T:", | ||
300 | longopts, &option); | ||
301 | |||
302 | if (c == -1 || c == EOF) | ||
303 | @@ -479,22 +645,27 @@ | ||
304 | asprintf (&metric_name, "%s", optarg); | ||
305 | if ( strcmp(optarg, "PROCS") == 0) { | ||
306 | metric = METRIC_PROCS; | ||
307 | + metric_state_name = 'P'; | ||
308 | break; | ||
309 | } | ||
310 | else if ( strcmp(optarg, "VSZ") == 0) { | ||
311 | metric = METRIC_VSZ; | ||
312 | + metric_state_name = 'V'; | ||
313 | break; | ||
314 | } | ||
315 | else if ( strcmp(optarg, "RSS") == 0 ) { | ||
316 | metric = METRIC_RSS; | ||
317 | + metric_state_name = 'R'; | ||
318 | break; | ||
319 | } | ||
320 | else if ( strcmp(optarg, "CPU") == 0 ) { | ||
321 | metric = METRIC_CPU; | ||
322 | + metric_state_name = 'C'; | ||
323 | break; | ||
324 | } | ||
325 | else if ( strcmp(optarg, "ELAPSED") == 0) { | ||
326 | metric = METRIC_ELAPSED; | ||
327 | + metric_state_name = 'E'; | ||
328 | break; | ||
329 | } | ||
330 | |||
331 | @@ -505,6 +676,16 @@ | ||
332 | case CHAR_MAX+2: | ||
333 | input_filename = optarg; | ||
334 | break; | ||
335 | + case 'S': /* state-file */ | ||
336 | + state_filename = optarg; | ||
337 | + break; | ||
338 | + case 'T': /* state-time */ | ||
339 | + if (!is_integer (optarg)) | ||
340 | + usage2 (_("state-time must be a positive integer"), optarg); | ||
341 | + else | ||
342 | + if((state_time = atoi (optarg)) < 0) /* Treat -ve time as zero */ | ||
343 | + state_time = 0; | ||
344 | + break; | ||
345 | } | ||
346 | } | ||
347 | |||
348 | @@ -727,6 +908,12 @@ | ||
349 | printf (" %s\n", "-C, --command=COMMAND"); | ||
350 | printf (" %s\n", _("Only scan for exact matches of COMMAND (without path).")); | ||
351 | |||
352 | + printf ("\n"); | ||
353 | + printf ( "%s\n", "State memory (complain if a process exceeds a limit for a long time):"); | ||
354 | + printf ( " %s\n", "-S, --state-file=StateFile"); | ||
355 | + printf ( " %s\n", _("Store process information in this file")); | ||
356 | + printf ( " %s\n", "-T, --state-time=minutes"); | ||
357 | + | ||
358 | printf(_("\n\ | ||
359 | RANGEs are specified 'min:max' or 'min:' or ':max' (or 'max'). If\n\ | ||
360 | specified 'max:min', a warning status will be generated if the\n\ | ||
361 | @@ -755,7 +942,9 @@ | ||
362 | printf (" %s\n", "check_procs -w 50000 -c 100000 --metric=VSZ"); | ||
363 | printf (" %s\n\n", _("Alert if VSZ of any processes over 50K or 100K")); | ||
364 | printf (" %s\n", "check_procs -w 10 -c 20 --metric=CPU"); | ||
365 | - printf (" %s\n", _("Alert if CPU of any processes over 10%% or 20%%")); | ||
366 | + printf (" %s\n\n", _("Alert if CPU of any processes over 10% or 20%")); | ||
367 | + printf (" %s\n", "check_procs -w 80 -c 90 --metric=CPU --state-time=10 --state-file=/tmp/CPU-state"); | ||
368 | + printf (" %s\n", _("Alert if CPU of any processes over 80% or 90% for at least 10 minutes, record state in /tmp/CPU-state")); | ||
369 | |||
370 | printf (_(UT_SUPPORT)); | ||
371 | } | ||
372 | @@ -766,5 +955,423 @@ | ||
373 | printf (_("Usage: ")); | ||
374 | printf ("%s -w <range> -c <range> [-m metric] [-s state] [-p ppid]\n", progname); | ||
375 | printf (" [-u user] [-r rss] [-z vsz] [-P %%cpu] [-a argument-array]\n"); | ||
376 | - printf (" [-C command] [-t timeout] [-v]\n"); | ||
377 | + printf (" [-C command] [-t timeout] [-v] [-S state_file] [-T state_time_minutes]\n"); | ||
378 | +} | ||
379 | + | ||
380 | +/* Read the state file - if there is one. | ||
381 | + * | ||
382 | + * This file is not locked. On a horribly over loaded system it might happen that reads & writes | ||
383 | + * could overlap in the wrong way. Locking could make things worse, you may end up with many | ||
384 | + * instances of this program waiting on the lock. | ||
385 | + */ | ||
386 | +void | ||
387 | +read_state_file(void) | ||
388 | +{ | ||
389 | + FILE* sf; | ||
390 | + char* errstr; | ||
391 | + char* str; | ||
392 | + char in_buf[STATE_MAX_LINE]; /* Input buffer */ | ||
393 | + int line_no = 0; | ||
394 | + char prog_name[MAX_PROG_NAME + 1]; | ||
395 | + int eaten; | ||
396 | + ExProc* pp; | ||
397 | + | ||
398 | + if( ! state_filename) /* No file specified */ | ||
399 | + return; | ||
400 | + | ||
401 | + if( ! (sf = fopen(state_filename, "r"))) { | ||
402 | + /* It is OK if it doesn't exist, we just haven't created it yet */ | ||
403 | + if(errno == ENOENT) { | ||
404 | + must_rewrite = 1; /* Force it to be created */ | ||
405 | + return; | ||
406 | + } | ||
407 | + | ||
408 | + /* Anything else - should not happen */ | ||
409 | + errstr = strerror(errno); | ||
410 | + die(STATE_UNKNOWN, _("Can't open %s for reading as: %s"), state_filename, errstr); | ||
411 | + } | ||
412 | + | ||
413 | + /* Read a line at a time */ | ||
414 | + while(fgets(in_buf, STATE_MAX_LINE, sf)) { | ||
415 | + line_no++; | ||
416 | + if( ! (str = strchr(in_buf, '\n'))) { | ||
417 | + printf(_("State file %s corrupt, line too long, at line %d\n"), state_filename, line_no); | ||
418 | + must_rewrite = 1; /* Force write */ | ||
419 | + goto read_off; /* Will be fixed when we rewrite it in a moment */ | ||
420 | + } | ||
421 | + *str = '\0'; | ||
422 | + | ||
423 | + /* Empty line or comment ? */ | ||
424 | + if(in_buf[0] == '\0' || in_buf[0] == '#') | ||
425 | + continue; | ||
426 | + | ||
427 | + if(in_buf[1] != ' ') { | ||
428 | + printf(_("State file %s corrupt, no space at position 1, at line %d\n"), state_filename, line_no); | ||
429 | + must_rewrite = 1; /* Force write */ | ||
430 | + goto read_off; | ||
431 | + } | ||
432 | + | ||
433 | + /* What line type ? */ | ||
434 | + switch(in_buf[0]) { | ||
435 | + case 'V': /* In case we are running 1st time after upgrade */ | ||
436 | + if(atoi(in_buf + 2) != STATE_VERSION) { | ||
437 | + printf(_("State file %s is wrong version, expecting %d. File ignored\n"), state_filename, STATE_VERSION); | ||
438 | + must_rewrite = 1; /* Force write */ | ||
439 | + goto read_off; | ||
440 | + } | ||
441 | + break; | ||
442 | + case 'P': /* Info about a process */ | ||
443 | + /* P 1234 1200 cpu_hog CW 4a05a817 CC 4a05a92f */ | ||
444 | + if( ! (pp = calloc(sizeof(ExProc), 1))) | ||
445 | + die(STATE_UNKNOWN, _("Out of memory reading %s line %d"), state_filename, line_no); | ||
446 | + | ||
447 | + pp->ep_next = state_list; | ||
448 | + state_list = pp; | ||
449 | + | ||
450 | + /* MAX_PROG_NAME on next line */ | ||
451 | + if(sscanf(in_buf, "P %d %d %20s%n", &pp->ep_pid, &pp->ep_ppid, prog_name, &eaten) != 3) { | ||
452 | + printf(_("State file corrupt, bad process line, file %s line %d\n"), state_filename, line_no); | ||
453 | + must_rewrite = 1; /* Force write */ | ||
454 | + goto read_off; | ||
455 | + } | ||
456 | + | ||
457 | + if( ! (pp->ep_prog = strdup(prog_name))) | ||
458 | + die(STATE_UNKNOWN, _("Out of memory reading %s line %d\n"), state_filename, line_no); | ||
459 | + | ||
460 | + if(verbose >= 3) | ||
461 | + printf("Read pid %d ppid %d proc %s\n", pp->ep_pid, pp->ep_ppid, pp->ep_prog); | ||
462 | + | ||
463 | + read_limit_line(in_buf + eaten, &pp->ep_limits, state_filename, line_no); | ||
464 | + break; | ||
465 | + case 'N': /* Number of procs exceeded */ | ||
466 | + if(verbose >= 3) | ||
467 | + printf("Read N:\n"); | ||
468 | + read_limit_line(in_buf + 1, &state_nprocs, state_filename, line_no); | ||
469 | + break; | ||
470 | + default: | ||
471 | + printf(_("State file %s corrupt, unknown line type, at line %d\n"), state_filename, line_no); | ||
472 | + must_rewrite = 1; /* Force write */ | ||
473 | + goto read_off; | ||
474 | + } | ||
475 | + } | ||
476 | + | ||
477 | + /* Ignore changes so far */ | ||
478 | +read_off: | ||
479 | + state_changed = 0; | ||
480 | + | ||
481 | + fclose(sf); | ||
482 | +} | ||
483 | + | ||
484 | +/* Read a line (or rest of) a process or global limit line. | ||
485 | + * Expect the first character of in_line to be NUL or a space. | ||
486 | + * Args: | ||
487 | + * in_line the line to read | ||
488 | + * ppl pointer to pointer to linked list where to store what is read | ||
489 | + * filename the name of the file being read | ||
490 | + * line_no that was read | ||
491 | + * | ||
492 | + * Read lines like: | ||
493 | + * PW 4a05a91f PC 4a05a817 | ||
494 | + */ | ||
495 | +void | ||
496 | +read_limit_line(const char* in_line, PLimit** ppl, char* state_filename, int line_no) | ||
497 | +{ | ||
498 | + PLimit* pl; | ||
499 | + unsigned long when; | ||
500 | + int eaten; | ||
501 | + | ||
502 | + while(*in_line) { | ||
503 | + if(*in_line == ' ') { /* Ignore spaces */ | ||
504 | + in_line++; | ||
505 | + continue; | ||
506 | + } | ||
507 | + | ||
508 | + /* We have found something, allocate somewhere to put it */ | ||
509 | + if( ! (pl = calloc(sizeof(PLimit), 1))) | ||
510 | + die(STATE_UNKNOWN, _("Out of memory reading %s line %d\n"), state_filename, line_no); | ||
511 | + | ||
512 | + /* Read a metric code */ | ||
513 | + if( ! strchr(METRIC_CODES, *in_line)) { | ||
514 | + printf(_("State file %s corrupt, unknown metric code, at line %d\n"), state_filename, line_no); | ||
515 | + must_rewrite = 1; | ||
516 | + free(pl); | ||
517 | + return; | ||
518 | + } | ||
519 | + pl->pl_metric = *in_line++; | ||
520 | + | ||
521 | + /* Read the state code */ | ||
522 | + if(*in_line == '\0' || ! strchr(STATE_CODES, *in_line)) { | ||
523 | + printf(_("State file %s corrupt, unknown state code, at line %d\n"), state_filename, line_no); | ||
524 | + must_rewrite = 1; | ||
525 | + free(pl); | ||
526 | + return; | ||
527 | + } | ||
528 | + pl->pl_state = *in_line++ == 'W' ? STATE_WARNING : STATE_CRITICAL; | ||
529 | + | ||
530 | + /* Read the time */ | ||
531 | + if(sscanf(in_line, " %lx%n", &when, &eaten) != 1) { | ||
532 | + printf(_("State file %s corrupt, bad time, at line %d\n"), state_filename, line_no); | ||
533 | + must_rewrite = 1; | ||
534 | + free(pl); | ||
535 | + return; | ||
536 | + } | ||
537 | + pl->pl_when = (time_t)when; | ||
538 | + in_line += eaten; | ||
539 | + | ||
540 | + if(verbose >= 3) | ||
541 | + printf(" metric=%c state=%c since %s", pl->pl_metric, STATE2state(pl->pl_state), ctime(&pl->pl_when)); | ||
542 | + | ||
543 | + /* Link it in */ | ||
544 | + pl->pl_next = *ppl; | ||
545 | + *ppl = pl; | ||
546 | + } | ||
547 | + | ||
548 | + if(verbose >= 3) | ||
549 | + printf("\n"); | ||
550 | +} | ||
551 | + | ||
552 | +/* Write back to the state file | ||
553 | + */ | ||
554 | +void | ||
555 | +write_state_file(char** argv) | ||
556 | +{ | ||
557 | + FILE* sf; | ||
558 | + char* errstr; | ||
559 | + ExProc* pp; | ||
560 | + PLimit* pl; | ||
561 | + | ||
562 | + if( ! state_filename) /* No file specified */ | ||
563 | + return; | ||
564 | + | ||
565 | + /* Work out if we are going to write back what we read in. | ||
566 | + * If there is something that has not been seen then it was read in | ||
567 | + * from the file - need to write back to loose the entry, scan for that. | ||
568 | + * Otherwise: state_changed will tell us what we want to know and was set | ||
569 | + * when a change was made. | ||
570 | + */ | ||
571 | + if(metric == METRIC_PROCS) { | ||
572 | + for(pl = state_nprocs; pl; pl = pl->pl_next) | ||
573 | + if( ! pl->pl_seen) | ||
574 | + state_changed = 1; | ||
575 | + } else { | ||
576 | + for(pp = state_list; pp; pp = pp->ep_next) { | ||
577 | + if( ! pp->ep_seen) | ||
578 | + state_changed = 1; | ||
579 | + | ||
580 | + for(pl = pp->ep_limits; pl; pl = pl->pl_next) | ||
581 | + if( ! pl->pl_seen) | ||
582 | + state_changed = 1; | ||
583 | + } | ||
584 | + } | ||
585 | + | ||
586 | + if(verbose >= 3) | ||
587 | + printf("Write state, changed=%d\n", state_changed); | ||
588 | + | ||
589 | + /* No change to the state file ? */ | ||
590 | + if( ! state_changed && ! must_rewrite) | ||
591 | + return; | ||
592 | + | ||
593 | + if( ! (sf = fopen(state_filename, "w"))) { | ||
594 | + errstr = strerror(errno); | ||
595 | + die(STATE_UNKNOWN, _("Can't open %s for writing as: %s"), state_filename, errstr); | ||
596 | + } | ||
597 | + | ||
598 | + fprintf(sf, "# Process state file written by %s - DO NOT HAND EDIT\n", progname); | ||
599 | + fprintf(sf, "# Args:"); | ||
600 | + for(; *argv; argv++) | ||
601 | + fprintf(sf, " %s", *argv); | ||
602 | + fprintf(sf, "\n"); | ||
603 | + fprintf(sf, "V %d\n", STATE_VERSION); | ||
604 | + | ||
605 | + if(metric != METRIC_PROCS) { | ||
606 | + /* Traverse the processes that we know about */ | ||
607 | + for(pp = state_list; pp; pp = pp->ep_next) { | ||
608 | + if( ! pp->ep_seen) | ||
609 | + continue; | ||
610 | + | ||
611 | + fprintf(sf, "P %d %d %.*s", pp->ep_pid, pp->ep_ppid, MAX_PROG_NAME, pp->ep_prog); | ||
612 | + for(pl = pp->ep_limits; pl; pl = pl->pl_next) | ||
613 | + if(pl->pl_seen) | ||
614 | + fprintf(sf, " %c%c %lx", pl->pl_metric, STATE2state(pl->pl_state), (unsigned long)pl->pl_when); | ||
615 | + | ||
616 | + fprintf(sf, "\n"); | ||
617 | + } | ||
618 | + } else { | ||
619 | + /* Print when the # processes is what is being checked */ | ||
620 | + fprintf(sf, "N"); | ||
621 | + for(pl = state_nprocs; pl; pl = pl->pl_next) | ||
622 | + if(pl->pl_seen) | ||
623 | + fprintf(sf, " %c%c %lx", pl->pl_metric, STATE2state(pl->pl_state), (unsigned long)pl->pl_when); | ||
624 | + | ||
625 | + /* Space before \n is important - else get error on read if no limits follow -- which | ||
626 | + * will happen if all is well. | ||
627 | + */ | ||
628 | + fprintf(sf, " \n"); | ||
629 | + } | ||
630 | + | ||
631 | + fclose(sf); | ||
632 | +} | ||
633 | + | ||
634 | +/* Record a state for a program. | ||
635 | + * Create a new entry if we need to, or update an existing one. | ||
636 | + * Program must match on the first 3 args to update. | ||
637 | + * Args: | ||
638 | + * procpid Process ID | ||
639 | + * procppid Parent process ID | ||
640 | + * progprog Program name | ||
641 | + * prog_metric What we are measuring (METRIC_something but represented as the character in the file) | ||
642 | + * state Error or warning (STATE_something) | ||
643 | + */ | ||
644 | +void | ||
645 | +record_state(pid_t procpid, pid_t procppid, char* procprog, char prog_metric, int state, time_t start_time) | ||
646 | +{ | ||
647 | + ExProc* pp; | ||
648 | + | ||
649 | + /* Look for the process */ | ||
650 | + for(pp = state_list; pp; pp = pp->ep_next) { | ||
651 | + if(pp->ep_pid != procpid) | ||
652 | + continue; | ||
653 | + | ||
654 | + /* Right process, but if it has mutated - throw it away and start again. | ||
655 | + * This doesn't detect processes that exec() a lot w/out fork(), but that is rare. | ||
656 | + */ | ||
657 | + if(pp->ep_ppid != procppid || strcmp(pp->ep_prog, procprog)) { | ||
658 | + PLimit* pl; | ||
659 | + while(pl = pp->ep_limits) { | ||
660 | + pp->ep_limits = pl->pl_next; | ||
661 | + free(pl); | ||
662 | + } | ||
663 | + pp->ep_ppid = procppid; | ||
664 | + if(strcmp(pp->ep_prog, procprog)) { | ||
665 | + free(pp->ep_prog); | ||
666 | + if( ! (pp->ep_prog = strdup(procprog))) | ||
667 | + die(STATE_UNKNOWN, _("Out of memory")); | ||
668 | + } | ||
669 | + | ||
670 | + state_changed = 1; | ||
671 | + } | ||
672 | + | ||
673 | + if(verbose >= 3) | ||
674 | + printf("Record found: pid %d %s\n", pp->ep_pid, pp->ep_prog); | ||
675 | + | ||
676 | + break; | ||
677 | + } | ||
678 | + | ||
679 | + /* Didn't find the process, allocate a new entry */ | ||
680 | + if( ! pp) { | ||
681 | + if( ! (pp = calloc(sizeof(ExProc), 1))) | ||
682 | + die(STATE_UNKNOWN, _("Out of memory")); | ||
683 | + pp->ep_pid = procpid; | ||
684 | + pp->ep_ppid = procppid; | ||
685 | + if( ! (pp->ep_prog = strdup(procprog))) | ||
686 | + die(STATE_UNKNOWN, _("Out of memory")); | ||
687 | + | ||
688 | + pp->ep_next = state_list; | ||
689 | + state_list = pp; | ||
690 | + state_changed = 1; | ||
691 | + | ||
692 | + if(verbose >= 3) | ||
693 | + printf("Record alloc: pid %d %s\n", pp->ep_pid, pp->ep_prog); | ||
694 | + } | ||
695 | + | ||
696 | + pp->ep_seen = 1; /* Ensure that this gets written out */ | ||
697 | + | ||
698 | + record_limit(&pp->ep_limits, state, prog_metric, start_time); | ||
699 | +} | ||
700 | + | ||
701 | +/* Store a limit | ||
702 | + * l_ref address of head of limits chain | ||
703 | + * prog_metric What we are measuring (METRIC_something but represented as the character in the file) | ||
704 | + * state Error or warning (STATE_something) | ||
705 | + * start_time The time to record when it started, if we already record this - don't change the time | ||
706 | + * unless this is older. | ||
707 | + * | ||
708 | + * If something is C then it is implicitly W. This is important: if something goes from | ||
709 | + * W to C, it might remain at C for less than the state time (which it might do W -> C | ||
710 | + * & back again several times) - but the time above the W level might be notifiable. | ||
711 | + */ | ||
712 | +void | ||
713 | +record_limit(PLimit** l_ref, int state, char proc_metric, time_t start_time) | ||
714 | +{ | ||
715 | + PLimit* pl; | ||
716 | + PLimit* pl_found = NULL; | ||
717 | + int seen_warning = 0; | ||
718 | + | ||
719 | + /* Find the individual process limit. | ||
720 | + * Scan the whole lot since we want to 'seen' a Warning if we have Critical. | ||
721 | + */ | ||
722 | + for(pl = *l_ref; pl; pl = pl->pl_next) | ||
723 | + if(pl->pl_metric == proc_metric) { | ||
724 | + if(state == STATE_CRITICAL && pl->pl_state == STATE_WARNING) { | ||
725 | + pl->pl_seen = 1; /* Ensure that it is output */ | ||
726 | + seen_warning = 1; | ||
727 | + } | ||
728 | + | ||
729 | + if(pl->pl_state == state) | ||
730 | + pl_found = pl; /* We found what we were looking for */ | ||
731 | + } | ||
732 | + | ||
733 | + pl = pl_found; | ||
734 | + | ||
735 | + /* Didn't find it, allocate a new one */ | ||
736 | + if( ! pl) { | ||
737 | + if( ! (pl = calloc(sizeof(PLimit), 1))) | ||
738 | + die(STATE_UNKNOWN, _("Out of memory")); | ||
739 | + pl->pl_next = *l_ref; | ||
740 | + *l_ref = pl; | ||
741 | + pl->pl_when = start_time; | ||
742 | + pl->pl_state = state; | ||
743 | + pl->pl_metric = proc_metric; | ||
744 | + | ||
745 | + state_changed = 1; | ||
746 | + } else /* It is possible that the time was set earlier when a 'C' generated | ||
747 | + * an implicit 'W'. Since the 'W' prob started earlier that the 'C' | ||
748 | + * we may have recorded the later 'C' time rather than the 'W' time. | ||
749 | + */ | ||
750 | + if(pl->pl_when > start_time) | ||
751 | + pl->pl_when = start_time; | ||
752 | + | ||
753 | + pl->pl_seen = 1; /* Ensure that it is output */ | ||
754 | + | ||
755 | + if(verbose >= 3) | ||
756 | + printf("Record limit: metric=%c state=%c since %s", pl->pl_metric, STATE2state(pl->pl_state), ctime(&pl->pl_when)); | ||
757 | + | ||
758 | + /* If this is a critical, but we didn't see the warning - generate the warning */ | ||
759 | + if(state == 'C' && ! seen_warning) | ||
760 | + record_limit(l_ref, 'W', proc_metric, start_time); | ||
761 | +} | ||
762 | + | ||
763 | +/* Check if limits have been exceeded for at least the state-time | ||
764 | + * Check only metric_state_name regardless of what is stored. | ||
765 | + * | ||
766 | + * Args: | ||
767 | + * pl List of limit values | ||
768 | + * | ||
769 | + * Return: STATE_OK, STATE_WARNING or STATE_CRITICAL | ||
770 | + */ | ||
771 | +int | ||
772 | +check_limit(PLimit* pl) | ||
773 | +{ | ||
774 | + int result = STATE_OK; | ||
775 | + | ||
776 | + for(; pl; pl = pl->pl_next) { | ||
777 | + if( ! pl->pl_seen) | ||
778 | + continue; /* Of no interest, not updated this run */ | ||
779 | + | ||
780 | + if(pl->pl_metric != metric_state_name) | ||
781 | + continue; /* Not what we are looking for */ | ||
782 | + | ||
783 | + /* Is this something that has been going on for long enough that we | ||
784 | + * are to report it ? | ||
785 | + */ | ||
786 | + if(pl->pl_when <= state_limit_start) { | ||
787 | + if(result == STATE_OK) | ||
788 | + result = pl->pl_state; | ||
789 | + | ||
790 | + if(result == STATE_WARNING && pl->pl_state == STATE_CRITICAL) | ||
791 | + result = STATE_CRITICAL; | ||
792 | + } | ||
793 | + } | ||
794 | + | ||
795 | + return(result); | ||
796 | } | ||