summaryrefslogtreecommitdiffstats
path: root/web/attachments/327688-check_procs.c.patch
diff options
context:
space:
mode:
Diffstat (limited to 'web/attachments/327688-check_procs.c.patch')
-rw-r--r--web/attachments/327688-check_procs.c.patch796
1 files changed, 796 insertions, 0 deletions
diff --git a/web/attachments/327688-check_procs.c.patch b/web/attachments/327688-check_procs.c.patch
new file mode 100644
index 0000000..2c262f7
--- /dev/null
+++ b/web/attachments/327688-check_procs.c.patch
@@ -0,0 +1,796 @@
1--- check_procs.c 2009-02-21 09:59:24.000000000 +0000
2+++ check_procs.c.new 2009-05-19 10:41:14.000000000 +0000
3@@ -27,7 +27,8 @@
4 *
5 * You should have received a copy of the GNU General Public License
6 * along with this program. If not, see <http://www.gnu.org/licenses/>.
7-*
8+*
9+* State file stuff originally by: Alain Williams <addw@phcomp.co.uk>
10 *
11 *****************************************************************************/
12
13@@ -42,13 +43,7 @@
14 #include "regex.h"
15
16 #include <pwd.h>
17-
18-int process_arguments (int, char **);
19-int validate_arguments (void);
20-int check_thresholds (int);
21-int convert_to_seconds (char *);
22-void print_help (void);
23-void print_usage (void);
24+#include <time.h>
25
26 int wmax = -1;
27 int cmax = -1;
28@@ -77,6 +72,7 @@
29 METRIC_ELAPSED
30 };
31 enum metric metric = METRIC_PROCS;
32+char metric_state_name = 'P'; /* Metric name in the state file */
33
34 int verbose = 0;
35 int uid;
36@@ -92,9 +88,98 @@
37 char *fmt;
38 char *fails;
39 char tmp[MAX_INPUT_BUFFER];
40+time_t now;
41+time_t state_limit_start;
42
43 FILE *ps_input = NULL;
44
45+/* Optionally trigger an alert if a process has been in a state for
46+ * some time. This time will be measured in minutes, ie much longer than
47+ * this program runs for - thus a state file is needed to store this
48+ * between runs of this program.
49+ * This happens if --state-file is specified.
50+ *
51+ * The state file records information about processes that exceed some criteria
52+ * for a warning or critical notice.
53+ * The file will contain one 'V' line.
54+ * If the metric is PROCS there will be one N line, else zero or more P lines.
55+ * Format of the state file:
56+ * Max line length of 500
57+ * Empty lines and lines starting '#' are ignored
58+ * Lines consist of a type character, a space and optional extra information
59+ * V version_number
60+ * P pid ppid name MS secs
61+ * name is limited to a max 20 characters
62+ * M is the metric:
63+ * V virtual memory size
64+ * R resident set memory size
65+ * C percentage CPU
66+ * E time elapsed in seconds
67+ * S is the state:
68+ * W Warning
69+ * C Critical
70+ * N MS secs
71+ * M will be 'P'
72+ * S is the state as above
73+ * secs is the epoch time the metric was first exceeded - hex number
74+ * The 'MS secs' or 'S secs' may appear twice as it is possible for a process (or the
75+ * max # processes) to exceed both the warning and critical thresholds but for different times.
76+ * If something is C then it is implicitly W.
77+ *
78+ * There might be a trailing space on a N line.
79+ *
80+ * Eg:
81+ * P 1234 1200 cpu_hog CW 4a05a817 CC 4a05a91f
82+ * N PW 4a05a91f
83+ */
84+#define STATE_VERSION 1 /* Change me if the file format changes */
85+#define STATE_MAX_LINE 500 /* Longest line */
86+#define MAX_PROG_NAME 20 /* Longest name of program - search for this if you change it */
87+#define METRIC_CODES "PVRCE" /* For input validation */
88+#define STATE_CODES "WC" /* For input validation */
89+#define STATE2state(x) ((x) == STATE_WARNING ? 'W' : 'C') /* Convert STATE_WARNING or STATE_CRITICAL to 'W' or 'C' */
90+
91+/* A process can exceed various limits. This describes on of them
92+ */
93+typedef struct plimit {
94+ struct plimit* pl_next; /* NULL terminated list */
95+ time_t pl_when; /* When it first exceeded this limit */
96+ int pl_state; /* STATE_WARNING or STATE_CRITICAL */
97+ int pl_seen; /* Exceeded this run */
98+ char pl_metric; /* What is exceeded - as in file */
99+} PLimit;
100+
101+/* Something to describe a process that is exceeding something
102+ */
103+typedef struct exproc {
104+ struct exproc* ep_next; /* NULL terminated list */
105+ pid_t ep_pid; /* Process ID */
106+ pid_t ep_ppid; /* Parent PID */
107+ char* ep_prog; /* Program name */
108+ PLimit* ep_limits; /* Limits exceeded list */
109+ int ep_seen; /* Updated/noticed this run */
110+} ExProc;
111+
112+char* state_filename; /* File that we store this in */
113+int state_time = 5; /* Trigger time - minutes */
114+ExProc* state_list; /* Used for process specific metrics - ie metric is *not* PROCS */
115+PLimit* state_nprocs; /* Info on # procs exceeded - used if metric is PROCS */
116+int state_changed; /* Ie need to write back to file */
117+int must_rewrite; /* Set this if there is a syntax error in the file, or
118+ * some other reason which means we must rewrite it */
119+
120+int process_arguments (int, char **);
121+int validate_arguments (void);
122+int check_thresholds (int);
123+int convert_to_seconds (char *);
124+void print_help (void);
125+void print_usage (void);
126+void read_state_file(void);
127+void write_state_file(char** argv);
128+void record_state(pid_t procpid, pid_t procppid, char* procprog, char prog_metric, int state, time_t start_time);
129+void record_limit(PLimit** l_ref, int state, char proc_metric, time_t start_time);
130+void read_limit_line(const char* in_line, PLimit** ppl, char* state_filename, int line_no);
131+int check_limit(PLimit* pl);
132
133 int
134 main (int argc, char **argv)
135@@ -129,13 +214,16 @@
136 int result = STATE_UNKNOWN;
137 output chld_out, chld_err;
138
139+ now = time(NULL);
140+
141 setlocale (LC_ALL, "");
142 bindtextdomain (PACKAGE, LOCALEDIR);
143 textdomain (PACKAGE);
144 setlocale(LC_NUMERIC, "POSIX");
145
146- input_buffer = malloc (MAX_INPUT_BUFFER);
147- procprog = malloc (MAX_INPUT_BUFFER);
148+ if( ! (input_buffer = malloc (MAX_INPUT_BUFFER)) ||
149+ ! (procprog = malloc (MAX_INPUT_BUFFER)))
150+ die(STATE_UNKNOWN, _("Out of memory in startup\n"));
151
152 asprintf (&metric_name, "PROCS");
153 metric = METRIC_PROCS;
154@@ -168,6 +256,9 @@
155 result = cmd_file_read( input_filename, &chld_out, 0);
156 }
157
158+ /* What do we remember from last time ? */
159+ read_state_file();
160+
161 /* flush first line: j starts at 1 */
162 for (j = 1; j < chld_out.lines; j++) {
163 input_line = chld_out.line[j];
164@@ -237,6 +328,10 @@
165 procetime, procprog, procargs);
166 }
167
168+ /* This is all made simpler because metric can only talk about
169+ * one metric, ie can't check more than one thing at a time.
170+ * This means that metric_state_name is the char equivalent of metric.
171+ */
172 if (metric == METRIC_VSZ)
173 i = check_thresholds (procvsz);
174 else if (metric == METRIC_RSS)
175@@ -248,15 +343,29 @@
176 i = check_thresholds (procseconds);
177
178 if (metric != METRIC_PROCS) {
179- if (i == STATE_WARNING) {
180- warn++;
181- asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
182- result = max_state (result, i);
183- }
184- if (i == STATE_CRITICAL) {
185- crit++;
186- asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
187- result = max_state (result, i);
188+ if(state_filename) {
189+ /* State is being stored - ie don't report immediately.
190+ * Note what we have found:
191+ */
192+ if(i == STATE_WARNING || i == STATE_CRITICAL)
193+ record_state(procpid, procppid, procprog, metric_state_name, i, now);
194+ } else {
195+ if (i == STATE_WARNING) {
196+ char* str = fails;
197+ warn++;
198+ asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
199+ result = max_state (result, i);
200+ if(str)
201+ free(str);
202+ }
203+ if (i == STATE_CRITICAL) {
204+ char* str = fails;
205+ crit++;
206+ asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog);
207+ result = max_state (result, i);
208+ if(str)
209+ free(str);
210+ }
211 }
212 }
213 }
214@@ -276,7 +385,59 @@
215
216 /* Needed if procs found, but none match filter */
217 if ( metric == METRIC_PROCS ) {
218- result = max_state (result, check_thresholds (procs) );
219+ int threshold = check_thresholds(procs);
220+ int putative_result = max_state(result, threshold);
221+
222+ if(state_filename) { /* Do not report immediately - note what we found */
223+ /* Only record something if we may need to report it */
224+ if(putative_result == STATE_WARNING || putative_result == STATE_CRITICAL)
225+ record_limit(&state_nprocs, putative_result, 'P', now);
226+ } else
227+ result = putative_result;
228+ }
229+
230+ /* If we have a state file, the above has just stored the results away, so have
231+ * a look and see if there is anything that we should note.
232+ * The slight subtlety is that we could have something recorded as both a warning
233+ * & a critical - in this case only report the critical.
234+ */
235+ if(state_filename) {
236+ /* Compute the start time of any state that we must report.
237+ * Ie any state younger than this we keep quiet about.
238+ */
239+ state_limit_start = (time_t)((unsigned long)now - state_time * 60);
240+
241+ if(verbose >= 3)
242+ printf("Checking metric %c, limit_start %s", metric_state_name, ctime(&state_limit_start));
243+
244+ if(metric == METRIC_PROCS) {
245+ result = check_limit(state_nprocs);
246+ } else {
247+ ExProc* pp;
248+
249+ for(pp = state_list; pp; pp = pp->ep_next) {
250+ char* str = fails;
251+
252+ /* What is the state of this recorded process ? */
253+ int res = check_limit(pp->ep_limits);
254+
255+ switch(res) {
256+ case STATE_OK:
257+ continue; /* Don't do the stuff below */
258+ case STATE_WARNING:
259+ warn++;
260+ break;
261+ case STATE_CRITICAL:
262+ crit++;
263+ break;
264+ }
265+
266+ asprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), pp->ep_prog);
267+ result = max_state(result, res);
268+ if(str)
269+ free(str);
270+ }
271+ }
272 }
273
274 if ( result == STATE_OK ) {
275@@ -302,6 +463,9 @@
276 printf (" [%s]", fails);
277
278 printf ("\n");
279+
280+ write_state_file(argv);
281+
282 return result;
283 }
284
285@@ -336,6 +500,8 @@
286 {"verbose", no_argument, 0, 'v'},
287 {"ereg-argument-array", required_argument, 0, CHAR_MAX+1},
288 {"input-file", required_argument, 0, CHAR_MAX+2},
289+ {"state-file", required_argument, 0, 'S'},
290+ {"state-time", required_argument, 0, 'T'},
291 {0, 0, 0, 0}
292 };
293
294@@ -344,7 +510,7 @@
295 strcpy (argv[c], "-t");
296
297 while (1) {
298- c = getopt_long (argc, argv, "Vvht:c:w:p:s:u:C:a:z:r:m:P:",
299+ c = getopt_long (argc, argv, "Vvht:c:w:p:s:u:C:a:z:r:m:P:S:T:",
300 longopts, &option);
301
302 if (c == -1 || c == EOF)
303@@ -479,22 +645,27 @@
304 asprintf (&metric_name, "%s", optarg);
305 if ( strcmp(optarg, "PROCS") == 0) {
306 metric = METRIC_PROCS;
307+ metric_state_name = 'P';
308 break;
309 }
310 else if ( strcmp(optarg, "VSZ") == 0) {
311 metric = METRIC_VSZ;
312+ metric_state_name = 'V';
313 break;
314 }
315 else if ( strcmp(optarg, "RSS") == 0 ) {
316 metric = METRIC_RSS;
317+ metric_state_name = 'R';
318 break;
319 }
320 else if ( strcmp(optarg, "CPU") == 0 ) {
321 metric = METRIC_CPU;
322+ metric_state_name = 'C';
323 break;
324 }
325 else if ( strcmp(optarg, "ELAPSED") == 0) {
326 metric = METRIC_ELAPSED;
327+ metric_state_name = 'E';
328 break;
329 }
330
331@@ -505,6 +676,16 @@
332 case CHAR_MAX+2:
333 input_filename = optarg;
334 break;
335+ case 'S': /* state-file */
336+ state_filename = optarg;
337+ break;
338+ case 'T': /* state-time */
339+ if (!is_integer (optarg))
340+ usage2 (_("state-time must be a positive integer"), optarg);
341+ else
342+ if((state_time = atoi (optarg)) < 0) /* Treat -ve time as zero */
343+ state_time = 0;
344+ break;
345 }
346 }
347
348@@ -727,6 +908,12 @@
349 printf (" %s\n", "-C, --command=COMMAND");
350 printf (" %s\n", _("Only scan for exact matches of COMMAND (without path)."));
351
352+ printf ("\n");
353+ printf ( "%s\n", "State memory (complain if a process exceeds a limit for a long time):");
354+ printf ( " %s\n", "-S, --state-file=StateFile");
355+ printf ( " %s\n", _("Store process information in this file"));
356+ printf ( " %s\n", "-T, --state-time=minutes");
357+
358 printf(_("\n\
359 RANGEs are specified 'min:max' or 'min:' or ':max' (or 'max'). If\n\
360 specified 'max:min', a warning status will be generated if the\n\
361@@ -755,7 +942,9 @@
362 printf (" %s\n", "check_procs -w 50000 -c 100000 --metric=VSZ");
363 printf (" %s\n\n", _("Alert if VSZ of any processes over 50K or 100K"));
364 printf (" %s\n", "check_procs -w 10 -c 20 --metric=CPU");
365- printf (" %s\n", _("Alert if CPU of any processes over 10%% or 20%%"));
366+ printf (" %s\n\n", _("Alert if CPU of any processes over 10% or 20%"));
367+ printf (" %s\n", "check_procs -w 80 -c 90 --metric=CPU --state-time=10 --state-file=/tmp/CPU-state");
368+ printf (" %s\n", _("Alert if CPU of any processes over 80% or 90% for at least 10 minutes, record state in /tmp/CPU-state"));
369
370 printf (_(UT_SUPPORT));
371 }
372@@ -766,5 +955,423 @@
373 printf (_("Usage: "));
374 printf ("%s -w <range> -c <range> [-m metric] [-s state] [-p ppid]\n", progname);
375 printf (" [-u user] [-r rss] [-z vsz] [-P %%cpu] [-a argument-array]\n");
376- printf (" [-C command] [-t timeout] [-v]\n");
377+ printf (" [-C command] [-t timeout] [-v] [-S state_file] [-T state_time_minutes]\n");
378+}
379+
380+/* Read the state file - if there is one.
381+ *
382+ * This file is not locked. On a horribly over loaded system it might happen that reads & writes
383+ * could overlap in the wrong way. Locking could make things worse, you may end up with many
384+ * instances of this program waiting on the lock.
385+ */
386+void
387+read_state_file(void)
388+{
389+ FILE* sf;
390+ char* errstr;
391+ char* str;
392+ char in_buf[STATE_MAX_LINE]; /* Input buffer */
393+ int line_no = 0;
394+ char prog_name[MAX_PROG_NAME + 1];
395+ int eaten;
396+ ExProc* pp;
397+
398+ if( ! state_filename) /* No file specified */
399+ return;
400+
401+ if( ! (sf = fopen(state_filename, "r"))) {
402+ /* It is OK if it doesn't exist, we just haven't created it yet */
403+ if(errno == ENOENT) {
404+ must_rewrite = 1; /* Force it to be created */
405+ return;
406+ }
407+
408+ /* Anything else - should not happen */
409+ errstr = strerror(errno);
410+ die(STATE_UNKNOWN, _("Can't open %s for reading as: %s"), state_filename, errstr);
411+ }
412+
413+ /* Read a line at a time */
414+ while(fgets(in_buf, STATE_MAX_LINE, sf)) {
415+ line_no++;
416+ if( ! (str = strchr(in_buf, '\n'))) {
417+ printf(_("State file %s corrupt, line too long, at line %d\n"), state_filename, line_no);
418+ must_rewrite = 1; /* Force write */
419+ goto read_off; /* Will be fixed when we rewrite it in a moment */
420+ }
421+ *str = '\0';
422+
423+ /* Empty line or comment ? */
424+ if(in_buf[0] == '\0' || in_buf[0] == '#')
425+ continue;
426+
427+ if(in_buf[1] != ' ') {
428+ printf(_("State file %s corrupt, no space at position 1, at line %d\n"), state_filename, line_no);
429+ must_rewrite = 1; /* Force write */
430+ goto read_off;
431+ }
432+
433+ /* What line type ? */
434+ switch(in_buf[0]) {
435+ case 'V': /* In case we are running 1st time after upgrade */
436+ if(atoi(in_buf + 2) != STATE_VERSION) {
437+ printf(_("State file %s is wrong version, expecting %d. File ignored\n"), state_filename, STATE_VERSION);
438+ must_rewrite = 1; /* Force write */
439+ goto read_off;
440+ }
441+ break;
442+ case 'P': /* Info about a process */
443+ /* P 1234 1200 cpu_hog CW 4a05a817 CC 4a05a92f */
444+ if( ! (pp = calloc(sizeof(ExProc), 1)))
445+ die(STATE_UNKNOWN, _("Out of memory reading %s line %d"), state_filename, line_no);
446+
447+ pp->ep_next = state_list;
448+ state_list = pp;
449+
450+ /* MAX_PROG_NAME on next line */
451+ if(sscanf(in_buf, "P %d %d %20s%n", &pp->ep_pid, &pp->ep_ppid, prog_name, &eaten) != 3) {
452+ printf(_("State file corrupt, bad process line, file %s line %d\n"), state_filename, line_no);
453+ must_rewrite = 1; /* Force write */
454+ goto read_off;
455+ }
456+
457+ if( ! (pp->ep_prog = strdup(prog_name)))
458+ die(STATE_UNKNOWN, _("Out of memory reading %s line %d\n"), state_filename, line_no);
459+
460+ if(verbose >= 3)
461+ printf("Read pid %d ppid %d proc %s\n", pp->ep_pid, pp->ep_ppid, pp->ep_prog);
462+
463+ read_limit_line(in_buf + eaten, &pp->ep_limits, state_filename, line_no);
464+ break;
465+ case 'N': /* Number of procs exceeded */
466+ if(verbose >= 3)
467+ printf("Read N:\n");
468+ read_limit_line(in_buf + 1, &state_nprocs, state_filename, line_no);
469+ break;
470+ default:
471+ printf(_("State file %s corrupt, unknown line type, at line %d\n"), state_filename, line_no);
472+ must_rewrite = 1; /* Force write */
473+ goto read_off;
474+ }
475+ }
476+
477+ /* Ignore changes so far */
478+read_off:
479+ state_changed = 0;
480+
481+ fclose(sf);
482+}
483+
484+/* Read a line (or rest of) a process or global limit line.
485+ * Expect the first character of in_line to be NUL or a space.
486+ * Args:
487+ * in_line the line to read
488+ * ppl pointer to pointer to linked list where to store what is read
489+ * filename the name of the file being read
490+ * line_no that was read
491+ *
492+ * Read lines like:
493+ * PW 4a05a91f PC 4a05a817
494+ */
495+void
496+read_limit_line(const char* in_line, PLimit** ppl, char* state_filename, int line_no)
497+{
498+ PLimit* pl;
499+ unsigned long when;
500+ int eaten;
501+
502+ while(*in_line) {
503+ if(*in_line == ' ') { /* Ignore spaces */
504+ in_line++;
505+ continue;
506+ }
507+
508+ /* We have found something, allocate somewhere to put it */
509+ if( ! (pl = calloc(sizeof(PLimit), 1)))
510+ die(STATE_UNKNOWN, _("Out of memory reading %s line %d\n"), state_filename, line_no);
511+
512+ /* Read a metric code */
513+ if( ! strchr(METRIC_CODES, *in_line)) {
514+ printf(_("State file %s corrupt, unknown metric code, at line %d\n"), state_filename, line_no);
515+ must_rewrite = 1;
516+ free(pl);
517+ return;
518+ }
519+ pl->pl_metric = *in_line++;
520+
521+ /* Read the state code */
522+ if(*in_line == '\0' || ! strchr(STATE_CODES, *in_line)) {
523+ printf(_("State file %s corrupt, unknown state code, at line %d\n"), state_filename, line_no);
524+ must_rewrite = 1;
525+ free(pl);
526+ return;
527+ }
528+ pl->pl_state = *in_line++ == 'W' ? STATE_WARNING : STATE_CRITICAL;
529+
530+ /* Read the time */
531+ if(sscanf(in_line, " %lx%n", &when, &eaten) != 1) {
532+ printf(_("State file %s corrupt, bad time, at line %d\n"), state_filename, line_no);
533+ must_rewrite = 1;
534+ free(pl);
535+ return;
536+ }
537+ pl->pl_when = (time_t)when;
538+ in_line += eaten;
539+
540+ if(verbose >= 3)
541+ printf(" metric=%c state=%c since %s", pl->pl_metric, STATE2state(pl->pl_state), ctime(&pl->pl_when));
542+
543+ /* Link it in */
544+ pl->pl_next = *ppl;
545+ *ppl = pl;
546+ }
547+
548+ if(verbose >= 3)
549+ printf("\n");
550+}
551+
552+/* Write back to the state file
553+ */
554+void
555+write_state_file(char** argv)
556+{
557+ FILE* sf;
558+ char* errstr;
559+ ExProc* pp;
560+ PLimit* pl;
561+
562+ if( ! state_filename) /* No file specified */
563+ return;
564+
565+ /* Work out if we are going to write back what we read in.
566+ * If there is something that has not been seen then it was read in
567+ * from the file - need to write back to loose the entry, scan for that.
568+ * Otherwise: state_changed will tell us what we want to know and was set
569+ * when a change was made.
570+ */
571+ if(metric == METRIC_PROCS) {
572+ for(pl = state_nprocs; pl; pl = pl->pl_next)
573+ if( ! pl->pl_seen)
574+ state_changed = 1;
575+ } else {
576+ for(pp = state_list; pp; pp = pp->ep_next) {
577+ if( ! pp->ep_seen)
578+ state_changed = 1;
579+
580+ for(pl = pp->ep_limits; pl; pl = pl->pl_next)
581+ if( ! pl->pl_seen)
582+ state_changed = 1;
583+ }
584+ }
585+
586+ if(verbose >= 3)
587+ printf("Write state, changed=%d\n", state_changed);
588+
589+ /* No change to the state file ? */
590+ if( ! state_changed && ! must_rewrite)
591+ return;
592+
593+ if( ! (sf = fopen(state_filename, "w"))) {
594+ errstr = strerror(errno);
595+ die(STATE_UNKNOWN, _("Can't open %s for writing as: %s"), state_filename, errstr);
596+ }
597+
598+ fprintf(sf, "# Process state file written by %s - DO NOT HAND EDIT\n", progname);
599+ fprintf(sf, "# Args:");
600+ for(; *argv; argv++)
601+ fprintf(sf, " %s", *argv);
602+ fprintf(sf, "\n");
603+ fprintf(sf, "V %d\n", STATE_VERSION);
604+
605+ if(metric != METRIC_PROCS) {
606+ /* Traverse the processes that we know about */
607+ for(pp = state_list; pp; pp = pp->ep_next) {
608+ if( ! pp->ep_seen)
609+ continue;
610+
611+ fprintf(sf, "P %d %d %.*s", pp->ep_pid, pp->ep_ppid, MAX_PROG_NAME, pp->ep_prog);
612+ for(pl = pp->ep_limits; pl; pl = pl->pl_next)
613+ if(pl->pl_seen)
614+ fprintf(sf, " %c%c %lx", pl->pl_metric, STATE2state(pl->pl_state), (unsigned long)pl->pl_when);
615+
616+ fprintf(sf, "\n");
617+ }
618+ } else {
619+ /* Print when the # processes is what is being checked */
620+ fprintf(sf, "N");
621+ for(pl = state_nprocs; pl; pl = pl->pl_next)
622+ if(pl->pl_seen)
623+ fprintf(sf, " %c%c %lx", pl->pl_metric, STATE2state(pl->pl_state), (unsigned long)pl->pl_when);
624+
625+ /* Space before \n is important - else get error on read if no limits follow -- which
626+ * will happen if all is well.
627+ */
628+ fprintf(sf, " \n");
629+ }
630+
631+ fclose(sf);
632+}
633+
634+/* Record a state for a program.
635+ * Create a new entry if we need to, or update an existing one.
636+ * Program must match on the first 3 args to update.
637+ * Args:
638+ * procpid Process ID
639+ * procppid Parent process ID
640+ * progprog Program name
641+ * prog_metric What we are measuring (METRIC_something but represented as the character in the file)
642+ * state Error or warning (STATE_something)
643+ */
644+void
645+record_state(pid_t procpid, pid_t procppid, char* procprog, char prog_metric, int state, time_t start_time)
646+{
647+ ExProc* pp;
648+
649+ /* Look for the process */
650+ for(pp = state_list; pp; pp = pp->ep_next) {
651+ if(pp->ep_pid != procpid)
652+ continue;
653+
654+ /* Right process, but if it has mutated - throw it away and start again.
655+ * This doesn't detect processes that exec() a lot w/out fork(), but that is rare.
656+ */
657+ if(pp->ep_ppid != procppid || strcmp(pp->ep_prog, procprog)) {
658+ PLimit* pl;
659+ while(pl = pp->ep_limits) {
660+ pp->ep_limits = pl->pl_next;
661+ free(pl);
662+ }
663+ pp->ep_ppid = procppid;
664+ if(strcmp(pp->ep_prog, procprog)) {
665+ free(pp->ep_prog);
666+ if( ! (pp->ep_prog = strdup(procprog)))
667+ die(STATE_UNKNOWN, _("Out of memory"));
668+ }
669+
670+ state_changed = 1;
671+ }
672+
673+ if(verbose >= 3)
674+ printf("Record found: pid %d %s\n", pp->ep_pid, pp->ep_prog);
675+
676+ break;
677+ }
678+
679+ /* Didn't find the process, allocate a new entry */
680+ if( ! pp) {
681+ if( ! (pp = calloc(sizeof(ExProc), 1)))
682+ die(STATE_UNKNOWN, _("Out of memory"));
683+ pp->ep_pid = procpid;
684+ pp->ep_ppid = procppid;
685+ if( ! (pp->ep_prog = strdup(procprog)))
686+ die(STATE_UNKNOWN, _("Out of memory"));
687+
688+ pp->ep_next = state_list;
689+ state_list = pp;
690+ state_changed = 1;
691+
692+ if(verbose >= 3)
693+ printf("Record alloc: pid %d %s\n", pp->ep_pid, pp->ep_prog);
694+ }
695+
696+ pp->ep_seen = 1; /* Ensure that this gets written out */
697+
698+ record_limit(&pp->ep_limits, state, prog_metric, start_time);
699+}
700+
701+/* Store a limit
702+ * l_ref address of head of limits chain
703+ * prog_metric What we are measuring (METRIC_something but represented as the character in the file)
704+ * state Error or warning (STATE_something)
705+ * start_time The time to record when it started, if we already record this - don't change the time
706+ * unless this is older.
707+ *
708+ * If something is C then it is implicitly W. This is important: if something goes from
709+ * W to C, it might remain at C for less than the state time (which it might do W -> C
710+ * & back again several times) - but the time above the W level might be notifiable.
711+ */
712+void
713+record_limit(PLimit** l_ref, int state, char proc_metric, time_t start_time)
714+{
715+ PLimit* pl;
716+ PLimit* pl_found = NULL;
717+ int seen_warning = 0;
718+
719+ /* Find the individual process limit.
720+ * Scan the whole lot since we want to 'seen' a Warning if we have Critical.
721+ */
722+ for(pl = *l_ref; pl; pl = pl->pl_next)
723+ if(pl->pl_metric == proc_metric) {
724+ if(state == STATE_CRITICAL && pl->pl_state == STATE_WARNING) {
725+ pl->pl_seen = 1; /* Ensure that it is output */
726+ seen_warning = 1;
727+ }
728+
729+ if(pl->pl_state == state)
730+ pl_found = pl; /* We found what we were looking for */
731+ }
732+
733+ pl = pl_found;
734+
735+ /* Didn't find it, allocate a new one */
736+ if( ! pl) {
737+ if( ! (pl = calloc(sizeof(PLimit), 1)))
738+ die(STATE_UNKNOWN, _("Out of memory"));
739+ pl->pl_next = *l_ref;
740+ *l_ref = pl;
741+ pl->pl_when = start_time;
742+ pl->pl_state = state;
743+ pl->pl_metric = proc_metric;
744+
745+ state_changed = 1;
746+ } else /* It is possible that the time was set earlier when a 'C' generated
747+ * an implicit 'W'. Since the 'W' prob started earlier that the 'C'
748+ * we may have recorded the later 'C' time rather than the 'W' time.
749+ */
750+ if(pl->pl_when > start_time)
751+ pl->pl_when = start_time;
752+
753+ pl->pl_seen = 1; /* Ensure that it is output */
754+
755+ if(verbose >= 3)
756+ printf("Record limit: metric=%c state=%c since %s", pl->pl_metric, STATE2state(pl->pl_state), ctime(&pl->pl_when));
757+
758+ /* If this is a critical, but we didn't see the warning - generate the warning */
759+ if(state == 'C' && ! seen_warning)
760+ record_limit(l_ref, 'W', proc_metric, start_time);
761+}
762+
763+/* Check if limits have been exceeded for at least the state-time
764+ * Check only metric_state_name regardless of what is stored.
765+ *
766+ * Args:
767+ * pl List of limit values
768+ *
769+ * Return: STATE_OK, STATE_WARNING or STATE_CRITICAL
770+ */
771+int
772+check_limit(PLimit* pl)
773+{
774+ int result = STATE_OK;
775+
776+ for(; pl; pl = pl->pl_next) {
777+ if( ! pl->pl_seen)
778+ continue; /* Of no interest, not updated this run */
779+
780+ if(pl->pl_metric != metric_state_name)
781+ continue; /* Not what we are looking for */
782+
783+ /* Is this something that has been going on for long enough that we
784+ * are to report it ?
785+ */
786+ if(pl->pl_when <= state_limit_start) {
787+ if(result == STATE_OK)
788+ result = pl->pl_state;
789+
790+ if(result == STATE_WARNING && pl->pl_state == STATE_CRITICAL)
791+ result = STATE_CRITICAL;
792+ }
793+ }
794+
795+ return(result);
796 }