summaryrefslogtreecommitdiffstats
path: root/contrib/check_procl.sh
blob: 0db6240f68a821b073a634cc0d456c3047afea1d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#!/bin/bash

#
# Check_procl.sh 
# 
# Program: Process load check plugin for Nagios
# License : GPL
# Copyright (c) 2002 Jerome Tytgat (j.tytgat@sioban.net)
#
# check_procl.sh,v 1.1 2002/07/04 09:35 
#
# Description :
#   
#  This plugin is for check the %cpu, %mem or cputime of one or more process
#
# Usage :
#
#  check_procl.sh -p process1,process2,... -w a.b -c c.d --cpu 
#  check_procl.sh -p process1,process2,... -w a.b -c c.d --mem
#  check_procl.sh -p process1,process2,... -w a:b:c -c d:e:f --cputime
#
#  check_procl.sh -p %all% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
#  check_procl.sh -p %max% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
#
# Example :
#   
#  To know the memory eaten by HTTPD processes, be warned when it reach 50% and be critical when it reach 75%
#	check_procl.sh -p httpd -w 50.0 -c 75.0 --mem
#	> OK - total %MEM for process httpd : 46.1
#
#  To know the process which eat the more cpu time, but as we are under linux and are using kapm we do :
# 	check_procl.sh -p %max% -e kapmd-idle,kapmd -w 0:1:0 -c 0:2:0 --cputime
# 	> CRITICAL - total CPUTIME for process named : 02:32:10
#
# Tested on solaris 7/8, Linux Redhat 7.3 and Linux Suse 7.1
#
# BUGS : problems with handling time on solaris...


help_usage() {
        echo "Usage:"
        echo " $0 -p <process_name1,process_name2,... | %all% | %max%>"
        echo "	  [-e <process_name1,process_name2,...>] -w warning -c critical < --cpu | --mem | --cputime>"
        echo " $0 (-v | --version)"
        echo " $0 (-h | --help)"
}

help_version() {
        echo "check_procl.sh (nagios-plugins) 1.1"
        echo "The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute"
        echo "copies of the plugins under the terms of the GNU General Public License."
	echo "For more information about these matters, see the file named COPYING."
        echo "Copyright (c) 2002 Jerome Tytgat - j.tytgat@sioban.net"
	echo "Greetings goes to Websurg which kindly let me took time to develop this"
        echo "                  Manu Feig and Jacques Kern who were my beta testers, thanks to them !"
}

verify_dep() {
	needed="bash cut egrep expr grep let ps sed sort tail test tr wc"
	for i in `echo $needed`
	do
		type $i > /dev/null 2>&1 /dev/null
		if [ $? -eq 1 ]
		then
			echo "I am missing an important component : $i"
			echo "Cannot continue, sorry, try to find the missing one..."
			exit 3
		fi
	done
}

myself=$0

verify_dep

if [ "$1" = "-h" -o "$1" = "--help" ]
then 
	help_version	
	echo ""
	echo "This plugin will check either the cumulutative %cpu, %mem or cputime"
	echo "of a process."
	echo ""
	help_usage
	echo ""
	echo "Required Arguments:"
        echo " -p, --process STRING1,STRING2,..."
        echo "    names of the processes we want to monitor,"
        echo "    you can add as much as process as you want, separated by comma,"
        echo "    hey will be cumulated"
        echo " -p, --process %all%"
        echo "    The special keyword %all% will check the cumulative cpu/mem/time of all process"
	echo "    WARNING : Can be very slow on heavy loaded servers, watch your timeout !"
        echo " -p, --process %max%"
        echo "    The special keyword %max% will check the process which eat the most"
	echo "    WARNING : only select the process which eat the more, not the cumulative,"
	echo "		    but return the cumulative"
 	echo " -w, --warning INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
	echo "    generate warning state if process count is outside this range"
	echo " -c, --critical INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
	echo "    generate critical state if process count is outside this range"
        echo " --cpu"
        echo "    return the current cpu usage for the given process"
        echo " --mem"
        echo "    return the current memory usage for the given process"
        echo " --cputime"
        echo "    return the total cputime usage for the given process"
	echo ""
        echo "Optional Argument:"
        echo " -e, --exclude-process STRING1,STRING2,..."
        echo "    names of the processes we want don't want to monitor"
        echo "    only useful when associated with %all% or %max% keywords, else ignored"
        echo "    ex : kapm-idled on linux is a process which eat memory / cputime but not really... ;-)"
	echo ""
	exit 3
fi

if [ "$1" = "-v" -o "$1" = "--version" ]
then
	help_version
        exit 3
fi

if [ `echo $@|tr "=" " "|wc -w` -lt 7 ]
then 
	echo "Bad arguments number (need at least 7)!"
	help_usage
	exit 3
fi

tt=0
process_name=""
exclude_process_name=""
wt=""
ct=""

# Test of the command lines arguments
while test $# -gt 0
do
	
	case "$1" in
		-p|--process)
			if [ -n "$process_name" ]
			then
				echo "Only one --process argument is useful..."
                                help_usage
                                exit 3
			fi
			shift
			process_name="`echo $1|tr \",\" \"|\"`"
			;;
                -e|--exclude-process)
                        if [ -n "$exclude_process_name" ]
                        then
                                echo "Only one --exclude-process argument is useful..."
                                help_usage
                                exit 3
                        fi
                        shift
                        exclude_process_name="`echo $1|tr \",\" \"|\"`"
                        ;;
                -w|--warning)
                        if [ -n "$wt" ]
                        then
                                echo "Only one --warning argument needed... Trying to test bad things ? :-)"
                                help_usage
                                exit 3
                        fi
			shift
			wt=$1
			;;
                -c|--critical)
                        if [ -n "$ct" ]
                        then
                                echo "Only one --critical argument needed... Trying to test bad things ? :-)"
                                help_usage
                                exit 3
                        fi
			shift
			ct=$1
			;;
		--cpu)
                	if [ $tt -eq 0 ]
                	then
                       		tt=1
                	else
                                echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
                        	help_usage
				exit 3
                	fi
			type_arg_aff="%CPU"		
			type_arg="pcpu"		
			delim="."
			;;
		--mem)
			if [ $tt -eq 0 ]
			then
                		tt=2
			else
                                echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
				help_usage
				exit 3
			fi
			type_arg_aff="%MEM"
			type_arg="pmem"
			delim="."
			;;
		--cputime)
                        if [ $tt -eq 0 ]
                        then
                                tt=3
                        else
                                echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
                                help_usage
                                exit 3
                        fi
			type_arg_aff="TIME"
			type_arg="time"
			delim=":"
			;;
		*)
			echo "Unknown argument $1"
			help_usage
			exit 3
			;;
	esac
	shift
done

# Is the process running ?
if [ -z "`ps -e | egrep \"$process_name?\"`" -a "$process_name" != "%all%" -a "$process_name" != "%max%" ]
then
	echo "WARNING: process $process_name not running !"
	exit 3
fi

# Cut of warning and critical values
wt_value1=`echo $wt|cut -d"$delim" -f1`
wt_value2=`echo $wt|cut -d"$delim" -f2`
ct_value1=`echo $ct|cut -d"$delim" -f1`
ct_value2=`echo $ct|cut -d"$delim" -f2`

if [ $tt -eq 3 ]
then
	wt_value3=`echo $wt|cut -d"$delim" -f3`
	ct_value3=`echo $ct|cut -d"$delim" -f3`
else
	wt_value3=0
	ct_value3=0
fi

# Integrity check of warning and critical values
if [ -z "$wt_value1" -o -z "$wt_value2" -o -z "$wt_value3" ]
then
        echo "Bad expression in the WARNING field : $wt"
	help_usage
        exit 3
fi

if [ "`echo $wt_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value3|tr -d \"[:digit:]\"`" != "" ]
then
        echo "Bad expression in the WARNING field : $wt"
	help_usage
        exit 3
fi

if [ -z "$ct_value1" -o -z "$ct_value2" -o -z "$ct_value3" ]
then
        echo "Bad expression in the CRITICAL field : $ct"
        help_usage
        exit 3
fi


if [ "`echo $ct_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value3|tr -d \"[:digit:]\"`" != "" ]
then
        echo "Bad expression in the CRITICAL field : $ct"
	help_usage
        exit 3
fi

# ps line construction set...
case "$process_name" in 
	%all%)
		if [ -z "$exclude_process_name" ]
		then
			psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
		else
			psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
		fi
		;;
	%max%)
                if [ -z "$exclude_process_name" ]
                then
			pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
		else
			pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
		fi
		psline=`ps -eo $type_arg,comm|grep $pstmp|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
		process_name=$pstmp
		;;
	*)
		psline=`ps -eo $type_arg,comm|egrep "$process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
		;;
esac

total1=0
total2=0
total3=0


# fetching the values
for i in $psline
do
	# Special case for solaris - several format exist for the time function...
	if [ ${#i} -le 6 -a "$tt" -eq 3 ]
	then
		i="00:$i"
	fi 
	value1=`echo $i|cut -d$delim -f1`
	value2=`echo $i|cut -d$delim -f2`
	value3=`echo $i|cut -d$delim -f3`
	value3=`test -z "$value3" && echo 0 || echo $value3`
	total1=`expr $total1 + $value1`
	total2=`expr $total2 + $value2`
	total3=`expr $total3 + $value3`
	if [ $tt -eq 3 ]
	then
        	if [ $total3 -ge 60 ]
                then
                	let total2+=1
                        let total3-=60
                fi
                if [ $total2 -ge 60 ]
                then
                        let total1+=1
                        let total2-=60
                fi
	else
		if [ $total2 -ge 10 ]
		then
			let total1+=1
			let total2=total2-10
		fi
	fi
done

warn=0
crit=0

# evaluation of the cumulative values vs warning and critical values
case "$tt" in
	1)
		return_total="$total1.$total2"
		test $total1 -gt $ct_value1 && crit=1
		test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
		test $total1 -gt $wt_value1 && warn=1
		test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
		;;
	2)
		return_total="$total1.$total2"
                test $total1 -gt $ct_value1 && crit=1
                test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
                test $total1 -gt $wt_value1 && warn=1
                test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
		;;
	3)
		return_total="`test ${#total1} -eq 1 && echo 0`$total1:`test ${#total2} -eq 1 && echo 0`$total2:`test ${#total3} -eq 1 && echo 0`$total3"
                test $total1 -gt $ct_value1 && crit=1
                test $total1 -eq $ct_value1 -a $total2 -gt $ct_value2 && crit=1
                test $total1 -eq $ct_value1 -a $total2 -eq $ct_value2 -a $total3 -ge $ct_value3 && crit=1
                test $total1 -gt $wt_value1 && warn=1
                test $total1 -eq $wt_value1 -a $total2 -gt $wt_value2 && warn=1
                test $total1 -eq $wt_value1 -a $total2 -eq $wt_value2 -a $total3 -ge $wt_value3 && warn=1
		;;
esac

# last check ...
if [ $crit -eq 1 -a $warn -eq 0 ]
then
	echo "Critical value must be greater than warning value !"
	help_usage
	exit 3
fi

# Finally Inform Nagios of what we found...
if [ $crit -eq 1 ]
then
	echo "CRITICAL - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
	exit 2
elif [ $warn -eq 1 ]
then
	echo "WARNING - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
	exit 1
else
	echo "OK - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
	exit 0
fi

# Hey what are we doing here ???
exit 3