diff options
Diffstat (limited to 'contrib/check_snmp_process_monitor.pl')
-rw-r--r-- | contrib/check_snmp_process_monitor.pl | 331 |
1 files changed, 0 insertions, 331 deletions
diff --git a/contrib/check_snmp_process_monitor.pl b/contrib/check_snmp_process_monitor.pl deleted file mode 100644 index 0f44597..0000000 --- a/contrib/check_snmp_process_monitor.pl +++ /dev/null | |||
@@ -1,331 +0,0 @@ | |||
1 | #!/usr/local/bin/perl | ||
2 | # author: Al Tobey <albert.tobey@priority-health.com> | ||
3 | # what: monitor a process using the host-resources mib | ||
4 | # license: GPL - http://www.fsf.org/licenses/gpl.txt | ||
5 | # | ||
6 | # Todo: | ||
7 | # * implement memory and cpu utilization checks | ||
8 | # * maybe cache pids in DBM files if snmp agents get overworked | ||
9 | ############################################################################### | ||
10 | # to get a list of processes over snmp try this command: | ||
11 | # snmptable -v2c -c public hostname hrSWRunTable | ||
12 | # for just a list of valid arguments for the '-e' option: | ||
13 | # snmpwalk -v2c -c public hostname hrSWRunName |perl -pe 's:.*/::' | ||
14 | ############################################################################### | ||
15 | |||
16 | use strict; | ||
17 | require 5.6.0; | ||
18 | use lib qw( /opt/nagios/libexec /usr/local/libexec ); | ||
19 | use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage); | ||
20 | use SNMP 5.0; | ||
21 | use Getopt::Long; | ||
22 | use Storable; | ||
23 | use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats $opt_cache $opt_nocache $cache_exp $interpreters $snmp_session $PROGNAME $TIMEOUT ); | ||
24 | |||
25 | $PROGNAME = "snmp_process_monitor.pl"; | ||
26 | $opt_verbose = undef; | ||
27 | $opt_host = undef; | ||
28 | $opt_community = 'public'; | ||
29 | $opt_command = undef; | ||
30 | $opt_warning = [ 1, -1 ]; | ||
31 | $opt_critical = [ 1, -1 ]; | ||
32 | $opt_memory = undef; | ||
33 | $opt_cpu = undef; | ||
34 | $opt_port = 161; | ||
35 | $opt_cache = 1; | ||
36 | $opt_nocache = undef; | ||
37 | $cache_exp = 600; | ||
38 | $exit = $ERRORS{OK}; | ||
39 | $interpreters = '(perl|/bin/sh|/usr/bin/sh|/bin/bash|/bin/ksh|python)'; | ||
40 | our $cachefile = '/var/opt/nagios/tmp/'; # completed later | ||
41 | our %processes = (); | ||
42 | |||
43 | sub process_options { | ||
44 | my( $opt_crit, $opt_warn ) = (); | ||
45 | Getopt::Long::Configure( 'bundling' ); | ||
46 | GetOptions( | ||
47 | 'V' => \$opt_version, 'version' => \$opt_version, | ||
48 | 'v' => \$opt_verbose, 'verbose' => \$opt_verbose, | ||
49 | 'h' => \$opt_help, 'help' => \$opt_help, | ||
50 | 's' => \$opt_stats, 'statistics' => \$opt_stats, | ||
51 | 'nocache' => \$opt_nocache, | ||
52 | 'H:s' => \$opt_host, 'hostname:s' => \$opt_host, | ||
53 | 'p:i' => \$opt_port, 'port:i' => \$opt_port, | ||
54 | 'C:s' => \$opt_community, 'community:s' => \$opt_community, | ||
55 | 'c:s' => \$opt_crit, 'critical:s' => \$opt_crit, | ||
56 | 'w:s' => \$opt_warn, 'warning:s' => \$opt_warn, | ||
57 | 't:i' => \$TIMEOUT, 'timeout:i' => \$TIMEOUT, | ||
58 | 'e:s' => \$opt_command, 'command:s' => \$opt_command, | ||
59 | 'r:s' => \$opt_regex, 'regex:s' => \$opt_regex, | ||
60 | 'cpu:i' => \$opt_cpu, 'memory:i' => \$opt_memory, | ||
61 | ); | ||
62 | if ( defined($opt_version) ) { local_print_revision(); } | ||
63 | if ( defined($opt_verbose) ) { $SNMP::debugging = 1; } | ||
64 | if ( !defined($opt_host) || defined($opt_help) || (!defined($opt_command) && !defined($opt_regex)) ) { | ||
65 | print_help(); | ||
66 | exit $ERRORS{UNKNOWN}; | ||
67 | } | ||
68 | |||
69 | if ( defined($opt_crit) ) { | ||
70 | if ( $opt_crit =~ /,/ ) { | ||
71 | $opt_critical = [ split(',', $opt_crit) ]; | ||
72 | } | ||
73 | else { | ||
74 | $opt_critical = [ $opt_crit, -1 ]; | ||
75 | } | ||
76 | } | ||
77 | if ( defined($opt_warn) ) { | ||
78 | if ( $opt_warn =~ /,/ ) { | ||
79 | $opt_warning = [ split(',', $opt_warn) ]; | ||
80 | } | ||
81 | else { | ||
82 | $opt_warning = [ $opt_crit, -1 ]; | ||
83 | } | ||
84 | } | ||
85 | if ( defined($opt_memory) ) { $opt_memory = 0 } | ||
86 | if ( defined($opt_cpu) ) { $opt_cpu = 0 } | ||
87 | if ( defined($opt_nocache)) { $opt_cache = 0 } | ||
88 | |||
89 | # complete the cachefile's name | ||
90 | $cachefile .= $opt_host . '.proc'; | ||
91 | } | ||
92 | |||
93 | sub local_print_revision { | ||
94 | print_revision( $PROGNAME, '$Revision: 84 $ ' ) | ||
95 | } | ||
96 | |||
97 | sub print_usage { | ||
98 | print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>] [-s|--statistics] [--memory] [--cpu] [--nocache]\n"; | ||
99 | } | ||
100 | |||
101 | sub print_help { | ||
102 | local_print_revision(); | ||
103 | print "Copyright (c) 2002 Al Tobey <albert.tobey\@priority-health.com>\n\n", | ||
104 | "SNMP Process Monitor plugin for Nagios\n\n"; | ||
105 | print_usage(); | ||
106 | print <<EOT; | ||
107 | -v, --verbose | ||
108 | print extra debugging information | ||
109 | -h, --help | ||
110 | print this help message | ||
111 | -H, --hostname=HOST | ||
112 | name or IP address of host to check | ||
113 | -C, --community=COMMUNITY NAME | ||
114 | community name for the host's SNMP agent | ||
115 | -e, --command=COMMAND NAME (ps -e style) | ||
116 | what command should be monitored? | ||
117 | -r, --regex=Perl RE | ||
118 | use a perl regular expression to find your process | ||
119 | -w, --warning=INTEGER[,INTEGER] | ||
120 | minimum and maximum number of processes before a warning is issued (Default 1,-1) | ||
121 | -c, --critical=INTEGER[,INTEGER] | ||
122 | minimum and maximum number of processes before a critical is issued (Default 1,-1) | ||
123 | --memory | ||
124 | combined with '-s', will print the number of bytes of real memory used by process | ||
125 | --cpu | ||
126 | combined with '-s', will print the number of seconds of cpu time consumed by process | ||
127 | EOT | ||
128 | } | ||
129 | |||
130 | sub verbose (@) { | ||
131 | return if ( !defined($opt_verbose) ); | ||
132 | print @_; | ||
133 | } | ||
134 | |||
135 | sub check_for_errors { | ||
136 | if ( $snmp_session->{ErrorNum} ) { | ||
137 | %processes = (); | ||
138 | print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n"; | ||
139 | exit $ERRORS{UNKNOWN}; | ||
140 | } | ||
141 | } | ||
142 | |||
143 | sub init_cache { | ||
144 | if ( !defined($opt_cache) ) { | ||
145 | %processes = (); | ||
146 | return; | ||
147 | } | ||
148 | if ( -r $cachefile ) { | ||
149 | eval { | ||
150 | verbose "loading cache from $cachefile\n"; | ||
151 | %processes = %{ retrieve( $cachefile ) }; | ||
152 | }; | ||
153 | if ( $@ ) { | ||
154 | verbose "cache loading failed - using blank cache: $@\n"; | ||
155 | %processes = () | ||
156 | } | ||
157 | } | ||
158 | else { | ||
159 | %processes = (); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | sub snmpget { | ||
164 | my $tmpvar = SNMP::Varbind->new( shift ); | ||
165 | $snmp_session->get( $tmpvar ); | ||
166 | check_for_errors(); | ||
167 | return $tmpvar->val; | ||
168 | } | ||
169 | |||
170 | sub update_cache { | ||
171 | # expire the cache after $cache_exp seconds | ||
172 | if ( $opt_cache != 0 && exists($processes{__last_update}) | ||
173 | && $processes{__last_update} >= time - $cache_exp ) { | ||
174 | verbose "cache file is recent enough - using it\n"; | ||
175 | return 1; | ||
176 | } | ||
177 | |||
178 | verbose "retrieving full listing of processes from $opt_host\n"; | ||
179 | my $process_count = snmpget( ['hrSystemProcesses', 0] ); | ||
180 | |||
181 | # retrieve the data from the remote host | ||
182 | my ($names) = $snmp_session->bulkwalk( 0, $process_count + 1, [['hrSWRunName']] ); | ||
183 | check_for_errors(); | ||
184 | |||
185 | # make sure the number of processes from the bulkwalk is close to hrSystemProcesses | ||
186 | if ( scalar(@$names) + 10 < $process_count ) { | ||
187 | print "UNKNOWN - only ", scalar(@$names), " of ",$process_count, " processes returned\n";; | ||
188 | exit $ERRORS{UNKNOWN}; | ||
189 | } | ||
190 | |||
191 | # sort through the process names and create a nice hash of processes | ||
192 | foreach my $row ( @$names ) { | ||
193 | my %hash = {}; | ||
194 | $hash{name} = $row->val; | ||
195 | $hash{abs_name} = $row->val; | ||
196 | $hash{name} =~ s#.*/##; # strip path | ||
197 | |||
198 | if ( defined($opt_regex) || | ||
199 | ($row->val =~ m#$interpreters$# | ||
200 | && $opt_command !~ m#$interpreters$#) ) { | ||
201 | |||
202 | # fetch the runtime parameters of the process | ||
203 | my $parameters = snmpget( ['hrSWRunParameters', $row->iid] ); | ||
204 | |||
205 | # only strip if we're looking for a specific command | ||
206 | if ( defined($opt_command) ) { | ||
207 | verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n"; | ||
208 | $hash{name} = $parameters; | ||
209 | $hash{name} =~ s#.*/##; # strip path name off the front | ||
210 | $hash{name} =~ s/\s+.*$//; # strip everything from the first space to the end | ||
211 | } | ||
212 | else { | ||
213 | # use the full 'ps -efl' style listing for regular expression matching | ||
214 | my $path = snmpget( ['hrSWRunPath', $row->iid] ); | ||
215 | $hash{name} = "$path $parameters"; | ||
216 | } | ||
217 | } | ||
218 | # store in the global hash | ||
219 | $processes{$row->iid} = \%hash; | ||
220 | } | ||
221 | |||
222 | # update the timestamp so the cache can expire | ||
223 | $processes{__last_update} = time; | ||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | # process the %processes hash and see if there any matches for our command or regex | ||
228 | sub check_for_matches { | ||
229 | my $ret_match = 0; | ||
230 | foreach my $key ( keys(%processes) ) { | ||
231 | next if ( $key eq '__last_update' ); | ||
232 | my $match = 0; | ||
233 | |||
234 | # static matches are letter-for-letter (-e) | ||
235 | if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { $match++; } | ||
236 | # use /o to make sure the user-supplied regex (-r) is only compiled once | ||
237 | elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { $match++; } | ||
238 | |||
239 | # verify the cache's entry by doing an snmpget | ||
240 | if ( $match > 0 && $opt_cache != 0 ) { | ||
241 | my $proc = snmpget( ['hrSWRunName', $key] ); | ||
242 | --$match if ( !$proc || $proc ne $processes{$key}->{abs_name} ); | ||
243 | } | ||
244 | # get the process memory usage if requested | ||
245 | if ( $match > 0 && defined($opt_memory) ) { | ||
246 | $opt_memory += snmpget( ['hrSWRunPerfMem', $key] ); | ||
247 | } | ||
248 | # get the process cpu usage if requested | ||
249 | if ( $match > 0 && defined($opt_cpu) ) { | ||
250 | $opt_cpu += snmpget( ['hrSWRunPerfCPU', $key] ); | ||
251 | } | ||
252 | |||
253 | verbose "process '$processes{$key}->{name}' has pid $processes{$key}->{pid} and index $key\n" | ||
254 | if ( $match > 0 ); | ||
255 | |||
256 | $ret_match += $match; | ||
257 | } | ||
258 | return $ret_match; | ||
259 | } | ||
260 | # =========================================================================== # | ||
261 | # =====> MAIN | ||
262 | # =========================================================================== # | ||
263 | process_options(); | ||
264 | |||
265 | alarm( $TIMEOUT ); # make sure we don't hang Nagios | ||
266 | |||
267 | # intialize the cache, if it's enabled | ||
268 | init_cache(); | ||
269 | |||
270 | # create a session for conversing with the remote SNMP agent | ||
271 | $snmp_session = new SNMP::Session( | ||
272 | DestHost => $opt_host, | ||
273 | Community => $opt_community, | ||
274 | RemotePort => $opt_port, | ||
275 | Version => '2c' | ||
276 | ); | ||
277 | |||
278 | my $usage = update_cache(); | ||
279 | my $count = check_for_matches(); | ||
280 | |||
281 | # always try twice if caching is enabled - once with cache and once without | ||
282 | if ( $usage != 0 && $opt_cache != 0 && $count <= 0 ) { | ||
283 | verbose "did not find process in cache - trying a refresh\n"; | ||
284 | %processes = (); | ||
285 | update_cache(); | ||
286 | $count = check_for_matches(); | ||
287 | } | ||
288 | |||
289 | |||
290 | # the default, OK message | ||
291 | my $message = "OK - $count process(es) found resembling '". ($opt_command || $opt_regex); | ||
292 | |||
293 | # warning, critical | ||
294 | if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count) | ||
295 | || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) { | ||
296 | $message = "WARNING - no processes found resembling '". ($opt_command || $opt_regex); | ||
297 | $exit = $ERRORS{WARNING}; | ||
298 | } | ||
299 | if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count) | ||
300 | || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) { | ||
301 | $message = "CRITICAL - no processes found resembling '". ($opt_command || $opt_regex); | ||
302 | $exit = $ERRORS{CRITICAL}; | ||
303 | } | ||
304 | |||
305 | # output the status message | ||
306 | print $message, "'"; | ||
307 | |||
308 | # print the number of processes if statistics are requested | ||
309 | if ( defined($opt_stats) ) { | ||
310 | print "|count=$count"; | ||
311 | if ( defined($opt_memory) ) { | ||
312 | print ":memory=", $opt_memory; | ||
313 | } | ||
314 | if ( defined($opt_cpu) ) { | ||
315 | $opt_cpu = $opt_cpu / 100; | ||
316 | printf ":cpu=%.2f", $opt_cpu; | ||
317 | } | ||
318 | } | ||
319 | |||
320 | # store a copy of the %processes hash if we're using caching | ||
321 | if ( $exit == $ERRORS{OK} && $opt_cache != 0 ) { | ||
322 | eval { | ||
323 | unlink( $cachefile ) if ( -e $cachefile ); | ||
324 | store( \%processes, $cachefile ); | ||
325 | }; | ||
326 | } | ||
327 | |||
328 | print "\n"; | ||
329 | exit $exit; | ||
330 | |||
331 | |||