diff options
Diffstat (limited to 'contrib')
| -rw-r--r-- | contrib/check_snmp_process_monitor.pl | 250 |
1 files changed, 177 insertions, 73 deletions
diff --git a/contrib/check_snmp_process_monitor.pl b/contrib/check_snmp_process_monitor.pl index 263255b5..c98ee7dc 100644 --- a/contrib/check_snmp_process_monitor.pl +++ b/contrib/check_snmp_process_monitor.pl | |||
| @@ -19,7 +19,8 @@ use lib qw( /opt/nagios/libexec /usr/local/libexec ); | |||
| 19 | use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage); | 19 | use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage); |
| 20 | use SNMP 5.0; | 20 | use SNMP 5.0; |
| 21 | use Getopt::Long; | 21 | use Getopt::Long; |
| 22 | use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats %processes $snmp_session $PROGNAME $TIMEOUT ); | 22 | use Storable; |
| 23 | use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats $opt_cache $opt_nocache $cache_exp $interpreters $snmp_session $PROGNAME $TIMEOUT ); | ||
| 23 | 24 | ||
| 24 | $PROGNAME = "snmp_process_monitor.pl"; | 25 | $PROGNAME = "snmp_process_monitor.pl"; |
| 25 | $opt_verbose = undef; | 26 | $opt_verbose = undef; |
| @@ -31,8 +32,13 @@ $opt_critical = [ 1, -1 ]; | |||
| 31 | $opt_memory = undef; | 32 | $opt_memory = undef; |
| 32 | $opt_cpu = undef; | 33 | $opt_cpu = undef; |
| 33 | $opt_port = 161; | 34 | $opt_port = 161; |
| 34 | %processes = (); | 35 | $opt_cache = 1; |
| 35 | $exit = 'OK'; | 36 | $opt_nocache = undef; |
| 37 | $cache_exp = 600; | ||
| 38 | $exit = $ERRORS{OK}; | ||
| 39 | $interpreters = '(perl|/bin/sh|/usr/bin/sh|/bin/bash|/bin/ksh|python)'; | ||
| 40 | our $cachefile = '/var/opt/nagios/tmp/'; # completed later | ||
| 41 | our %processes = (); | ||
| 36 | 42 | ||
| 37 | sub process_options { | 43 | sub process_options { |
| 38 | my( $opt_crit, $opt_warn ) = (); | 44 | my( $opt_crit, $opt_warn ) = (); |
| @@ -42,6 +48,7 @@ sub process_options { | |||
| 42 | 'v' => \$opt_verbose, 'verbose' => \$opt_verbose, | 48 | 'v' => \$opt_verbose, 'verbose' => \$opt_verbose, |
| 43 | 'h' => \$opt_help, 'help' => \$opt_help, | 49 | 'h' => \$opt_help, 'help' => \$opt_help, |
| 44 | 's' => \$opt_stats, 'statistics' => \$opt_stats, | 50 | 's' => \$opt_stats, 'statistics' => \$opt_stats, |
| 51 | 'nocache' => \$opt_nocache, | ||
| 45 | 'H:s' => \$opt_host, 'hostname:s' => \$opt_host, | 52 | 'H:s' => \$opt_host, 'hostname:s' => \$opt_host, |
| 46 | 'p:i' => \$opt_port, 'port:i' => \$opt_port, | 53 | 'p:i' => \$opt_port, 'port:i' => \$opt_port, |
| 47 | 'C:s' => \$opt_community, 'community:s' => \$opt_community, | 54 | 'C:s' => \$opt_community, 'community:s' => \$opt_community, |
| @@ -75,6 +82,12 @@ sub process_options { | |||
| 75 | $opt_warning = [ $opt_crit, -1 ]; | 82 | $opt_warning = [ $opt_crit, -1 ]; |
| 76 | } | 83 | } |
| 77 | } | 84 | } |
| 85 | if ( defined($opt_memory) ) { $opt_memory = 0 } | ||
| 86 | if ( defined($opt_cpu) ) { $opt_cpu = 0 } | ||
| 87 | if ( defined($opt_nocache)) { $opt_cache = 0 } | ||
| 88 | |||
| 89 | # complete the cachefile's name | ||
| 90 | $cachefile .= $opt_host . '.proc'; | ||
| 78 | } | 91 | } |
| 79 | 92 | ||
| 80 | sub local_print_revision { | 93 | sub local_print_revision { |
| @@ -82,7 +95,7 @@ sub local_print_revision { | |||
| 82 | } | 95 | } |
| 83 | 96 | ||
| 84 | sub print_usage { | 97 | sub print_usage { |
| 85 | print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>]\n"; | 98 | print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>] [-s|--statistics] [--memory] [--cpu] [--nocache]\n"; |
| 86 | } | 99 | } |
| 87 | 100 | ||
| 88 | sub print_help { | 101 | sub print_help { |
| @@ -107,6 +120,10 @@ sub print_help { | |||
| 107 | minimum and maximum number of processes before a warning is issued (Default 1,-1) | 120 | minimum and maximum number of processes before a warning is issued (Default 1,-1) |
| 108 | -c, --critical=INTEGER[,INTEGER] | 121 | -c, --critical=INTEGER[,INTEGER] |
| 109 | minimum and maximum number of processes before a critical is issued (Default 1,-1) | 122 | minimum and maximum number of processes before a critical is issued (Default 1,-1) |
| 123 | --memory | ||
| 124 | combined with '-s', will print the number of bytes of real memory used by process | ||
| 125 | --cpu | ||
| 126 | combined with '-s', will print the number of seconds of cpu time consumed by process | ||
| 110 | EOT | 127 | EOT |
| 111 | } | 128 | } |
| 112 | 129 | ||
| @@ -117,11 +134,129 @@ sub verbose (@) { | |||
| 117 | 134 | ||
| 118 | sub check_for_errors { | 135 | sub check_for_errors { |
| 119 | if ( $snmp_session->{ErrorNum} ) { | 136 | if ( $snmp_session->{ErrorNum} ) { |
| 137 | %processes = (); | ||
| 120 | print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n"; | 138 | print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n"; |
| 121 | exit $ERRORS{UNKNOWN}; | 139 | exit $ERRORS{UNKNOWN}; |
| 122 | } | 140 | } |
| 123 | } | 141 | } |
| 124 | 142 | ||
| 143 | sub init_cache { | ||
| 144 | if ( !defined($opt_cache) ) { | ||
| 145 | %processes = (); | ||
| 146 | return; | ||
| 147 | } | ||
| 148 | if ( -r $cachefile ) { | ||
| 149 | eval { | ||
| 150 | verbose "loading cache from $cachefile\n"; | ||
| 151 | %processes = %{ retrieve( $cachefile ) }; | ||
| 152 | }; | ||
| 153 | if ( $@ ) { | ||
| 154 | verbose "cache loading failed - using blank cache: $@\n"; | ||
| 155 | %processes = () | ||
| 156 | } | ||
| 157 | } | ||
| 158 | else { | ||
| 159 | %processes = (); | ||
| 160 | } | ||
| 161 | } | ||
| 162 | |||
| 163 | sub snmpget { | ||
| 164 | my $tmpvar = SNMP::Varbind->new( shift ); | ||
| 165 | $snmp_session->get( $tmpvar ); | ||
| 166 | check_for_errors(); | ||
| 167 | return $tmpvar->val; | ||
| 168 | } | ||
| 169 | |||
| 170 | sub update_cache { | ||
| 171 | # expire the cache after $cache_exp seconds | ||
| 172 | if ( $opt_cache != 0 && exists($processes{__last_update}) | ||
| 173 | && $processes{__last_update} >= time - $cache_exp ) { | ||
| 174 | verbose "cache file is recent enough - using it\n"; | ||
| 175 | return 1; | ||
| 176 | } | ||
| 177 | |||
| 178 | verbose "retrieving full listing of processes from $opt_host\n"; | ||
| 179 | my $process_count = snmpget( ['hrSystemProcesses', 0] ); | ||
| 180 | |||
| 181 | # retrieve the data from the remote host | ||
| 182 | my ($names) = $snmp_session->bulkwalk( 0, $process_count + 1, [['hrSWRunName']] ); | ||
| 183 | check_for_errors(); | ||
| 184 | |||
| 185 | # make sure the number of processes from the bulkwalk is close to hrSystemProcesses | ||
| 186 | if ( scalar(@$names) + 10 < $process_count ) { | ||
| 187 | print "UNKNOWN - only ", scalar(@$names), " of ",$process_count, " processes returned\n";; | ||
| 188 | exit $ERRORS{UNKNOWN}; | ||
| 189 | } | ||
| 190 | |||
| 191 | # sort through the process names and create a nice hash of processes | ||
| 192 | foreach my $row ( @$names ) { | ||
| 193 | my %hash = {}; | ||
| 194 | $hash{name} = $row->val; | ||
| 195 | $hash{abs_name} = $row->val; | ||
| 196 | $hash{name} =~ s#.*/##; # strip path | ||
| 197 | |||
| 198 | if ( defined($opt_regex) || | ||
| 199 | ($row->val =~ m#$interpreters$# | ||
| 200 | && $opt_command !~ m#$interpreters$#) ) { | ||
| 201 | |||
| 202 | # fetch the runtime parameters of the process | ||
| 203 | my $parameters = snmpget( ['hrSWRunParameters', $row->iid] ); | ||
| 204 | |||
| 205 | # only strip if we're looking for a specific command | ||
| 206 | if ( defined($opt_command) ) { | ||
| 207 | verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n"; | ||
| 208 | $hash{name} = $parameters; | ||
| 209 | $hash{name} =~ s#.*/##; # strip path name off the front | ||
| 210 | $hash{name} =~ s/\s+.*$//; # strip everything from the first space to the end | ||
| 211 | } | ||
| 212 | else { | ||
| 213 | # use the full 'ps -efl' style listing for regular expression matching | ||
| 214 | my $path = snmpget( ['hrSWRunPath', $row->iid] ); | ||
| 215 | $hash{name} = "$path $parameters"; | ||
| 216 | } | ||
| 217 | } | ||
| 218 | # store in the global hash | ||
| 219 | $processes{$row->iid} = \%hash; | ||
| 220 | } | ||
| 221 | |||
| 222 | # update the timestamp so the cache can expire | ||
| 223 | $processes{__last_update} = time; | ||
| 224 | return 0; | ||
| 225 | } | ||
| 226 | |||
| 227 | # process the %processes hash and see if there any matches for our command or regex | ||
| 228 | sub check_for_matches { | ||
| 229 | my $ret_match = 0; | ||
| 230 | foreach my $key ( keys(%processes) ) { | ||
| 231 | next if ( $key eq '__last_update' ); | ||
| 232 | my $match = 0; | ||
| 233 | |||
| 234 | # static matches are letter-for-letter (-e) | ||
| 235 | if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { $match++; } | ||
| 236 | # use /o to make sure the user-supplied regex (-r) is only compiled once | ||
| 237 | elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { $match++; } | ||
| 238 | |||
| 239 | # verify the cache's entry by doing an snmpget | ||
| 240 | if ( $match > 0 && $opt_cache != 0 ) { | ||
| 241 | my $proc = snmpget( ['hrSWRunName', $key] ); | ||
| 242 | --$match if ( !$proc || $proc ne $processes{$key}->{abs_name} ); | ||
| 243 | } | ||
| 244 | # get the process memory usage if requested | ||
| 245 | if ( $match > 0 && defined($opt_memory) ) { | ||
| 246 | $opt_memory += snmpget( ['hrSWRunPerfMem', $key] ); | ||
| 247 | } | ||
| 248 | # get the process cpu usage if requested | ||
| 249 | if ( $match > 0 && defined($opt_cpu) ) { | ||
| 250 | $opt_cpu += snmpget( ['hrSWRunPerfCPU', $key] ); | ||
| 251 | } | ||
| 252 | |||
| 253 | verbose "process '$processes{$key}->{name}' has pid $processes{$key}->{pid} and index $key\n" | ||
| 254 | if ( $match > 0 ); | ||
| 255 | |||
| 256 | $ret_match += $match; | ||
| 257 | } | ||
| 258 | return $ret_match; | ||
| 259 | } | ||
| 125 | # =========================================================================== # | 260 | # =========================================================================== # |
| 126 | # =====> MAIN | 261 | # =====> MAIN |
| 127 | # =========================================================================== # | 262 | # =========================================================================== # |
| @@ -129,6 +264,10 @@ process_options(); | |||
| 129 | 264 | ||
| 130 | alarm( $TIMEOUT ); # make sure we don't hang Nagios | 265 | alarm( $TIMEOUT ); # make sure we don't hang Nagios |
| 131 | 266 | ||
| 267 | # intialize the cache, if it's enabled | ||
| 268 | init_cache(); | ||
| 269 | |||
| 270 | # create a session for conversing with the remote SNMP agent | ||
| 132 | $snmp_session = new SNMP::Session( | 271 | $snmp_session = new SNMP::Session( |
| 133 | DestHost => $opt_host, | 272 | DestHost => $opt_host, |
| 134 | Community => $opt_community, | 273 | Community => $opt_community, |
| @@ -136,92 +275,57 @@ $snmp_session = new SNMP::Session( | |||
| 136 | Version => '2c' | 275 | Version => '2c' |
| 137 | ); | 276 | ); |
| 138 | 277 | ||
| 139 | my $process_count = SNMP::Varbind->new( ['hrSystemProcesses', 0] ); | 278 | my $usage = update_cache(); |
| 140 | $snmp_session->get( $process_count ); | 279 | my $count = check_for_matches(); |
| 141 | check_for_errors(); | ||
| 142 | |||
| 143 | # retrieve the data from the remote host | ||
| 144 | my( $names, $index ) = $snmp_session->bulkwalk( 0, $process_count->val, [['hrSWRunName'], ['hrSWRunIndex']] ); | ||
| 145 | check_for_errors(); | ||
| 146 | |||
| 147 | alarm( 0 ); # all done with the network connection | ||
| 148 | |||
| 149 | my %namecount = (); | ||
| 150 | foreach my $row ( @$names ) { | ||
| 151 | $processes{$row->iid}->{name} = $row->val; | ||
| 152 | $processes{$row->iid}->{name} =~ s#.*/##; # strip path | ||
| 153 | |||
| 154 | if ( defined($opt_regex) || | ||
| 155 | ($row->val =~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/ | ||
| 156 | && $opt_command !~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/) ) { | ||
| 157 | |||
| 158 | # fetch the runtime parameters of the process | ||
| 159 | my $parm_var = SNMP::Varbind->new( ['hrSWRunParameters', $row->iid] ); | ||
| 160 | $snmp_session->get( $parm_var ); | ||
| 161 | check_for_errors(); | ||
| 162 | |||
| 163 | # only strip if we're looking for a specific command | ||
| 164 | if ( defined($opt_command) ) { | ||
| 165 | verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n"; | ||
| 166 | $processes{$row->iid}->{name} = $parm_var->val; | ||
| 167 | # strip path name off the front | ||
| 168 | $processes{$row->iid}->{name} =~ s#.*/##; | ||
| 169 | # strip everything from the first space to the end | ||
| 170 | $processes{$row->iid}->{name} =~ s/\s+.*$//; | ||
| 171 | } | ||
| 172 | else { | ||
| 173 | # get the longer full-path style listing | ||
| 174 | my $path_var = SNMP::Varbind->new( ['hrSWRunPath', $row->iid] ); | ||
| 175 | $snmp_session->get( $path_var ); | ||
| 176 | check_for_errors(); | ||
| 177 | 280 | ||
| 178 | # use the full 'ps -efl' style listing for regular expression matching | 281 | # always try twice if caching is enabled - once with cache and once without |
| 179 | $processes{$row->iid}->{name} = $path_var->val.' '.$parm_var->val; | 282 | if ( $usage != 0 && $opt_cache != 0 && $count <= 0 ) { |
| 180 | } | 283 | verbose "did not find process in cache - trying a refresh\n"; |
| 181 | } | 284 | %processes = (); |
| 182 | } | 285 | update_cache(); |
| 183 | foreach my $row ( @$index ) { | 286 | $count = check_for_matches(); |
| 184 | $processes{$row->iid}->{pid} = $row->val; | ||
| 185 | } | 287 | } |
| 186 | 288 | ||
| 187 | my @pids = (); | 289 | |
| 188 | my @matches = (); | 290 | # the default, OK message |
| 189 | foreach my $key ( keys(%processes) ) { | 291 | my $message = "OK - $count process(es) found resembling '". ($opt_command || $opt_regex); |
| 190 | if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { | ||
| 191 | push( @matches, $processes{$key} ); | ||
| 192 | push( @pids, $processes{$key}->{pid} ); | ||
| 193 | verbose "process '$processes{$key}->{name}' has pid ", | ||
| 194 | "$processes{$key}->{pid} and index $key\n"; | ||
| 195 | } | ||
| 196 | elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { | ||
| 197 | push( @matches, $processes{$key} ); | ||
| 198 | push( @pids, $processes{$key}->{pid} ); | ||
| 199 | verbose "process '$processes{$key}->{name}' has pid ", | ||
| 200 | "$processes{$key}->{pid} and index $key\n"; | ||
| 201 | } | ||
| 202 | } | ||
| 203 | my $count = @matches; | ||
| 204 | 292 | ||
| 205 | # warning, critical | 293 | # warning, critical |
| 206 | if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count) | 294 | if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count) |
| 207 | || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) { | 295 | || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) { |
| 208 | $exit = 'WARNING'; | 296 | $message = "WARNING - no processes found resembling '". ($opt_command || $opt_regex); |
| 297 | $exit = $ERRORS{WARNING}; | ||
| 209 | } | 298 | } |
| 210 | if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count) | 299 | if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count) |
| 211 | || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) { | 300 | || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) { |
| 212 | $exit = 'CRITICAL'; | 301 | $message = "CRITICAL - no processes found resembling '". ($opt_command || $opt_regex); |
| 302 | $exit = $ERRORS{CRITICAL}; | ||
| 213 | } | 303 | } |
| 214 | 304 | ||
| 215 | print "$exit - $count processes with pid(s) ",join(',',@pids); | 305 | # output the status message |
| 306 | print $message, "'"; | ||
| 216 | 307 | ||
| 217 | # print the number of processes if statistics are requested | 308 | # print the number of processes if statistics are requested |
| 218 | if ( defined($opt_stats) ) { | 309 | if ( defined($opt_stats) ) { |
| 219 | print "|count:$count\n"; | 310 | print "|count=$count"; |
| 311 | if ( defined($opt_memory) ) { | ||
| 312 | print ":memory=", $opt_memory; | ||
| 313 | } | ||
| 314 | if ( defined($opt_cpu) ) { | ||
| 315 | $opt_cpu = $opt_cpu / 100; | ||
| 316 | printf ":cpu=%.2f", $opt_cpu; | ||
| 317 | } | ||
| 220 | } | 318 | } |
| 221 | else { | 319 | |
| 222 | print "\n"; | 320 | # store a copy of the %processes hash if we're using caching |
| 321 | if ( $exit == $ERRORS{OK} && $opt_cache != 0 ) { | ||
| 322 | eval { | ||
| 323 | unlink( $cachefile ) if ( -e $cachefile ); | ||
| 324 | store( \%processes, $cachefile ); | ||
| 325 | }; | ||
| 223 | } | 326 | } |
| 224 | 327 | ||
| 225 | exit $ERRORS{$exit}; | 328 | print "\n"; |
| 329 | exit $exit; | ||
| 226 | 330 | ||
| 227 | 331 | ||
