#!/usr/bin/perl -w # check_status.pl Nagios Plugin - Version 1.3 # Last Updated: 1/9/2003 # # Report any bugs/questions to Russell Scibetti at russell@quadrix.com # # check_status Change Log: # # To do for 1.4 # - Better help and documentation (separate doc?) # - Take argument (patterns to match) from a separate spec file # # New Addition to 1.3 # - Added ChangeLog information and updated --help output # - hostdown (hd) argument for how a service check should respond # when its host is Down/Unreachable # (--hostdown="ok|warning|critical|unknown") # - Changed name from check_state to check_status # - Set hostdown to default to OK when the argument isn't specified # - Number of Hosts checked is now output in OK result # # Version 1.2 additions: # # - Added ability to handle ack'd and downtimed services differently # depending on argument provided # (--ack="ok|warning|critical|unknown|down|unreachable" # --dt="ok|warning|critical|unknown|down|unreachable") # # Version 1.1 additions: # # - Added --host=, --servhost= to allow for specific field # matching (host for matching hostname in host checks, servhost for # matching the hostname in service checks, service for matching the # service name in service checks) # - Output the number of OK services for an OK output # # Version 1.0 features: # # - Freshness check of status.log (timestamp) # - Match service or host checks # - Can ignore acknowledged or downtimes services/hosts (--ack, --dt) # - Can output different levels of detail dependent on # of problems # - Can check for number of critical, warning, or unknowns # ############################################################# use Getopt::Long; use File::stat; Getopt::Long::Configure('bundling'); GetOptions ("V" => \$version, "version" => \$version, "h" => \$help, "help" => \$help, "v" => \$verbose, "verbose" => \$verbose, "w=s" => \$warning, "warning=s" => \$warning, "c=s" => \$critical, "critical=s" => \$critical, "u=s" => \$unknown, "unknown=s" => \$unknown, "p=s" => \$pattern, "pattern=s" => \$pattern, "S:s" => \$service, "service:s" => \$service, "s=s" => \$status, "status=s" => \$status, "d=s" => \$dir, "dir=s" => \$dir, "D=s" => \$details, "details=s" => \$details, "H:s" => \$host, "host:s" => \$host, "f=s" => \$freshness, "freshness=s" => \$freshness, "servhost=s" => \$servhost, "a:s" => \$ack, "ack:s" => \$ack, "dt:s"=> \$dt, "downtime:s" => \$dt, "hd:s"=> \$hdown, "hostdown:s" => \$hdown, "ok" => \$ok); #Constants: my $OK = 0; my $WARNING = 1; my $CRITICAL = 2; my $UNKNOWN = 3; my $crit="CRITICAL"; my $warn="WARNING"; my $unk="UNKNOWN"; my $down="DOWN"; my $unreach="UNREACHABLE"; # Print out Help information if ($help) { printVersion(); printHelp(); exitcheck($UNKNOWN); } # Print out version information if ($version) { printVersion(); exitcheck($UNKNOWN); } # Check for status log or directory argument or print usage if (!$status) { if (!$dir) { print "Usage: $0 -s | -d \n"; print "Use the --help option for full list of arguments\n"; exitcheck($UNKNOWN); } elsif ($dir =~ m#[^/]/$#) { $status = $dir . "status.log"; } else { $status = $dir . "/status.log"; } } if (defined $host) { if (!$host) { $host="[^\\s]*"; } } if (!$host && !$servhost) { $servhost="[^\\s]*"; } if (!$host && !$service) { $service="[^\\s]*"; } if (defined $ack) { if (!$ack) { $ack="ok"; } elsif (!($ack =~ "ok|critical|warning|unknown|down|unreachable")) { print "Invalid value for ack\n"; exitcheck($UNKNOWN); } } if (defined $dt) { if (!$dt) { $dt="ok"; } elsif (!($dt =~ "ok|critical|warning|unknown|down|unreachable")) { print "Invalid value for dt\n"; exitcheck($UNKNOWN); } } if (defined $hdown) { if (!$hdown) { $hdown="ok"; } elsif (!($hdown =~ "ok|critical|warning|unknown|down|unreachable")) { print "Invalid value for hostdown\n"; exitcheck($UNKNOWN); } } my $much_details = 0; my $ServiceNotOK = "CRITICAL|WARNING|UNKNOWN"; my $HostNotOK = "DOWN|UNREACHABLE"; my %numprob = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0); my $CritOnly = 0; my $WarnOnly = 0; my $UnkOnly = 0; my @wlev; my @clev; my @ulev; my %warnlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0); my %critlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0); my %unklevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0); my %hostlevel = ("DOWN",0,"UNREACHABLE",0); # Store Hosts in downtime my @hostdowntime; my $numdowntime = 0; # Store Hosts in a Down/Unreachable state my @hostdown; my $numdown = 0; # Hash for storing state-change to OK times for hosts: my %hostoktimes; # Number of matches in parsing my $nummatch = 0; if ($warning) { if ($warning =~ /,/) { @wlev = split /,/,$warning; $warnlevel{"WARNING"} = $wlev[0]; $warnlevel{"CRITICAL"} = $wlev[1]; if ($wlev[2] ) { $warnlevel{"UNKNOWN"} = $wlev[2]; } } else { $WarnOnly = $warning; } } else { $WarnOnly = 1; } if ($critical) { if ($critical =~ /,/) { @clev = split /,/,$critical; $critlevel{"WARNING"} = $clev[0]; $critlevel{"CRITICAL"} = $clev[1]; if ($clev[2] ) { $critlevel{"UNKNOWN"} = $clev[2]; } } else { $CritOnly = $critical; } } else { $CritOnly = 1; } if ($unknown) { if ($unknown =~ /,/) { @ulev = split /,/,$unknown; $unklevel{"WARNING"} = $ulev[0]; $unklevel{"CRITICAL"} = $ulev[1]; if ($ulev[2] ) { $unklevel{"UNKNOWN"} = $ulev[2]; } } else { $UnkOnly = $unknown; } } else { $UnkOnly = 1; } if (!$freshness) { $freshness = 30 * 60; } else { $freshness = $freshness * 60; } my %ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0); my %much_ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0); my %output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE",""); my %much_output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE",""); if ($details) { if ($details =~ /,/) { my @tempv = split /,/,$details; $much_details = $tempv[0]; $details = $tempv[1]; } } open("sta","$status") || die "Cannot open status file $status!"; $curr_time = time; $file_time = stat($status)->mtime; if ($curr_time - $file_time > $freshness) { printf "State CRITICAL - Status file is stale!!!\n"; exitcheck($CRITICAL); } while() { chomp; if (/^[^\s]+[\s]+HOST;/) { @hdata = split /;/,$_; # If you care about matching hosts (not services): if ($host && $hdata[1] =~ /$host/) { $nummatch++; if ( $hdata[2] =~ /$HostNotOK/ ) { addproblem($_,$hdata[2]); } } # If you are matching services, gather host information: else { if ( $hdata[2] =~ /$HostNotOK/ ) { $hostdown[$numdown] = $hdata[1]; $numdown++; } else { $hostoktimes{$hdata[1]} = $hdata[4]; } if ( $hdata[17] ne "0" ) { $hostdowntime[$numdowntime] = $hdata[1]; $numdowntime++; } } } elsif (!$host && /^[^\s]+[\s]+SERVICE;/) { @servdata = split /;/,$_; if ( ( $pattern && ($_ =~ /$pattern/)) || (($servdata[1] =~ /$servhost/) && ($servdata[2] =~ /$service/)) ){ $nummatch++; if (($servdata[5] eq "HARD") && ($servdata[3] =~ /$ServiceNotOK/)) { addproblem($_,$servdata[3]); } } } } close("sta"); if ($nummatch==0) { print "Nothing Matches your criteria!\n"; exitcheck($UNKNOWN); } # Count the number of problems (for reference): if ($host) { $total = $numprob{"DOWN"} + $numprob{"UNREACHABLE"}; } else { $total = $numprob{"WARNING"} + $numprob{"CRITICAL"} + $numprob{"UNKNOWN"}; } my $numok = $nummatch - $total; # If this is a host state check: if ($host) { if ($numprob{"DOWN"}>0 || $numprob{"UNREACHABLE"}>0 ) { if ($details && ($total <= $details)) { print "State CRITICAL - $total Host Problems: $output{$down} $output{$unreach}\n"; exitcheck($CRITICAL); } else { print "State CRITICAL - $numprob{$down} Hosts Down, $numprob{$unreach} Hosts Unreachable\n"; exitcheck($CRITICAL); } } else { print "State OK - $numok Hosts Up, $total Problems\n"; exitcheck($OK); } } #If you only defined a Critical level in terms of # of criticals... elsif ($CritOnly && ($numprob{"CRITICAL"} >= $CritOnly)) { countAndPrint($crit,$numprob{$crit},0); exitcheck($CRITICAL); } #Critical in terms on # criticals and # warnings... elsif (!$CritOnly && ($numprob{"WARNING"} >= $critlevel{"WARNING"} || $numprob{"CRITICAL"} >= $critlevel{"CRITICAL"} || $numprob{"UNKNOWN"} >= $critlevel{"UNKNOWN"} )) { countAndPrint($crit,$total,1); exitcheck($CRITICAL); } #Warning in terms of # warnings only... elsif ($WarnOnly && ($numprob{"WARNING"} >= $WarnOnly)) { countAndPrint($warn,$numprob{$warn},0); exitcheck($WARNING); } #Warning in terms of # warnings and # criticals... elsif (!$WarnOnly && ($numprob{"WARNING"} >= $warnlevel{"WARNING"} || $numprob{"CRITICAL"} >= $warnlevel{"CRITICAL"} || $numprob{"UNKNOWN"} >= $warnlevel{"UNKNOWN"})) { countAndPrint($warn,$total,1); exitcheck($WARNING); } #Unknown in terms on # unknown only... elsif ( $UnkOnly && ($numprob{"UNKNOWN"}>=$UnkOnly) ) { countAndPrint($unk,$numprob{$unk},0); exitcheck($UNKNOWN); } #Unknown in terms of # warning, critical, and unknown... elsif (!$UnkOnly && ($numprob{"WARNING"} >= $unklevel{"WARNING"} || $numprob{"CRITICAL"} >= $unklevel{"CRITICAL"} || $numprob{"UNKNOWN"} >= $unklevel{"UNKNOWN"})) { countAndPrint($unk,$total,1); exitcheck($UNKNOWN); } # Everything is OK! else { print "State OK - $numok OK, $total problems\n"; exitcheck($OK); } ############################ # Subroutines ############################ # Return the proper exit code for Critical, Warning, Unknown, or OK sub exitcheck { if ($ok) { exit 0; } else { exit $_[0]; } } # Decide what to print for services: sub countAndPrint { my $state = $_[0]; my $count = $_[1]; my $alltypes = $_[2]; my $output = "State $state - "; if ($details) { if ($count<=$much_details) { if ($alltypes) { $output .= "$count problems: $much_output{$crit} $much_output{$warn} $much_output{$unk}"; } else { $output .= "$count \L$state\E: $much_output{$state}"; } } elsif ($count<=$details) { if ($alltypes) { $output .= "$count problems: $output{$crit} $output{$warn} $output{$unk}"; } else { $output .= "$count \L$state\E: $output{$state}"; } } else { if ($alltypes) { $output .= "$numprob{$crit} critical, $numprob{$warn} warning, $numprob{$unk} unknown"; } else { $output .= "$count \L$state\E"; } } } else { $output .= "$count problems"; } print "$output\n"; } # Add-in the problem found in the status log sub addproblem { $test = 1; $type = $_[1]; my $diffout = ""; my @values = split /;/,$_[0]; if (!$host) { my $namehold = $values[1]; if ($ack && ($values[13] eq "1")) { if ($ack =~ "ok") { $test = 0; } else { $type = "\U$ack"; } } elsif ($hdown && grep /$namehold/, @hostdown) { if ($hdown =~ "ok") { $test = 0; } else { $type = "\U$hdown"; $diffout = "$values[1] is down"; } } elsif ($dt && (($values[27] ne "0") || (grep /$namehold/, @hostdowntime))){ if ($dt =~ "ok") { $test = 0; } else { $type = "\U$dt"; } } elsif (exists $hostoktimes{$namehold}) { # If the state change time of the host is more recent than the last # service check, must wait until the next service check runs! if ($hostoktimes{$namehold} > $values[6]) { $test = 0; } } } else { if ($ack && $values[5]) { if ($ack =~ "ok") { $test = 0; } else { $type = "\U$ack"; } } elsif ($dt && ($values[17] ne "0")) { if ($dt =~ "ok") { $test = 0; } else { $type = "\U$dt"; } } } if ($details && $test) { if (!$host) { if ($diffout) { $much_output{$type} .= " $diffout;"; $output{$type} .= "$diffout;"; $much_ct{$type}++; $ct{$type}++; } else { if ($much_details && $much_ct{$type}<$much_details) { $much_output{$type} .= " $values[2] on $values[1] $values[31];"; $much_ct{$type}++; } if ($ct{$type} < $details) { $output{$type} .= " $values[2] on $values[1];"; $ct{$type}++; } } } else { $much_output{$type} .= " $values[1] $_[1] $values[20],"; $much_ct{type}++; $output{$type} .= " $values[1] HOST $_[1],"; $ct{$type}++; } } if ($test) { $numprob{$type}++; } } ################################ # # Version and Help Information # ################################ sub printVersion { printf < | -d [-w #[,#][,#]] [-c #[,#][,#]] [-u #[,#][,#]] [--service= | --servhost= | --pattern= | --host | --host=] [--ack[=string]] [--dt[=string]] [--hostdown[=string]] [-D #[,#]] [--ok] [-f ] $0 --help $0 --version NOTE: One of -s and -d must be specified Options: -s, --status=FILE_NAME Location and name of status log (e.g. /usr/local/nagios/var/status.log) -d, --dir=DIRECTORY_NAME Directory that contains the nagios logs (e.g. /usr/local/nagios/var/) -w, --warning=INTEGER[,INTEGER][,INTEGER] #: Number of warnings to result in a WARNING state OR #,#: Warning,Criticals to result in a WARNING state OR #,#,#: Warning,Critical,Unknown to result in a WARNING state Default: -w=1 -c, --critical=INTEGER[,INTEGER][,INTEGER] #: Number of criticals to result in a CRITICAL state OR #,#: Warning,Criticals to result in a CRITICAL state OR #,#,#: Warning,Critical,Unknown to result in a CRITICAL state Default: -c=1 -u, --unknown=INTEGER[,INTEGER][,INTEGER] #: Number of unknowns to result in a UNKNOWN state OR #,#: Warning,Criticals to result in a UNKNOWN state OR #,#,#: Warning,Critical,Unknown to result in a UNKNOWN state Default: -u=1 -r, --service[=REGEX] Only match services [that match the RegEx] (--service is default setting if no other matching arguments provided) --servhost=REGEX Only match services whose host match the RegEx -p, --pattern=REGEX Only parse for this regular expression (services only, not hosts) --host[=REGEX] Report on the state of hosts (whose name matches the RegEx if provided) -a, --ack[=ok|warning|critical|unknown|down|unreachable] Handle Acknowledged problems [--ack defaults to ok] --dt, --downtime[=ok|warning|critical|unknown|down|unreachable] Handle problems in scheduled downtime [--dt defaults to ok] --hd, --hostdown[=ok|warning|critical|unknown|down|unreachable] Handle services whose Host is down [--hd defaults to ok] -D, --details=INTEGER[,INTEGER] Amount of verbosity to output If # problems: <= 1st integer, return full details (each plugin's output) <= 2nd integer, return some details (list each service host pair) > 2nd integer, return the # of problems -f, --freshness=INTEGER Number of minutes old the log can be to make sure Nagios is running (Default = 30 minutes) --ok Return an OK exit code, regardless of number of problems found -h, --help Print detailed help screen -V, --version Print version information For service checking (use --service and/or --servhost): 1. The values of warning, critical, and unknown default to 1, i.e. $0 will return CRITICAL if there is at least 1 critical service, WARNING if there is at least 1 warning service, and UNKNOWN if there is at least one unknown service. 2. If a service's host is DOWN or UNREACHABLE, $0 will use the value of --hostdown to determine how to treat the service. Without that argument, $0 will count the service as OK. 3. If a service's host is OK, but the last host-state change occurred more recently than the last service check, $0 will ignore that service (want to wait until the service has been checked after a host has recovered or you may get service alert for services that still need to be checked) 4. If the --dt, --ack, or --hd tags are used, $0 will use the value of the arguments to determine how to handle services in downtime, acknowledged, or with down hosts (default=OK). For service checks, --dt will also check if the service's host is in a downtime. For host checking (use --host): 1. Using the --host argument, $0 will look for DOWN and UNREACHABLE hosts. If any are found, $0 will return a CRITICAL. You can provide an REGEX for --host to only check hosts with matching host names. 2. If the --dt or --ack tags are used, $0 will use the value of the --dt/--ack arguments to determine the state of the host (default is OK) EOF }