From 07fe1d77c03173f0291da02360a806260542b559 Mon Sep 17 00:00:00 2001 From: Subhendu Ghosh Date: Sun, 9 Feb 2003 14:16:29 +0000 Subject: more contribs git-svn-id: https://nagiosplug.svn.sourceforge.net/svnroot/nagiosplug/nagiosplug/trunk@300 f882894a-f735-0410-b71e-b25c423dba1c --- contrib/check_remote_nagios_status.pl | 666 ++++++++++++++++++++++++++++++++++ 1 file changed, 666 insertions(+) create mode 100644 contrib/check_remote_nagios_status.pl (limited to 'contrib/check_remote_nagios_status.pl') diff --git a/contrib/check_remote_nagios_status.pl b/contrib/check_remote_nagios_status.pl new file mode 100644 index 00000000..dc99705e --- /dev/null +++ b/contrib/check_remote_nagios_status.pl @@ -0,0 +1,666 @@ +#!/usr/bin/perl -w + +# check_status.pl Nagios Plugin - Version 1.3 +# Last Updated: 1/9/2003 +# +# Report any bugs/questions to Russell Scibetti at russell@quadrix.com +# +# check_status Change Log: +# +# To do for 1.4 +# - Better help and documentation (separate doc?) +# - Take argument (patterns to match) from a separate spec file +# +# New Addition to 1.3 +# - Added ChangeLog information and updated --help output +# - hostdown (hd) argument for how a service check should respond +# when its host is Down/Unreachable +# (--hostdown="ok|warning|critical|unknown") +# - Changed name from check_state to check_status +# - Set hostdown to default to OK when the argument isn't specified +# - Number of Hosts checked is now output in OK result +# +# Version 1.2 additions: +# +# - Added ability to handle ack'd and downtimed services differently +# depending on argument provided +# (--ack="ok|warning|critical|unknown|down|unreachable" +# --dt="ok|warning|critical|unknown|down|unreachable") +# +# Version 1.1 additions: +# +# - Added --host=, --servhost= to allow for specific field +# matching (host for matching hostname in host checks, servhost for +# matching the hostname in service checks, service for matching the +# service name in service checks) +# - Output the number of OK services for an OK output +# +# Version 1.0 features: +# +# - Freshness check of status.log (timestamp) +# - Match service or host checks +# - Can ignore acknowledged or downtimes services/hosts (--ack, --dt) +# - Can output different levels of detail dependent on # of problems +# - Can check for number of critical, warning, or unknowns +# +############################################################# + +use Getopt::Long; +use File::stat; + +Getopt::Long::Configure('bundling'); + +GetOptions + ("V" => \$version, "version" => \$version, + "h" => \$help, "help" => \$help, + "v" => \$verbose, "verbose" => \$verbose, + "w=s" => \$warning, "warning=s" => \$warning, + "c=s" => \$critical, "critical=s" => \$critical, + "u=s" => \$unknown, "unknown=s" => \$unknown, + "p=s" => \$pattern, "pattern=s" => \$pattern, + "S:s" => \$service, "service:s" => \$service, + "s=s" => \$status, "status=s" => \$status, + "d=s" => \$dir, "dir=s" => \$dir, + "D=s" => \$details, "details=s" => \$details, + "H:s" => \$host, "host:s" => \$host, + "f=s" => \$freshness, "freshness=s" => \$freshness, + "servhost=s" => \$servhost, + "a:s" => \$ack, "ack:s" => \$ack, + "dt:s"=> \$dt, "downtime:s" => \$dt, + "hd:s"=> \$hdown, "hostdown:s" => \$hdown, + "ok" => \$ok); + +#Constants: +my $OK = 0; +my $WARNING = 1; +my $CRITICAL = 2; +my $UNKNOWN = 3; + +my $crit="CRITICAL"; +my $warn="WARNING"; +my $unk="UNKNOWN"; +my $down="DOWN"; +my $unreach="UNREACHABLE"; + +# Print out Help information +if ($help) { + printVersion(); + printHelp(); + exitcheck($UNKNOWN); +} + +# Print out version information +if ($version) { + printVersion(); + exitcheck($UNKNOWN); +} + +# Check for status log or directory argument or print usage +if (!$status) { + if (!$dir) { + print "Usage: $0 -s | -d \n"; + print "Use the --help option for full list of arguments\n"; + exitcheck($UNKNOWN); + } + elsif ($dir =~ m#[^/]/$#) { + $status = $dir . "status.log"; + } + else { + $status = $dir . "/status.log"; + } +} + +if (defined $host) { + if (!$host) { + $host="[^\\s]*"; + } +} + +if (!$host && !$servhost) { + $servhost="[^\\s]*"; +} + +if (!$host && !$service) { + $service="[^\\s]*"; +} + +if (defined $ack) { + if (!$ack) { + $ack="ok"; + } + elsif (!($ack =~ "ok|critical|warning|unknown|down|unreachable")) { + print "Invalid value for ack\n"; + exitcheck($UNKNOWN); + } +} + +if (defined $dt) { + if (!$dt) { + $dt="ok"; + } + elsif (!($dt =~ "ok|critical|warning|unknown|down|unreachable")) { + print "Invalid value for dt\n"; + exitcheck($UNKNOWN); + } +} + +if (defined $hdown) { + if (!$hdown) { + $hdown="ok"; + } + elsif (!($hdown =~ "ok|critical|warning|unknown|down|unreachable")) { + print "Invalid value for hostdown\n"; + exitcheck($UNKNOWN); + } +} + +my $much_details = 0; + +my $ServiceNotOK = "CRITICAL|WARNING|UNKNOWN"; +my $HostNotOK = "DOWN|UNREACHABLE"; + +my %numprob = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0); + +my $CritOnly = 0; +my $WarnOnly = 0; +my $UnkOnly = 0; + +my @wlev; +my @clev; +my @ulev; +my %warnlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0); +my %critlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0); +my %unklevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0); +my %hostlevel = ("DOWN",0,"UNREACHABLE",0); + +# Store Hosts in downtime +my @hostdowntime; +my $numdowntime = 0; + +# Store Hosts in a Down/Unreachable state +my @hostdown; +my $numdown = 0; + +# Hash for storing state-change to OK times for hosts: +my %hostoktimes; + +# Number of matches in parsing +my $nummatch = 0; + +if ($warning) { + if ($warning =~ /,/) { + @wlev = split /,/,$warning; + $warnlevel{"WARNING"} = $wlev[0]; + $warnlevel{"CRITICAL"} = $wlev[1]; + if ($wlev[2] ) { + $warnlevel{"UNKNOWN"} = $wlev[2]; + } + } + else { + $WarnOnly = $warning; + } +} +else { + $WarnOnly = 1; +} + +if ($critical) { + if ($critical =~ /,/) { + @clev = split /,/,$critical; + $critlevel{"WARNING"} = $clev[0]; + $critlevel{"CRITICAL"} = $clev[1]; + if ($clev[2] ) { + $critlevel{"UNKNOWN"} = $clev[2]; + } + } + else { + $CritOnly = $critical; + } +} +else { + $CritOnly = 1; +} + +if ($unknown) { + if ($unknown =~ /,/) { + @ulev = split /,/,$unknown; + $unklevel{"WARNING"} = $ulev[0]; + $unklevel{"CRITICAL"} = $ulev[1]; + if ($ulev[2] ) { + $unklevel{"UNKNOWN"} = $ulev[2]; + } + } + else { + $UnkOnly = $unknown; + } +} +else { + $UnkOnly = 1; +} + + +if (!$freshness) { + $freshness = 30 * 60; +} +else { + $freshness = $freshness * 60; +} + +my %ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0); +my %much_ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0); + +my %output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE",""); +my %much_output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE",""); + +if ($details) { + if ($details =~ /,/) { + my @tempv = split /,/,$details; + $much_details = $tempv[0]; + $details = $tempv[1]; + } +} + +open("sta","$status") || die "Cannot open status file $status!"; + +$curr_time = time; +$file_time = stat($status)->mtime; + +if ($curr_time - $file_time > $freshness) { + printf "State CRITICAL - Status file is stale!!!\n"; + exitcheck($CRITICAL); +} + +while() { + chomp; + if (/^[^\s]+[\s]+HOST;/) { + @hdata = split /;/,$_; + +# If you care about matching hosts (not services): + if ($host && $hdata[1] =~ /$host/) { + $nummatch++; + if ( $hdata[2] =~ /$HostNotOK/ ) { + addproblem($_,$hdata[2]); + } + } + +# If you are matching services, gather host information: + else { + if ( $hdata[2] =~ /$HostNotOK/ ) { + $hostdown[$numdown] = $hdata[1]; + $numdown++; + } + else { + $hostoktimes{$hdata[1]} = $hdata[4]; + } + if ( $hdata[17] ne "0" ) { + $hostdowntime[$numdowntime] = $hdata[1]; + $numdowntime++; + } + } + } + elsif (!$host && /^[^\s]+[\s]+SERVICE;/) { + @servdata = split /;/,$_; + if ( ( $pattern && ($_ =~ /$pattern/)) || + (($servdata[1] =~ /$servhost/) && ($servdata[2] =~ /$service/)) ){ + $nummatch++; + if (($servdata[5] eq "HARD") && ($servdata[3] =~ /$ServiceNotOK/)) { + addproblem($_,$servdata[3]); + } + } + } +} + +close("sta"); + +if ($nummatch==0) { + print "Nothing Matches your criteria!\n"; + exitcheck($UNKNOWN); +} + +# Count the number of problems (for reference): +if ($host) { + $total = $numprob{"DOWN"} + $numprob{"UNREACHABLE"}; +} +else { + $total = $numprob{"WARNING"} + $numprob{"CRITICAL"} + $numprob{"UNKNOWN"}; +} + +my $numok = $nummatch - $total; + +# If this is a host state check: +if ($host) { + if ($numprob{"DOWN"}>0 || $numprob{"UNREACHABLE"}>0 ) { + if ($details && ($total <= $details)) { + print "State CRITICAL - $total Host Problems: $output{$down} $output{$unreach}\n"; + exitcheck($CRITICAL); + } + else { + print "State CRITICAL - $numprob{$down} Hosts Down, $numprob{$unreach} Hosts Unreachable\n"; + exitcheck($CRITICAL); + } + } + else { + print "State OK - $numok Hosts Up, $total Problems\n"; + exitcheck($OK); + } +} + +#If you only defined a Critical level in terms of # of criticals... +elsif ($CritOnly && ($numprob{"CRITICAL"} >= $CritOnly)) { + countAndPrint($crit,$numprob{$crit},0); + exitcheck($CRITICAL); +} + +#Critical in terms on # criticals and # warnings... +elsif (!$CritOnly && ($numprob{"WARNING"} >= $critlevel{"WARNING"} || + $numprob{"CRITICAL"} >= $critlevel{"CRITICAL"} || + $numprob{"UNKNOWN"} >= $critlevel{"UNKNOWN"} )) { + countAndPrint($crit,$total,1); + exitcheck($CRITICAL); +} + +#Warning in terms of # warnings only... +elsif ($WarnOnly && ($numprob{"WARNING"} >= $WarnOnly)) { + countAndPrint($warn,$numprob{$warn},0); + exitcheck($WARNING); +} + +#Warning in terms of # warnings and # criticals... +elsif (!$WarnOnly && ($numprob{"WARNING"} >= $warnlevel{"WARNING"} || + $numprob{"CRITICAL"} >= $warnlevel{"CRITICAL"} || + $numprob{"UNKNOWN"} >= $warnlevel{"UNKNOWN"})) { + countAndPrint($warn,$total,1); + exitcheck($WARNING); +} + +#Unknown in terms on # unknown only... +elsif ( $UnkOnly && ($numprob{"UNKNOWN"}>=$UnkOnly) ) { + countAndPrint($unk,$numprob{$unk},0); + exitcheck($UNKNOWN); +} + +#Unknown in terms of # warning, critical, and unknown... +elsif (!$UnkOnly && ($numprob{"WARNING"} >= $unklevel{"WARNING"} || + $numprob{"CRITICAL"} >= $unklevel{"CRITICAL"} || + $numprob{"UNKNOWN"} >= $unklevel{"UNKNOWN"})) { + countAndPrint($unk,$total,1); + exitcheck($UNKNOWN); +} + +# Everything is OK! +else { + print "State OK - $numok OK, $total problems\n"; + exitcheck($OK); +} + + + +############################ +# Subroutines +############################ + +# Return the proper exit code for Critical, Warning, Unknown, or OK +sub exitcheck { + if ($ok) { + exit 0; + } + else { + exit $_[0]; + } +} + +# Decide what to print for services: +sub countAndPrint { + my $state = $_[0]; + my $count = $_[1]; + my $alltypes = $_[2]; + my $output = "State $state - "; + + if ($details) { + if ($count<=$much_details) { + if ($alltypes) { + $output .= "$count problems: $much_output{$crit} $much_output{$warn} $much_output{$unk}"; + } + else { + $output .= "$count \L$state\E: $much_output{$state}"; + } + } + elsif ($count<=$details) { + if ($alltypes) { + $output .= "$count problems: $output{$crit} $output{$warn} $output{$unk}"; + } + else { + $output .= "$count \L$state\E: $output{$state}"; + } + } + else { + if ($alltypes) { + $output .= "$numprob{$crit} critical, $numprob{$warn} warning, $numprob{$unk} unknown"; + } + else { + $output .= "$count \L$state\E"; + } + } + } + else { + $output .= "$count problems"; + } + + print "$output\n"; +} + + +# Add-in the problem found in the status log +sub addproblem { + + $test = 1; + $type = $_[1]; + my $diffout = ""; + + my @values = split /;/,$_[0]; + + if (!$host) { + my $namehold = $values[1]; + if ($ack && ($values[13] eq "1")) { + if ($ack =~ "ok") { + $test = 0; + } + else { + $type = "\U$ack"; + } + } + elsif ($hdown && grep /$namehold/, @hostdown) { + if ($hdown =~ "ok") { + $test = 0; + } + else { + $type = "\U$hdown"; + $diffout = "$values[1] is down"; + } + } + elsif ($dt && (($values[27] ne "0") || (grep /$namehold/, @hostdowntime))){ + if ($dt =~ "ok") { + $test = 0; + } + else { + $type = "\U$dt"; + } + } + elsif (exists $hostoktimes{$namehold}) { + # If the state change time of the host is more recent than the last + # service check, must wait until the next service check runs! + if ($hostoktimes{$namehold} > $values[6]) { + $test = 0; + } + } + } + else { + if ($ack && $values[5]) { + if ($ack =~ "ok") { + $test = 0; + } + else { + $type = "\U$ack"; + } + } + elsif ($dt && ($values[17] ne "0")) { + if ($dt =~ "ok") { + $test = 0; + } + else { + $type = "\U$dt"; + } + } + } + + if ($details && $test) { + if (!$host) { + if ($diffout) { + $much_output{$type} .= " $diffout;"; + $output{$type} .= "$diffout;"; + $much_ct{$type}++; + $ct{$type}++; + } + else { + if ($much_details && $much_ct{$type}<$much_details) { + $much_output{$type} .= " $values[2] on $values[1] $values[31];"; + $much_ct{$type}++; + } + if ($ct{$type} < $details) { + $output{$type} .= " $values[2] on $values[1];"; + $ct{$type}++; + } + } + } + else { + $much_output{$type} .= " $values[1] $_[1] $values[20],"; + $much_ct{type}++; + $output{$type} .= " $values[1] HOST $_[1],"; + $ct{$type}++; + } + } + if ($test) { + $numprob{$type}++; + } +} + +################################ +# +# Version and Help Information +# +################################ + +sub printVersion { + printf < | -d + [-w #[,#][,#]] [-c #[,#][,#]] [-u #[,#][,#]] + [--service= | --servhost= | --pattern= | + --host | --host=] + [--ack[=string]] [--dt[=string]] [--hostdown[=string]] + [-D #[,#]] [--ok] [-f ] + $0 --help + $0 --version +NOTE: One of -s and -d must be specified + +Options: + -s, --status=FILE_NAME + Location and name of status log (e.g. /usr/local/nagios/var/status.log) + -d, --dir=DIRECTORY_NAME + Directory that contains the nagios logs (e.g. /usr/local/nagios/var/) + -w, --warning=INTEGER[,INTEGER][,INTEGER] + #: Number of warnings to result in a WARNING state + OR + #,#: Warning,Criticals to result in a WARNING state + OR + #,#,#: Warning,Critical,Unknown to result in a WARNING state + Default: -w=1 + -c, --critical=INTEGER[,INTEGER][,INTEGER] + #: Number of criticals to result in a CRITICAL state + OR + #,#: Warning,Criticals to result in a CRITICAL state + OR + #,#,#: Warning,Critical,Unknown to result in a CRITICAL state + Default: -c=1 + -u, --unknown=INTEGER[,INTEGER][,INTEGER] + #: Number of unknowns to result in a UNKNOWN state + OR + #,#: Warning,Criticals to result in a UNKNOWN state + OR + #,#,#: Warning,Critical,Unknown to result in a UNKNOWN state + Default: -u=1 + -r, --service[=REGEX] + Only match services [that match the RegEx] + (--service is default setting if no other matching arguments provided) + --servhost=REGEX + Only match services whose host match the RegEx + -p, --pattern=REGEX + Only parse for this regular expression (services only, not hosts) + --host[=REGEX] + Report on the state of hosts (whose name matches the RegEx if provided) + -a, --ack[=ok|warning|critical|unknown|down|unreachable] + Handle Acknowledged problems [--ack defaults to ok] + --dt, --downtime[=ok|warning|critical|unknown|down|unreachable] + Handle problems in scheduled downtime [--dt defaults to ok] + --hd, --hostdown[=ok|warning|critical|unknown|down|unreachable] + Handle services whose Host is down [--hd defaults to ok] + -D, --details=INTEGER[,INTEGER] + Amount of verbosity to output + If # problems: + <= 1st integer, return full details (each plugin's output) + <= 2nd integer, return some details (list each service host pair) + > 2nd integer, return the # of problems + -f, --freshness=INTEGER + Number of minutes old the log can be to make sure Nagios is running + (Default = 30 minutes) + --ok + Return an OK exit code, regardless of number of problems found + -h, --help + Print detailed help screen + -V, --version + Print version information + +For service checking (use --service and/or --servhost): +1. The values of warning, critical, and unknown default to 1, i.e. +$0 will return CRITICAL if there is at least 1 critical service, +WARNING if there is at least 1 warning service, and UNKNOWN if there is +at least one unknown service. + +2. If a service's host is DOWN or UNREACHABLE, $0 will use the +value of --hostdown to determine how to treat the service. Without that +argument, $0 will count the service as OK. + +3. If a service's host is OK, but the last host-state change occurred more +recently than the last service check, $0 will ignore that service +(want to wait until the service has been checked after a host has recovered +or you may get service alert for services that still need to be checked) + +4. If the --dt, --ack, or --hd tags are used, $0 will use the value +of the arguments to determine how to handle services in downtime, acknowledged, +or with down hosts (default=OK). For service checks, --dt will also check +if the service's host is in a downtime. + +For host checking (use --host): +1. Using the --host argument, $0 will look for DOWN and UNREACHABLE +hosts. If any are found, $0 will return a CRITICAL. You can provide +an REGEX for --host to only check hosts with matching host names. + +2. If the --dt or --ack tags are used, $0 will use the value of the +--dt/--ack arguments to determine the state of the host (default is OK) + +EOF +} -- cgit v1.2.3-74-g34f1