[Nagiosplug-checkins] nagiosplug/plugins Makefile.am,1.65,1.66 check_ntp.c,1.6,1.7 common.h,1.18,1.19 runcmd.c,1.2,1.3

M. Sean Finney seanius at users.sourceforge.net
Mon May 1 14:53:06 CEST 2006


Update of /cvsroot/nagiosplug/nagiosplug/plugins
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv6344/plugins

Modified Files:
	Makefile.am check_ntp.c common.h runcmd.c 
Log Message:
- check_ntp:
  - now roughly feature-complete.
  - various bugfixes, esp. offset calculation.
  - enhanced the asynchronous offset polling to set requests that
    haven't recieved a response in >= 1 second to stale and retransmit them,
    which results in much better performance on unreliable networks.
  - we only spend timeout/2 seconds polling offsets, and if we don't get
    everything by that point we work with what we have and set status to
    warning/critical depending on how much data we have.
  - set the same defaults as the perl script.
- commit changes to configure.in to support automatic building of check_apt
  (if apt-get is installed and regex libraries available) and check_ntp
  (unconditionally), now defaulting to check_ntp.c instead of the perl script.
  if this is an issue we can back out the commit of course.  an eye
  should be kept on check_ntp building and running correctly in different
  environments, esp. 64-bit and big-endian platforms, and those with more
  "esoteric" API's (do any of the platforms not have poll()?).
- similar changes to Makefile.am's.
- common.h: add statement to include sys/poll.h
- runcmd.c: exit STATE_UNKNOWN if execve() fails.


Index: check_ntp.c
===================================================================
RCS file: /cvsroot/nagiosplug/nagiosplug/plugins/check_ntp.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -d -r1.6 -r1.7
--- check_ntp.c	12 Apr 2006 10:00:23 -0000	1.6
+++ check_ntp.c	1 May 2006 21:52:42 -0000	1.7
@@ -29,16 +29,15 @@
 #include "common.h"
 #include "netutils.h"
 #include "utils.h"
-#include <sys/poll.h>
 
 static char *server_address=NULL;
 static int verbose=0;
 static int zero_offset_bad=0;
-static double owarn=0;
-static double ocrit=0;
+static double owarn=60;
+static double ocrit=120;
 static short do_jitter=0;
-static double jwarn=0;
-static double jcrit=0;
+static double jwarn=5000;
+static double jcrit=10000;
 
 int process_arguments (int, char **);
 void print_help (void);
@@ -67,8 +66,11 @@
 
 /* this structure holds data about results from querying offset from a peer */
 typedef struct {
-	int waiting;            /* we set to 1 to signal waiting for a response */  
+	time_t waiting;         /* ts set when we started waiting for a response */ 
 	int num_responses;      /* number of successfully recieved responses */
+	uint8_t stratum;        /* copied verbatim from the ntp_message */
+	double rtdelay;         /* converted from the ntp_message */
+	double rtdisp;          /* converted from the ntp_message */
 	double offset[AVG_NUM]; /* offsets from each response */
 } ntp_server_results;
 
@@ -192,13 +194,12 @@
 
 /* calculate the offset of the local clock */
 static inline double calc_offset(const ntp_message *m, const struct timeval *t){
-	double client_tx, peer_rx, peer_tx, client_rx, rtdelay;
+	double client_tx, peer_rx, peer_tx, client_rx;
 	client_tx = NTP64asDOUBLE(m->origts);
 	peer_rx = NTP64asDOUBLE(m->rxts);
 	peer_tx = NTP64asDOUBLE(m->txts);
 	client_rx=TVasDOUBLE((*t));
-	rtdelay=NTP32asDOUBLE(m->rtdelay);
-	return (.5*((peer_tx-client_rx)+(peer_rx-client_tx)))-rtdelay;
+	return (.5*((peer_tx-client_rx)+(peer_rx-client_tx)));
 }
 
 /* print out a ntp packet in human readable/debuggable format */
@@ -279,14 +280,63 @@
 	TVtoNTP64(t,p->txts);
 }
 
+/* select the "best" server from a list of servers, and return its index.
+ * this is done by filtering servers based on stratum, dispersion, and
+ * finally round-trip delay. */
+int best_offset_server(const ntp_server_results *slist, int nservers){
+	int i=0, j=0, cserver=0, candidates[5], csize=0;
+
+	/* for each server */
+	for(cserver=0; cserver<nservers; cserver++){
+		/* compare it to each of the servers already in the candidate list */
+		for(i=0; i<csize; i++){
+			/* does it have an equal or better stratum? */
+			if(slist[cserver].stratum <= slist[i].stratum){
+				/* does it have an equal or better dispersion? */
+				if(slist[cserver].rtdisp <= slist[i].rtdisp){
+					/* does it have a better rtdelay? */
+					if(slist[cserver].rtdelay < slist[i].rtdelay){
+						break;
+					}
+				}
+			}
+		}
+
+		/* if we haven't reached the current list's end, move everyone
+		 * over one to the right, and insert the new candidate */
+		if(i<csize){
+			for(j=5; j>i; j--){
+				candidates[j]=candidates[j-1];
+			}
+		}
+		/* regardless, if they should be on the list... */
+		if(i<5) {
+			candidates[i]=cserver;
+			if(csize<5) csize++;
+		/* otherwise discard the server */
+		} else {
+			DBG(printf("discarding peer id %d\n", cserver));
+		}
+	}
+
+	if(csize>0) {
+		DBG(printf("best server selected: peer %d\n", candidates[0]));
+		return candidates[0];
+	} else {
+		DBG(printf("no peers meeting synchronization criteria :(\n"));
+		return -1;
+	}
+}
+
 /* do everything we need to get the total average offset
  * - we use a certain amount of parallelization with poll() to ensure
  *   we don't waste time sitting around waiting for single packets. 
  * - we also "manually" handle resolving host names and connecting, because
  *   we have to do it in a way that our lazy macros don't handle currently :( */
-double offset_request(const char *host){
+double offset_request(const char *host, int *status){
 	int i=0, j=0, ga_result=0, num_hosts=0, *socklist=NULL, respnum=0;
-	int servers_completed=0, one_written=0, servers_readable=0, offsets_recvd=0;
+	int servers_completed=0, one_written=0, servers_readable=0, best_index=-1;
+	time_t now_time=0, start_ts=0;
 	ntp_message *req=NULL;
 	double avg_offset=0.;
 	struct timeval recv_time;
@@ -337,28 +387,24 @@
 		ai_tmp = ai_tmp->ai_next;
 	}
 
-	/* now do AVG_NUM checks to each host. */
-	while(servers_completed<num_hosts){
-
-		/* write to any servers that are free and have done < AVG_NUM reqs */
-		/* XXX we need some kind of ability to retransmit lost packets.
-		 * XXX one way would be replace "waiting" with a timestamp and
-		 * XXX if the timestamp is old enough the request is re-transmitted.
-		 * XXX then a certain number of failures could mark a server as
-		 * XXX bad, which is what i imagine that ntpdate does though
-		 * XXX i can't confirm it (i think it still only sends a max
-		 * XXX of AVG_NUM requests, but what does it do if one fails
-		 * XXX but the others succeed? */
-		/* XXX also we need the ability to cut out failed/unresponsive
-		 * XXX servers.  currently after doing all other servers we
-		 * XXX still wait for them until the bitter end/timeout. */
+	/* now do AVG_NUM checks to each host.  we stop before timeout/2 seconds
+	 * have passed in order to ensure post-processing and jitter time. */
+	now_time=start_ts=time(NULL);
+	while(servers_completed<num_hosts && now_time-start_ts <= socket_timeout/2){
+		/* loop through each server and find each one which hasn't
+		 * been touched in the past second or so and is still lacking
+		 * some responses.  for each of these servers, send a new request,
+		 * and update the "waiting" timestamp with the current time. */
 		one_written=0;
+		now_time=time(NULL);
+
 		for(i=0; i<num_hosts; i++){
-			if(!servers[i].waiting && servers[i].num_responses<AVG_NUM){
+			if(servers[i].waiting<now_time && servers[i].num_responses<AVG_NUM){
+				if(verbose && servers[i].waiting != 0) printf("re-");
 				if(verbose) printf("sending request to peer %d\n", i);
 				setup_request(&req[i]);
 				write(socklist[i], &req[i], sizeof(ntp_message));
-				servers[i].waiting=1;
+				servers[i].waiting=now_time;
 				one_written=1;
 				break;
 			}
@@ -373,17 +419,22 @@
 
 		/* read from any sockets with pending data */
 		for(i=0; servers_readable && i<num_hosts; i++){
-			if(ufds[i].revents&POLLIN){
+			if(ufds[i].revents&POLLIN && servers[i].num_responses < AVG_NUM){
 				if(verbose) {
 					printf("response from peer %d: ", i);
 				}
+
 				read(ufds[i].fd, &req[i], sizeof(ntp_message));
 				gettimeofday(&recv_time, NULL);
+				DBG(print_ntp_message(&req[i]));
 				respnum=servers[i].num_responses++;
 				servers[i].offset[respnum]=calc_offset(&req[i], &recv_time);
 				if(verbose) {
-					printf("offset %g\n", servers[i].offset[respnum]);
+					printf("offset %.10g\n", servers[i].offset[respnum]);
 				}
+				servers[i].stratum=req[i].stratum;
+				servers[i].rtdisp=NTP32asDOUBLE(req[i].rtdisp);
+				servers[i].rtdelay=NTP32asDOUBLE(req[i].rtdelay);
 				servers[i].waiting=0;
 				servers_readable--;
 				if(servers[i].num_responses==AVG_NUM) servers_completed++;
@@ -392,15 +443,17 @@
 		/* lather, rinse, repeat. */
 	}
 
-	/* finally, calculate the average offset */
-	/* XXX still something about the "top 5" */
-	for(i=0;i<num_hosts;i++){
-		for(j=0;j<servers[i].num_responses;j++){
-			offsets_recvd++;
-			avg_offset+=servers[i].offset[j];
+	/* now, pick the best server from the list */
+	best_index=best_offset_server(servers, num_hosts);
+	if(best_index < 0){
+		*status=STATE_CRITICAL;
+	} else {
+		/* finally, calculate the average offset */
+		for(i=0; i<servers[best_index].num_responses;i++){
+			avg_offset+=servers[best_index].offset[j];
 		}
+		avg_offset/=servers[best_index].num_responses;
 	}
-	avg_offset/=offsets_recvd;
 
 	/* cleanup */
 	for(j=0; j<num_hosts; j++){ close(socklist[j]); }
@@ -410,7 +463,7 @@
 	free(req);
 	freeaddrinfo(ai);
 
-	if(verbose) printf("overall average offset: %g\n", avg_offset);
+	if(verbose) printf("overall average offset: %.10g\n", avg_offset);
 	return avg_offset;
 }
 
@@ -426,10 +479,11 @@
 }
 
 /* XXX handle responses with the error bit set */
-double jitter_request(const char *host){
+double jitter_request(const char *host, int *status){
 	int conn=-1, i, npeers=0, num_candidates=0, syncsource_found=0;
 	int run=0, min_peer_sel=PEER_INCLUDED, num_selected=0, num_valid=0;
-	ntp_assoc_status_pair *peers;
+	int peer_offset=0;
+	ntp_assoc_status_pair *peers=NULL;
 	ntp_control_message req;
 	double rval = 0.0, jitter = -1.0;
 	char *startofvalue=NULL, *nptr=NULL;
@@ -449,27 +503,28 @@
 	 * 4) Extract the jitter value from the data[] (it's ASCII)
 	 */
 	my_udp_connect(server_address, 123, &conn);
-	setup_control_request(&req, OP_READSTAT, 1);
 
-	DBG(printf("sending READSTAT request"));
-	write(conn, &req, SIZEOF_NTPCM(req));
-	DBG(print_ntp_control_message(&req));
-	/* Attempt to read the largest size packet possible
-	 * Is it possible for an NTP server to have more than 117 synchronization
-	 * sources?  If so, we will receive a second datagram with additional
-	 * peers listed, since 117 is the maximum number that can fit in a
-	 * single NTP control datagram.  This code doesn't handle that case */
-	/* XXX check the REM_MORE bit */
-	req.count=htons(MAX_CM_SIZE);
-	DBG(printf("recieving READSTAT response"))
-	read(conn, &req, SIZEOF_NTPCM(req));
-	DBG(print_ntp_control_message(&req));
-	/* Each peer identifier is 4 bytes in the data section, which
-	 * we represent as a ntp_assoc_status_pair datatype.
-	 */
-	npeers=ntohs(req.count)/sizeof(ntp_assoc_status_pair);
-	peers=(ntp_assoc_status_pair*)malloc(sizeof(ntp_assoc_status_pair)*npeers);
-	memcpy((void*)peers, (void*)req.data, sizeof(ntp_assoc_status_pair)*npeers);
+	/* keep sending requests until the server stops setting the
+	 * REM_MORE bit, though usually this is only 1 packet. */
+	do{
+		setup_control_request(&req, OP_READSTAT, 1);
+		DBG(printf("sending READSTAT request"));
+		write(conn, &req, SIZEOF_NTPCM(req));
+		DBG(print_ntp_control_message(&req));
+		/* Attempt to read the largest size packet possible */
+		req.count=htons(MAX_CM_SIZE);
+		DBG(printf("recieving READSTAT response"))
+		read(conn, &req, SIZEOF_NTPCM(req));
+		DBG(print_ntp_control_message(&req));
+		/* Each peer identifier is 4 bytes in the data section, which
+	 	 * we represent as a ntp_assoc_status_pair datatype.
+	 	 */
+		npeers+=(ntohs(req.count)/sizeof(ntp_assoc_status_pair));
+		peers=(ntp_assoc_status_pair*)realloc(peers, sizeof(ntp_assoc_status_pair)*npeers);
+		memcpy((void*)peers+peer_offset, (void*)req.data, sizeof(ntp_assoc_status_pair)*npeers);
+		peer_offset+=ntohs(req.count);
+	} while(req.op&REM_MORE);
+
 	/* first, let's find out if we have a sync source, or if there are
 	 * at least some candidates.  in the case of the latter we'll issue
 	 * a warning but go ahead with the check on them. */
@@ -484,13 +539,15 @@
 	}
 	if(verbose) printf("%d candiate peers available\n", num_candidates);
 	if(verbose && syncsource_found) printf("synchronization source found\n");
-	/* XXX if ! syncsource_found set status to warning */
+	if(! syncsource_found) *status = STATE_WARNING;
+
 
 	for (run=0; run<AVG_NUM; run++){
 		if(verbose) printf("jitter run %d of %d\n", run+1, AVG_NUM);
 		for (i = 0; i < npeers; i++){
 			/* Only query this server if it is the current sync source */
 			if (PEER_SEL(peers[i].status) >= min_peer_sel){
+				num_selected++;
 				setup_control_request(&req, OP_READVAR, 2);
 				req.assoc = peers[i].assoc;
 				/* By spec, putting the variable name "jitter"  in the request
@@ -514,11 +571,12 @@
 					printf("parsing jitter from peer %.2x: ", peers[i].assoc);
 				}
 				startofvalue = strchr(req.data, '=') + 1;
-				jitter = strtod(startofvalue, &nptr);
-				num_selected++;
-				if(jitter == 0 && startofvalue==nptr){
-					printf("warning: unable to parse server response.\n");
-					/* XXX errors value ... */
+				if(startofvalue != NULL) {
+					jitter = strtod(startofvalue, &nptr);
+				}
+				if(startofvalue == NULL || startofvalue==nptr){
+					printf("warning: unable to read server jitter response.\n");
+					*status = STATE_WARNING;
 				} else {
 					if(verbose) printf("%g\n", jitter);
 					num_valid++;
@@ -527,7 +585,7 @@
 			}
 		}
 		if(verbose){
-			printf("jitter parsed from %d/%d peers\n", num_selected, num_valid);
+			printf("jitter parsed from %d/%d peers\n", num_valid, num_selected);
 		}
 	}
 
@@ -637,9 +695,11 @@
 }
 
 int main(int argc, char *argv[]){
-	int result = STATE_UNKNOWN;
+	int result, offset_result, jitter_result;
 	double offset=0, jitter=0;
 
+	result=offset_result=jitter_result=STATE_UNKNOWN;
+
 	if (process_arguments (argc, argv) == ERROR)
 		usage4 (_("Could not parse arguments"));
 
@@ -649,14 +709,15 @@
 	/* set socket timeout */
 	alarm (socket_timeout);
 
-	offset = offset_request(server_address);
-	if(offset > ocrit){
+	offset = offset_request(server_address, &offset_result);
+	if(fabs(offset) > ocrit){
 		result = STATE_CRITICAL;
-	} else if(offset > owarn) {
+	} else if(fabs(offset) > owarn) {
 		result = STATE_WARNING;
 	} else {
 		result = STATE_OK;
 	}
+	result=max_state(result, offset_result);
 
 	/* If not told to check the jitter, we don't even send packets.
 	 * jitter is checked using NTP control packets, which not all
@@ -664,7 +725,7 @@
 	 * (for example) will result in an error
 	 */
 	if(do_jitter){
-		jitter=jitter_request(server_address);
+		jitter=jitter_request(server_address, &jitter_result);
 		if(jitter > jcrit){
 			result = max_state(result, STATE_CRITICAL);
 		} else if(jitter > jwarn) {
@@ -675,6 +736,7 @@
 			result = STATE_UNKNOWN;
 		}
 	}
+	result=max_state(result, jitter_result);
 
 	switch (result) {
 		case STATE_CRITICAL :
@@ -690,9 +752,15 @@
 			printf("NTP UNKNOWN: ");
 			break;
 	}
-
-	printf("Offset %g secs|offset=%g", offset, offset);
-	if (do_jitter) printf("|jitter=%f", jitter);
+	if(offset_result==STATE_CRITICAL){
+		printf("Offset unknown|offset=unknown");
+	} else {
+		if(offset_result==STATE_WARNING){
+			printf("Unable to fully sample sync server. ");
+		}
+		printf("Offset %.10g secs|offset=%.10g", offset, offset);
+	}
+	if (do_jitter) printf(", jitter=%f", jitter);
 	printf("\n");
 
 	if(server_address!=NULL) free(server_address);

Index: Makefile.am
===================================================================
RCS file: /cvsroot/nagiosplug/nagiosplug/plugins/Makefile.am,v
retrieving revision 1.65
retrieving revision 1.66
diff -u -d -r1.65 -r1.66
--- Makefile.am	21 Mar 2006 11:42:59 -0000	1.65
+++ Makefile.am	1 May 2006 21:52:42 -0000	1.66
@@ -13,8 +13,8 @@
 MATHLIBS = @MATHLIBS@
 AM_CFLAGS = -Wall
 
-libexec_PROGRAMS = check_disk check_dummy check_http check_load \
-	check_mrtg check_mrtgtraf check_nwstat check_overcr check_ping \
+libexec_PROGRAMS = check_apt check_disk check_dummy check_http check_load \
+	check_mrtg check_mrtgtraf check_ntp check_nwstat check_overcr check_ping \
 	check_real check_smtp check_ssh check_tcp check_time \
 	check_udp check_ups check_users negate \
 	urlize @EXTRAS@
@@ -25,7 +25,7 @@
 EXTRA_PROGRAMS = check_mysql check_radius check_pgsql check_snmp check_hpjd \
 	check_swap check_fping check_ldap check_game check_dig \
 	check_nagios check_by_ssh check_dns check_nt check_ide_smart	\
-	check_procs check_mysql_query
+	check_procs check_mysql_query check_apt
 
 EXTRA_DIST = t utils.c netutils.c sslutils.c popen.c utils.h netutils.h \
 	popen.h common.h getaddrinfo.c getaddrinfo.h \
@@ -51,6 +51,7 @@
 ##############################################################################
 # the actual targets
 
+check_apt_LDADD = $(BASEOBJS) runcmd.o
 check_dig_LDADD = $(NETLIBS) runcmd.o 
 check_disk_LDADD = $(BASEOBJS) popen.o
 check_dns_LDADD = $(NETLIBS) runcmd.o
@@ -71,6 +72,7 @@
 check_mysql_query_LDADD = $(NETLIBS) $(MYSQLLIBS)
 check_nagios_LDADD = $(BASEOBJS) runcmd.o
 check_nt_LDADD = $(NETLIBS) 
+check_ntp_LDADD = $(NETLIBS) $(MATHLIBS)
 check_nwstat_LDADD = $(NETLIBS)
 check_overcr_LDADD = $(NETLIBS)
 check_pgsql_LDADD = $(NETLIBS) $(PGLIBS)
@@ -92,6 +94,7 @@
 negate_LDADD = $(BASEOBJS) popen.o
 urlize_LDADD = $(BASEOBJS) popen.o
 
+check_apt_DEPENDENCIES = check_apt.c $(BASEOBJS) runcmd.o $(DEPLIBS)
 check_dig_DEPENDENCIES = check_dig.c $(NETOBJS) runcmd.o $(DEPLIBS)
 check_disk_DEPENDENCIES = check_disk.c $(BASEOBJS) popen.o $(DEPLIBS)
 check_dns_DEPENDENCIES = check_dns.c $(NETOBJS) runcmd.o $(DEPLIBS)
@@ -109,6 +112,7 @@
 check_mysql_query_DEPENDENCIES = check_mysql_query.c $(NETOBJS) $(DEPLIBS)
 check_nagios_DEPENDENCIES = check_nagios.c $(BASEOBJS) runcmd.o $(DEPLIBS)
 check_nt_DEPENDENCIES = check_nt.c $(NETOBJS) $(DEPLIBS)
+check_ntp_DEPENDENCIES = check_ntp.c $(NETOBJS) $(DEPLIBS)
 check_nwstat_DEPENDENCIES = check_nwstat.c $(NETOBJS) $(DEPLIBS)
 check_overcr_DEPENDENCIES = check_overcr.c $(NETOBJS) $(DEPLIBS)
 check_pgsql_DEPENDENCIES = check_pgsql.c $(NETOBJS)  $(DEPLIBS)

Index: runcmd.c
===================================================================
RCS file: /cvsroot/nagiosplug/nagiosplug/plugins/runcmd.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- runcmd.c	24 Oct 2005 11:10:29 -0000	1.2
+++ runcmd.c	1 May 2006 21:52:42 -0000	1.3
@@ -198,7 +198,7 @@
 				close (i);
 
 		execve (argv[0], argv, env);
-		_exit (0);
+		_exit (STATE_UNKNOWN);
 	}
 
 	/* parent picks up execution here */

Index: common.h
===================================================================
RCS file: /cvsroot/nagiosplug/nagiosplug/plugins/common.h,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -d -r1.18 -r1.19
--- common.h	7 Dec 2005 15:10:27 -0000	1.18
+++ common.h	1 May 2006 21:52:42 -0000	1.19
@@ -119,6 +119,10 @@
 # define SWAP_CONVERSION 1
 #endif
 
+#ifdef HAVE_SYS_POLL_H
+# include "sys/poll.h"
+#endif
+
 /*
  *
  * Missing Functions





More information about the Commits mailing list