summaryrefslogtreecommitdiffstats
path: root/plugins/check_ntp.c
diff options
context:
space:
mode:
authorM. Sean Finney <seanius@users.sourceforge.net>2006-05-01 21:52:42 (GMT)
committerM. Sean Finney <seanius@users.sourceforge.net>2006-05-01 21:52:42 (GMT)
commite667553b97c358f5d80608f62c291deffc0328d5 (patch)
tree4e74625613f8d13f51752d84f03b436b45164ff7 /plugins/check_ntp.c
parent3b91dfbcb05dbfa9a1a20c6ba233c5d163e06821 (diff)
downloadmonitoring-plugins-e667553b97c358f5d80608f62c291deffc0328d5.tar.gz
- check_ntp:
- now roughly feature-complete. - various bugfixes, esp. offset calculation. - enhanced the asynchronous offset polling to set requests that haven't recieved a response in >= 1 second to stale and retransmit them, which results in much better performance on unreliable networks. - we only spend timeout/2 seconds polling offsets, and if we don't get everything by that point we work with what we have and set status to warning/critical depending on how much data we have. - set the same defaults as the perl script. - commit changes to configure.in to support automatic building of check_apt (if apt-get is installed and regex libraries available) and check_ntp (unconditionally), now defaulting to check_ntp.c instead of the perl script. if this is an issue we can back out the commit of course. an eye should be kept on check_ntp building and running correctly in different environments, esp. 64-bit and big-endian platforms, and those with more "esoteric" API's (do any of the platforms not have poll()?). - similar changes to Makefile.am's. - common.h: add statement to include sys/poll.h - runcmd.c: exit STATE_UNKNOWN if execve() fails. git-svn-id: https://nagiosplug.svn.sourceforge.net/svnroot/nagiosplug/nagiosplug/trunk@1386 f882894a-f735-0410-b71e-b25c423dba1c
Diffstat (limited to 'plugins/check_ntp.c')
-rw-r--r--plugins/check_ntp.c220
1 files changed, 144 insertions, 76 deletions
diff --git a/plugins/check_ntp.c b/plugins/check_ntp.c
index 149ca98..655dd4f 100644
--- a/plugins/check_ntp.c
+++ b/plugins/check_ntp.c
@@ -29,16 +29,15 @@ const char *email = "nagiosplug-devel@lists.sourceforge.net";
29#include "common.h" 29#include "common.h"
30#include "netutils.h" 30#include "netutils.h"
31#include "utils.h" 31#include "utils.h"
32#include <sys/poll.h>
33 32
34static char *server_address=NULL; 33static char *server_address=NULL;
35static int verbose=0; 34static int verbose=0;
36static int zero_offset_bad=0; 35static int zero_offset_bad=0;
37static double owarn=0; 36static double owarn=60;
38static double ocrit=0; 37static double ocrit=120;
39static short do_jitter=0; 38static short do_jitter=0;
40static double jwarn=0; 39static double jwarn=5000;
41static double jcrit=0; 40static double jcrit=10000;
42 41
43int process_arguments (int, char **); 42int process_arguments (int, char **);
44void print_help (void); 43void print_help (void);
@@ -67,8 +66,11 @@ typedef struct {
67 66
68/* this structure holds data about results from querying offset from a peer */ 67/* this structure holds data about results from querying offset from a peer */
69typedef struct { 68typedef struct {
70 int waiting; /* we set to 1 to signal waiting for a response */ 69 time_t waiting; /* ts set when we started waiting for a response */
71 int num_responses; /* number of successfully recieved responses */ 70 int num_responses; /* number of successfully recieved responses */
71 uint8_t stratum; /* copied verbatim from the ntp_message */
72 double rtdelay; /* converted from the ntp_message */
73 double rtdisp; /* converted from the ntp_message */
72 double offset[AVG_NUM]; /* offsets from each response */ 74 double offset[AVG_NUM]; /* offsets from each response */
73} ntp_server_results; 75} ntp_server_results;
74 76
@@ -192,13 +194,12 @@ typedef struct {
192 194
193/* calculate the offset of the local clock */ 195/* calculate the offset of the local clock */
194static inline double calc_offset(const ntp_message *m, const struct timeval *t){ 196static inline double calc_offset(const ntp_message *m, const struct timeval *t){
195 double client_tx, peer_rx, peer_tx, client_rx, rtdelay; 197 double client_tx, peer_rx, peer_tx, client_rx;
196 client_tx = NTP64asDOUBLE(m->origts); 198 client_tx = NTP64asDOUBLE(m->origts);
197 peer_rx = NTP64asDOUBLE(m->rxts); 199 peer_rx = NTP64asDOUBLE(m->rxts);
198 peer_tx = NTP64asDOUBLE(m->txts); 200 peer_tx = NTP64asDOUBLE(m->txts);
199 client_rx=TVasDOUBLE((*t)); 201 client_rx=TVasDOUBLE((*t));
200 rtdelay=NTP32asDOUBLE(m->rtdelay); 202 return (.5*((peer_tx-client_rx)+(peer_rx-client_tx)));
201 return (.5*((peer_tx-client_rx)+(peer_rx-client_tx)))-rtdelay;
202} 203}
203 204
204/* print out a ntp packet in human readable/debuggable format */ 205/* print out a ntp packet in human readable/debuggable format */
@@ -279,14 +280,63 @@ void setup_request(ntp_message *p){
279 TVtoNTP64(t,p->txts); 280 TVtoNTP64(t,p->txts);
280} 281}
281 282
283/* select the "best" server from a list of servers, and return its index.
284 * this is done by filtering servers based on stratum, dispersion, and
285 * finally round-trip delay. */
286int best_offset_server(const ntp_server_results *slist, int nservers){
287 int i=0, j=0, cserver=0, candidates[5], csize=0;
288
289 /* for each server */
290 for(cserver=0; cserver<nservers; cserver++){
291 /* compare it to each of the servers already in the candidate list */
292 for(i=0; i<csize; i++){
293 /* does it have an equal or better stratum? */
294 if(slist[cserver].stratum <= slist[i].stratum){
295 /* does it have an equal or better dispersion? */
296 if(slist[cserver].rtdisp <= slist[i].rtdisp){
297 /* does it have a better rtdelay? */
298 if(slist[cserver].rtdelay < slist[i].rtdelay){
299 break;
300 }
301 }
302 }
303 }
304
305 /* if we haven't reached the current list's end, move everyone
306 * over one to the right, and insert the new candidate */
307 if(i<csize){
308 for(j=5; j>i; j--){
309 candidates[j]=candidates[j-1];
310 }
311 }
312 /* regardless, if they should be on the list... */
313 if(i<5) {
314 candidates[i]=cserver;
315 if(csize<5) csize++;
316 /* otherwise discard the server */
317 } else {
318 DBG(printf("discarding peer id %d\n", cserver));
319 }
320 }
321
322 if(csize>0) {
323 DBG(printf("best server selected: peer %d\n", candidates[0]));
324 return candidates[0];
325 } else {
326 DBG(printf("no peers meeting synchronization criteria :(\n"));
327 return -1;
328 }
329}
330
282/* do everything we need to get the total average offset 331/* do everything we need to get the total average offset
283 * - we use a certain amount of parallelization with poll() to ensure 332 * - we use a certain amount of parallelization with poll() to ensure
284 * we don't waste time sitting around waiting for single packets. 333 * we don't waste time sitting around waiting for single packets.
285 * - we also "manually" handle resolving host names and connecting, because 334 * - we also "manually" handle resolving host names and connecting, because
286 * we have to do it in a way that our lazy macros don't handle currently :( */ 335 * we have to do it in a way that our lazy macros don't handle currently :( */
287double offset_request(const char *host){ 336double offset_request(const char *host, int *status){
288 int i=0, j=0, ga_result=0, num_hosts=0, *socklist=NULL, respnum=0; 337 int i=0, j=0, ga_result=0, num_hosts=0, *socklist=NULL, respnum=0;
289 int servers_completed=0, one_written=0, servers_readable=0, offsets_recvd=0; 338 int servers_completed=0, one_written=0, servers_readable=0, best_index=-1;
339 time_t now_time=0, start_ts=0;
290 ntp_message *req=NULL; 340 ntp_message *req=NULL;
291 double avg_offset=0.; 341 double avg_offset=0.;
292 struct timeval recv_time; 342 struct timeval recv_time;
@@ -337,28 +387,24 @@ double offset_request(const char *host){
337 ai_tmp = ai_tmp->ai_next; 387 ai_tmp = ai_tmp->ai_next;
338 } 388 }
339 389
340 /* now do AVG_NUM checks to each host. */ 390 /* now do AVG_NUM checks to each host. we stop before timeout/2 seconds
341 while(servers_completed<num_hosts){ 391 * have passed in order to ensure post-processing and jitter time. */
342 392 now_time=start_ts=time(NULL);
343 /* write to any servers that are free and have done < AVG_NUM reqs */ 393 while(servers_completed<num_hosts && now_time-start_ts <= socket_timeout/2){
344 /* XXX we need some kind of ability to retransmit lost packets. 394 /* loop through each server and find each one which hasn't
345 * XXX one way would be replace "waiting" with a timestamp and 395 * been touched in the past second or so and is still lacking
346 * XXX if the timestamp is old enough the request is re-transmitted. 396 * some responses. for each of these servers, send a new request,
347 * XXX then a certain number of failures could mark a server as 397 * and update the "waiting" timestamp with the current time. */
348 * XXX bad, which is what i imagine that ntpdate does though
349 * XXX i can't confirm it (i think it still only sends a max
350 * XXX of AVG_NUM requests, but what does it do if one fails
351 * XXX but the others succeed? */
352 /* XXX also we need the ability to cut out failed/unresponsive
353 * XXX servers. currently after doing all other servers we
354 * XXX still wait for them until the bitter end/timeout. */
355 one_written=0; 398 one_written=0;
399 now_time=time(NULL);
400
356 for(i=0; i<num_hosts; i++){ 401 for(i=0; i<num_hosts; i++){
357 if(!servers[i].waiting && servers[i].num_responses<AVG_NUM){ 402 if(servers[i].waiting<now_time && servers[i].num_responses<AVG_NUM){
403 if(verbose && servers[i].waiting != 0) printf("re-");
358 if(verbose) printf("sending request to peer %d\n", i); 404 if(verbose) printf("sending request to peer %d\n", i);
359 setup_request(&req[i]); 405 setup_request(&req[i]);
360 write(socklist[i], &req[i], sizeof(ntp_message)); 406 write(socklist[i], &req[i], sizeof(ntp_message));
361 servers[i].waiting=1; 407 servers[i].waiting=now_time;
362 one_written=1; 408 one_written=1;
363 break; 409 break;
364 } 410 }
@@ -373,17 +419,22 @@ double offset_request(const char *host){
373 419
374 /* read from any sockets with pending data */ 420 /* read from any sockets with pending data */
375 for(i=0; servers_readable && i<num_hosts; i++){ 421 for(i=0; servers_readable && i<num_hosts; i++){
376 if(ufds[i].revents&POLLIN){ 422 if(ufds[i].revents&POLLIN && servers[i].num_responses < AVG_NUM){
377 if(verbose) { 423 if(verbose) {
378 printf("response from peer %d: ", i); 424 printf("response from peer %d: ", i);
379 } 425 }
426
380 read(ufds[i].fd, &req[i], sizeof(ntp_message)); 427 read(ufds[i].fd, &req[i], sizeof(ntp_message));
381 gettimeofday(&recv_time, NULL); 428 gettimeofday(&recv_time, NULL);
429 DBG(print_ntp_message(&req[i]));
382 respnum=servers[i].num_responses++; 430 respnum=servers[i].num_responses++;
383 servers[i].offset[respnum]=calc_offset(&req[i], &recv_time); 431 servers[i].offset[respnum]=calc_offset(&req[i], &recv_time);
384 if(verbose) { 432 if(verbose) {
385 printf("offset %g\n", servers[i].offset[respnum]); 433 printf("offset %.10g\n", servers[i].offset[respnum]);
386 } 434 }
435 servers[i].stratum=req[i].stratum;
436 servers[i].rtdisp=NTP32asDOUBLE(req[i].rtdisp);
437 servers[i].rtdelay=NTP32asDOUBLE(req[i].rtdelay);
387 servers[i].waiting=0; 438 servers[i].waiting=0;
388 servers_readable--; 439 servers_readable--;
389 if(servers[i].num_responses==AVG_NUM) servers_completed++; 440 if(servers[i].num_responses==AVG_NUM) servers_completed++;
@@ -392,15 +443,17 @@ double offset_request(const char *host){
392 /* lather, rinse, repeat. */ 443 /* lather, rinse, repeat. */
393 } 444 }
394 445
395 /* finally, calculate the average offset */ 446 /* now, pick the best server from the list */
396 /* XXX still something about the "top 5" */ 447 best_index=best_offset_server(servers, num_hosts);
397 for(i=0;i<num_hosts;i++){ 448 if(best_index < 0){
398 for(j=0;j<servers[i].num_responses;j++){ 449 *status=STATE_CRITICAL;
399 offsets_recvd++; 450 } else {
400 avg_offset+=servers[i].offset[j]; 451 /* finally, calculate the average offset */
452 for(i=0; i<servers[best_index].num_responses;i++){
453 avg_offset+=servers[best_index].offset[j];
401 } 454 }
455 avg_offset/=servers[best_index].num_responses;
402 } 456 }
403 avg_offset/=offsets_recvd;
404 457
405 /* cleanup */ 458 /* cleanup */
406 for(j=0; j<num_hosts; j++){ close(socklist[j]); } 459 for(j=0; j<num_hosts; j++){ close(socklist[j]); }
@@ -410,7 +463,7 @@ double offset_request(const char *host){
410 free(req); 463 free(req);
411 freeaddrinfo(ai); 464 freeaddrinfo(ai);
412 465
413 if(verbose) printf("overall average offset: %g\n", avg_offset); 466 if(verbose) printf("overall average offset: %.10g\n", avg_offset);
414 return avg_offset; 467 return avg_offset;
415} 468}
416 469
@@ -426,10 +479,11 @@ setup_control_request(ntp_control_message *p, uint8_t opcode, uint16_t seq){
426} 479}
427 480
428/* XXX handle responses with the error bit set */ 481/* XXX handle responses with the error bit set */
429double jitter_request(const char *host){ 482double jitter_request(const char *host, int *status){
430 int conn=-1, i, npeers=0, num_candidates=0, syncsource_found=0; 483 int conn=-1, i, npeers=0, num_candidates=0, syncsource_found=0;
431 int run=0, min_peer_sel=PEER_INCLUDED, num_selected=0, num_valid=0; 484 int run=0, min_peer_sel=PEER_INCLUDED, num_selected=0, num_valid=0;
432 ntp_assoc_status_pair *peers; 485 int peer_offset=0;
486 ntp_assoc_status_pair *peers=NULL;
433 ntp_control_message req; 487 ntp_control_message req;
434 double rval = 0.0, jitter = -1.0; 488 double rval = 0.0, jitter = -1.0;
435 char *startofvalue=NULL, *nptr=NULL; 489 char *startofvalue=NULL, *nptr=NULL;
@@ -449,27 +503,28 @@ double jitter_request(const char *host){
449 * 4) Extract the jitter value from the data[] (it's ASCII) 503 * 4) Extract the jitter value from the data[] (it's ASCII)
450 */ 504 */
451 my_udp_connect(server_address, 123, &conn); 505 my_udp_connect(server_address, 123, &conn);
452 setup_control_request(&req, OP_READSTAT, 1); 506
453 507 /* keep sending requests until the server stops setting the
454 DBG(printf("sending READSTAT request")); 508 * REM_MORE bit, though usually this is only 1 packet. */
455 write(conn, &req, SIZEOF_NTPCM(req)); 509 do{
456 DBG(print_ntp_control_message(&req)); 510 setup_control_request(&req, OP_READSTAT, 1);
457 /* Attempt to read the largest size packet possible 511 DBG(printf("sending READSTAT request"));
458 * Is it possible for an NTP server to have more than 117 synchronization 512 write(conn, &req, SIZEOF_NTPCM(req));
459 * sources? If so, we will receive a second datagram with additional 513 DBG(print_ntp_control_message(&req));
460 * peers listed, since 117 is the maximum number that can fit in a 514 /* Attempt to read the largest size packet possible */
461 * single NTP control datagram. This code doesn't handle that case */ 515 req.count=htons(MAX_CM_SIZE);
462 /* XXX check the REM_MORE bit */ 516 DBG(printf("recieving READSTAT response"))
463 req.count=htons(MAX_CM_SIZE); 517 read(conn, &req, SIZEOF_NTPCM(req));
464 DBG(printf("recieving READSTAT response")) 518 DBG(print_ntp_control_message(&req));
465 read(conn, &req, SIZEOF_NTPCM(req)); 519 /* Each peer identifier is 4 bytes in the data section, which
466 DBG(print_ntp_control_message(&req)); 520 * we represent as a ntp_assoc_status_pair datatype.
467 /* Each peer identifier is 4 bytes in the data section, which 521 */
468 * we represent as a ntp_assoc_status_pair datatype. 522 npeers+=(ntohs(req.count)/sizeof(ntp_assoc_status_pair));
469 */ 523 peers=(ntp_assoc_status_pair*)realloc(peers, sizeof(ntp_assoc_status_pair)*npeers);
470 npeers=ntohs(req.count)/sizeof(ntp_assoc_status_pair); 524 memcpy((void*)peers+peer_offset, (void*)req.data, sizeof(ntp_assoc_status_pair)*npeers);
471 peers=(ntp_assoc_status_pair*)malloc(sizeof(ntp_assoc_status_pair)*npeers); 525 peer_offset+=ntohs(req.count);
472 memcpy((void*)peers, (void*)req.data, sizeof(ntp_assoc_status_pair)*npeers); 526 } while(req.op&REM_MORE);
527
473 /* first, let's find out if we have a sync source, or if there are 528 /* first, let's find out if we have a sync source, or if there are
474 * at least some candidates. in the case of the latter we'll issue 529 * at least some candidates. in the case of the latter we'll issue
475 * a warning but go ahead with the check on them. */ 530 * a warning but go ahead with the check on them. */
@@ -484,13 +539,15 @@ double jitter_request(const char *host){
484 } 539 }
485 if(verbose) printf("%d candiate peers available\n", num_candidates); 540 if(verbose) printf("%d candiate peers available\n", num_candidates);
486 if(verbose && syncsource_found) printf("synchronization source found\n"); 541 if(verbose && syncsource_found) printf("synchronization source found\n");
487 /* XXX if ! syncsource_found set status to warning */ 542 if(! syncsource_found) *status = STATE_WARNING;
543
488 544
489 for (run=0; run<AVG_NUM; run++){ 545 for (run=0; run<AVG_NUM; run++){
490 if(verbose) printf("jitter run %d of %d\n", run+1, AVG_NUM); 546 if(verbose) printf("jitter run %d of %d\n", run+1, AVG_NUM);
491 for (i = 0; i < npeers; i++){ 547 for (i = 0; i < npeers; i++){
492 /* Only query this server if it is the current sync source */ 548 /* Only query this server if it is the current sync source */
493 if (PEER_SEL(peers[i].status) >= min_peer_sel){ 549 if (PEER_SEL(peers[i].status) >= min_peer_sel){
550 num_selected++;
494 setup_control_request(&req, OP_READVAR, 2); 551 setup_control_request(&req, OP_READVAR, 2);
495 req.assoc = peers[i].assoc; 552 req.assoc = peers[i].assoc;
496 /* By spec, putting the variable name "jitter" in the request 553 /* By spec, putting the variable name "jitter" in the request
@@ -514,11 +571,12 @@ double jitter_request(const char *host){
514 printf("parsing jitter from peer %.2x: ", peers[i].assoc); 571 printf("parsing jitter from peer %.2x: ", peers[i].assoc);
515 } 572 }
516 startofvalue = strchr(req.data, '=') + 1; 573 startofvalue = strchr(req.data, '=') + 1;
517 jitter = strtod(startofvalue, &nptr); 574 if(startofvalue != NULL) {
518 num_selected++; 575 jitter = strtod(startofvalue, &nptr);
519 if(jitter == 0 && startofvalue==nptr){ 576 }
520 printf("warning: unable to parse server response.\n"); 577 if(startofvalue == NULL || startofvalue==nptr){
521 /* XXX errors value ... */ 578 printf("warning: unable to read server jitter response.\n");
579 *status = STATE_WARNING;
522 } else { 580 } else {
523 if(verbose) printf("%g\n", jitter); 581 if(verbose) printf("%g\n", jitter);
524 num_valid++; 582 num_valid++;
@@ -527,7 +585,7 @@ double jitter_request(const char *host){
527 } 585 }
528 } 586 }
529 if(verbose){ 587 if(verbose){
530 printf("jitter parsed from %d/%d peers\n", num_selected, num_valid); 588 printf("jitter parsed from %d/%d peers\n", num_valid, num_selected);
531 } 589 }
532 } 590 }
533 591
@@ -637,9 +695,11 @@ int process_arguments(int argc, char **argv){
637} 695}
638 696
639int main(int argc, char *argv[]){ 697int main(int argc, char *argv[]){
640 int result = STATE_UNKNOWN; 698 int result, offset_result, jitter_result;
641 double offset=0, jitter=0; 699 double offset=0, jitter=0;
642 700
701 result=offset_result=jitter_result=STATE_UNKNOWN;
702
643 if (process_arguments (argc, argv) == ERROR) 703 if (process_arguments (argc, argv) == ERROR)
644 usage4 (_("Could not parse arguments")); 704 usage4 (_("Could not parse arguments"));
645 705
@@ -649,14 +709,15 @@ int main(int argc, char *argv[]){
649 /* set socket timeout */ 709 /* set socket timeout */
650 alarm (socket_timeout); 710 alarm (socket_timeout);
651 711
652 offset = offset_request(server_address); 712 offset = offset_request(server_address, &offset_result);
653 if(offset > ocrit){ 713 if(fabs(offset) > ocrit){
654 result = STATE_CRITICAL; 714 result = STATE_CRITICAL;
655 } else if(offset > owarn) { 715 } else if(fabs(offset) > owarn) {
656 result = STATE_WARNING; 716 result = STATE_WARNING;
657 } else { 717 } else {
658 result = STATE_OK; 718 result = STATE_OK;
659 } 719 }
720 result=max_state(result, offset_result);
660 721
661 /* If not told to check the jitter, we don't even send packets. 722 /* If not told to check the jitter, we don't even send packets.
662 * jitter is checked using NTP control packets, which not all 723 * jitter is checked using NTP control packets, which not all
@@ -664,7 +725,7 @@ int main(int argc, char *argv[]){
664 * (for example) will result in an error 725 * (for example) will result in an error
665 */ 726 */
666 if(do_jitter){ 727 if(do_jitter){
667 jitter=jitter_request(server_address); 728 jitter=jitter_request(server_address, &jitter_result);
668 if(jitter > jcrit){ 729 if(jitter > jcrit){
669 result = max_state(result, STATE_CRITICAL); 730 result = max_state(result, STATE_CRITICAL);
670 } else if(jitter > jwarn) { 731 } else if(jitter > jwarn) {
@@ -675,6 +736,7 @@ int main(int argc, char *argv[]){
675 result = STATE_UNKNOWN; 736 result = STATE_UNKNOWN;
676 } 737 }
677 } 738 }
739 result=max_state(result, jitter_result);
678 740
679 switch (result) { 741 switch (result) {
680 case STATE_CRITICAL : 742 case STATE_CRITICAL :
@@ -690,9 +752,15 @@ int main(int argc, char *argv[]){
690 printf("NTP UNKNOWN: "); 752 printf("NTP UNKNOWN: ");
691 break; 753 break;
692 } 754 }
693 755 if(offset_result==STATE_CRITICAL){
694 printf("Offset %g secs|offset=%g", offset, offset); 756 printf("Offset unknown|offset=unknown");
695 if (do_jitter) printf("|jitter=%f", jitter); 757 } else {
758 if(offset_result==STATE_WARNING){
759 printf("Unable to fully sample sync server. ");
760 }
761 printf("Offset %.10g secs|offset=%.10g", offset, offset);
762 }
763 if (do_jitter) printf(", jitter=%f", jitter);
696 printf("\n"); 764 printf("\n");
697 765
698 if(server_address!=NULL) free(server_address); 766 if(server_address!=NULL) free(server_address);