@@ -0,0 +1,1163 @@
+#!/usr/bin/perl -w
+# vim:ts=4
+# check_esx Version 3.0
+#
+# Check the status of a virtual machine on a VMware ESX server, via SNMP
+# Return status in format for either Nagios or MRTG
+#
+# Steve Shipway (www.steveshipway.org) Nov 2004, Dec 2006, Aug 2007
+# Released under GNU GPL
+#
+# Version 2.0: Added SNMP agent extension to get memory split and ready time
+# 2.1: Corrected some bugs. Use >0.01 instead of >0.
+# 2.2: corrected opt_r bug, fa bug
+# 2.3:
+# 2.4: simpler guest names for list report
+# 2.5: Thresholds for LIST given more sensible defaults
+# Added -a alternate for MRTG/Nagios in MEM and CPU
+# 2.6: Final tests under ESX3
+# 3.0: Merge in GW additions, change -v to -V to standardise
+
+use Net::SNMP;
+use Getopt::Std;
+
+my($VERSION) = "3.0";
+######## CONFIGURABLE
+my($STATEFILE) = "/var/tmp/esx_state"; # For rate counter (if not agent)
+my($SWAPINCRIT) = 1024; # this many bps swap in is critical (else warn)
+my($SWAPINWARN) = 128;
+my($SWAPPCCRIT) = 4; # this % of gruest memory in swap is critical (else warn)
+my($SWAPPCWARN) = 1; # under ESX3, this is always >0 by a tiny bit
+my($warn,$crit) = (70,90); # usage warn/crit: 70/90 is virtualcentre default
+my($rwarn,$rcrit) = (10,15); # cpu readytime warn/crit: VMWare say to crit at 5%
+my($community) = 'public'; # Default community string
+my($TRUNC) = 16; # truncte guest names in report to this length (use 99 to stop)
+######## END
+my($VMOID) = "1.3.6.1.4.1.6876"; # VMware MIB
+my($UCDOID) = "1.3.6.1.4.1.2021.1000.10"; # where to find the agent plugin
+my($SYSOID) = "1.3.6.1.2.1.1.1.0"; # system object to test SNMP working
+my($OK,$WARNING,$CRITICAL,$UNKNOWN) = (0,1,2,3);
+my(%VisibleStatus) = ($OK => "OK", $WARNING => "WARNING", $CRITICAL => "CRITICAL", $UNKNOWN => "UNKNOWN");
+my($TIMEOUT) = 5;
+my($RETRIES) = 1;
+my($from,$to) = (0,99999);
+my(%snmp,$snmp,$resp,$snmperr);
+my($hostname) = '';
+my($vhost) = '';
+my($A, $B, $MSG) = ('U','U','');
+my($STATUS) = $UNKNOWN;
+my($MODE) = 0;
+my($VMID) = -1; # set to -1 if not running
+my($VMNO) = -1; # set to -1 if not defined
+my(%lookup) = ();
+my(%states) = ();
+my(%tmpnet) = ();
+my($fa,$sa,$fb,$sb);
+my(@perf) = (); # for performance stats
+# For debugging
+my($DEBUG) = 0;
+my($SNMPFILE) = "testdata/snmp.txt"; # for test/debug mode only
+my($VMWARESTATS) = "./vmware-stats -d"; # for test/debug mode only
+# End
+
+use vars qw($opt_C $opt_H $opt_N $opt_M $opt_h $opt_c $opt_t $opt_i $opt_d $opt_w $opt_l $opt_v $opt_r $opt_R $opt_a $opt_V);
+
+sub base($) {
+ return '?' if(!$_[0]);
+ return $1 if( $_[0]=~/^(\S+)/ );
+ return $_[0];
+}
+
+sub dohelp {
+ print "Usage: $0 [-h] [-v] [-d] -H host [-C community] [-N | -M [-r]]\n";
+ print " [-l check [-V vhost] [-i interface] [-w warn -c crit]]\n";
+ print " [-t timeout] [-R retries]\n";
+ print " -h: just prints this help message\n";
+ print " -v: just prints the script version number\n";
+ print " -d: puts the script into debug mode\n";
+ print " -H host: ESX server machine\n";
+ print " -C community: the SNMP community string (default is \"public\")\n";
+ print " -N: Nagios mode (the default); need -w and -c for CPU, MEM\n";
+ print " -M: MRTG mode (-r specifies rate rather than counter)\n";
+ print " -l check: can be CPU MEM STATE LIST NET LISTNET (default is LIST)\n";
+ print " -V virtualhost: restrict probing to that one guest host; required for STATE;\n";
+ print " if not specified, probes total ESX system statistics\n";
+ print " -i interface: Only valid for NET\n";
+ print " -w warn -c crit: Nagios thresholds\n";
+ print " -t timeout: ([1..60] seconds) for individual SNMP queries\n";
+ print " -R retries: # of retries ([0..20]) for individual SNMP queries\n";
+ print "\nFor MRTG,\n";
+ print " CPU is total seconds (counter) for vhost or total over all if no vhost given.\n";
+ print " MEM is memory remaining in K.\n";
+ print " STATE is 1 for up, 0 for down.\n";
+ print " LIST is number of vhosts.\n";
+ print " NET is network throughput in bytes for specified vhost and/or interface\n";
+ print " (total of all if not specified).\n";
+ print "\nFor Nagios, specify thresholds as follows.\n";
+ print " CPU is percentage of allocated CPU (for vhosts) and of total CPU (if no vhost).\n";
+ print " MEM is active memory (for vhosts) or free phys memory (if no vhost) in K or %.\n";
+ print " STATE is CRITICAL if vhost is down.\n";
+ print " LIST is WARN if some are down, CRIT is all vhosts are down.\n";
+ print " NET is bytes/sec since last check, if possible (otherwise UNKNOWN).\n";
+ print "\nThresholds for MEM or LIST under Nagios, can be in K or %\n";
+ print " e.g.: -l MEM -w 2048K -c 1024K\n";
+ print " e.g.: -l MEM -V vhost -w 80% -c 90%\n";
+ print " e.g.: -l LIST -w 90% -c 0\n";
+ print " e.g.: -l LIST -w 10 -c 1\n";
+ print "Thresholds for CPU are in % (the trailing % symbol is optional)\n";
+ print " e.g.: -l CPU -w 80 -c 90\n";
+ print "Thresholds for NET are in BYTES/SEC (cannot use %)\n";
+ exit 0;
+}
+
+sub readstate {
+ return if(! -r $STATEFILE);
+ open STATE, "<$STATEFILE";
+ flock STATE,1; # read lock
+ while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); }
+ flock STATE,8; # unlock
+ close STATE;
+}
+
+# Big fixes for the race condition from GroundWork
+sub writestate {
+ my(%new) = @_;
+
+ if(-r $STATEFILE) {
+ open STATE, "+<$STATEFILE"
+ or do { $A=$B="U"; $MSG="$STATEFILE: $!"; $STATUS=3; &dooutput; };
+ flock STATE,2; # write lock
+ while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); }
+ } else {
+ open STATE, ">>$STATEFILE"
+ or do { $A=$B="U"; $MSG="$STATEFILE: $!"; $STATUS=3; &dooutput; };
+ flock STATE,2; # write lock
+
+ }
+ seek STATE,0,0; # rewind
+ truncate STATE,0;
+ foreach ( keys %new ) { $states{$_} = $new{$_}; }
+ foreach ( keys %states ) { print STATE "$_=".$states{$_}."\n"; }
+ flock STATE,8; # unlock
+ close STATE;
+}
+
+sub dooutput {
+ if( $MODE ) {
+ # MRTG
+ $A = 'U' if(!defined $A);
+ $B = $A if(!defined $B);
+ $MSG = "Returned values: $A, $B\n" if(!$MSG);
+ print "$A\n$B\n\n$MSG\n";
+ exit 0;
+ } else {
+ # Nagios: now supporting performance stats
+ print "".($VisibleStatus{$STATUS} || "UNKNOWN").": $MSG"
+ .(scalar @perf ? "|" . join(" ",@perf) : ""), "\n";
+ exit $STATUS;
+ }
+ # should never get here
+}
+
+sub snmpfile($) {
+ my(%resp);
+ my($k) = $_[0];
+ foreach ( keys %snmp ) { $resp{$_} = $snmp{$_} if( /^$k\./); }
+ return \%resp;
+}
+
+sub makesnmp() {
+ if ($DEBUG and $SNMPFILE) {
+ open SNMP,"<$SNMPFILE" or return;
+ while(<SNMP>) { chomp; if(/^(\S+)\s+=\s+\S+:\s+"?([^"]+)/) { $snmp{$1}=$2; } }
+ close SNMP;
+ return;
+ }
+ ($snmp,$snmperr) = Net::SNMP->session( -hostname=>$hostname,
+ -community=>$community, -timeout=>$TIMEOUT, -retries=>$RETRIES );
+ if($snmperr) {
+ $A = $B = 'U';
+ print "($snmperr)\n" if($DEBUG);
+ $MSG = "Error: $snmperr";
+ $STATUS = $UNKNOWN;
+ dooutput; # exit
+ exit(0);
+ }
+}
+
+###########################################################################
+# Read detailed memory and CPU data from extended snmp daemon, if possible
+my(%stats) = ();
+my($donereadagent) = 0;
+sub readagent {
+ return "" if($donereadagent);
+ $MSG = "";
+ makesnmp() if(!$snmp);
+ if($DEBUG and $VMWARESTATS) {
+ open STATS,"$VMWARESTATS|" or return 0;
+ while( <STATS> ) {
+# print;
|
@@ -0,0 +1,1063 @@
+#!/usr/local/groundwork/bin/perl -w
+# vim:ts=4
+# check_esx Version 2.4
+my($Version) = 0.6;
+# Modified by GroundWork to work with ESX 3; renamed to ..._gw to indicate
+# a special interim version. This is considered a temporary working copy
+# until the changes are folded back into the main line of development.
+#
+# Check the status of a virtual machine on a VMware ESX server, via SNMP.
+# Return status in standard format for either Nagios or MRTG.
+#
+# Steve Shipway (www.steveshipway.org) Nov 2004
+# Released under GNU GPL
+#
+# See dohelp{} below for usage, so we only need to maintain one copy
+# and we don't get accidental divergence in what is documented.
+#
+# Version 2.0: Added SNMP agent extension to get memory split and ready time
+# 2.1: Corrected some bugs. Use >0.01 instead of >0.
+# 2.2: corrected opt_r bug, fa bug
+# 2.3:
+# 2.4: simpler guest names for list report
+# gw 0.6: added some support for ESX 3
+
+use strict;
+use Net::SNMP;
+use Getopt::Std;
+
+my($STATEFILE) = "/var/tmp/esx_state"; # For rate counter (if not agent)
+my($VMOID) = "1.3.6.1.4.1.6876"; # VMware MIB
+my($UCDOID) = "1.3.6.1.4.1.2021.1000.10"; # where to find the agent plugin
+my($SYSOID) = "1.3.6.1.2.1.1.1.0"; # system object to test SNMP working
+my($OK,$WARNING,$CRITICAL,$UNKNOWN) = (0,1,2,3);
+my(%VisibleStatus) = ($OK => "OK", $WARNING => "WARNING", $CRITICAL => "CRITICAL", $UNKNOWN => "UNKNOWN");
+my($DEBUG) = 0;
+my($TIMEOUT) = 5;
+my($RETRIES) = 1;
+my($SWAPINCRIT) = 2; # this many bps swap in is critical (else warn)
+my($SWAPPCCRIT) = 4; # this % usage of swap is critical (else warn)
+my($from,$to) = (0,99999);
+my($snmp,$resp,$snmperr);
+my($hostname) = '';
+my($community) = 'public'; # Default community string
+my($vhost) = '';
+my($A, $B, $MSG) = ('U','U','');
+my(@perf) = ();
+my($STATUS) = $UNKNOWN;
+my($MODE) = 0; # 0 = Nagios, 1 = MRTG
+my($VMID) = -1; # set to -1 if not running
+my($VMNO) = -1; # set to -1 if not defined
+my($vmGuestState) = "notRunning";
+my($warn,$crit) = (70,90); # usage warn/crit: 70/90 is virtualcentre default
+my($rwarn,$rcrit) = (5,10); # cpu readytime warn/crit: VMWare say to crit at 5%
+my(%lookup) = ();
+my(%states) = ();
+my(%tmpnet) = ();
+my($fa,$sa,$fb,$sb);
+my($esx_version) = 3;
+
+use vars qw($opt_C $opt_H $opt_N $opt_M $opt_V $opt_h $opt_c $opt_t $opt_i $opt_d $opt_w $opt_l $opt_v $opt_r $opt_R);
+
+sub base($) {
+ return '?' if(!$_[0]);
+ return $1 if( $_[0]=~/^(\S+)/ );
+ return $_[0];
+}
+
+sub dohelp {
+ print "Usage: $0 [-h] [-V] [-d] -H host [-C community] [-N | -M [-r]]\n";
+ print " [-l check [-v vhost] [-i interface] [-w warn -c crit]]\n";
+ print " [-t timeout] [-R retries]\n";
+ print " -h: just prints this help message\n";
+ print " -V: just prints the script version number\n";
+ print " -d: puts the script into debug mode\n";
+ print " -H host: ESX server machine\n";
+ print " -C community: the SNMP community string (default is \"public\")\n";
+ print " -N: Nagios mode (the default); need -w and -c for CPU, MEM\n";
+ print " -M: MRTG mode (-r specifies rate rather than counter)\n";
+ print " -l check: can be CPU MEM STATE LIST NET LISTNET (default is LIST)\n";
+ print " -v virtualhost: restrict probing to that one guest host; required for STATE;\n";
+ print " if not specified, probes total ESX system statistics\n";
+ print " -i interface: Only valid for NET\n";
+ print " -w warn -c crit: Nagios thresholds\n";
+ print " -t timeout: ([1..60] seconds) for individual SNMP queries\n";
+ print " -R retries: # of retries ([0..20]) for individual SNMP queries\n";
+ print "\nFor MRTG,\n";
+ print " CPU is total seconds (counter) for vhost or total over all if no vhost given.\n";
+ print " MEM is memory remaining in K.\n";
+ print " STATE is 1 for up, 0 for down.\n";
+ print " LIST is number of vhosts.\n";
+ print " NET is network throughput in bytes for specified vhost and/or interface\n";
+ print " (total of all if not specified).\n";
+ print "\nFor Nagios, specify thresholds as follows.\n";
+ print " CPU is percentage of allocated CPU (for vhosts) and of total CPU (if no vhost).\n";
+ print " MEM is active memory (for vhosts) or free phys memory (if no vhost) in K or %.\n";
+ print " STATE is CRITICAL if vhost is down.\n";
+ print " LIST is WARN if some are down, CRIT is all vhosts are down.\n";
+ print " NET is bytes/sec since last check, if possible (otherwise UNKNOWN).\n";
+ print "\nThresholds for MEM or LIST under Nagios, can be in K or %\n";
+ print " e.g.: -l MEM -w 2048K -c 1024K\n";
+ print " e.g.: -l MEM -v vhost -w 80% -c 90%\n";
+ print " e.g.: -l LIST -w 90% -c 0\n";
+ print " e.g.: -l LIST -w 10 -c 1\n";
+ print "Thresholds for CPU are in % (the trailing % symbol is optional)\n";
+ print " e.g.: -l CPU -w 80 -c 90\n";
+ print "Thresholds for NET are in BYTES/SEC (cannot use %)\n";
+ exit 0;
+}
+
+sub readstate {
+ return if(! -r $STATEFILE);
+ open STATE, "<$STATEFILE" or return;
+ flock STATE,1; # read lock
+ while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); }
+ flock STATE,8; # unlock
+ close STATE;
+}
+sub writestate {
+ my(%new) = @_;
+
+ # NOTE: For further discussion of the possible race conditions and security holes
+ # here, see Programming Perl, 3/e, pp. 571-576.
+
+ # We write-lock the state file for the entire interaction, and we flush the output
+ # before releasing the lock, to prevent any avoidable race conditions from showing
+ # up. The one race condition this doesn't prevent is multiple independent processes
+ # finding the file doesn't exist, and all of them opening the file in some kind of
+ # write mode (which is required to create a non-existent file). The race condition
+ # here arises because we use a lock mechanism that only allows us to do the locking
+ # after the file is opened, and not beforehand. Thus there is no adjudication such
+ # that only one process would be able to observe that the file does not yet exist.
+ # Whichever of those processes acquires the write lock last will overwrite the
+ # contents written by all the previous processes. But that circumstance will only
+ # occur in a limited, one-time situation, the file will still be left in a consistent
+ # state each time it is unlocked, and it will simply be as if the other processes
+ # never ran. The system will recover properly on subsequent runs.
+ if(-r $STATEFILE) {
+ # We open in read/write mode so we can later write the file without needing
+ # to release the lock and close the file between read and write activity.
+ # (Closing and re-opening would allow some other process to write the file
+ # in between, such that the data we just read would now be stale and those
+ # states we are not updating ourselves here would be overwritten here with
+ # data which is now out-of-date.)
+ open STATE, "+<$STATEFILE" or return;
+ flock STATE,2; # write lock
+ while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); }
+ } else {
+ # We open in append mode because that mode doesn't clobber an existing file.
+ # That's important so we don't truncate a file which some other process has
+ # already write-locked and is already in the middle of writing, and possibly
+ # leave around a broken file that some other process might read before we
+ # acquire the write lock.
+ open STATE, ">>$STATEFILE" or return;
+ flock STATE,2; # write lock
+ }
+ seek STATE,0,0; # rewind
+ truncate STATE,0;
+ foreach ( keys %new ) { $states{$_} = $new{$_}; }
+ foreach ( keys %states ) { print STATE "$_=".$states{$_}."\n"; }
+ # Actually, the flock call implicitly flushes the file before releasing the lock,
+ # so we don't need to do so explicitly here.
+ # my $oldfile = select STATE; $| = 1; select $oldfile; # flush any buffered output
+ flock STATE,8; # unlock
+ close STATE;
+}
+
+sub dooutput {
+ if( $MODE ) {
+ # MRTG
+ $A = 'U' if(!defined $A);
+ $B = $A if(!defined $B);
+ $MSG = "Returned values: $A, $B\n" if(!$MSG);
+ print "$A\n$B\n\n$MSG\n";
+ exit 0;
+ } else {
+ # Nagios
+ # Here we follow the format recommended in the Nagios plug-in development guidelines,
+ # plus a convention that says the status should be right up front in a readable form, so
+ # end-users can see it directly in contexts that might not color-highlight the message.
+ # The one thing this doesn't guarantee is that the message isn't so long that either it or
+ # the appended performance data doesn't get chopped off due to overall length limitations
+ # in the Nagios command pipe.
+ print "SERVICE STATUS: ", ($VisibleStatus{$STATUS} || "UNKNOWN"), ": ", $MSG, (scalar @perf ? "|" . join(" ",@perf) : ""), "\n";
+ exit $STATUS;
+ }
+ # should never get here
+}
+
+sub makesnmp() {
+ ($snmp,$snmperr) = Net::SNMP->session( -hostname=>$hostname,
+ -community=>$community, -timeout=>$TIMEOUT, -retries=>$RETRIES );
+ if($snmperr) {
+ $A = $B = 'U';
+ print "($snmperr)\n" if($DEBUG);
+ $MSG = "Error: $snmperr";
+ $STATUS = $UNKNOWN;
+ dooutput; # exit
+ exit(0);
+ }
|