Search
j0ke.net Open Build Service
>
Projects
>
server:monitoring
>
nagios-plugins-snmp
> check_esx_gw
Sign Up
|
Log In
Username
Password
Cancel
Overview
Repositories
Revisions
Requests
Users
Advanced
Attributes
Meta
File check_esx_gw of Package nagios-plugins-snmp
#!/usr/bin/perl -w # vim:ts=4 # check_esx Version 2.4 my($Version) = 0.6; # Modified by GroundWork to work with ESX 3; renamed to ..._gw to indicate # a special interim version. This is considered a temporary working copy # until the changes are folded back into the main line of development. # # Check the status of a virtual machine on a VMware ESX server, via SNMP. # Return status in standard format for either Nagios or MRTG. # # Steve Shipway (www.steveshipway.org) Nov 2004 # Released under GNU GPL # # See dohelp{} below for usage, so we only need to maintain one copy # and we don't get accidental divergence in what is documented. # # Version 2.0: Added SNMP agent extension to get memory split and ready time # 2.1: Corrected some bugs. Use >0.01 instead of >0. # 2.2: corrected opt_r bug, fa bug # 2.3: # 2.4: simpler guest names for list report # gw 0.6: added some support for ESX 3 use strict; use Net::SNMP; use Getopt::Std; my($STATEFILE) = "/var/tmp/esx_state"; # For rate counter (if not agent) my($VMOID) = "1.3.6.1.4.1.6876"; # VMware MIB my($UCDOID) = "1.3.6.1.4.1.2021.1000.10"; # where to find the agent plugin my($SYSOID) = "1.3.6.1.2.1.1.1.0"; # system object to test SNMP working my($OK,$WARNING,$CRITICAL,$UNKNOWN) = (0,1,2,3); my(%VisibleStatus) = ($OK => "OK", $WARNING => "WARNING", $CRITICAL => "CRITICAL", $UNKNOWN => "UNKNOWN"); my($DEBUG) = 0; my($TIMEOUT) = 5; my($RETRIES) = 1; my($SWAPINCRIT) = 2; # this many bps swap in is critical (else warn) my($SWAPPCCRIT) = 4; # this % usage of swap is critical (else warn) my($from,$to) = (0,99999); my($snmp,$resp,$snmperr); my($hostname) = ''; my($community) = 'public'; # Default community string my($vhost) = ''; my($A, $B, $MSG) = ('U','U',''); my(@perf) = (); my($STATUS) = $UNKNOWN; my($MODE) = 0; # 0 = Nagios, 1 = MRTG my($VMID) = -1; # set to -1 if not running my($VMNO) = -1; # set to -1 if not defined my($vmGuestState) = "notRunning"; my($warn,$crit) = (70,90); # usage warn/crit: 70/90 is virtualcentre default my($rwarn,$rcrit) = (5,10); # cpu readytime warn/crit: VMWare say to crit at 5% my(%lookup) = (); my(%states) = (); my(%tmpnet) = (); my($fa,$sa,$fb,$sb); my($esx_version) = 3; use vars qw($opt_C $opt_H $opt_N $opt_M $opt_V $opt_h $opt_c $opt_t $opt_i $opt_d $opt_w $opt_l $opt_v $opt_r $opt_R); sub base($) { return '?' if(!$_[0]); return $1 if( $_[0]=~/^(\S+)/ ); return $_[0]; } sub dohelp { print "Usage: $0 [-h] [-V] [-d] -H host [-C community] [-N | -M [-r]]\n"; print " [-l check [-v vhost] [-i interface] [-w warn -c crit]]\n"; print " [-t timeout] [-R retries]\n"; print " -h: just prints this help message\n"; print " -V: just prints the script version number\n"; print " -d: puts the script into debug mode\n"; print " -H host: ESX server machine\n"; print " -C community: the SNMP community string (default is \"public\")\n"; print " -N: Nagios mode (the default); need -w and -c for CPU, MEM\n"; print " -M: MRTG mode (-r specifies rate rather than counter)\n"; print " -l check: can be CPU MEM STATE LIST NET LISTNET (default is LIST)\n"; print " -v virtualhost: restrict probing to that one guest host; required for STATE;\n"; print " if not specified, probes total ESX system statistics\n"; print " -i interface: Only valid for NET\n"; print " -w warn -c crit: Nagios thresholds\n"; print " -t timeout: ([1..60] seconds) for individual SNMP queries\n"; print " -R retries: # of retries ([0..20]) for individual SNMP queries\n"; print "\nFor MRTG,\n"; print " CPU is total seconds (counter) for vhost or total over all if no vhost given.\n"; print " MEM is memory remaining in K.\n"; print " STATE is 1 for up, 0 for down.\n"; print " LIST is number of vhosts.\n"; print " NET is network throughput in bytes for specified vhost and/or interface\n"; print " (total of all if not specified).\n"; print "\nFor Nagios, specify thresholds as follows.\n"; print " CPU is percentage of allocated CPU (for vhosts) and of total CPU (if no vhost).\n"; print " MEM is active memory (for vhosts) or free phys memory (if no vhost) in K or %.\n"; print " STATE is CRITICAL if vhost is down.\n"; print " LIST is WARN if some are down, CRIT is all vhosts are down.\n"; print " NET is bytes/sec since last check, if possible (otherwise UNKNOWN).\n"; print "\nThresholds for MEM or LIST under Nagios, can be in K or %\n"; print " e.g.: -l MEM -w 2048K -c 1024K\n"; print " e.g.: -l MEM -v vhost -w 80% -c 90%\n"; print " e.g.: -l LIST -w 90% -c 0\n"; print " e.g.: -l LIST -w 10 -c 1\n"; print "Thresholds for CPU are in % (the trailing % symbol is optional)\n"; print " e.g.: -l CPU -w 80 -c 90\n"; print "Thresholds for NET are in BYTES/SEC (cannot use %)\n"; exit 0; } sub readstate { return if(! -r $STATEFILE); open STATE, "<$STATEFILE" or return; flock STATE,1; # read lock while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); } flock STATE,8; # unlock close STATE; } sub writestate { my(%new) = @_; # NOTE: For further discussion of the possible race conditions and security holes # here, see Programming Perl, 3/e, pp. 571-576. # We write-lock the state file for the entire interaction, and we flush the output # before releasing the lock, to prevent any avoidable race conditions from showing # up. The one race condition this doesn't prevent is multiple independent processes # finding the file doesn't exist, and all of them opening the file in some kind of # write mode (which is required to create a non-existent file). The race condition # here arises because we use a lock mechanism that only allows us to do the locking # after the file is opened, and not beforehand. Thus there is no adjudication such # that only one process would be able to observe that the file does not yet exist. # Whichever of those processes acquires the write lock last will overwrite the # contents written by all the previous processes. But that circumstance will only # occur in a limited, one-time situation, the file will still be left in a consistent # state each time it is unlocked, and it will simply be as if the other processes # never ran. The system will recover properly on subsequent runs. if(-r $STATEFILE) { # We open in read/write mode so we can later write the file without needing # to release the lock and close the file between read and write activity. # (Closing and re-opening would allow some other process to write the file # in between, such that the data we just read would now be stale and those # states we are not updating ourselves here would be overwritten here with # data which is now out-of-date.) open STATE, "+<$STATEFILE" or return; flock STATE,2; # write lock while( <STATE> ) { $states{$1}=$2 if( /^(\S+)=(\d+)/ ); } } else { # We open in append mode because that mode doesn't clobber an existing file. # That's important so we don't truncate a file which some other process has # already write-locked and is already in the middle of writing, and possibly # leave around a broken file that some other process might read before we # acquire the write lock. open STATE, ">>$STATEFILE" or return; flock STATE,2; # write lock } seek STATE,0,0; # rewind truncate STATE,0; foreach ( keys %new ) { $states{$_} = $new{$_}; } foreach ( keys %states ) { print STATE "$_=".$states{$_}."\n"; } # Actually, the flock call implicitly flushes the file before releasing the lock, # so we don't need to do so explicitly here. # my $oldfile = select STATE; $| = 1; select $oldfile; # flush any buffered output flock STATE,8; # unlock close STATE; } sub dooutput { if( $MODE ) { # MRTG $A = 'U' if(!defined $A); $B = $A if(!defined $B); $MSG = "Returned values: $A, $B\n" if(!$MSG); print "$A\n$B\n\n$MSG\n"; exit 0; } else { # Nagios # Here we follow the format recommended in the Nagios plug-in development guidelines, # plus a convention that says the status should be right up front in a readable form, so # end-users can see it directly in contexts that might not color-highlight the message. # The one thing this doesn't guarantee is that the message isn't so long that either it or # the appended performance data doesn't get chopped off due to overall length limitations # in the Nagios command pipe. print "SERVICE STATUS: ", ($VisibleStatus{$STATUS} || "UNKNOWN"), ": ", $MSG, (scalar @perf ? "|" . join(" ",@perf) : ""), "\n"; exit $STATUS; } # should never get here } sub makesnmp() { ($snmp,$snmperr) = Net::SNMP->session( -hostname=>$hostname, -community=>$community, -timeout=>$TIMEOUT, -retries=>$RETRIES ); if($snmperr) { $A = $B = 'U'; print "($snmperr)\n" if($DEBUG); $MSG = "Error: $snmperr"; $STATUS = $UNKNOWN; dooutput; # exit exit(0); } } ########################################################################### # Read detailed memory and CPU data from extended snmp daemon, if possible my(%stats) = (); my($donereadagent) = 0; sub readagent { return "" if($donereadagent); $MSG = ""; makesnmp() if(!$snmp); $resp = $snmp->get_request( -varbindlist=>["$UCDOID.2.1"] ); if(!$resp) { # Fall back to the old way return 1; } if( $resp->{"$UCDOID.2.1"} ne 'vmware' ) { $MSG = "Incorrect SNMPD configuration: found '".$resp->{"$UCDOID.2.1"}."' when expected 'vmware'"; $STATUS = $UNKNOWN; return 1; } $resp = $snmp->get_table( -baseoid=>"$UCDOID.101" ); if(!$resp) { # Fall back to the old way # $MSG = "SNMP error: ".$snmp->error; return 1; } # Convert the retrieved values to lookup hash foreach my $oid ( keys %$resp ) { if(( $oid =~ /\.101\.\d+$/ ) and ( $resp->{$oid}=~/^(\S+)=(.*)$/)) { $stats{$1}=$2; } } $donereadagent = 1; return ""; } sub getesxversion { print "(snmp lookup)\n" if($DEBUG); makesnmp() if(!$snmp); $resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.2.0" ] ); if(!$resp) { if(readagent) { $MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)" if(!$MSG); $STATUS = $UNKNOWN; dooutput; # exit exit(0); } if(!$stats{'has-names'}) { $MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)"; $STATUS = $UNKNOWN; dooutput; # exit exit(0); } $esx_version = 2; # just a blind assumption } else { $esx_version = $resp->{"$VMOID.1.2.0"}; $esx_version =~ s/\..*//; } } # Read all the VM IDs from the vmware-snmpd MIB sub getvmid { print "(snmp lookup)\n" if($DEBUG); makesnmp() if(!$snmp); $resp = $snmp->get_table( -baseoid=>"$VMOID.2.1.1"); if(!$resp) { $resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.1.0" ] ); if(!$resp) { if(readagent) { $MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)" if(!$MSG); $STATUS = $UNKNOWN; dooutput; # exit exit(0); } if(!$stats{'has-names'}) { $MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)"; $STATUS = $UNKNOWN; dooutput; # exit exit(0); } foreach ( keys %stats ) { if( /vhost-(\d+)-name/ ) { $lookup{$1} = $stats{$_}; # id->name $lookup{$stats{$_}} = "vmno-$1" ; # name->dummyOID $lookup{"vmno-$1"} = $1 ; # dummyOID->id } } } else { print "No guests are defined on this server\n" if($DEBUG); $MSG = "No guests defined on this server"; return; } } else { foreach my $oid ( keys %$resp ) { $oid =~ /(\d+)\.(\d+)$/; if( $1 == 2 ) { $lookup{$resp->{$oid}} = $2; $lookup{$2} = $resp->{"$VMOID.2.1.1.7.$2"}; $lookup{$resp->{"$VMOID.2.1.1.7.$2"}} = $resp->{$oid}; if ( $esx_version == 3 ) { $lookup{"vmGuestState-$2"} = $resp->{"$VMOID.2.1.1.8.$2"}; } # } elsif( $1 == 7 ) { # $lookup{$2} = $resp->{$oid}; } } } return if(!$vhost); # we're just getting the table if(defined $lookup{$vhost}) { $VMNO = $lookup{$vhost}; if( defined $lookup{$VMNO} ) { $VMID = $lookup{$VMNO}; if ( defined $lookup{"vmGuestState-$VMNO"} ) { $vmGuestState = $lookup{"vmGuestState-$VMNO"}; } } else { $STATUS = $CRITICAL; $MSG = "Virtual host $vhost($VMNO) is not running!"; } } else { # lets see if they just gave part of the vhost name? foreach ( keys %lookup ) { if( /^$vhost/i ) { $VMNO = $lookup{$_}; if( defined $lookup{$VMNO} ) { $VMID = $lookup{$VMNO}; if ( defined $lookup{"vmGuestState-$VMNO"} ) { $vmGuestState = $lookup{"vmGuestState-$VMNO"}; } $vhost = $_; } else { $STATUS = $CRITICAL; $MSG = "Virtual host $vhost($VMNO) is not running!"; } last; } } if($VMNO<0) { $STATUS = $UNKNOWN; $MSG = "Virtual host $vhost is not defined!"; dooutput; # exit exit(0); } } print "(hostno=$VMNO, ID=$VMID)\n" if($DEBUG); } sub listvm { my(@vh); %lookup = (); @vh = (); print "(snmp lookup)\n" if($DEBUG); makesnmp() if(!$snmp); $resp = $snmp->get_table( -baseoid=>"$VMOID.2.1.1"); if(!$resp) { $resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.1.0" ] ); if(!$resp) { if(readagent) { $A = $B = 'U'; $MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)" if(!$MSG); $STATUS = $UNKNOWN; dooutput; # exit exit(0); } if(!$stats{'has-names'}) { $MSG = "Error: No VMWare SNMP sub-agent running (vmware-snmpd)"; $STATUS = $UNKNOWN; dooutput; # exit exit(0); } foreach ( keys %stats ) { if( /vhost-(\d+)-name/ ) { $lookup{$1} = $stats{$_}; # id->name $lookup{$stats{$_}} = "vmno-$1" ; # name->dummyOID $lookup{"vmno-$1"} = $1 ; # dummyOID->id push @vh,$stats{$_}; } } } else { $A = $B = 0; $MSG = "No VHosts are defined on this server"; $STATUS = $OK; dooutput; # exit exit(0); } } else { foreach my $oid ( sort keys %$resp ) { $oid =~ /(\d+)\.(\d+)$/; if( $1 == 2 ) { $lookup{$resp->{$oid}} = $2; push @vh, $resp->{$oid}; } elsif( $esx_version == 2 && $1 == 7 ) { $lookup{$2} = $resp->{$oid}; } elsif( $esx_version == 3 && $1 == 8 ) { $lookup{$2} = $resp->{$oid}; } } } $A = $B = 0; foreach ( @vh ) { next if(!$_); $B++; if ( $esx_version == 2 ) { if( defined $lookup{$lookup{$_}} and ($lookup{$lookup{$_}} > 0)) { $_ = (substr $_,0,16)."(".$lookup{$lookup{$_}}.")"; $A++; } else { $_ = (substr $_,0,16)."(DOWN)"; } } else { # This logic is for ESX 3. if( defined $lookup{$lookup{$_}} and ($lookup{$lookup{$_}} eq "running")) { $_ = $_."(UP)"; $A++; } else { # We don't want to truncate the vhostnames because the substrings might not be unique. $_ = $_."(DOWN)"; } } $_ =~ s/ *\([^\)]+\)(\(.*\))/$1/; } $MSG = "VHosts: $A/$B up: ".(join ", ",@vh); push @perf, "allvms_up_ct=$A;;;0;$B"; push @perf, "allvms_up_pc=". int($A/$B*10000)/100.0 ."%;;;0;100"; $STATUS = $OK; } sub readnet { my($found); $resp = $snmp->get_table( -baseoid=>"$VMOID.3.4.1"); if(!$resp) { $resp = $snmp->get_request( -varbindlist=>[ "$VMOID.1.1" ] ); if($resp) { $A = $B = 0; $MSG = "No VHosts defined"; $STATUS = $OK; return; } $MSG = "Error: Unable to retrieve SNMP data"; $STATUS = $UNKNOWN; return; } foreach my $oid ( keys %$resp ) { $oid =~ /(\d+)\.(\d+)$/; # Type, index. if( $1 == 3 ) { $tmpnet{$2} = [ $resp->{$oid}, $resp->{"$VMOID.3.4.1.2.$2"}, ($resp->{"$VMOID.3.4.1.7.$2"}*1024), ($resp->{"$VMOID.3.4.1.9.$2"}*1024) ]; } } return if($opt_l =~ /LIST/); # We now have all the network statistics indexed by card or VMID $A = $B = 0; $found = 0; foreach ( keys %tmpnet ) { if((($VMID<0) or ($VMID == $tmpnet{$_}[0])) # vm matches and ((!$opt_i) or ($opt_i eq $tmpnet{$_}[1]))) { # net matches $A += $tmpnet{$_}[2]; $B += $tmpnet{$_}[3]; $found = 1; } } if(!$found) { $MSG = "No network interfaces exist for "; $MSG .= "vhost $vhost" if($VMID>-1); $MSG .= " and " if($VMID>-1 and $opt_i); $MSG .= " interface $opt_i" if ($opt_i); $STATUS = $UNKNOWN; } } ########################################################################### # Read general memory and CPU data from vmware-snmpd # This is what we do if we can't get the detailed information. sub readcpu { my($k,@k); my($t1,$t2,$a1); $MSG = ""; $A = 0; $B = 0; if( !$MODE or $opt_r ) { readstate; $t1 = $states{"$hostname-CPU-$vhost-time"}; $a1 = $states{"$hostname-CPU-$vhost"}; $t2 = time; } @k = (); if( $VMID < 0 ) { foreach ( keys %lookup ) { push @k, "$VMOID.3.1.2.1.3.".$_ if( /^\d+$/ and $_>99); } } else { $k = "$VMOID.3.1.2.1.3.$VMID"; @k = ( $k ); } foreach $k ( @k ) { print "(retrieving $k)\n" if($DEBUG); } $resp = $snmp->get_request( -varbindlist=>\@k ); if( $resp ) { if($VMID<0){ $A = 0; foreach ( keys %$resp ) { $A += $resp->{$_}; print "$_: ".$resp->{$_}."\n" if($DEBUG); } } else { $A = $resp->{$k}; } $B = 0; } else { $A = $B = 'U'; if($VMID<0){ $MSG = "Unable to retrieve CPU statistics for ESX server: ".$snmp->error; } else { $MSG = "Unable to retrieve CPU statistics for $vhost: ".$snmp->error; } $STATUS = $UNKNOWN; } if(!$MSG){ # IE, no errors $MSG = "CPU has used $A seconds"; $MSG .= " on $vhost" if($vhost); if( !$MODE or $opt_r ) { writestate( "$hostname-CPU-$vhost"=>$A, "$hostname-CPU-$vhost-time"=>$t2 ) if(!$t1 or ($t2-$t1)>30); if(!$t1 or !$a1 or ($t1 >= $t2) or ( ($t2-$t1)>1000 ) ) { if($vhost) { $MSG = "No saved state for $vhost CPU time yet - wait for next poll."; } else { $MSG = "No saved state for ESX system CPU time yet - wait for next poll."; } $A = $B = "U"; $STATUS = $UNKNOWN; } else { print "Usage: $A-$a1 in $t2-$t1 = ".($A-$a1)." in ".($t2-$t1) if($DEBUG); $A = int((($A - $a1)/($t2 - $t1))*10000)/100; print " = $A\n" if($DEBUG); $B = 0; $MSG = "CPU usage is $A% "; $MSG .= "on $vhost" if($vhost); $MSG .= " (".($t2-$t1)."s average)"; if($A>110 or $A<0) { $B = $A = 0; $MSG = "Error reading CPU usage information." } } } } } sub readmem { my($k1,$k2); if($VMID < 0) { $k1 = "$VMOID.3.2.1.0"; # Total physical present $k2 = "$VMOID.3.2.3.0"; # Memory free } else { $k1 = "$VMOID.3.2.4.1.3.$VMID"; # VM memory max $k2 = "$VMOID.3.2.4.1.4.$VMID"; # VM memory used } print "(retrieving $k1,$k2)\n" if($DEBUG); $resp = $snmp->get_request( -varbindlist=>[$k1,$k2] ); if( $resp ) { if($VMID < 0 ) { $A = $resp->{$k2}; $B = $resp->{$k1}; } else { $A = $resp->{$k2}; $B = $resp->{$k1}*1024; $A = $B - $A; # memory remaining } } else { $A = $B = 'U'; if($VMID<0) { $MSG = "Unable to retrieve memory statistics for ESX server: ".$snmp->error; } else { $MSG = "Unable to retrieve memory statistics for $vhost: ".$snmp->error; } $STATUS = $UNKNOWN; } } sub readconsolemempc { my($k1,$k2); $k1 = "$VMOID.3.2.1.0"; # Total physical present (enterprises.vmware.vmwResources.vmwMemory.memSize.0) $k2 = "$VMOID.3.2.2.0"; # Memory used by console (enterprises.vmware.vmwResources.vmwMemory.memCOS.0) print "(retrieving $k1,$k2)\n" if($DEBUG); $resp = $snmp->get_request( -varbindlist=>[$k1,$k2] ); if( $resp ) { return int( $resp->{$k2} / $resp->{$k1} * 10000) / 100.0; } else { return 'U'; } } sub readxcpu { my($k,$C); $MSG = ""; $A = 0; $B = 0; $STATUS = 0; if( readagent ) { print "(readagent failed: $MSG)\n" if($DEBUG); readcpu if(!$MSG); # no vmware agent, no error return; } if($vhost) { if ( $esx_version <= 2 ) { $k = "vhost-$VMID"; } else { foreach my $key ( keys %stats ) { if ( $stats{$key} eq $lookup{$lookup{$VMNO}} ) { $key =~ /vhost-(\d+)-name/; $k = "vhost-$1"; last; } } } if ( defined $k ) { $A = $stats{"$k-cpu-used-pc"}; $B = $stats{"$k-cpu-ready-pc"}; } else { $A = undef; $B = undef; } $C = $A; } else { $k = "sys"; $A = $stats{"sys-cpu-used-pc"}; $B = $stats{"allvms-cpu-used-pc"}; $C = $A + $B if(defined $A and defined $B); } if(!defined $A or !defined $B) { $A=$B='U'; $MSG="Gathering statistics, please wait."; $STATUS = 3; # Fill in some dummy performance data anyway, to keep downstream processes somewhat happy. if ($vhost) { push @perf, "vhost_cpu_used_pc=U%;;;0;100"; push @perf, "vhost_cpu_ready_pc=U%;;;0;100"; } else { push @perf, "sys_cpu_used_pc=U%;;;0;100"; push @perf, "allvms_cpu_used_pc=U%;;;0;100"; push @perf, "sys_cpu_ready_pc=U%;;;0;100"; } dooutput; exit 3; } if($vhost) { $MSG = "vhost CPU used=$A% ready=$B%"; push @perf, "vhost_cpu_used_pc=$A%;;;0;100"; push @perf, "vhost_cpu_ready_pc=$B%;;;0;100"; } else { $MSG = "CPU used sys=$A% vhosts=$B% readytime=".$stats{'sys-cpu-ready-pc'}."%"; push @perf, "sys_cpu_used_pc=$A%;;;0;100"; push @perf, "allvms_cpu_used_pc=$B%;;;0;100"; push @perf, "sys_cpu_ready_pc=".$stats{'sys-cpu-ready-pc'}."%;;;0;100"; } # MRTG only if($MODE) { dooutput; exit 0; } # Nagios only $crit =~ s/[^\d\.]//g; $warn =~ s/[^\d\.]//g; $crit = 100 if(!$crit); $warn = 100 if(!$warn); if( $C >= $crit ) { $MSG .= "<BR>" if($MSG); $MSG .= "CPU usage is CRITICAL ($C%)"; $STATUS = 2; } elsif( $C >= $warn ) { $MSG .= "<BR>" if($MSG); $MSG .= "CPU usage is WARNING ($C%)"; $STATUS = 1 if($STATUS<2); } # Ready time if( $stats{"$k-cpu-ready-pc"} >= $rcrit ) { $MSG .= "<BR>" if($MSG); $MSG .= "Ready time is CRITICAL (".$stats{"$k-cpu-ready-pc"}."%)"; $STATUS = 2; } elsif( $stats{"$k-cpu-ready-pc"} >= $rwarn ) { $MSG .= "<BR>" if($MSG); $MSG .= "Ready time is WARNING (".$stats{"$k-cpu-ready-pc"}."%)"; $STATUS = 1 if($STATUS<2); } if(!$vhost) { # check all vhosts foreach ( keys %lookup ) { next if(!defined $stats{"vhost-$_-cpu-used-pc"}); $C=$stats{"vhost-$_-cpu-used-pc"}; if( $C >= $crit ) { $MSG .= "<BR>" if($MSG); $MSG .= "'".base($lookup{$_})."' CPU CRITICAL ($C%)"; $STATUS = 2; } elsif( $C >= $warn ) { $MSG .= "<BR>" if($MSG); $MSG .= "'".base($lookup{$_})."' CPU WARNING ($C%)"; $STATUS = 1 if($STATUS<2); } if( $stats{"vhost-$_-cpu-ready-pc"} >= $rcrit ) { $MSG .= "<BR>" if($MSG); $MSG .= "'".base($lookup{$_})."' Ready time CRITICAL (".$stats{"vhost-$_-cpu-ready-pc"}."%)"; $STATUS = 2; } elsif( $stats{"vhost-$_-cpu-ready-pc"} >= $rwarn ) { $MSG .= "<BR>" if($MSG); $MSG .= "'".base($lookup{$_})."' Ready time WARNING (".$stats{"vhost-$_-cpu-ready-pc"}."%)"; $STATUS = 1 if($STATUS<2); } } } dooutput; exit 3; # not reached } sub readxmem { my($pc,$max,$k,$memVMID); $MSG = ""; $A = 0; $B = 0; if( readagent() ) { print "(readagent failed: $MSG)\n" if($DEBUG); readmem if(!$MSG); # no vmware agent, no error return; } if( $vhost ) { if ( $esx_version <= 2 ) { $k = "vhost-$VMID"; } else { # for ESX Version 3 foreach my $key ( keys %stats ) { if ( $stats{$key} eq $lookup{$lookup{$VMNO}} ) { $key =~ /vhost-(\d+)-name/; $memVMID = $1 - 1; # why this is off by one, we don't know, but it is $k = "unknown-$memVMID"; last; } } } $A = $stats{"$k-mem-active"}; $B = $stats{"$k-mem-max"}; $max = $stats{"$k-mem-max"}; if(!defined $A or !defined $B) { $A=$B='U'; $MSG="Please wait, data being gathered."; $STATUS=3; push @perf, "vhost_mem_act_pc=U%;;;0;100"; push @perf, "vhost_mem_pvt_pc=U%;;;0;100"; push @perf, "vhost_mem_shr_pc=U%;;;0;100"; push @perf, "vhost_mem_bal_pc=U%;;;0;100"; push @perf, "vhost_mem_swp_pc=U%;;;0;100"; dooutput; exit 0; } $pc = int($A/$B*10000)/100.0; $MSG = "Memory active: ".int($A/1024000)."Mb ($pc%) [Total available ".int($B/1024000)."Mb]"; push @perf, "vhost_mem_act_pc=$pc%;;;0;100"; if($pc>=$crit) { $STATUS=2; $MSG = "CRIT: $MSG"; } elsif($pc>=$warn) { $STATUS=1; $MSG = "WARN: $MSG"; } } else { $k = "allvms"; $A = $stats{'mem-free'}; $B = $stats{'mem-total'}; $max = $stats{"$k-mem-max"}; if(!defined $A or !defined $B) { $A=$B='U'; $MSG="Please wait, data being gathered."; $STATUS=3; push @perf, "mem_free_pc=U\%;;;0;100"; if ( $esx_version == 3 ) { push @perf, "console_mem_pc=U\%;;;0;100"; } push @perf, "allvms_mem_pvt_pc=U\%;;;0;100"; push @perf, "allvms_mem_shr_pc=U\%;;;0;100"; push @perf, "allvms_mem_bal_pc=U\%;;;0;100"; push @perf, "allvms_mem_swp_pc=U\%;;;0;100"; dooutput; exit 0; } $pc = int($A/$B*10000)/100.0; $MSG = "Memory free: ".int($A/1024000)."Mb ($pc\%) [Total available ".int($B/1024000)."Mb]"; push @perf, "mem_free_pc=$pc\%;;;0;100"; if ( $esx_version == 3 ) { my ($consolemempc) = readconsolemempc; $MSG .= " [Console=$consolemempc\%]"; push @perf, "console_mem_pc=" . $consolemempc . "\%;;;0;100"; } if($pc<=$crit) { $STATUS=2; $MSG = "CRIT: $MSG"; } elsif($pc<=$warn) { $STATUS=1; $MSG = "WARN: $MSG"; } } # MRTG if($MODE) { dooutput; exit 0; } # Nagios if($max) { $MSG .= "<BR>Memory split: pvt/shr/bal/swp = " .(int(10000*$stats{"$k-mem-private"}/$max)/100.0)."%/" .(int(10000*$stats{"$k-mem-shared"}/$max)/100.0)."%/" .(int(10000*$stats{"$k-mem-balloon"}/$max)/100.0)."%/" .(int(10000*$stats{"$k-mem-swap"}/$max)/100.0)."%"; if ($vhost) { push @perf, "vhost_mem_pvt_pc=" . (int(10000*$stats{"$k-mem-private"}/$max)/100.0) . "%;;;0;100"; push @perf, "vhost_mem_shr_pc=" . (int(10000*$stats{"$k-mem-shared" }/$max)/100.0) . "%;;;0;100"; push @perf, "vhost_mem_bal_pc=" . (int(10000*$stats{"$k-mem-balloon"}/$max)/100.0) . "%;;;0;100"; push @perf, "vhost_mem_swp_pc=" . (int(10000*$stats{"$k-mem-swap" }/$max)/100.0) . "%;;;0;100"; } else { push @perf, "allvms_mem_pvt_pc=" . (int(10000*$stats{"$k-mem-private"}/$max)/100.0) . "%;;;0;100"; push @perf, "allvms_mem_shr_pc=" . (int(10000*$stats{"$k-mem-shared" }/$max)/100.0) . "%;;;0;100"; push @perf, "allvms_mem_bal_pc=" . (int(10000*$stats{"$k-mem-balloon"}/$max)/100.0) . "%;;;0;100"; push @perf, "allvms_mem_swp_pc=" . (int(10000*$stats{"$k-mem-swap" }/$max)/100.0) . "%;;;0;100"; } if($stats{"$k-mem-balloon"}) { $pc = int(100000*$stats{"$k-mem-balloon"}/$max)/1000.0; if($pc>=25) { $MSG .= "<BR>CRIT: Balloon drivers in action! ($pc%)"; $STATUS = 2; } elsif($pc>=0.01) { $MSG .= "<BR>WARN: Balloon drivers in action! ($pc%)"; $STATUS = 1 if($STATUS<2); } } } if($stats{"$k-swap-in-bps"} and $stats{"$k-swap-in-bps"}>10) { if($stats{"$k-swap-in-bps"}>$SWAPINCRIT) { $MSG .= "<BR>CRIT: VMware swapping in action! (".$stats{"$k-swap-in-bps"}."Bps)"; $STATUS = 2; } else { $MSG .= "<BR>WARN: VMware swapping is starting!"; $STATUS = 1 if($STATUS<2); } } elsif($max and $stats{"$k-mem-swap"}) { $pc = int(100000*$stats{"$k-mem-swap"}/$max)/1000.0; if($pc>=$SWAPPCCRIT) { $MSG .= "<BR>CRIT: VMWare swap space in use! ($pc%)"; $STATUS = 2; } elsif($pc>=0.01) { $MSG .= "<BR>WARN: VMWare swap space in use! ($pc%)"; $STATUS = 1 if($STATUS<2); } } dooutput; exit 3; # not reached } ########################################################################### getopts('hrdNMVH:c:t:v:w:C:l:i:R:'); $hostname = $opt_H if($opt_H); $vhost = $opt_v if($opt_v); $warn = $opt_w if($opt_w); $crit = $opt_c if($opt_c); $TIMEOUT = $opt_t if($opt_t); $RETRIES = $opt_R if($opt_R); $MODE = 1 if($opt_M); $community = $opt_C if($opt_C); $DEBUG = 1 if($opt_d); dohelp if($opt_h); if ($opt_V) { $MSG = "$0 $Version"; dooutput; exit 0; } if(!$hostname) { $MSG = "No ESX server hostname specified with -H"; dooutput; exit 0; } if( !$opt_l ) { # $MSG = "You need to specify a command with -l"; # dooutput; # exit 0; $opt_l = "LIST"; } getesxversion; if( $opt_l =~ /LISTNET/i ) { getvmid; $MSG = ""; readnet; if(!$MSG) { my($tk); foreach ( keys %tmpnet ) { if(!$vhost or ($VMID eq $tmpnet{$_}[0]) ) { $tk=$tmpnet{$_}[1]; next if($MSG=~/$tk/); $MSG .= ', ' if($MSG); # $MSG .= $lookup{$tmpnet{$_}[0]}."/" if(!$opt_v); $MSG .= $tk; } } $STATUS = $OK; } dooutput; exit 0; } if( $opt_l =~ /LIST/i ) { listvm; if($warn =~ /(\d+)%/) { $warn = $B * $1 / 100; } elsif( $warn < 0 ) { $warn = $B - 1; } if($crit =~ /(\d+)%/) { $crit = $B * $1 / 100; } elsif( $crit < 0 ) { $crit = 0; } $STATUS = $WARNING if($A<=$warn); # If SOME are down $STATUS = $CRITICAL if($A<=$crit); # If NONE are up $STATUS = $OK if(!$B); # No guests at all dooutput; exit 3; } if( $opt_l !~ /NET|CPU|MEM|STAT/i ) { $MSG = "Bad command $opt_l!"; dooutput; exit 3; } if( $opt_l =~ /MEM|CPU|NET/ and !$MODE and ($crit<0 or $warn<0)) { $MSG = "Invalid warn/critical thresholds for '$opt_l' (need -w and -c)"; dooutput; exit 3; } # Now, we have host, vhost, community, and command getvmid; # also opens SNMP object if( $opt_l =~ /STAT/i ) { if(!$vhost) { $MSG = "No virtual hostname specified with -v"; dooutput; exit 0; } if( ( $esx_version == 2 && $VMID < 0 ) || ( $esx_version == 3 && $vmGuestState ne "running" ) ) { $STATUS = $CRITICAL; ($A,$B) = (0,0); $MSG = "VHost $vhost is down or undefined."; } else { $STATUS = $OK; ($A,$B) = (1,1); $MSG = "VHost $vhost is up (ID: $VMID)"; } push @perf, "vhost_up=$A;;;0;1"; dooutput; exit 0; } if($vhost and ( $esx_version == 2 && $VMID < 0 || $esx_version == 3 && $vmGuestState ne "running" )) { $STATUS = $CRITICAL; $MSG = "$vhost is not running." if(!$MSG); if( $opt_l =~ /CPU/i ) { # Fill in some dummy performance data anyway, to keep downstream processes happy. push @perf, "vhost_cpu_used_pc=U%;;;0;100"; push @perf, "vhost_cpu_ready_pc=U%;;;0;100"; } if( $opt_l =~ /MEM/i ) { # Fill in some dummy performance data anyway, to keep downstream processes happy. push @perf, "vhost_mem_act_pc=U%;;;0;100"; push @perf, "vhost_mem_pvt_pc=U%;;;0;100"; push @perf, "vhost_mem_shr_pc=U%;;;0;100"; push @perf, "vhost_mem_bal_pc=U%;;;0;100"; push @perf, "vhost_mem_swp_pc=U%;;;0;100"; } if( $opt_l =~ /NET/i ) { # Fill in some dummy performance data anyway, to keep downstream processes happy. push @perf, "vhost_net_read=U;;;0"; push @perf, "vhost_net_write=U;;;0"; } dooutput; exit 0; } $STATUS = $OK; if( $opt_l =~ /CPU/i ) { $MSG = ""; readxcpu; # attempt to use extended MIB, else use VMWare MIB } elsif( $opt_l =~ /NET/i ) { my($t1,$t2,$a1,$b1); $opt_i = "" if(!defined $opt_i); $vhost = "" if(!defined $vhost); if( !$MODE or $opt_r ) { readstate; $t1 = $states{"$hostname-NET-$vhost-$opt_i-time"}; $a1 = $states{"$hostname-NET-$vhost-$opt_i-r"}; $b1 = $states{"$hostname-NET-$vhost-$opt_i-w"}; $t2 = time; } $MSG = ""; readnet; if(!$MSG){ # IE, no errors $MSG = "Network counters Read=$A Write=$B"; $MSG .= " on $vhost" if($vhost); if( $opt_i ) { if( $vhost ) { $MSG .= '/'; } else { $MSG .= ' on '; } $MSG .= $opt_i; } if( !$MODE or $opt_r ) { writestate( "$hostname-NET-$vhost-$opt_i-r"=>$A, "$hostname-NET-$vhost-$opt_i-w"=>$B, "$hostname-NET-$vhost-$opt_i-time"=>$t2 ) if(!$t1 or ($t2-$t1)>30); if(!$t1 or (!$a1 and !$b1) or ($t1 >= $t2) or (($t2 - $t1)>3600)) { $MSG = "No saved state available yet - wait for next poll."; $A = $B = "U"; $STATUS = $UNKNOWN; if ($vhost) { push @perf, "vhost_net_read=U;;;0"; push @perf, "vhost_net_write=U;;;0"; } else { push @perf, "allvms_net_read=U;;;0"; push @perf, "allvms_net_write=U;;;0"; } } else { $A = ($A - $a1)/($t2 - $t1); $B = ($B - $b1)/($t2 - $t1); ($fa,$sa,$fb,$sb) = ( $A, "", $B, "" ); # NOTE: This arithmetic is really misleading, # though we haven't fixed it in this iteration of revisions. # M should be either 1024*1024 or 1000*1000, not 1024*1000. # And since K is 1024 here, M should be 1024*1024 for consistency. if($fa >= 1024000) { $fa /= 1024000; $sa = 'M'; } elsif($fa >= 1024) { $fa /= 1024; $sa = 'K'; } if($fb >= 1024000) { $fb /= 1024000; $sb = 'M'; } elsif($fb >= 1024) { $fb /= 1024; $sb = 'K'; } $fa = int($fa * 100)/100; $fb = int($fb * 100)/100; $MSG = "Network traffic $fa ".$sa."B/s read, $fb ".$sb."B/s write "; $MSG .= "on $vhost" if($vhost); if( $opt_i ) { if( $vhost ) { $MSG .= '/'; } else { $MSG .= 'on '; } $MSG .= $opt_i; } $MSG .= " (".($t2-$t1)."s average)"; if ($vhost) { push @perf, "vhost_net_read=" . (int(100*$A)/100.0) . ";;;0"; push @perf, "vhost_net_write=" . (int(100*$B)/100.0) . ";;;0"; } else { push @perf, "allvms_net_read=" . (int(100*$A)/100.0) . ";;;0"; push @perf, "allvms_net_write=" . (int(100*$B)/100.0) . ";;;0"; } } } } } elsif( $opt_l =~ /MEM/i ) { my($pc,$tot,$av,$sfx); $MSG = ""; readxmem; if(!$MSG) { $pc = int($A/$B*10000.0)/100.0; $sfx = "Kb"; $av = $A; if($av>2047) { $av = int($av/10.24)/100.0; $sfx="Mb"; } $av .= $sfx; $sfx = "Kb"; $tot = $B; if($tot>2047) { $tot = int($tot/10.24)/100.0; $sfx="Mb"; } $tot .= $sfx; $MSG = "Memory free: $av ($pc\%) [Total available $tot]" ; $MSG .= " on vhost $vhost" if($vhost); } } else { $MSG = "Invalid command $opt_l"; $STATUS = $UNKNOWN; } if( !$MODE and $STATUS==$OK ) { # Set Nagios thresholds if( $opt_l=~/MEM/i and $warn =~ /([\d\.]+)%/ ) { $warn = $B * $1 / 100.0; } elsif( $warn =~ /([\d\.]+)M/i ) { $warn = $1 * 1024; } elsif( $warn =~ /([\d\.]+)/i ) { $warn = $1; } if( $opt_l=~/MEM/i and $crit =~ /([\d\.]+)%/ ) { $crit = $B * $1 / 100.0; } elsif( $crit =~ /([\d\.]+)M/i ) { $crit = $1 * 1024; } elsif( $crit =~ /([\d\.]+)/i ) { $crit = $1; } if( $opt_l =~ /MEM/i ) { $STATUS = $WARNING if( $A <= $warn ); $STATUS = $CRITICAL if( $A <= $crit ); } elsif( $opt_l =~ /CPU/i ) { $STATUS = $WARNING if( ($A+$B) >= $warn ); $STATUS = $CRITICAL if( ($A+$B) >= $crit ); } elsif( $opt_l =~ /NET/i ) { $STATUS = $WARNING if( $A >= $warn ); $STATUS = $WARNING if( $B >= $warn ); $STATUS = $CRITICAL if( $A >= $crit ); $STATUS = $CRITICAL if( $B >= $crit ); } else { $STATUS = $WARNING if( $A <= $warn ); $STATUS = $CRITICAL if( $A <= $crit ); } } $snmp->close; dooutput; exit 0;