#!/usr/bin/ksh # # Nagios check script for Sun Cluster. # Written by Thomas Sluyter (nagios@kilala.nl) # By request of KPN-IS, i-Provide SYS, the Netherlands # Last Modified: 25-09-2006 # # Usage: ./check_suncluster [-t, -q, -g, -G resource-group, -r, -R resource, -i] # # Description: # This script is capable of performing a number of basic checks on a # system running Sun Cluster. Depending on the parameter you pass to # it, it will check: # * Transport paths (-t). # * Quorum (-q). # * Resource groups (-g). # * One selected resource group (-G). # * Resources (-r). # * One selected resource (-R). # * IPMP groups (-i). # # Limitations: # This script will only work with Korn shell, due to some funky while # looping with pipe forking. Bash doesn't handle this very gracefully, # due to its sub-shell variable scoping. Maybe I really should learn # to program in Perl. # # Output: # * Transport paths return a WARN when one of the paths is down and a # CRIT when all paths are offline. # * Quorum returns a WARN when not all, but enough quorum devices are # available. It returns a CRIT when quorum cannot be reached. # * Resource groups returns a CRIT when a group is offline on all nodes # and a WARN if a group is in an unstable state. # * Resources returns a CRIT when a resource is offline on all nodes # and a WARN if a resource is in an unstable state. # * IPMP groups returns a CRIT when a group is offline. # # Other notes: # Aside from the debugging output that I've built into most of my recent # scripts, this check script will also have a testing mode hacked on, as # a bag on the side. This testing mode is only engaged when the test_check_suncluster # script is being run and will intentionally "break" a few things, to # verify the failure options of this check script. # # Enabling the following dumps information into DEBUGFILE at various # stages during the execution of this script. DEBUG=0 DEBUGFILE="/tmp/foobar" if [ -f /tmp/neko-wa-baka ] then if [ `cat /tmp/neko-wa-baka` == "Nyo!" ] then TESTING="1" else TESTING="0" fi else TESTING="0" fi ### REQUISITE NAGIOS USER INTERFACE STUFF ### # You may have to change this, depending on where you installed your # Nagios plugins PATH="/usr/bin:/usr/sbin:/bin:/sbin:/usr/cluster/bin" LIBEXEC="/usr/local/nagios/libexec" PROGNAME="check_suncluster" . $LIBEXEC/utils.sh [ $DEBUG -gt 0 ] && rm $DEBUGFILE print_usage() { echo "Usage: $PROGNAME [-t, -q, -g, -G resource-group, -r, -R resource, -i]" echo "Usage: $PROGNAME --help" } print_help() { echo "" print_usage echo "" echo "Sun Cluster check plugin for Nagios" echo "" echo "-t: check transport paths" echo "-q: check quorum" echo "-g: check resource groups" echo "-G: check one individual resource group" echo "-r: check all resources" echo "-R: check one individual resources" echo "-i: check IPMP groups" echo "" echo "This plugin not developped by the Nagios Plugin group." echo "Please do not e-mail them for support on this plugin, since" echo "they won't know what you're talking about :P" echo "" echo "For contact info, read the plugin itself..." } ### SUB-ROUTINE DEFINITIONS ### function check_transport_paths { [ $DEBUG -gt 0 ] && echo "Starting check_transport_path subroutine." >> $DEBUGFILE TOTAL=`scstat -W | grep "Transport path:" | wc -l` let COUNT=0 scstat -W | grep "Transport path:" | awk '{print $3" "$6}' | while read PATH STATUS do [ $DEBUG -gt 0 ] && echo "Before math, Count has the value of $COUNT." >> $DEBUGFILE if [ $STATUS == "online" ] then let COUNT=$COUNT+1 fi [ $DEBUG -gt 0 ] && echo "Path: $PATH has status $STATUS" >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Count: $COUNT online transport paths." >> $DEBUGFILE done [ $DEBUG -gt 0 ] && echo "Count: Outside the loop it has a value of $COUNT." >> $DEBUGFILE [ $TESTING -gt 0 ] && COUNT="0" if [ $COUNT -lt 1 ] then echo "NOK - No transport paths online." exit $STATE_CRITICAL elif [ $COUNT -lt $TOTAL ] then echo "NOK - One or more transport paths offline." exit $STATE_WARNING fi } function check_quorum { [ $DEBUG -gt 0 ] && echo "Starting check_quorum subroutine." >> $DEBUGFILE NEED=`scstat -q | grep "votes needed:" | awk '{print $4}'` PRES=`scstat -q | grep "votes present:" | awk '{print $4}'` [ $DEBUG -gt 0 ] && echo "Quorum needed: $NEED" >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Quorum present: $PRES" >> $DEBUGFILE [ $TESTING -gt 0 ] && PRES="0" if [ $PRES -ge $NEED ] then [ $DEBUG -gt 0 ] && echo "Enough quorum votes." >> $DEBUGFILE scstat -q | grep "votes:" | awk '{print $3" "$6}' | while read VOTE STATUS do [ $DEBUG -gt 0 ] && echo "Vote: $VOTE has status $STATUS." >> $DEBUGFILE if [ $STATUS != "Online" ] then echo "NOK - Quorum vote $VOTE not available." exit $STATE_WARNING fi done else [ $DEBUG -gt 0 ] && echo "Not enough quorum." >> $DEBUGFILE echo "NOK - Not enough quorum votes present." exit $STATE_CRITICAL fi } function check_resource_groups { [ $DEBUG -gt 0 ] && echo "Starting check_resource_groups subroutine." >> $DEBUGFILE scstat -g | grep "Group:" | awk '{print $2}' | sort -u | while read GROUP do ONLINE=`scstat -g | grep "Group: $GROUP" | grep "Online" | wc -l` WEIRD=`scstat -g | grep "Group: $GROUP" | grep -v "Resources" | grep -v "Online" | grep -v "Offline" | wc -l` [ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $ONLINE instances online." >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $WEIRD instances in a weird state." >> $DEBUGFILE [ $TESTING -gt 0 ] && ONLINE="0" if [ $ONLINE -lt 1 ] then echo "NOK - Resource group $GROUP not online." exit $STATE_CRITICAL fi if [ $WEIRD -gt 1 ] then echo "NOK - Resource group $GROUP is an unstable state." exit $STATE_WARNING fi done } function check_resource_grp { [ $DEBUG -gt 0 ] && echo "Starting check_resource_grp subroutine." >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Selected group: $RGROUP" >> $DEBUGFILE ONLINE=`scstat -g | grep $RGROUP | grep "Online" | wc -l` WEIRD=`scstat -g | grep $RGROUP | grep -v "Resources" | grep -v "Online" | grep -v "Offline" | wc -l` [ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $ONLINE instances online." >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Resource Group $GROUP has $WEIRD instances in a weird state." >> $DEBUGFILE [ $TESTING -gt 0 ] && ONLINE="0" if [ $ONLINE -lt 1 ] then echo "NOK - Resource group $RGROUP not online." exit $STATE_CRITICAL fi if [ $WEIRD -gt 1 ] then echo "NOK - Resource group $RGROUP is in an unstable state." exit $STATE_WARNING fi } function check_resources { [ $DEBUG -gt 0 ] && echo "Starting check_resources subroutine." >> $DEBUGFILE RESOURCES=`scstat -g | grep "Resource:" | awk '{print $2}' | sort -u` [ $DEBUG -gt 0 ] && echo "List of resources to check: $RESOURCES" >> $DEBUGFILE for RESOURCE in `echo $RESOURCES` do ONLINE=`scstat -g | grep "Resource: $RESOURCE" | awk '{print $4}' | grep "Online" | wc -l` WEIRD=`scstat -g | grep "Resource: $RESOURCE" | awk '{print $4}' | grep -v "Online" | grep -v "Offline" | wc -l` [ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $ONLINE instances online." >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $WEIRD instances in a weird state." >> $DEBUGFILE [ $TESTING -gt 0 ] && ONLINE="0" if [ $ONLINE -lt 1 ] then echo "NOK - Resource $RESOURCE not online." exit $STATE_CRITICAL fi if [ $WEIRD -gt 1 ] then echo "NOK - Resource $RESOURCE is in an unstable state." exit $STATE_WARNING fi done } function check_rsrce { [ $DEBUG -gt 0 ] && echo "Starting check_rsrce subroutine." >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Selected resource: $RSRCE" >> $DEBUGFILE ONLINE=`scstat -g | grep "Resource: $RSRCE" | awk '{print $4}' | grep "Online" | wc -l` WEIRD=`scstat -g | grep "Resource: $RSRCE" | awk '{print $4}' | grep -v "Online" | grep -v "Offline" | wc -l` [ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $ONLINE instances online." >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "Resource $RESOURCE has $WEIRD instances in a weird state." >> $DEBUGFILE [ $TESTING -gt 0 ] && ONLINE="0" if [ $ONLINE -lt 1 ] then echo "NOK - Resource $RESOURCE not online." exit $STATE_CRITICAL fi if [ $WEIRD -gt 1 ] then echo "NOK - Resource $RESOURCE is in an unstable state." exit $STATE_WARNING fi } function check_ipmp { [ $DEBUG -gt 0 ] && echo "Starting check_ipmp subroutine." >> $DEBUGFILE scstat -i | grep "IPMP Group:" | awk '{print $3" "$5}' | while read GROUP STATUS do [ $DEBUG -gt 0 ] && echo "IPMP Group: $GROUP has status $STATUS" >> $DEBUGFILE if [ $STATUS != "Online" ] then echo "NOK - IPMP group $GROUP not online." exit $STATE_CRITICAL fi if [ $TESTING -gt 0 ] then echo "NOK - IPMP group $GROUP not online." exit $STATE_CRITICAL fi done } ### THE MAIN ROUTINE FINALLY STARTS ### [ $DEBUG -gt 0 ] && echo "Starting main routine." >> $DEBUGFILE if [ $# -lt 1 ] then print_usage exit $STATE_UNKNOWN fi [ $DEBUG -gt 0 ] && echo "More than one argument." >> $DEBUGFILE [ $DEBUG -gt 0 ] && echo "" >> $DEBUGFILE case "$1" in --help) print_help; exit $STATE_OK;; -h) print_help; exit $STATE_OK;; -t) check_transport_paths;; -q) check_quorum;; -g) check_resource_groups;; -G) RGROUP="$2"; check_resource_grp;; -r) check_resources;; -R) RSRCE="$2"; check_rsrce;; -i) check_ipmp;; *) print_usage; exit $STATE_UNKNOWN;; esac [ $DEBUG -gt 0 ] && echo "No problems. Exiting normally." >> $DEBUGFILE # None of the other subroutines forced us to exit 1 before here, so let's quit with a 0. echo "OK - Everything running like it should" exit $STATE_OK