Nagios probe for testing the status of L&B
Tests called:
- job registration
- notification registration
- logging events
- receiving notifications
+ 1. Register job
+ 2. Register to receive notifications
+ 3. Log events
+ 4. Check job state
+ 5. Receive notifications
Return values:
0: Passed
3: Unknown
Console output:
- OK|<time to transit from Registered to Cleared>
+ OK|<time the test job took to transit from Registered to Cleared>
WARNING: <reason>
DOWN: <reason>
UNKNOWN: <reason>
EndHelpHeader
- echo "Usage: $progname [-h] [-v[v[v]]] [-H server] [-p port] [server[:port]]"
+ echo "Usage: $progname [-h] [-v[v[v]]] {[-H server] [-p port] | [server[:port]]} [-t <timeout>] [-T <tmpdir>]"
echo "Options:"
- echo " -h | --help Show this help message."
- echo " -v[vv] Verbosity level."
- echo " -H <server> server"
- echo " -p <server> port"
- echo " <server> Environmental variables are used if unspecified"
- echo " <port> Environmental variables or defaults are used if unspecified)"
+ echo " -h | --help Show this help message"
+ echo " -v[vv] Verbosity level"
+ echo " -H <server> server (Environmental variables are used if unspecified)"
+ echo " -p <server> port (Environmental variables or defaults are used if unspecified)"
+ echo " -t <timeout> Probe timeout in seconds"
+ echo " -T <tmpdir> Temporary directory (default /var/lib/grid-monitoring/emi.lb)"
echo ""
}
VERBLEVEL=0
TIMEOUT=0
+TMPDIR="/var/lib/grid-monitoring/emi.lb"
while test -n "$1"
do
case "$1" in
"-h" | "--help") showHelp && exit 2 ;;
- "-v" ) VERBLEVEL=$(( $VERBLEVEL + 1 )) ;;
+ "-v" | "--verbose") VERBLEVEL=$(( $VERBLEVEL + 1 )) ;;
"-vv" ) VERBLEVEL=$(( $VERBLEVEL + 2 )) ;;
"-vvv" ) VERBLEVEL=$(( $VERBLEVEL + 3 )) ;;
- "-H" ) shift && SRVPORT="${1}$SRVPORT" ;;
- "-p" ) shift && SRVPORT="$SRVPORT:${1}" ;;
- "-t" ) shift && TIMEOUT=$1 ;;
+ "-H" | "--hostname") shift && SRVPORT="${1}$SRVPORT" ;;
+ "-p" | "--port") shift && SRVPORT="$SRVPORT:${1}" ;;
+ "-t" | "--timeout") shift && TIMEOUT=$1 ;;
+ "-T" | "--tmpdir") shift && TMPDIR=$1 ;;
*) SRVPORT="$1" ;;
-# "-t" | "--text") setOutputASCII ;;
esac
shift
done
+export VERBLEVEL
+
+# Arrange timeout
if [ $TIMEOUT -gt 0 ]; then
mypid=$$
(trap 'exit' TERM; sleep $TIMEOUT; vprintf 0 "UNKNOWN: Probe timed out\n"; kill -s SIGINT $mypid; exit 3)&
fi
-export VERBLEVEL
-
#Set path to L&B example commands used by the probe
for exdir in /usr/lib64/glite-lb/examples /usr/lib/glite-lb/examples /opt/glite/examples
do
export GLITE_LB_SERVER_PORT=$portnumber
fi
+#Check if tmpdir writable
+touch $TMPDIR/$$_probecheck > /dev/null 2> /dev/null
+if [ -f $TMPDIR/$$_probecheck ]; then
+ rm $TMPDIR/$$_probecheck
+else
+ vprintf 1 "[LB Probe] Could not write to $TMPDIR. Falling back to /tmp.\n"
+ TMPDIR="/tmp"
+ touch $TMPDIR/$$_probecheck > /dev/null 2> /dev/null
+ if [ -f $TMPDIR/$$_probecheck ]; then
+ rm $TMPDIR/$$_probecheck
+ else
+ vprintf 1 "[LB Probe] Could not write to $TMPDIR.\n"
+ vprintf 0 "UNKNOWN: Probe could not write temporary files\n"
+ exit 3
+ fi
+fi
+
+
if [ $VERBLEVEL -ge 3 ]; then
env | grep -E "GLITE_|PATH"
printf "*** $servername:$portnumber\n"
if [ "$timeleft" = "" ]; then
vprintf 1 "\n[LB Probe] Test failed -- No credentials\n\n"
- vprintf 0 "UNKNOWN: NO CREDENTIALS\n"
+ vprintf 0 "UNKNOWN: No Credentials\n"
exit 3
else
if [ "$timeleft" = "0:00:00" ]; then
vprintf 1 "\n[LB Probe] Test failed -- Credentials expired\n\n"
- vprintf 0 "UNKNOWN: CREDENTIALS EXPIRED\n"
+ vprintf 0 "UNKNOWN: Credentials Expired\n"
exit 3
else
serverversion=`glite-lb-ws_getversion -m $servername:$wsportnumber`
if [ -z "$serverversion" ]; then
vprintf 1 "\n[LB Probe] Test failed -- server did not respond\n\n"
- vprintf 0 "DOWN: UNABLE TO GET SERVER VERSION\n"
+ vprintf 0 "DOWN: Unable to Get Server Version\n"
exit 2
else
echo $serverversion | grep -E "version.*[0-9]+\.[0-9]+\.[0-9]+" > /dev/null
if [ -z $jobid ]; then
vprintf 1 " Failed to register job\n[LB Probe] Test failed \n\n"
- vprintf 0 "DOWN: JOB REGISTRATION FAILED LOCALLY\n"
+ vprintf 0 "DOWN: Job Registration Failed Locally\n"
exit 2
else
vprintf 2 "${jobid}"
vprintf 2 ", server side OK"
else
vprintf 1 "\n[LB Probe] Test failed -- Job has not been submitted to server\n\n"
- vprintf 0 "DOWN: L&B SERVER NOT RUNNING\n"
+ vprintf 0 "DOWN: L&B Server Not Running\n"
exit 2
fi
fi
if [ -z $notifid ]; then
vprintf 1 "\n[LB Probe] Test failed -- Failed to register notification\n\n"
- vprintf 0 "DOWN: L&B SERVER NOT RUNNING\n"
+ vprintf 0 "DOWN: L&B Server Not Running\n"
exit 2
else
vprintf 2 "${notifid}"
NOTIFFILE="/tmp/$$_notifications.txt"
STATEFILE="/tmp/$$_jobstat.txt"
echo '' > $NOTIFFILE
-
- TOREPS=4; #Repetitions before timeout
- CLRNOTIFIED=0;
+
vprintf 1 "\n[LB Probe] Waiting for delivery/processing"
+ if [ $TIMEOUT -gt 0 ]; then
+ #Assume about 3/4 of the timeout may be used to wait for messages
+ TOREDUCED=`expr $TIMEOUT \* 3 / 4`
+ TOREPS=`expr $TOREDUCED / 5`
+ vprintf 2 " (split $TOREDUCED-s span into $TOREPS wait cycles)"
+ else
+ TOREPS=4; #Repetitions before timeout
+ fi
+ CLRNOTIFIED=0;
while [ $CLRNOTIFIED -eq 0 -a $TOREPS -gt 0 ]
do
glite-lb-notify receive -i 5 ${notifid} >> $NOTIFFILE 2> /dev/null
vprintf 1 "\n[LB Probe] Checking job state"
if [ "${jobstate}" = "Submitted" ]; then
vprintf 1 "\n[LB Probe] Test failed -- Job state has not changed (${jobstate})\n\n"
- vprintf 0 "DOWN: EVENT DELIVERY CHAIN (LOGGER/INTERLOGGER) NOT RUNNING\n"
+ vprintf 0 "DOWN: Event Delivery Chain (Logger/Interlogger) Not Running\n"
rm $NOTIFFILE $STATEFILE
exit 2
else
vprintf 2 ", OK ($NOTIFS messages)"
else
vprintf 1 "\n[LB Probe] Test failed -- Notifications were not delivered\n\n"
- vprintf 0 "DOWN: NOTIFICATION INTERLOGGER NOT RUNNING\n"
+ vprintf 0 "DOWN: Notification Interlogger Not Running\n"
rm $NOTIFFILE $STATEFILE
exit 2
fi