- use another default tmpdir
authorZdeněk Šustr <sustr4@cesnet.cz>
Thu, 25 Aug 2011 08:17:32 +0000 (08:17 +0000)
committerZdeněk Šustr <sustr4@cesnet.cz>
Thu, 25 Aug 2011 08:17:32 +0000 (08:17 +0000)
- tmpdir configurable
- timeout also governs the notification waiting cycle
- output modifications

org.glite.lb.nagios/Makefile
org.glite.lb.nagios/src/LB-probe

index 1ce5411..bcb127a 100644 (file)
@@ -9,6 +9,7 @@ INSTALL=install
 -include ${top_srcdir}/project/version.properties
 
 install:
+       mkdir -p ${DESTDIR}${PREFIX}/var/lib/grid-monitoring/emi.lb
        mkdir -p ${DESTDIR}${PREFIX}${prefix}/libexec/grid-monitoring/probes/emi.lb
        ${INSTALL} -m 0755 src/LB-probe ${DESTDIR}${PREFIX}${prefix}/libexec/grid-monitoring/probes/emi.lb
 
index 56d15d6..5bd11a6 100755 (executable)
@@ -24,10 +24,11 @@ cat << EndHelpHeader
 Nagios probe for testing the status of L&B
 
 Tests called:
-    job registration
-    notification registration
-    logging events
-    receiving notifications
+    1. Register job
+    2. Register to receive notifications 
+    3. Log events
+    4. Check job state
+    5. Receive notifications
 
 Return values:
     0: Passed
@@ -36,21 +37,21 @@ Return values:
     3: Unknown
 
 Console output:
-    OK|<time to transit from Registered to Cleared>
+    OK|<time the test job took to transit from Registered to Cleared>
     WARNING: <reason>
     DOWN: <reason>
     UNKNOWN: <reason>
 
 EndHelpHeader
 
-       echo "Usage: $progname [-h] [-v[v[v]]] [-H server] [-p port] [server[:port]]"
+       echo "Usage: $progname [-h] [-v[v[v]]] {[-H server] [-p port] | [server[:port]]} [-t <timeout>] [-T <tmpdir>]"
        echo "Options:"
-       echo "    -h | --help       Show this help message."
-       echo "    -v[vv]            Verbosity level."
-       echo "    -H <server>       server"
-       echo "    -p <server>       port"
-       echo "    <server>          Environmental variables are used if unspecified"
-       echo "    <port>            Environmental variables or defaults are used if unspecified)"
+       echo "    -h | --help       Show this help message"
+       echo "    -v[vv]            Verbosity level"
+       echo "    -H <server>       server (Environmental variables are used if unspecified)"
+       echo "    -p <server>       port (Environmental variables or defaults are used if unspecified)"
+       echo "    -t <timeout>      Probe timeout in seconds"
+       echo "    -T <tmpdir>       Temporary directory (default /var/lib/grid-monitoring/emi.lb)"
        echo ""
 }
 
@@ -123,23 +124,27 @@ function log_cleared()
 
 VERBLEVEL=0
 TIMEOUT=0
+TMPDIR="/var/lib/grid-monitoring/emi.lb"
 
 while test -n "$1"
 do
        case "$1" in
                "-h" | "--help") showHelp && exit 2 ;;
-               "-v" )  VERBLEVEL=$(( $VERBLEVEL + 1 )) ;;
+               "-v" | "--verbose")  VERBLEVEL=$(( $VERBLEVEL + 1 )) ;;
                "-vv" )  VERBLEVEL=$(( $VERBLEVEL + 2 )) ;;
                "-vvv" )  VERBLEVEL=$(( $VERBLEVEL + 3 )) ;;
-               "-H" ) shift && SRVPORT="${1}$SRVPORT" ;;
-               "-p" ) shift && SRVPORT="$SRVPORT:${1}" ;;
-               "-t" ) shift && TIMEOUT=$1 ;;
+               "-H" | "--hostname") shift && SRVPORT="${1}$SRVPORT" ;;
+               "-p" | "--port") shift && SRVPORT="$SRVPORT:${1}" ;;
+               "-t" | "--timeout") shift && TIMEOUT=$1 ;;
+               "-T" | "--tmpdir") shift && TMPDIR=$1 ;;
                *) SRVPORT="$1" ;;
-#              "-t" | "--text")  setOutputASCII ;;
        esac
        shift
 done
 
+export VERBLEVEL
+
+# Arrange timeout
 if [ $TIMEOUT -gt 0 ]; then
 mypid=$$
 (trap 'exit' TERM; sleep $TIMEOUT; vprintf 0 "UNKNOWN: Probe timed out\n"; kill -s SIGINT $mypid; exit 3)&
@@ -149,8 +154,6 @@ trap 'exit 3' INT
 
 fi
 
-export VERBLEVEL
-
 #Set path to L&B example commands used by the probe
 for exdir in /usr/lib64/glite-lb/examples /usr/lib/glite-lb/examples /opt/glite/examples
 do
@@ -202,6 +205,24 @@ if [ ! -z $SRVPORT ]; then
        export GLITE_LB_SERVER_PORT=$portnumber
 fi
 
+#Check if tmpdir writable
+touch $TMPDIR/$$_probecheck > /dev/null 2> /dev/null
+if [ -f $TMPDIR/$$_probecheck ]; then
+       rm $TMPDIR/$$_probecheck
+else
+       vprintf 1 "[LB Probe] Could not write to $TMPDIR. Falling back to /tmp.\n"
+       TMPDIR="/tmp"
+       touch $TMPDIR/$$_probecheck > /dev/null 2> /dev/null
+       if [ -f $TMPDIR/$$_probecheck ]; then
+               rm $TMPDIR/$$_probecheck
+       else
+               vprintf 1 "[LB Probe] Could not write to $TMPDIR.\n"
+               vprintf 0 "UNKNOWN: Probe could not write temporary files\n"
+               exit 3
+       fi
+fi
+
+
 if [ $VERBLEVEL -ge 3 ]; then
        env | grep -E "GLITE_|PATH"
        printf "*** $servername:$portnumber\n"
@@ -232,12 +253,12 @@ timeleft=`grid-proxy-info | grep -E "^timeleft" | sed "s/timeleft\s*:\s//"`
 
 if [ "$timeleft" = "" ]; then
         vprintf 1 "\n[LB Probe] Test failed -- No credentials\n\n"
-       vprintf 0 "UNKNOWN: NO CREDENTIALS\n"
+       vprintf 0 "UNKNOWN: No Credentials\n"
        exit 3
 else
         if [ "$timeleft" = "0:00:00" ]; then
                 vprintf 1 "\n[LB Probe] Test failed -- Credentials expired\n\n"
-               vprintf 0 "UNKNOWN: CREDENTIALS EXPIRED\n"
+               vprintf 0 "UNKNOWN: Credentials Expired\n"
                exit 3
         else
                
@@ -245,7 +266,7 @@ else
                serverversion=`glite-lb-ws_getversion -m $servername:$wsportnumber`
                if [ -z "$serverversion" ]; then
                        vprintf 1 "\n[LB Probe] Test failed -- server did not respond\n\n"
-                       vprintf 0 "DOWN: UNABLE TO GET SERVER VERSION\n"
+                       vprintf 0 "DOWN: Unable to Get Server Version\n"
                        exit 2
                else
                        echo $serverversion | grep -E "version.*[0-9]+\.[0-9]+\.[0-9]+" > /dev/null
@@ -264,7 +285,7 @@ else
 
                if [ -z $jobid ]; then
                        vprintf 1 " Failed to register job\n[LB Probe] Test failed \n\n"
-                       vprintf 0 "DOWN: JOB REGISTRATION FAILED LOCALLY\n"
+                       vprintf 0 "DOWN: Job Registration Failed Locally\n"
                        exit 2
                else
                        vprintf 2 "${jobid}"
@@ -274,7 +295,7 @@ else
                                vprintf 2 ", server side OK"
                         else
                                vprintf 1 "\n[LB Probe] Test failed -- Job has not been submitted to server\n\n"
-                               vprintf 0 "DOWN: L&B SERVER NOT RUNNING\n"
+                               vprintf 0 "DOWN: L&B Server Not Running\n"
                                exit 2
                         fi
                fi
@@ -287,7 +308,7 @@ else
 
                if [ -z $notifid ]; then
                        vprintf 1 "\n[LB Probe] Test failed -- Failed to register notification\n\n"
-                       vprintf 0 "DOWN: L&B SERVER NOT RUNNING\n"
+                       vprintf 0 "DOWN: L&B Server Not Running\n"
                        exit 2
                else
                        vprintf 2 "${notifid}"
@@ -298,10 +319,17 @@ else
                        NOTIFFILE="/tmp/$$_notifications.txt"           
                        STATEFILE="/tmp/$$_jobstat.txt"         
                        echo '' > $NOTIFFILE
-       
-                       TOREPS=4; #Repetitions before timeout
-                       CLRNOTIFIED=0;
+
                        vprintf 1 "\n[LB Probe] Waiting for delivery/processing"
+                       if [ $TIMEOUT -gt 0 ]; then
+                               #Assume about 3/4 of the timeout may be used to wait for messages
+                               TOREDUCED=`expr $TIMEOUT \* 3 / 4`
+                               TOREPS=`expr $TOREDUCED  / 5`
+                               vprintf 2 " (split $TOREDUCED-s span into $TOREPS wait cycles)"
+                       else    
+                               TOREPS=4; #Repetitions before timeout
+                       fi
+                       CLRNOTIFIED=0;
                        while [ $CLRNOTIFIED -eq 0 -a $TOREPS -gt 0 ]
                        do
                                glite-lb-notify receive -i 5 ${notifid} >> $NOTIFFILE 2> /dev/null 
@@ -316,7 +344,7 @@ else
                        vprintf 1 "\n[LB Probe] Checking job state"
                        if [ "${jobstate}" = "Submitted" ]; then
                                vprintf 1 "\n[LB Probe] Test failed -- Job state has not changed (${jobstate})\n\n"
-                               vprintf 0 "DOWN: EVENT DELIVERY CHAIN (LOGGER/INTERLOGGER) NOT RUNNING\n"
+                               vprintf 0 "DOWN: Event Delivery Chain (Logger/Interlogger) Not Running\n"
                                rm $NOTIFFILE $STATEFILE
                                exit 2
                         else
@@ -337,7 +365,7 @@ else
                                vprintf 2 ", OK ($NOTIFS messages)"
                        else
                                vprintf 1 "\n[LB Probe] Test failed -- Notifications were not delivered\n\n"
-                               vprintf 0 "DOWN: NOTIFICATION INTERLOGGER NOT RUNNING\n"
+                               vprintf 0 "DOWN: Notification Interlogger Not Running\n"
                                rm $NOTIFFILE $STATEFILE
                                exit 2
                        fi