#!/bin/bash
#
# Copyright (c) Members of the EGEE Collaboration. 2004-2010.
# See http://www.eu-egee.org/partners for details on the copyright holders.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# show help and usage
progname=`basename $0`
showHelp()
{
cat << EndHelpHeader
Nagios probe for testing the status of L&B

Tests called:
    1. Register job
    2. Register to receive notifications 
    3. Log events
    4. Check job state
    5. Receive notifications

Return values:
    0: Passed
    1: Warning
    2: Critical
    3: Unknown

Console output:
    OK|<time the test job took to transit from Registered to Cleared>
    WARNING: <reason>
    DOWN: <reason>
    UNKNOWN: <reason>

EndHelpHeader

	echo "Usage: $progname [-h] [-v[v[v]]] {[-H server] [-p port] | [server[:port]]} [-a clientname:port] [-x proxy] [-t <timeout>] [-T <tmpdir>]"
	echo "Options:"
	echo "    -h | --help       Show this help message"
	echo "    -v[vv]            Verbosity level"
	echo "    -H <server>       server (Environmental variables are used if unspecified)"
	echo "    -p <server>       port (Environmental variables or defaults are used if unspecified)"
	echo "    -t <timeout>      Probe timeout in seconds"
	echo "    -T <tmpdir>       Temporary directory (default /var/lib/grid-monitoring/emi.lb)"
	echo "    -a <host:port>    Endpoint for listening to notifications"
	echo "    -x <proxy>        User proxy file (default /tmp/x509up_u<UID>"
	echo ""
}

function vprintf()
{
#	echo $1 le $VERBLEVEL
	if [ $1 -le $VERBLEVEL ]; then
		printf "$2"
	fi
}

function check_exec()
{
        if [ -z $1 ]; then
                set_error "No binary to check"
                return 1
        fi
        # XXX: maybe use bash's command type?
        local ret=`which $1 2> /dev/null`
        if [ -n "$ret" -a -x "$ret" ]; then
                return 0
        else
                return 1
        fi
}

function check_binaries()
{
        local ret=0
        for file in $@
        do
                check_exec $file
                if [ $? -gt 0 ]; then
                        vprintf 2 "\nfile $file not found\n"
                        ret=1
                fi
        done
        return $ret
}

function try_purge()
{
                        joblist=$1

                        vprintf 1 "\n[LB Probe] Trying to purge test job"

                        glite-lb-purge -j ${joblist} > /dev/null 2> /dev/null
                        rm ${joblist}
}

function log_cleared()
{
                EDG_WL_SEQUENCE="UI=000003:NS=0000000000:WM=000000:BH=0000000000:JSS=000000:LM=000000:LRMS=000000:APP=000000:LBS=000000"

                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s NetworkServer -e Accepted --from="UserInterface" --from_host="sending component hostname" --from_instance="sending component instance" --local_jobid="new jobId (Condor Globus ...)"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s NetworkServer -e EnQueued --queue="destination queue" --job="job description in receiver language" --result=OK --reason="detailed description of transfer"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s WorkloadManager -e DeQueued --queue="queue name" --local_jobid="new jobId assigned by the receiving component"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s WorkloadManager -e HelperCall --helper_name="name of the called component" --helper_params="parameters of the call" --src_role=CALLING`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s WorkloadManager -e Match --dest_id="${DESTINATION:-destination CE/queue}"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s WorkloadManager -e HelperReturn --helper_name="name of the called component" --retval="returned data" --src_role=CALLING`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s WorkloadManager -e EnQueued --queue="destination queue" --job="job description in receiver language" --result=OK --reason="detailed description of transfer"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s JobController -e DeQueued --queue="queue name" --local_jobid="new jobId assigned by the receiving component"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s JobController -e Transfer --destination="LRMS" --dest_host="destination hostname" --dest_instance="destination instance" --job="job description in receiver language" --result=OK --reason="detailed description of transfer" --dest_jobid="destination internal jobid"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s LogMonitor -e Accepted --from="JobController" --from_host="sending component hostname" --from_instance="sending component instance" --local_jobid="new jobId (Condor Globus ...)"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s LogMonitor -e Transfer --destination="LRMS" --dest_host="destination hostname" --dest_instance="destination instance" --job="job description in receiver language" --result=OK --reason="detailed description of transfer" --dest_jobid="destination internal jobid"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s LogMonitor -e Running --node="${CE_NODE:-worker node}"`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s LogMonitor -e Done --status_code=OK --reason="reason for the change" --exit_code=0`
                EDG_WL_SEQUENCE=`glite-lb-logevent -j $1 -c $EDG_WL_SEQUENCE -s LogMonitor -e Clear --reason=USER`
}

VERBLEVEL=0
TIMEOUT=0
TMPDIR="/var/lib/grid-monitoring/emi.lb"
NOTIFendPOINT=""

while test -n "$1"
do
	case "$1" in
		"-h" | "--help") showHelp && exit 2 ;;
		"-v" | "--verbose")  VERBLEVEL=$(( $VERBLEVEL + 1 )) ;;
		"-vv" )  VERBLEVEL=$(( $VERBLEVEL + 2 )) ;;
		"-vvv" )  VERBLEVEL=$(( $VERBLEVEL + 3 )) ;;
		"-H" | "--hostname") shift && SRVPORT="${1}$SRVPORT" ;;
		"-p" | "--port") shift && SRVPORT="$SRVPORT:${1}" ;;
		"-t" | "--timeout") shift && TIMEOUT=$1 ;;
		"-T" | "--tmpdir") shift && TMPDIR=$1 ;;
		"-a" | "--notif-endpoint") shift && NOTIFendPOINT="-a $1 " ;;
		"-x" | "--proxy") shift && export X509_USER_PROXY=$1 ;;
		*) SRVPORT="$1" ;;
	esac
	shift
done

export VERBLEVEL

# Arrange timeout
if [ $TIMEOUT -gt 0 ]; then
mypid=$$
(trap 'exit' TERM; sleep $TIMEOUT; vprintf 0 "UNKNOWN: Probe timed out\n"; kill -s SIGINT $mypid; exit 3)&
watchpid=$!

trap 'exit 3' INT

fi

vprintf 3 "[LB Probe] Global variable values as received from cmdline:\n[LB Probe] VERBLEVEL=$VERBLEVEL\n[LB Probe] TIMEOUT=$TIMEOUT\n[LB Probe] TMPDIR=$TMPDIR\n[LB Probe] NOTIFendPOINT=$NOTIFendPOINT\n[LB Probe] SRVPORT=$SRVPORT\n"

#Set path to L&B example commands used by the probe
for exdir in /usr/lib64/glite-lb/examples /usr/lib/glite-lb/examples /opt/glite/examples
do
	if [ -d "$exdir" ]; then
		export PATH=$PATH:$exdir
		vprintf 2 "[LB Probe] adding $exdir to PATH\n"
	fi
done

#Extract srv:port from evnironmental variables, or vice versa
if [ -z $SRVPORT ]; then
	servername=`echo ${GLITE_WMS_QUERY_SERVER} | sed "s/:.*//"`
	portnumber=`echo ${GLITE_WMS_QUERY_SERVER} | grep -o -E ":.*$" | sed 's/[^0-9]//g'`

else
	servername=`echo ${SRVPORT} | sed "s/:.*//"`
	portnumber=`echo ${SRVPORT} | grep -o -E ":.*$" | sed 's/[^0-9]//g'`
fi

#exit if server not specified
if [ -z $servername ]; then
	vprintf 0 "UNKNOWN: No server specified\n"
	exit 3
fi

#use default if server port is not specified
if [ -z $portnumber ]; then
	if [ -z $GLITE_LB_SERVER_PORT ]; then
		portnumber=9000
	else
		portnumber=$GLITE_LB_SERVER_PORT
	fi
fi

#use default if logger port is not specified
if [ -z $GLITE_LB_LOGGER_PORT ]; then
	GLITE_LB_LOGGER_PORT=`expr $portnumber + 2`
fi

wsportnumber=`expr $portnumber + 3`

#override environmental variables if server/port was specified
#if [ "$SRVPORT" != "" ]; then
if [ ! -z $SRVPORT ]; then
	export GLITE_WMS_QUERY_SERVER=$servername:$portnumber
	export GLITE_WMS_NOTIF_SERVER=$servername:$portnumber
	export GLITE_WMS_LOG_DESTINATION=$servername:`expr $portnumber + 2`
	export GLITE_LB_LOGGER_PORT=`expr $portnumber + 2`
	export GLITE_LB_SERVER_PORT=$portnumber
fi

#Check if tmpdir writable
touch $TMPDIR/$$_probecheck > /dev/null 2> /dev/null
if [ -f $TMPDIR/$$_probecheck ]; then
	rm $TMPDIR/$$_probecheck
else
	vprintf 1 "[LB Probe] Could not write to $TMPDIR. Falling back to /tmp.\n"
	TMPDIR="/tmp"
	touch $TMPDIR/$$_probecheck > /dev/null 2> /dev/null
	if [ -f $TMPDIR/$$_probecheck ]; then
		rm $TMPDIR/$$_probecheck
	else
		vprintf 1 "[LB Probe] Could not write to $TMPDIR.\n"
		vprintf 0 "UNKNOWN: Probe could not write temporary files\n"
		exit 3
	fi
fi


if [ $VERBLEVEL -ge 3 ]; then
	env | grep -E "GLITE_|PATH"
	printf "*** $servername:$portnumber\n"
fi

##
#  Starting the test
#####################

{
vprintf 1 "[LB Probe] Starting test"

EXITCODE=0
EXITMESSAGE=""
HOSTCREDS=0

# check_binaries
vprintf 2 "\n[LB Probe] Testing if all binaries are available"
check_binaries grid-proxy-info grep sed echo wc cat awk kill glite-lb-notify glite-lb-job_reg glite-lb-job_status glite-lb-purge glite-lb-ws_getversion glite-lb-logevent 
if [ $? -gt 0 ]; then
	vprintf 1 "\n[LB Probe] Some Commands are unavailable\n\n"
	vprintf 0 "UNKNOWN: Some commands are not available\n"
	exit 3
fi

vprintf 1 "\n[LB Probe] Testing credentials"

if [ "$GLITE_GSS_MECH" == "krb5" ]; then
	klist | grep krbtgt > /dev/null
	if [ $? -ne 0 ]; then
		timeleft=""
	else
		klist | grep krbtgt | grep Expired > /dev/null
		if [ $? -ne 0 ]; then
			timeleft="1"
		else
			timeleft="0:00:00"
		fi
	fi
else
	timeleft=`grid-proxy-info | grep -E "^timeleft" | sed "s/timeleft\s*:\s//"` 2>/dev/null
	if [ "$timeleft" = "" ]; then
		vprintf 1 "\n[LB Probe] No proxy, trying to regenerate from host creds"
		grid-proxy-init -cert $GLITE_HOST_CERT -key $GLITE_HOST_KEY 2>&1 > /dev/null
		timeleft=`grid-proxy-info | grep -E "^timeleft" | sed "s/timeleft\s*:\s//"`
		HOSTCREDS=1
	fi
fi

if [ "$timeleft" = "" ]; then
        vprintf 1 "\n[LB Probe] Test failed -- No credentials\n\n"
	vprintf 0 "UNKNOWN: No Credentials\n"
	exit 3
else
        if [ "$timeleft" = "0:00:00" ]; then
		vprintf 1 "\n[LB Probe] Proxy expired, trying to regenerate from host creds"
		grid-proxy-init -cert $GLITE_HOST_CERT -key $GLITE_HOST_KEY 2>&1 > /dev/null
		timeleft=`grid-proxy-info | grep -E "^timeleft" | sed "s/timeleft\s*:\s//"`
	        if [ "$timeleft" = "0:00:00" ]; then
			vprintf 1 "\n[LB Probe] Test failed -- Credentials expired\n\n"
			vprintf 0 "UNKNOWN: Credentials Expired\n"
			exit 3
		else
			HOSTCREDS=1
		fi
        else
		
		vprintf 1 "\n[LB Probe] Getting server version"
		serverversion=`glite-lb-ws_getversion -m $servername:$wsportnumber`
		if [ -z "$serverversion" ]; then
			vprintf 1 "\n[LB Probe] Test failed -- server did not respond\n\n"
			vprintf 0 "DOWN: Unable to Get Server Version\n"
			exit 2
		else
			echo $serverversion | grep -E "version.*[0-9]+\.[0-9]+\.[0-9]+" > /dev/null
			if [ $? = 0 ]; then
				vprintf 2 ": $serverversion"
			else
				vprintf 1 " - unexpected output ($serverversion). A WARNING will be returned."
				EXITMESSAGE="$EXITMESSAGE[Unexpected version output ($serverversion)]"
				EXITCODE=1
			fi
		fi

		# Register job:
		vprintf 1 "\n[LB Probe] Registering testing job "
		jobid=`glite-lb-job_reg -m ${GLITE_WMS_QUERY_SERVER} -s application 2>&1 | grep "new jobid" | awk '{ print $3 }'`

		if [ -z $jobid ]; then
			vprintf 1 " Failed to register job\n[LB Probe] Test failed \n\n"
			vprintf 0 "DOWN: Job Registration Failed Locally\n"
			exit 2
		else
			vprintf 2 "${jobid}"

	                jobstate=`glite-lb-job_status ${jobid} | grep "state :" | awk '{print $3}'`
                	if [ "${jobstate}" = "Submitted" ]; then
				vprintf 2 ", server side OK"
                        else
				vprintf 1 "\n[LB Probe] Test failed -- Job has not been submitted to server\n\n"
				vprintf 0 "DOWN: L&B Server Not Running\n"
				exit 2
                        fi
		fi
	

		# Register notification:
		vprintf 1 "\n[LB Probe] Registering notification "

		notifid=`glite-lb-notify new -j ${jobid} | grep "notification ID" | awk '{ print $3 }'`

		if [ -z $notifid ]; then
			vprintf 1 "\n[LB Probe] Test failed -- Failed to register notification\n\n"
			vprintf 0 "DOWN: L&B Server Not Running\n"
			exit 2
		else
			vprintf 2 "${notifid}"

			vprintf 1 "\n[LB Probe] Logging events resulting in state Cleared"
			log_cleared ${jobid}

			NOTIFFILE="/tmp/$$_notifications.txt"		
			STATEFILE="/tmp/$$_jobstat.txt"		
			echo '' > $NOTIFFILE

			vprintf 1 "\n[LB Probe] Waiting for delivery/processing"
			if [ $TIMEOUT -gt 0 ]; then
				#Assume about 3/4 of the timeout may be used to wait for messages
				TOREDUCED=`expr $TIMEOUT \* 3 / 4`
				TOREPS=`expr $TOREDUCED  / 5`
				vprintf 2 " (split $TOREDUCED-s span into $TOREPS wait cycles)"
			else 	
				TOREPS=4; #Repetitions before timeout
			fi
			CLRNOTIFIED=0;
			while [ $CLRNOTIFIED -eq 0 -a $TOREPS -gt 0 ]
			do
				glite-lb-notify receive ${NOTIFendPOINT}-i 5 ${notifid} >> $NOTIFFILE 2> /dev/null 

				CLRNOTIFIED=`cat $NOTIFFILE | grep -E "$jobid.*Cleared" | wc -l` > /dev/null
				TOREPS=$(( $TOREPS - 1 ))
			done

			glite-lb-job_status ${jobid} > $STATEFILE
	                jobstate=`cat $STATEFILE | grep "state :" | awk '{print $3}'`

			vprintf 1 "\n[LB Probe] Checking job state"
                	if [ "${jobstate}" = "Submitted" ]; then
				vprintf 1 "\n[LB Probe] Test failed -- Job state has not changed (${jobstate})\n\n"
				vprintf 0 "DOWN: Event Delivery Chain (Logger/Interlogger) Not Running\n"
				rm $NOTIFFILE $STATEFILE
				exit 2
                        else
				vprintf 2 " -- ${jobstate}"
                		if [ "${jobstate}" != "Cleared" ]; then
					vprintf 2 ", not Cleared. A WARNING will be returned."
					EXITMESSAGE="$EXITMESSAGE[Unexpected state of test job (${jobstate})]"
					EXITCODE=1
				fi
                        fi

			vprintf 1 "\n[LB Probe] Checking if notifications were delivered"
			NOTIFS=`cat $NOTIFFILE | wc -l`

			grep ${jobid} $NOTIFFILE > /dev/null

			if [ $? = 0 ]; then
				vprintf 2 ", OK ($NOTIFS messages)"
			else
				vprintf 1 "\n[LB Probe] Test failed -- Notifications were not delivered\n\n"
				vprintf 0 "DOWN: Notification Interlogger Not Running\n"
				rm $NOTIFFILE $STATEFILE
				dropresult=`glite-lb-notify drop ${notifid} 2>&1`
				exit 2
			fi

			if [ $VERBLEVEL -ge 3 ]; then
				cat $NOTIFFILE
			fi

			regtime=`cat $STATEFILE | grep -E "^[\t ]+Submitted[\t ]+" | sed 's/[\t ]*Submitted[\t ]*//'`
			clrtime=`cat $STATEFILE | grep -E "^[\t ]+Cleared[\t ]+" | sed 's/[\t ]*Cleared[\t ]*//'`
			REGTIME=`date -d "$regtime" +%s`
			CLRTIME=`date -d "$clrtime" +%s`

			rm $NOTIFFILE $STATEFILE

			#Drop notification
			vprintf 1 "\n[LB Probe] Dropping the test notification"
			vprintf 2 " (${notifid})"
			dropresult=`glite-lb-notify drop ${notifid} 2>&1`
			if [ -z $dropresult ]; then
				vprintf 2 ""
			else
				vprintf 1 "\n[LB Probe] Test failed"
				vprintf 1 " Failed to drop notification ${dropresult}, A WARNING will be returned."
				EXITMESSAGE="$EXITMESSAGE[Could not drop notification]"
				EXITCODE=1
			fi

			#Purge test job
			joblist="/tmp/$$_jobs_to_purge.txt"
			echo $jobid > ${joblist}
			try_purge ${joblist}

		fi

		if [ $HOSTCREDS -eq 1 ]; then
			vprintf 1 "\n[LB Probe] Removing host creds-based proxy"
			grid-proxy-destroy
		fi
	fi
fi

if [ ! -z $watchpid ]; then
	kill -s SIGTERM "$watchpid"
fi

vprintf 1 "\n[LB Probe] Test finished\n\n"
}

if [ $EXITCODE -eq 0 ]; then
	EXITMESSAGE="OK|time=`expr $CLRTIME - $REGTIME`s"
else
	EXITMESSAGE="WARNING: $EXITMESSAGE" 
fi
printf "$EXITMESSAGE\n"
exit $EXITCODE

