#!/bin/bash
#
# Copyright (c) Members of the EGEE Collaboration. 2004-2010.
# See http://www.eu-egee.org/partners for details on the copyright holders.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# show help and usage
progname=`basename $0`
showHelp()
{
cat << EndHelpHeader
Nagios probe for testing the status of log file processing by the L&B Interlogger service

This probe is intended for running on the target machine.

Tests called:
    1. Check running processes
    2. Check number and size of log files
    3. Check last contact timestamps

Return values:
    0: Passed
    1: Warning
    2: Critical
    3: Unknown

Console output:
    OK|<No. of log files/size of log files>, <last contact min/mean/max>
    WARNING: <reason>
    DOWN: <reason>
    UNKNOWN: <reason>

EndHelpHeader

	echo "Usage: $progname [-h] [-v[v[v]]] {[-H server] [-p port] | [server[:port]]} [-t <timeout>] [-T <tmpdir>]"
	echo "Options:"
	echo "    -h | --help       Show this help message"
	echo "    -v[v[v]]          Verbosity level"
	echo "    -t <timeout>      Probe timeout in seconds"
	echo "    -f <size>         Path and prefix for event files"
	echo "    -s <size>         Path and prefix for IL socket"
	echo "    -S <size>         Timeout for the IL socket (default 60 s)"
	echo "    -w <size>         Log file size limit (kB) to trigger warning (default 10 MB)"
	echo "    -c <size>         Log file size limit (kB) to trigger state critical (default 128 MB)"
#	echo "    -W <size>         Time (s) elapsed since last contact to trigger warning"
#	echo "    -C <size>         Time (s) elapsed since last contact to trigger state critical"
	echo "    -P | --proxy      Use default prefix for Proxy Interlogger files rather than regular IL"
	echo "    -N | --notif      Use default prefix for Notification Interlogger files rather than regular IL"
	echo ""
}

function vprintf()
{
#	echo $1 le $VERBLEVEL
	if [ $1 -le $VERBLEVEL ]; then
		printf "$2"
	fi
}

function check_exec()
{
        if [ -z $1 ]; then
                set_error "No binary to check"
                return 1
        fi
        # XXX: maybe use bash's command type?
        local ret=`which $1 2> /dev/null`
        if [ -n "$ret" -a -x "$ret" ]; then
                return 0
        else
                return 1
        fi
}

function check_binaries()
{
        local ret=0
        for file in $@
        do
                check_exec $file
                if [ $? -gt 0 ]; then
                        vprintf 2 "\nfile $file not found\n"
                        ret=1
                fi
        done
        return $ret
}


VERBLEVEL=0
TIMEOUT=0
PREFIX="/var/spool/glite/lb-locallogger/dglogd.log"
NOTIFPREFIX="/var/spool/glite/lb-notif/dglogd.log"
PROXYPREFIX="/var/spool/glite/lb-proxyi/dglogd.log"
ILSOCK="/var/run/glite/glite-lb-interlogger.sock"
CRITICAL=131072
WARNING=10240
SOCKTIMEOUT=60

while test -n "$1"
do
	case "$1" in
		"-h" | "--help") showHelp && exit 2 ;;
		"-v" | "--verbose")  VERBLEVEL=$(( $VERBLEVEL + 1 )) ;;
		"-vv" )  VERBLEVEL=$(( $VERBLEVEL + 2 )) ;;
		"-vvv" )  VERBLEVEL=$(( $VERBLEVEL + 3 )) ;;
		"-t" | "--timeout") shift && TIMEOUT=$1 ;;
		"-f" | "--file-prefix") shift && PREFIX=$1 ;;
		"-s" | "--sock") shift && ILSOCK=$1 ;;
		"-S" | "--sock-timeout") shift && SOCKTIMEOUT=$1 ;;
		"-w" | "--warning") shift && WARNING=$1 ;;
		"-c" | "--critical") shift && CRITICAL=$1 ;;
		"-W" | "--t-warning") shift && TWARNING=$1 ;;
		"-C" | "--t-critical") shift && TCRITICAL=$1 ;;
		"-P" | "--proxy") shift && PREFIX="$PROXYPREFIX" ;;
		"-N" | "--notif") shift && PREFIX="$NOTIFPREFIX" ;;
		*) SRVPORT="$1" ;;
	esac
	shift
done

export VERBLEVEL

# Arrange timeout
if [ $TIMEOUT -gt 0 ]; then
mypid=$$
(trap 'exit' TERM; sleep $TIMEOUT; vprintf 0 "UNKNOWN: Probe timed out\n"; kill -s SIGINT $mypid; exit 3)&
watchpid=$!

trap 'exit 3' INT

fi



##
#  Starting the test
#####################

{
vprintf 1 "[IL Probe] Starting test"

EXITCODE=0
EXITMESSAGE=""

# check_binaries
vprintf 2 "\n[IL Probe] Testing if all binaries are available"
check_binaries grep sed echo wc cat awk kill du grep sort tail stat
if [ $? -gt 0 ]; then
	vprintf 1 "\n[IL Probe] Some commands are unavailable\n\n"
	vprintf 0 "UNKNOWN: Some commands are not available\n"
	exit 3
fi



#Total size of IL files
vprintf 1 "\n[IL Probe] Check IL file sizes"
NOFiles=`ls -U ${PREFIX}* 2>/dev/null | wc -l`
NOEvtFiles=0
vprintf 3 "\n[IL Probe] There are $NOFiles files matching the prefix"
if [ "$NOFiles" -gt "0" ]; then
	NOStatFiles=`ls -U ${PREFIX}*.stat 2>/dev/null | wc -l`
	if [ "$NOFiles" -ne "$NOStatFiles" ]; then
#		USAGE=`du --bytes --summarize -k ${PREFIX}* | awk '{total=total+$1} END {print total}'`
		USAGE=`du --bytes --summarize --exclude '*.stat' -k ${PREFIX}* | awk '{total=total+$1} END {print total}'`
		let NOEvtFiles=NOFiles-NOStatFiles
		vprintf 2 "\n[IL Probe] $USAGE kB worth of files found for prefix ${PREFIX}"
	else
		vprintf 2 "\n[IL Probe] Only stat files found for prefix ${PREFIX}"
		USAGE=0
	fi
else
	vprintf 2 "\n[IL Probe] No files found for prefix ${PREFIX}"
	USAGE=0
fi

#Compare values
vprintf 2 "\n[IL Probe] Comparing measured files to critical threshold."
vprintf 3 "\n[IL Probe] $USAGE -gt $CRITICAL?"
if [ "$USAGE" -gt "$CRITICAL" ]; then
	printf "Total of InterLogger files ($USAGE kb) exceeds bounds ($CRITICAL).\n"
	exit 2
fi

vprintf 2 "\n[IL Probe] Comparing measured files to warning threshold."
vprintf 3 "\n[IL Probe] $USAGE -gt $WARNING?"
if [ "$USAGE" -gt "$WARNING" ]; then
        EXITCODE=1
        EXITMESSAGE="Total of InterLogger files ($USAGE kb) exceeds bounds ($WARNING)."
fi


vprintf 1 "\n[IL Probe] Check stat timestamps"
vprintf 2 "\n[IL Probe] Getting most recent connected timestamp."
STAMP=`grep -h "last_connected=" ${PREFIX}*stat | sort | tail -n 1 | grep -o -P '[0-9]+'`
NOW=`date +%s`
let ELAPSED=NOW-STAMP
vprintf 3 "\n[IL Probe] most recent timestamp = $STAMP\n[IL Probe] current time = $NOW\n[IL Probe] elapsed time = $ELAPSED"



#vprintf 3 "\n[IL Probe] Are there event files waiting? ($NOEvtFiles)"
#if [ "$NOEvtFiles" -gt "0" ]; then
#	vprintf 2 "\n[IL Probe] Event files waiting, compare timestamp."
#
#fi

vprintf 1 "\n[IL Probe] Check socket timestamp"
vprintf 3 "\n[IL Probe] Check if socket file exists"
if [ -e $ILSOCK ]; then
	vprintf 3 "\n[IL Probe] Check if it is a socket"
	if [ -S $ILSOCK ]; then
		SOCKSTAMP=`stat -c %Y $ILSOCK`
		vprintf 2 "\n[IL Probe] IL socket timesamp: $SOCKSTAMP"
	else
		printf "UNKNOWN: $ILSOCK is not a socket\n"
		return 3
	fi
else
	printf "UNKNOWN: $ILSOCK does not exist\n"
	return 3
fi


vprintf 2 "\n[IL Probe] Comparing timestamps"
NOW=`date +%s`
let ILAGE=NOW-SOCKSTAMP
vprintf 2 "\n[IL Probe] socket age is $ILAGE s."
if [ "$ILAGE" -gt "SOCKTIMEOUT" ]; then
	printf "CRITICAL: Interlogger socket not being refreshed anymore (age $ILAGE s)\n"
	exit 2
fi

if [ ! -z $watchpid ]; then
	kill -s SIGTERM "$watchpid"
fi

vprintf 1 "\n[IL Probe] Test finished\n\n"
}

if [ $EXITCODE -eq 0 ]; then
	EXITMESSAGE="OK|size=$USAGE kB;elapsed=$ELAPSED s"
else
	EXITMESSAGE="WARNING: $EXITMESSAGE" 
fi
printf "$EXITMESSAGE\n"
exit $EXITCODE

