#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


#This script polls the specified url (typically a service we want to see running) and process
#If it finds that the web request fails it also kills the process being monitored and exits
#If it finds that the process is not alive any more we exit
#Typically used in startup scripts for services such as solr that should be terminated if the
#server is not running
#Example usage in a shell script : bigtop-monitor-service $$

function info() {
  echo "INFO:" `date +"%X %Z %x"` "$@"
}

#Although this was written specifically for Ubuntu, I suspect this
#might work for any distro
#In case you face problems with this function you may want to try
#using 'ALL:COMPLEMENTOFALL' instead of 'DEFAULT' below

function add_cipher_if_needed() {
distro=`lsb_release -i -s`
release=`lsb_release -r -s`
if [ "$distro" == "Ubuntu" ] && [[ "$release" == "12."* ]]; then
  cipher=`openssl ciphers -v 'DEFAULT' | awk 'BEGIN{ORS=","}{print $1}'`
  cipher="--ciphers `echo $cipher | sed 's/,$//'`"
fi
}

function monitor() {
USAGE="$0 polling_interval_seconds process_id_to_kill"
if [ $# -ne 2 ]
then
    echo $USAGE >&2
    exit 1
fi
interval="$1"
pid="$2"

if [ ! -f ${CATALINA_BASE}/conf/watchdog.properties ]; then
  echo "watchdog.properties not found under ${CATALINA_BASE}/conf. Terminating watchdog." >&2
  exit 1
fi

. ${CATALINA_BASE}/conf/watchdog.properties
if [ -z $WATCHDOG_MONITOR_URL ]; then
  echo "WATCHDOG_MONITOR_URL not set.. Terminating watchdog." >&2
  exit 1
fi

url=${WATCHDOG_MONITOR_URL}
add_cipher_if_needed

if  ! expr "$interval" : '^[0-9][0-9]*$' >/dev/null 
then
    echo "Invalid value for polling_interval_seconds $interval - must be a positive integer" >&2
    kill -9 $pid
    exit 1
fi

if  [ $interval -le 0 ]
then
    echo "Invalid value for polling_interval_seconds $interval - must be >= 1" >&2
    kill -9 $pid
    exit 1
fi

eval exec {3..255}\>\&-
cd /

info "Starting a watchdog process monitoring process '$pid' and url '$url'"

while :
do
     sleep $interval
     info  "Sending a heartbeat request to $url"

     HTTP_CODE=`curl -m$interval --retry 5 -L -k -s --negotiate -u : -o /dev/null -w "%{http_code}" "$url" $cipher`
     HTTP_CODE=${HTTP_CODE:-600}

     # If we're getting 5xx+ (server side error) kill the service and exit
     # Because curl is weird (it tries to proxy HTTP exit codes to be its
     # UNIX exit codes times 10 AND at the same time prints 000 as HTTP exit
     # code) we should also treat exit code of 0 as a failure. 
     if [ $HTTP_CODE -ge 500 -o $HTTP_CODE -eq 0 ] ; then
       info "Got $HTTP_CODE HTTP code from the server. Watchdog is now killing process: $pid"
       kill -9 $pid
       exit 0
     fi

     # If we're getting 4xx (client side error) we better exit silently
     # 401 (Unauthorized) is a special case of when we should keep running
     if [ $HTTP_CODE -ge 400 -a $HTTP_CODE -lt 500 -a $HTTP_CODE -ne 401 ] ; then
       info "Got $HTTP_CODE HTTP code. This is confusing. Watchdog is now exiting..."
       exit 0
     fi

     if kill -0 $pid >>/dev/null 2>&1 ;then
        echo "Process $pid is alive"
     else
        echo "Process $pid is dead"
        exit 1
     fi
done
}

monitor "$@" &
