#!/bin/bash

HOSTNAME=
NAGIOS_IPS=
HELP=false
VERBOSE=false

USAGE=$(cat <<'END_HEREDOC'
Send passive alerts to a set of Nagios servers
--
monitor-replication-nagios --help
monitor-replication-nagios --nagios-hosts=ip1,ip2,... [--hostname=<hostname>] [--verbose]
Examples:
  monitor-replication-nagios --nagios-hosts=192.168.23.1,10.0.1.2
  monitor-replication-nagios --nagios-hosts=192.168.23.1,10.0.1.2 --hostname=kdc-dev1.stanford.edu
END_HEREDOC
)

progress () {
    if [[ "$VERBOSE" == true ]]; then
	echo "$1"
    fi
}

exit_with_error () {
    echo "error: $1"
    exit 1
}

# Process options
# Taken from https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash

# Is getopt on this system?
! getopt --test > /dev/null
if [[ ${PIPESTATUS[0]} -ne 4 ]]; then
    echo 'I am sorry, `getopt --test` failed in this environment.'
    exit 1
fi

# Define the options
# n: hostname
# i: Nagios IP addresses
# v: verbose
# h: help
OPTIONS=n:i:vhv
LONGOPTS=hostname:,nagios-hosts:,verbose,help

! PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTS --name "$0" -- "$@")
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
    # e.g. return value is 1
    #  then getopt has complained about wrong arguments to stdout
    exit 2
fi

eval set -- "$PARSED"

# Set v (verbose) and h (help) to "falsen"
VERBOSE=false HELP=false

# now enjoy the options in order and nicely split until we see --
while true; do
    case "$1" in
	-h|--help)
	    HELP=true
	    shift
	    ;;
	-v|--verbose)
	    VERBOSE=true
	    shift
	    ;;
	-n|--hostname)
	    HOSTNAME="$2"
	    shift 2
	    ;;
	-i|--nagios-hosts)
	    NAGIOS_IPS="$2"
	    shift 2
	    ;;
	--)
	    shift
	    break
	    ;;
	*)
	    echo "Programming error: $1"
	    exit 3
	    ;;
    esac
done

progress "option 'i|nagios-hosts' is '$NAGIOS_IPS'"
progress "option 'n|hostname'     is '$HOSTNAME'"
progress "option 'h|help'         is '$HELP'"
progress "option 'v|verbose'      is '$VERBOSE'"

# If --help was passed, show the help screen now and then exit.
if [[ "$HELP" = true ]]; then
    echo "$USAGE"
    exit
fi

# Parse the Nagios IP addresses.
IFS=',' read -ra IP_ADDRESSES <<< "$NAGIOS_IPS"
progress "parsed --nagios-ips: $NAGIOS_IPS"

NUM_ADDRESSES=${#IP_ADDRESSES[@]}
progress "number of Nagios IP addresses: ${NUM_ADDRESSES}"

if [[ "$NUM_ADDRESSES" -eq "0" ]]; then
  exit_with_error "missing Nagios IP addresses"
fi

if [[ -z "$HOSTNAME" ]]; then
    # Derive the hostname from the system
    HOSTNAME=$(hostname --short)
else
    progress "using hostname from --hostname options"
fi
progress "HOSTNAME is '$HOSTNAME'"

# Replication is considered critically behind if it has been this many
# seconds since last password change. Remember that the password update
# job only runs every 5 minutes and there is a built-in delay of up to
# two additional minutes, so the password change could be as much as 7
# minutes old and still be fine. So, set WARNING to 9 minutes and
# CRITICAL to 15 minutes.
WARN_SECS=$( expr 9  \* 60 )
CRIT_SECS=$( expr 15 \* 60 )

progress "WARN_SECS is $WARN_SECS seconds"
progress "CRIT_SECS is $CRIT_SECS seconds"

# Get the number of seconds since the password was last changed.
TMPFILE=$(mktemp /tmp/monitor-replication-nagios.XXXXXX)
CMD=""
SECONDS_BEHIND=$(/usr/sbin/kdc-get-last-pwchange testing/replication 2> $TMPFILE)
LAST_ERROR=$?
STDERR=$(<$TMPFILE)
unlink $TMPFILE

progress "LAST_ERROR is $LAST_ERROR"
progress "SECONDS_BEHIND is '$SECONDS_BEHIND'"
progress "STDERR is '$STDERR'"

if [[ ! -z "$STDERR" ]]; then
    MSG="3;UNKNOWN - monitor failed: $STDERR" ;
elif [[ "$SECONDS_BEHIND" == "" ]]; then
    MSG='3;UNKNOWN - kdc-get-last-pwchange returned an empty string'
elif [[ "$SECONDS_BEHIND" -gt "$CRIT_SECS" ]]; then
    MSG="2;CRITICAL - replication is behind $SECONDS_BEHIND seconds"
elif [[ "$SECONDS_BEHIND" -gt "$WARN_SECS" ]]; then
    MSG="1;WARNING - replication is behind $SECONDS_BEHIND seconds"
else
    MSG='0;OK - replication is within system limits'
fi


# Send NCSA (passive check) to Nagios monitors.
for IP_ADDRESS in "${IP_ADDRESSES[@]}"
do
    progress "processing passive alert for Nagios address $IP_ADDRESS"

    PSV_MSG="$HOSTNAME;replication_passive;$MSG"
    progress "passive message: $PSV_MSG"

    TMPFILE_OUT=$(mktemp /tmp/monitor-replication-nagios-out.XXXXX)
    TMPFILE_ERR=$(mktemp /tmp/monitor-replication-nagios-err.XXXXX)

    echo "$PSV_MSG" | /usr/sbin/send_nsca -d \; -H $IP_ADDRESS 2> $TMPFILE_ERR > $TMPFILE_OUT
    EXIT_CODE=$?
    STDOUT=$(<$TMPFILE_OUT)
    STDERR=$(<$TMPFILE_ERR)

    unlink $TMPFILE_OUT
    unlink $TMPFILE_ERR

    progress "send_ncsa command output: $STDOUT"
done

exit 0

# Documentation.  Use a hack to hide this from the shell.  Because of the
# above exit line, this should never be executed.
DOCS=<<__END_OF_DOCS__

=head1 NAME

monitor-replication-nagios - Passive monitor for replication status

=head1 SYNOPSIS

B<monitor-replication-nagios> [hostname]

=head1 DESCRIPTION

This command gets the time that the password for for the principal
B<testing/replication> was last changed (using
C</usr/sbin/kdc-get-last-pwchange>) If the last time it was changed is
more than 15 minutes in the past, a CRITICAL passive Nagios alert is sent
to the nagios01.stanford.edu server.  If the last time it was changed is
less than 15 minutes but more than 9 minutes a WARNING passive Nagios
alert is sent.

If a C<hostname> argument is supplied that value is used as the hostname
for the passvive alert. If no C<hostname> argument is provider, then the
C<hostname> command is run and that value is used.

=head1 AUTHOR

Adam Lewenberg <adamhl@stanford.edu>

=cut
