#!/usr/bin/env bash script_name="$(basename $(realpath $0))" working_dir="$(dirname $(realpath $0))" conf_file="${working_dir}/conf/${script_name%%.*}.conf" LOCK_DIR="/tmp/${script_name%%.*}.LOCK" RESTART_CHECK_FILE="/tmp/${script_name%%.*}.NEED-RESTART" declare -a alert_email_arr # ------------- # --- Some functions # ------------- clean_up() { # Perform program exit housekeeping rm -rf "$LOCK_DIR" if $LOGGING ; then echo "" fi exit $1 } echononl(){ if $terminal && $LOGGING ; then echo X\\c > /tmp/shprompt$$ if [ `wc -c /tmp/shprompt$$ | awk '{print $1}'` -eq 1 ]; then echo -e -n " $*\\c" 1>&2 else echo -e -n " $*" 1>&2 fi rm /tmp/shprompt$$ fi } fatal(){ echo "" if $terminal ; then echo -e "[ \033[31m\033[1mError\033[m ]: $*" echo "" echo -e "\t\033[31m\033[1mScript was interupted\033[m!" else echo " [ Fatal ]: $*" echo "" echo " Script was terminated...." fi echo "" clean_up 1 } error (){ echo "" if $terminal ; then echo -e "\t[ \033[31m\033[1mError\033[m ]: $*" else echo "[ Error ]: $*" fi echo "" } warn (){ echo "" if $terminal ; then echo -e "\t[ \033[33m\033[1mWarning\033[m ]: $*" else echo "[ Warning ]: $*" fi echo "" } warn_only_terminal () { if $terminal ; then echo "" echo -e " [ \033[33m\033[1mWarning\033[m ]: $*" echo "" fi } info (){ echo "" if $terminal ; then echo -e " [ \033[32m\033[1mInfo\033[m ] $*" else echo "[ Info ] $*" fi echo "" } ok (){ echo "" if $terminal ; then echo -e "\t[ \033[32m\033[1mOk\033[m ]: $*" else echo "[ Ok ]: $*" fi echo "" } echo_done() { if $terminal && $LOGGING ; then echo -e "\033[75G[ \033[32mdone\033[m ]" else if $LOGGING ; then echo " [ done ]" fi fi } echo_ok() { if $terminal && $LOGGING ; then echo -e "\033[75G[ \033[32mok\033[m ]" else if $LOGGING ; then echo " [ ok ]" fi fi } echo_failed(){ if $terminal && $LOGGING ; then echo -e "\033[75G[ \033[1;31mfailed\033[m ]" else if $LOGGING ; then echo " [ failed ]" fi fi } echo_skipped() { if $terminal && $LOGGING ; then echo -e "\033[75G[ \033[33m\033[1mskipped\033[m ]" else if $LOGGING ; then echo " [ skipped ]" fi fi } is_number() { return $(test ! -z "${1##*[!0-9]*}" > /dev/null 2>&1); # - also possible # - #[[ ! -z "${1##*[!0-9]*}" ]] && return 0 || return 1 #return $([[ ! -z "${1##*[!0-9]*}" ]]) } trim() { local var="$*" var="${var#"${var%%[![:space:]]*}"}" # remove leading whitespace characters var="${var%"${var##*[![:space:]]}"}" # remove trailing whitespace characters echo -n "$var" } blank_line() { if $terminal ; then echo "" fi } reboot_system() { # content_type='Content-Type: text/plain;\n charset="utf-8"' # datum="$(date +"%d.%m.%Y")" # from_address="root@$(hostname --long)" # msg="S*" # # # for _email in ${alert_email_arr[@]} ; do # # echo -e "To:${_email}\n${content_type}\nSubject:[Fatal: Local Webservice NOT rsponding] - Reboot System\n${msg}" \ # | sendmail -F "Error `hostname -f`" -f $sender_address $_email # done sleep 10 /sbin/reboot -f > /dev/null 2>&1 } # ------------- # --- Read Configurations from $conf_file # ------------- # Some default values # DEFAULT_NUMBER_LINES=20 DEFAULT_TIME_OUT=20 #DEFAULT_CONFLICTING_SCRIPTS="/root/bin/monitoring/check_webservice_load.sh" if [[ ! -f "$conf_file" ]]; then echo "" echo -e " [ Fatal ] Configuration file '$(basename ${conf_file})' not found!" echo "" echo -e "\tScript terminated.." echo "" exit 1 else source "$conf_file" fi for _email in $alert_email_addresses ; do alert_email_arr+=("$_email") done [[ -n "$sender_address" ]] || sender_address="check_mm_service@$(hostname -f)" [[ -n "$content_type" ]] || content_type='Content-Type: text/plain;\n charset="utf-8"' [[ -n "$TIME_OUT" ]] || TIME_OUT=$DEFAULT_TIME_OUT TIME_OUT_MAX="$(expr ${TIME_OUT} + 5)" #[[ -n "$CONFLICTING_SCRIPTS" ]] || CONFLICTING_SCRIPTS="$DEFAULT_CONFLICTING_SCRIPTS" [[ -n "$CONFLICTING_SCRIPTS" ]] || CONFLICTING_SCRIPTS="" # ------------- # --- Check some prerequisites # ------------- # - Running in a terminal? # - if [[ -t 1 ]] ; then terminal=true LOGGING=true else terminal=false LOGGING=false fi # - Stop here, if these give scripts are running # - if [[ ${#CONFLICTING_SCRIPTS} -gt 0 ]] ; then # - Try using a random start delay to prevent (or at least have a small chance) that # - conflicting scripts will both/all abort if they start at the same time. # - # - !! Notice !! # - This only makes sense if a fixed LOCK directory is used, otherwise the process list # - (and NOT the LOCK-directory) is used to look for scripts running in parallel. # - # - Skip delay if running in an terminal (from copnsole) # - if ! $terminal ; then if [[ "$LOCK_DIR" = "/tmp/${script_name%%.*}.LOCK" ]]; then _shift="$(( $RANDOM % 10 + 1 ))" sleep $(( $RANDOM % 25 + $_shift )) fi fi _stop_running=false for _val in $CONFLICTING_SCRIPTS ; do IFS=':' read -a _val_arr <<< "${_val}" _script_name="$(basename ${_val_arr[0]})" if [[ -n "${_val_arr[1]}" ]] ; then if [[ "${_val_arr[1]}" = "CHECK_PROCESS_LIST" ]] ; then check_string_ps="${_val_arr[0]}" if ps -e f | grep -E "\s+${check_string_ps}" | grep -v grep | grep -v -E "\s+vim\s+" > /dev/null ; then _stop_running=true fi elif [[ -d "${_val_arr[1]}" ]] ; then _stop_running=true fi elif [[ -d "/tmp/${_script_name%%.*}.LOCK" ]]; then _stop_running=true fi if $_stop_running ; then warn_only_terminal "\033[1m${_script_name}\033[m is currently running, but it conflicts with this script. Exiting now.." clean_up 1 fi # if $_stop_running ; then done # for _val in $CONFLICTING_SCRIPTS ; do fi # if [[ ${#CONFLICTING_SCRIPTS} -gt 0 ]] ; then # ------------- # - Job is already running? # ------------- # - If job already runs, stop execution.. # - if mkdir "$LOCK_DIR" 2> /dev/null ; then ## - Remove lockdir when the script finishes, or when it receives a signal trap "clean_up 1" SIGHUP SIGINT SIGTERM else datum="$(date +"%d.%m.%Y %H:%M")" msg="[ Error ]: A previos instance of \"`basename $0`\" seems already be running.\n\n Exiting now.." echo "" echo "[ Error ]: A previos instance of that script \"`basename $0`\" seems already be running." echo "" echo -e " Exiting now.." echo "" for _email in ${alert_email_arr[@]} ; do echo -e "To:${_email}\n${content_type}\nSubject:Error cronjob `basename $0` -- $datum\n${msg}\n" \ | sendmail -F "Error `hostname -f`" -f $sender_address $_email done exit 1 fi # ------------- # --- Check some further prerequisites # ------------- # - Systemd supported ? # - systemd=$(which systemd) systemctl=$(which systemctl) if $LOGGING ; then echo "" fi declare -i response=-1 NGINX_SERVICE_DOWN=false MM_SERVICE_DOWN=false if [[ -n "$LOCAL_MM_WEBSITES_TO_CHECK" ]] ; then echononl "Check local mattermost service \033[1m$LOCAL_MM_WEBSITES_TO_CHECK\033[m .." declare -i i=0 while [[ $i -lt 3 ]] ; do response="$(curl --max-time $TIME_OUT_MAX --connect-timeout $TIME_OUT \ -I -k -L --write-out %{http_code} --silent --output /dev/null $LOCAL_MM_WEBSITES_TO_CHECK \ 2> ${LOCK_DIR}/error.log)" # 200 - OK if [[ $response -eq 200 ]] ; then echo_ok if [[ -f "$RESTART_CHECK_FILE" ]] ; then rm -f "$RESTART_CHECK_FILE" fi clean_up 0 fi sleep 2 ((i++)) done echo_failed if [[ $response -gt 499 ]] ; then MM_SERVICE_DOWN=true elif [[ $response -eq -1 ]]; then NGINX_SERVICE_DOWN=true PID="$(ps -e f | grep -E "[[:digit:]]\s+/opt/mattermost/bin/mattermost" | grep -v grep 2> /dev/null)" [[ -z "$PID" ]] && MM_SERVICE_DOWN=true #elif [[ $response -eq 0 ]] ; then # NGINX_SERVICE_DOWN=true # MM_SERVICE_DOWN=true else NGINX_SERVICE_DOWN=true MM_SERVICE_DOWN=true fi else warn "No local Mattermost Service to check is given (empty var 'LOCAL_MM_WEBSITES_TO_CHECK')" clean_up 10 fi msg_process_list="process list:\n=============\n$(ps -e f)" msg_head="\n==========\nSystem logfiles\n==========\n" msg00="Last entries (20 lines) of \"/var/log/syslog\":" msg01="===============================================" msg02=`tail -n 20 /var/log/syslog` msg03="Last entries (20 lines) of \"/var/log/messages\":" msg04="=================================================" msg05=`tail -n 20 /var/log/messages` msg06="Last entries (20 lines) of \"/var/log/auth.log\":" msg07="=================================================" msg08=`tail -n 20 /var/log/auth.log` msg09="Last entries (20 lines) of \"/var/log/daemon.log\":" msg10="=================================================" msg11=`tail -n 20 /var/log/daemon.log` msg12="Last entries (20 lines) of \"/var/log/kern.log\":" msg13="=================================================" msg14=`tail -n 20 /var/log/kern.log` msg15="Lastlog:" msg16="========" msg17=`lastlog` msg18="dmesg:" msg19="======" msg20=`dmesg -T` msg="${msg_head}\n${msg00}\n${msg01}\n${msg02}\n\n${msg03}\n${msg04}\n${msg05}\n\n${msg06}\n${msg07}\n${msg08}\n\n${msg09}\n${msg10}\n${msg11}\n\n${msg12}\n${msg13}\n${msg14}\n\n${msg15}\n${msg16}\n${msg17}\n\n${msg18}\n${msg19}\n${msg20}" if [[ ${#LOG_FILES_TO_MONITOR} -gt 0 ]] ; then msg_user_defined="\n==========\nUser defined logfiles\n==========\n" for _val in $LOG_FILES_TO_MONITOR ; do IFS=':' read -a _val_arr <<< "${_val}" _log_file="${_val_arr[0]}" if [[ -n "${_val_arr[0]}" ]] && is_number "${_val_arr[1]}" ; then _number_lines=${_val_arr[1]} else _number_lines=$DEFAULT_NUMBER_LINES fi if [[ -s "${_log_file}" ]] ; then msg_user_defined="${msg_user_defined}\n---\nLast entries (${_number_lines} lines) of \"${_log_file}\":\n---\n$(tail -n ${_number_lines} ${_log_file})\n" else msg_user_defined="${msg_user_defined}\n---\nLast entries (${_number_lines} lines) of \"${_log_file}\":\n---\n-- FILE IS EMPTY --\n" fi done msg_user_defined="${msg_user_defined}\n\n" else msg_user_defined="" fi if [[ ! -f "$RESTART_CHECK_FILE" ]]; then touch "$RESTART_CHECK_FILE" error "The local Mattermost Service seems to be down." if $LOGGING ; then echo -e "\n \033[1mFirst we try to restore the system. If this is not successful,\n the system will be restarted in about 5 minutes.\033[m" else echo "" echo "First we try to restore the system. If this is not successful," echo ""the system will be restarted in about 5 minutes. echo "" fi err_msg="\n[ Warning ]: The local Mattermost Service seems to be down.\n" err_msg="${err_msg}\nFirst we try to restore the system. If this is not successful,\nthe system will be restarted in about 5 minutes.\n" datum="$(date +"%d.%m.%Y %H:%M")" for _email in ${alert_email_arr[@]} ; do echo -e "To:${_email}\n${content_type}\nSubject:[Warning] Local Mattermost Service is not available.\n${err_msg}\n\nFilesystem usage:\n=================\n$(df -h)\n\n${msg_process_list}\n\n${msg_user_defined}\n${msg}" \ | sendmail -F "Error `hostname -f`" -f $sender_address $_email done else error "The local Mattermost Service seems to be down." if $LOGGING ; then echo -e "\n\033[1mGoing to restart the system NOW..\033[m" else echo "" echo "Going to restart the system NOW.." echo "" fi err_msg="\n[ Error ]: The local Mattermost Service seems to be down.\n" err_msg="${err_msg}\nGoing to restart the system..\n" datum="$(date +"%d.%m.%Y %H:%M")" for _email in ${alert_email_arr[@]} ; do echo -e "To:${_email}\n${content_type}\nSubject:[Error] Local Mattermost Service is not available.\n${err_msg}\n\nFilesystem usage:\n=================\n$(df -h)\n\n${msg_process_list}\n\n${msg_user_defined}\n${msg}" \ | sendmail -F "Error `hostname -f`" -f $sender_address $_email done rm -f "$RESTART_CHECK_FILE" reboot_system fi if $NGINX_SERVICE_DOWN ; then error "NGINX Service seems to be down. Going to restart Service.." err_msg="\n[ Error ]: NGINX Service seems to be down. Going to restart NGINX Service\n" echononl "Stop nginx Service.." $systemctl stop nginx > /dev/null 2> ${LOCK_DIR}/error.log if [[ $? -ne 0 ]]; then echo_failed error "$(cat ${LOCK_DIR}/error.log)" else echo_done fi declare -i counter=0 PID="$(ps aux | grep -E "[[:digit:]]\s+nginx:" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" while [[ -n "$PID" ]] ; do if [[ $counter -gt 3 ]] ; then break fatal "Killing remaining nginx Process(es) failed!" fi warn "There are still nginx processes running" ((counter++)) echononl "${counter}: Kill remaining nginx Process(es).." if [[ -s "/run/nginx.pid" ]]; then kill $(cat /run/nginx.pid) > ${LOCK_DIR}/error.log 2>&1 if [[ $? -ne 0 ]]; then echo_failed error "$(cat ${LOCK_DIR}/error.log)" else echo_done fi rm -f "/run/nginx.pid" > /dev/null 2>&1 else killall nginx > ${LOCK_DIR}/error.log 2>&1 if [[ $? -ne 0 ]]; then echo_failed error "$(cat ${LOCK_DIR}/error.log)" else echo_done fi fi sleep 1 PID="$(ps aux | grep -E "[[:digit:]]\s+nginx:" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" done echononl "Start nginx Service.." $systemctl start nginx > /dev/null 2> ${LOCK_DIR}/error.log if [[ $? -ne 0 ]]; then echo_failed error "$(cat ${LOCK_DIR}/error.log)" else echo_done fi declare -i counter=0 PID="$(ps aux | grep -E "[[:digit:]]\s+nginx:" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" sleep 1 while [[ "X${PID}" = "X" ]]; do sleep 1 PID="$(ps aux | grep -E "[[:digit:]]\s+nginx:" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" if [[ $counter -gt 10 ]]; then break else ((counter++)) fi done if [[ "X${PID}" = "X" ]] ; then error "Restarting NGINX Service failed!" err_msg="${err_msg}\n[ Error ]: Restarting NGINX Service failed!" else ok "NGINX Service is up and running" err_msg="${err_msg}\n[ OK ]: NGINX Service is up and running" fi fi if $MM_SERVICE_DOWN; then error "Mattermost Service seems to be down. Going to restart Service.." if [[ -n "$err_msg" ]]; then err_msg="${err_msg}\n\n\n[ Error ]: Mattermost Service seems to be down. Going to restart Mattermost Service Service\n" else err_msg="\n[ Error ]: Mattermost Service seems to be down. Going to restart Mattermost Service Service\n" fi echononl "Stop mattwermost Service.." $systemctl stop mattermost > /dev/null 2> ${LOCK_DIR}/error.log if [[ $? -ne 0 ]]; then echo_failed error "$(cat ${LOCK_DIR}/error.log)" else echo_done fi declare -i counter=0 PID="$(ps aux | grep -E "[[:digit:]]\s+/opt/mattermost/bin/mattermost" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" while [[ -n "$PID" ]] ; do if [[ $counter -gt 3 ]] ; then break fatal "Killing remaining mattermost Process(es) failed!" fi warn "There are still mattermost processes running" ((counter++)) echononl "${counter}: Kill remaining mattermost Process(es).." killall mattermost > ${LOCK_DIR}/error.log 2>&1 if [[ $? -ne 0 ]]; then echo_failed error "$(cat ${LOCK_DIR}/error.log)" else echo_done fi sleep 1 PID="$(ps aux | grep -E "[[:digit:]]\s+/opt/mattermost/bin/mattermost" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" done echononl "Start mattermost Service.." $systemctl start mattermost > /dev/null 2> ${LOCK_DIR}/error.log if [[ $? -ne 0 ]]; then echo_failed error "$(cat ${LOCK_DIR}/error.log)" else echo_done fi declare -i counter=0 PID="$(ps aux | grep -E "[[:digit:]]\s+/opt/mattermost/bin/mattermost" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" sleep 1 while [[ "X${PID}" = "X" ]]; do sleep 1 PID="$(ps aux | grep -E "[[:digit:]]\s+/opt/mattermost/bin/mattermost" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" if [[ $counter -gt 10 ]]; then break else ((counter++)) fi done if [[ "X${PID}" = "X" ]] ; then error "Restarting Mattermost Service failed!" err_msg="${err_msg}\n[ Error ]: Restarting Mattermost Service failed!" else ok "Mattermost Service is up and running" err_msg="${err_msg}\n[ OK ]: Mattermost Service is up and running" fi fi if $LOGGING ; then echo "" fi clean_up 0