From 97625cbe0b92b2404c92b8b2a56577da44097582 Mon Sep 17 00:00:00 2001 From: Christoph Date: Thu, 23 Jun 2022 11:22:24 +0200 Subject: [PATCH] check_local_mattermost_service.sh: support restart of the whole system if service continues be down. --- check_local_mattermost_service.sh | 172 +++++++++++++++++++----------- 1 file changed, 108 insertions(+), 64 deletions(-) diff --git a/check_local_mattermost_service.sh b/check_local_mattermost_service.sh index cb91cd6..f73b484 100755 --- a/check_local_mattermost_service.sh +++ b/check_local_mattermost_service.sh @@ -7,6 +7,8 @@ conf_file="${working_dir}/conf/${script_name%%.*}.conf" LOCK_DIR="/tmp/${script_name%%.*}.LOCK" +RESTART_CHECK_FILE="/tmp/${script_name%%.*}.NEED-RESTART" + declare -a alert_email_arr @@ -335,6 +337,9 @@ if [[ -n "$LOCAL_MM_WEBSITES_TO_CHECK" ]] ; then # 200 - OK if [[ $response -eq 200 ]] ; then echo_ok + if [[ -f "$RESTART_CHECK_FILE" ]] ; then + rm -f "$RESTART_CHECK_FILE" + fi clean_up 0 fi sleep 2 @@ -343,12 +348,15 @@ if [[ -n "$LOCAL_MM_WEBSITES_TO_CHECK" ]] ; then echo_failed if [[ $response -gt 499 ]] ; then MM_SERVICE_DOWN=true - elif [[ $response -eq 0 ]] || [[ $response -eq -1 ]]; then + elif [[ $response -eq -1 ]]; then NGINX_SERVICE_DOWN=true PID="$(ps -e f | grep -E "[[:digit:]]\s+/opt/mattermost/bin/mattermost" | grep -v grep 2> /dev/null)" - [[ -z "$PID" ]] && MM_SERVICE_DOWN=true + [[ -z "$PID" ]] && MM_SERVICE_DOWN=true + #elif [[ $response -eq 0 ]] ; then + # NGINX_SERVICE_DOWN=true + # MM_SERVICE_DOWN=true else NGINX_SERVICE_DOWN=true MM_SERVICE_DOWN=true @@ -412,6 +420,8 @@ if [[ ${#LOG_FILES_TO_MONITOR} -gt 0 ]] ; then else msg_user_defined="${msg_user_defined}\n---\nLast entries (${_number_lines} lines) of \"${_log_file}\":\n---\n-- FILE IS EMPTY --\n" fi + done + fi done msg_user_defined="${msg_user_defined}\n\n" else @@ -419,6 +429,62 @@ else fi + +if [[ ! -f "$RESTART_CHECK_FILE" ]]; then + + touch "$RESTART_CHECK_FILE" + + error "The local Mattermost Service seems to be down." + + if $LOGGING ; then + echo -e "\n \033[1mFirst we try to restore the system. If this is not successful,\n the system will be restarted in about 5 minutes.\033[m" + else + echo "" + echo "First we try to restore the system. If this is not successful," + echo ""the system will be restarted in about 5 minutes. + echo "" + fi + + err_msg="\n[ Warning ]: The local Mattermost Service seems to be down.\n" + err_msg="${err_msg}\nFirst we try to restore the system. If this is not successful,\nthe system will be restarted in about 5 minutes.\n" + + datum="$(date +"%d.%m.%Y %H:%M")" + + for _email in ${alert_email_arr[@]} ; do + + echo -e "To:${_email}\n${content_type}\nSubject:[Warning] Local Mattermost Service is not available.\n${err_msg}\n\nFilesystem usage:\n=================\n$(df -h)\n\n${msg_process_list}\n\n${msg_user_defined}\n${msg}" \ + | sendmail -F "Error `hostname -f`" -f $sender_address $_email + done + +else + + error "The local Mattermost Service seems to be down." + + if $LOGGING ; then + echo -e "\n\033[1mGoing to restart the system NOW..\033[m" + else + echo "" + echo "Going to restart the system NOW.." + echo "" + fi + + err_msg="\n[ Error ]: The local Mattermost Service seems to be down.\n" + err_msg="${err_msg}\nGoing to restart the system..\n" + + datum="$(date +"%d.%m.%Y %H:%M")" + + for _email in ${alert_email_arr[@]} ; do + + echo -e "To:${_email}\n${content_type}\nSubject:[Error] Local Mattermost Service is not available.\n${err_msg}\n\nFilesystem usage:\n=================\n$(df -h)\n\n${msg_process_list}\n\n${msg_user_defined}\n${msg}" \ + | sendmail -F "Error `hostname -f`" -f $sender_address $_email + done + + rm -f "$RESTART_CHECK_FILE" + reboot_system + +fi + + if $NGINX_SERVICE_DOWN ; then error "NGINX Service seems to be down. Going to restart Service.." @@ -483,64 +549,6 @@ if $NGINX_SERVICE_DOWN ; then declare -i counter=0 PID="$(ps aux | grep -E "[[:digit:]]\s+nginx:" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" - sleep 1 - while [[ "X${PID}" = "X" ]]; do - sleep 1 - PID="$(ps aux | grep -E "[[:digit:]]\s+nginx:" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" - if [[ $counter -gt 10 ]]; then - break - else - ((counter++)) - fi - done - - if [[ "X${PID}" = "X" ]] ; then - error "Restarting NGINX Service failed!" - - err_msg="${err_msg}\n[ Error ]: Restarting NGINX Service failed!" - - else - ok "NGINX Service is up and running" - - err_msg="${err_msg}\n[ OK ]: NGINX Service is up and running" - fi - -fi - -if $MM_SERVICE_DOWN; then - error "Mattermost Service seems to be down. Going to restart Service.." - - if [[ -n "$err_msg" ]]; then - err_msg="${err_msg}\n\n\n[ Error ]: Mattermost Service seems to be down. - - Going to restart Mattermost Service Service\n" - else - err_msg="\n[ Error ]: Mattermost Service seems to be down. - - Going to restart Mattermost Service Service\n" - fi - - echononl "Stop mattwermost Service.." - $systemctl stop mattermost > /dev/null 2> ${LOCK_DIR}/error.log - if [[ $? -ne 0 ]]; then - echo_failed - error "$(cat ${LOCK_DIR}/error.log)" - else - echo_done - fi - - declare -i counter=0 - PID="$(ps aux | grep -E "[[:digit:]]\s+/opt/mattermost/bin/mattermost" | grep -v grep | tail -n 1 | awk '{print$2}' 2> /dev/null)" - - while [[ -n "$PID" ]] ; do - - if [[ $counter -gt 3 ]] ; then - break - fatal "Killing remaining mattermost Process(es) failed!" - fi - - warn "There are still mattermost processes running" - ((counter++)) echononl "${counter}: Kill remaining mattermost Process(es).." @@ -590,14 +598,50 @@ if $MM_SERVICE_DOWN; then fi -datum="$(date +"%d.%m.%Y %H:%M")" -for _email in ${alert_email_arr[@]} ; do - echo -e "To:${_email}\n${content_type}\nSubject:[Error] Local Mattermost Serviceinot available.\n$err_msg\n\nFilesystem usage:\n=================\n$(df -h)\n\n${msg_process_list}\n\n${msg_user_defined}\n${msg}" \ +if [[ ! -f "$RESTART_CHECK_FILE" ]]; then + + touch "$RESTART_CHECK_FILE" + + error "The local Mattermost Service seems to be down." + err_msg="\n[ Warning ]: The local Mattermost Service seems to be down.\n" + err_msg="${err_msg}\nFor now nothing is to. because its the first time..\n" + + datum="$(date +"%d.%m.%Y %H:%M")" + + for _email in ${alert_email_arr[@]} ; do + + echo -e "To:${_email}\n${content_type}\nSubject:[Warning] Local Mattermost Service is not available.\n${err_msg}\n\nFilesystem usage:\n=================\n$(df -h)\n\n${msg_process_list}\n\n${msg_user_defined}\n${msg}" \ + | sendmail -F "Error `hostname -f`" -f $sender_address $_email + done + +else + + error "The local Mattermost Service seems to be down." + err_msg="\n[ Warning ]: The local Mattermost Service seems to be down.\n" + + if $LOGGING ; then + echo -e "\n\033[1mGoing to restart the system NOW..\033[m" + else + echo "" + echo "Going to restart the system NOW.." + fi + + err_msg="${err_msg}\nGoing to restart the system..\n" + + datum="$(date +"%d.%m.%Y %H:%M")" + + for _email in ${alert_email_arr[@]} ; do + + echo -e "To:${_email}\n${content_type}\nSubject:[Error] Local Mattermost Service is not available.\n$err_msg\n\nFilesystem usage:\n=================\n$(df -h)\n\n${msg_process_list}\n\n${msg_user_defined}\n${msg}" \ | sendmail -F "Error `hostname -f`" -f $sender_address $_email -done + done + rm -f "$RESTART_CHECK_FILE" + reboot_system + +fi if $LOGGING ; then