diff --git a/apache-ip-requests-analyze.sh b/apache-ip-requests-analyze.sh new file mode 100755 index 0000000..7e0a11c --- /dev/null +++ b/apache-ip-requests-analyze.sh @@ -0,0 +1,772 @@ +#!/usr/bin/env bash +############################################################################### +# apache-ip-requests-analyze.sh +# +# Zweck +# ----- +# Dieses Skript analysiert eine zentrale Apache-Sammellogdatei, in die alle +# VirtualHosts zusätzlich protokollieren, z.B.: +# +# CustomLog /var/log/apache2/ip_requests.log base_requests +# LogFormat "%a %v %p %t %r %>s \"%{User-Agent}i\" %T" base_requests +# +# Beispielzeile: +# 62.138.6.15 www.example.tld 443 [21/Feb/2026:00:00:59 +0100] \ +# GET /foo HTTP/1.1 404 "Mozilla/5.0 ..." 0 +# +# Hintergrund / Nutzen +# -------------------- +# - Erkennen, ob einzelne Sites auffällig viel Traffic bekommen (Site-Angriff). +# - Erkennen, ob einzelne IPs auffällig viel senden (Scanner/Bruteforce). +# - Erkennen von "Spikes" (BURST) pro Minute – typisch bei DDoS/Scans. +# - Erkennen von WP-typischen Angriffspfaden (wp-login, xmlrpc, wp-json, …). +# +# --------------------------------------------------------------------------- +# WICHTIG: Automatische Logdatei-Auswahl bei --from/--to (NEU) +# --------------------------------------------------------------------------- +# Default ohne Parameter: +# -> Es wird NUR /var/log/apache2/ip_requests.log ausgewertet +# +# Sobald du aber --from und/oder --to angibst UND du NICHT explizit --log nutzt: +# -> Das Skript schaltet automatisch auf /var/log/apache2/ip_requests.log* +# (also inkl. ip_requests.log.1, .2.gz, .3.gz, ...) +# +# Grund: Bei Zeitraum-Filtern ist das "aktuelle" Log oft nicht mehr ausreichend. +# --------------------------------------------------------------------------- +# +# --------------------------------------------------------------------------- +# Features +# --------------------------------------------------------------------------- +# Input: +# - Standard: nur /var/log/apache2/ip_requests.log +# - Rotationen: per --log auch *.gz und *.1, *.2.gz, ... (z.B. logrotate daily) +# - Auto-Range: bei --from/--to automatisch ip_requests.log* (wenn --log fehlt) +# +# Filter: +# - Zeitraum: --from / --to (lokale Zeit, wie Apache loggt) +# - Request: --status (Regex), --method (Regex), --path-prefix (Prefix) +# - WordPress: --wp-suspects [preset] (typische WP-Endpunkte) +# - Site/VHost: --site (exakt), --site-regex (Regex), --exclude-site (Regex) +# +# Auswertungen (TYPES): +# - Requests pro Site (SITE) +# - Requests pro IP (IP) +# - Unique IPs pro Site (SITE_UNIQIP) +# - Requests pro Site+IP (PAIR) -> "IPs pro Site" +# - Burst pro IP/Minute (BURST_IP) -> "Spikes pro IP" +# - Top User-Agents (UA_G) +# - 404 Pfade pro Site (PATH_404) +# - 5xx Pfade pro Site (PATH_5XX) +# +# Output: +# - --out text (Default): stdout (mit --text-types steuerbar) +# - --out csv (Default Separator ';' für LibreOffice/Excel-DE) +# - --out tsv +# - --out json (JSON Lines) +# +# CSV/TSV Eigenschaften: +# - Blockweise pro TYPE: Headerzeile, dann Daten, Leerzeile zwischen Blöcken +# - Pro TYPE nach COUNT absteigend sortiert +# - --csv-top N begrenzt die Zeilen pro TYPE (Top-N pro Block) +# - --csv-gap / --csv-no-gap: optionale Leerspalten zwischen Spalten +# +# --------------------------------------------------------------------------- +# TYPE-Matrix (Bedeutung der TMP/CSV Records) +# --------------------------------------------------------------------------- +# Intern (TMP): +# TYPECOUNTKEY1KEY2KEY3 +# +# TYPE | Bedeutung | key1 | key2 | key3 +# ----------- | ------------------------------------------- | ---- | ------ | ---- +# TOTAL | Gesamtrequests nach allen Filtern | - | - | - +# SITE | Requests pro Site/VHost | site | - | - +# IP | Requests pro IP | ip | - | - +# SITE_UNIQIP | Anzahl unterschiedlicher IPs pro Site | site | - | - +# PAIR | Requests pro (Site,IP) Kombination | site | ip | - +# BURST_IP | Requests pro IP innerhalb einer Minute | ip | minute | - +# UA_G | User-Agent global (alle Sites zusammen) | ua | - | - +# PATH_404 | 404-Pfade je Site (site + path) | site | path | - +# PATH_5XX | 5xx-Pfade je Site (site + path) | site | path | - +# +# Minute-Format: +# - Intern: minute_key = YYYYmmddHHMM +# - In CSV/TSV/Text: YYYY-mm-dd HH:MM +# +# Burst erklärt (IP vs BURST_IP): +# - IP: Gesamtrequests pro IP (über gesamten Zeitraum) +# - BURST_IP: Aktivität pro IP in EINER Minute (Spikes -> Scanner/DDoS) +# +############################################################################### + +set -euo pipefail + +PROG="$(basename "$0")" + +# ----------------------------------------------------------------------------- +# Defaults / Konfiguration +# ----------------------------------------------------------------------------- +DEFAULT_LOG="/var/log/apache2/ip_requests.log" + +# LOG_SPEC ist der "User-Level Input" (Datei/Glob/Liste). Wird später expandiert. +LOG_SPEC="$DEFAULT_LOG" +USER_LOG_SPECIFIED=0 # (NEU) Merker: hat User explizit --log gesetzt? + +FROM_EPOCH="" +TO_EPOCH="" + +OUT_FORMAT="text" # text|csv|tsv|json + +# Text defaults +TOP_N=25 +STATUS_TOP_SITES=25 +BURST_TOP=25 +UA_TOP=25 +PATH_ERR_TOP=25 + +# Stdout selection: Welche Blöcke (TYPES) sollen in Text-Ausgabe erscheinen? +TEXT_TYPES="" # comma list +DEFAULT_TEXT_TYPES="SITE,IP,SITE_UNIQIP,BURST_IP,UA_G,PATH_404,PATH_5XX" + +# CSV/TSV controls +CSV_TYPES="" # empty => all +CSV_TOP=0 # 0 => unlimited +CSV_GAP=1 # default: ON +CSV_SEP=";" # default: ';' (LibreOffice) + +# Globale Request-Filter (wirken auf alle Auswertungen) +FILTER_STATUS="" # regex, e.g. 404 or '^5..$' +FILTER_METHOD="" # regex, e.g. POST or 'GET|POST' +FILTER_PATH_PREFIX="" # prefix, e.g. /wp-login.php +FILTER_PATH_REGEX="" # internal for WP preset all + +# Site/VHost Filter +FILTER_SITE_EXACT="" # exact match +FILTER_SITE_REGEX="" # regex include +FILTER_EXCLUDE_SITE_REGEX="" # regex exclude + +# WordPress Shortcuts +WP_SUSPECTS=0 +WP_PRESET="all" + +# Debug +DEBUG=0 + +# ----------------------------------------------------------------------------- +# Helper-Funktionen +# ----------------------------------------------------------------------------- +die() { echo "ERROR: $*" >&2; exit 2; } + +# Parse "YYYY-mm-dd [HH:MM:SS]" -> epoch seconds +to_epoch() { + local s="$1" + date -d "$s" +%s 2>/dev/null || die "Kann Datum nicht parsen: '$s'" +} + +# Datei lesen, optional gzip transparent +emit_file() { + local f="$1" + if [[ "$f" =~ \.gz$ ]]; then + zcat -- "$f" + else + cat -- "$f" + fi +} + +# Expand --log SPEC: +# - kann glob sein (z.B. /var/log/apache2/ip_requests.log*) +# - kann mehrere Pfade in einem String enthalten (in Quotes) +expand_logs() { + local spec="$1" + local -a parts out + local -A seen=() + + # shellcheck disable=SC2206 + parts=( $spec ) + for p in "${parts[@]}"; do + # glob expansion + # shellcheck disable=SC2206 + local -a g=( $p ) + + # wenn kein glob-match und Datei existiert nicht -> skip + if [[ ${#g[@]} -eq 1 && "${g[0]}" == "$p" && ! -e "$p" ]]; then + continue + fi + + for f in "${g[@]}"; do + [[ -e "$f" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + out+=("$f") + fi + done + done + + printf "%s\n" "${out[@]}" +} + +# JSON escape for JSON Lines output +json_escape() { + local s="$1" + s="${s//\\/\\\\}" + s="${s//\"/\\\"}" + s="${s//$'\n'/\\n}" + s="${s//$'\r'/\\r}" + s="${s//$'\t'/\\t}" + echo -n "$s" +} + +# Parse comma-separated list into associative array (trim spaces) +# Usage: parse_list_to_map "A,B,C" MAPNAME +parse_list_to_map() { + local list="$1" + local -n _map="$2" + local IFS=, + local item + for item in $list; do + item="${item#"${item%%[![:space:]]*}"}" + item="${item%"${item##*[![:space:]]}"}" + [[ -n "$item" ]] && _map["$item"]=1 + done +} + +usage() { + cat < Nur ${DEFAULT_LOG} + + Auto-Range (NEU): + Wenn --from oder --to angegeben wird UND du NICHT --log setzt, + nutzt das Skript automatisch: + ${DEFAULT_LOG}* + also inkl. Rotationen (.1, .2.gz, ...) + + -h, --help + -l, --log SPEC Datei/Glob/Liste (in Quotes), z.B.: + -l ${DEFAULT_LOG} + -l "${DEFAULT_LOG}*" + -l "${DEFAULT_LOG} ${DEFAULT_LOG}.1" + +Zeit: + --from "YYYY-mm-dd[ HH:MM:SS]" Start (inkl.) + --to "YYYY-mm-dd[ HH:MM:SS]" Ende (inkl.) + +Request-Filter (wirken auf ALLES): + --status X Statuscode oder Regex, z.B. 404 oder '^5..$' + --method M Methode oder Regex, z.B. POST oder 'GET|POST' + --path-prefix P Pfadprefix, z.B. /wp-login.php + +Site-Filter: + --site SITE Nur exakt diese Site (vhost) + --site-regex REGEX Nur Sites die REGEX matchen + --exclude-site REGEX Sites ausblenden, die REGEX matchen + +WordPress Shortcuts: + --wp-suspects [PRESET] PRESET = login|xmlrpc|admin|api|cron|all (Default: all) + Matcht typische WP-Endpunkte: + /wp-login.php, /xmlrpc.php, /wp-admin, /wp-json, /wp-cron.php + +Output: + --out text|csv|tsv|json Default: text + CSV default Delimiter: ';' (LibreOffice/Excel-DE) + +Text-Ausgabe: + --top N Top N (Default: ${TOP_N}) + --text-types LIST Welche TYPE-Blöcke in stdout (text) erscheinen sollen. + Beispiel: + --text-types "SITE,IP,BURST_IP,UA_G,PATH_404,PATH_5XX" + Default: + ${DEFAULT_TEXT_TYPES} + +CSV/TSV: + --csv-types LIST Nur bestimmte TYPE-Blöcke (Komma-Liste) + Beispiel: --csv-types "SITE,IP,BURST_IP" + --csv-top N Pro TYPE nur Top N Zeilen (0=unbegrenzt) + --csv-gap | --csv-no-gap Leerspalte zwischen Spalten (Default: gap ON) + +Debug: + --debug TYPE-Counts aus TMP ausgeben + +TYPE-Matrix (für CSV/TSV/JSON und --text-types): + TYPE | Bedeutung | key1 | key2 | key3 + ----------- | --------------------------------------- | ---- | ------ | ---- + TOTAL | Gesamtrequests nach Filtern | - | - | - + SITE | Requests pro Site/VHost | site | - | - + IP | Requests pro IP | ip | - | - + SITE_UNIQIP | Unique IPs pro Site | site | - | - + PAIR | Requests pro (site,ip) | site | ip | - + BURST_IP | Requests pro IP in einer Minute | ip | minute | - + UA_G | User-Agent global | ua | - | - + PATH_404 | 404 Pfade je Site (site+path) | site | path | - + PATH_5XX | 5xx Pfade je Site (site+path) | site | path | - + +Minute-Format: + minute wird als "YYYY-mm-dd HH:MM" ausgegeben. + +Burst erklärt (IP vs BURST_IP): + - IP: Gesamtrequests pro IP im Zeitraum + - BURST_IP: Requests pro IP pro Minute (Spikes -> Scanner/Attacken) + +Beispiele (Grundlagen): + # 1) Standard (Text, nur aktuelle Datei): + $PROG + + # 2) Zeitraum: Auto-Range greift -> automatisch alle Rotationen: + $PROG --from "2026-02-21 00:00:00" --to "2026-02-21 23:59:59" + + # 3) Zeitraum aber bewusst NUR aktuelle Datei (override): + $PROG -l ${DEFAULT_LOG} --from "2026-02-21 00:00:00" --to "2026-02-21 23:59:59" + + # 4) Alle Rotationen explizit: + $PROG -l "${DEFAULT_LOG}*" + + # 5) Sites die "aktions" enthalten, aber staging ausblenden: + $PROG --site-regex 'aktions' --exclude-site 'staging' + + # 6) Nur 404s ansehen (z.B. für Scans): + $PROG --status 404 + + # 7) Nur POSTs ansehen (z.B. Login-Bruteforce): + $PROG --method POST + +Beispiele (Angriff / DDoS / WordPress): + # A) Verdacht auf DDoS / Traffic-Spikes: + # Zeigt stärkste Minuten (gesamt). Hohe Werte = genereller Spike. + $PROG --out csv --csv-types "BURST_TOTAL" --csv-top 50 > ddos-burst-total.csv + + # B) DDoS auf einzelne Site (Spikes pro Site/Minute): + $PROG --out csv --csv-types "BURST_SITE" --csv-top 100 > ddos-burst-site.csv + + # C) DDoS/Spikes pro IP (Top 200): + $PROG --out csv --csv-types "BURST_IP" --csv-top 200 > burst-ip.csv + + # D) "Wer greift welche Site an?" -> Top IPs pro Site (PAIR): + $PROG --out csv --csv-types "PAIR" --csv-top 300 > ips-pro-site.csv + + # E) WordPress Suspects (Pfad/404/5xx + Burst + UA + IP): + $PROG --wp-suspects all --out csv --csv-types "PATH_404,PATH_5XX,BURST_IP,UA_G,IP" --csv-top 200 > wp-suspects.csv + + # F) Bruteforce: POST auf wp-login.php (eine Site): + $PROG --site www.example.tld --method POST --path-prefix /wp-login.php --out csv --csv-types "IP,BURST_IP,UA_G" --csv-top 300 > wp-login-post.csv + + # G) xmlrpc (häufig für Bruteforce/Amplification): + $PROG --path-prefix /xmlrpc.php --out csv --csv-types "SITE,IP,BURST_IP,UA_G,PATH_G" --csv-top 200 > xmlrpc.csv + + # H) Viele 404-Scans: + $PROG --status 404 --out csv --csv-types "IP,BURST_IP,PATH_404,UA_G" --csv-top 300 > scan-404.csv + + # I) Viele 5xx (Server unter Stress/Fehler): + $PROG --status '^5..$' --out csv --csv-types "SITE,PATH_5XX,IP,UA_G" --csv-top 200 > server-5xx.csv + + # J) stdout nur Attack-Indikatoren: + $PROG --text-types "BURST_IP,UA_G,PATH_404,PATH_5XX" + +EOF +} + +# ----------------------------------------------------------------------------- +# Argumente parsen +# ----------------------------------------------------------------------------- +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) usage; exit 0 ;; + + -l|--log) + [[ $# -ge 2 ]] || die "Fehlender Wert nach $1" + LOG_SPEC="$2" + USER_LOG_SPECIFIED=1 + shift 2 + ;; + + --from) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FROM_EPOCH="$(to_epoch "$2")"; shift 2 ;; + --to) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; TO_EPOCH="$(to_epoch "$2")"; shift 2 ;; + + --status) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_STATUS="$2"; shift 2 ;; + --method) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_METHOD="$2"; shift 2 ;; + --path-prefix) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_PATH_PREFIX="$2"; shift 2 ;; + + --site) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_SITE_EXACT="$2"; shift 2 ;; + --site-regex) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_SITE_REGEX="$2"; shift 2 ;; + --exclude-site) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_EXCLUDE_SITE_REGEX="$2"; shift 2 ;; + + --wp-suspects) + WP_SUSPECTS=1 + if [[ $# -ge 2 && ! "$2" =~ ^- ]]; then WP_PRESET="$2"; shift 2; else WP_PRESET="all"; shift; fi + ;; + + --out) + [[ $# -ge 2 ]] || die "Fehlender Wert nach $1" + case "$2" in text|csv|tsv|json) OUT_FORMAT="$2" ;; *) die "Ungültig: $2" ;; esac + shift 2 + ;; + + --top) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; [[ "$2" =~ ^[0-9]+$ ]] || die "--top erwartet Zahl"; TOP_N="$2"; shift 2 ;; + + --text-types) + [[ $# -ge 2 ]] || die "Fehlender Wert nach $1" + TEXT_TYPES="$2" + shift 2 + ;; + + --csv-types) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; CSV_TYPES="$2"; shift 2 ;; + --csv-top) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; [[ "$2" =~ ^[0-9]+$ ]] || die "--csv-top erwartet Zahl"; CSV_TOP="$2"; shift 2 ;; + --csv-gap) CSV_GAP=1; shift ;; + --csv-no-gap) CSV_GAP=0; shift ;; + + --debug) DEBUG=1; shift ;; + + *) die "Unbekannte Option: $1 (nutze --help)" ;; + esac +done + +# ----------------------------------------------------------------------------- +# Auto-Range Log Auswahl (NEU) +# - Wenn Zeitraum angegeben wurde (from/to) und der User NICHT explizit --log +# gesetzt hat, wechseln wir automatisch auf ip_requests.log* +# ----------------------------------------------------------------------------- +if [[ "$USER_LOG_SPECIFIED" -eq 0 ]] && [[ -n "${FROM_EPOCH}${TO_EPOCH}" ]]; then + LOG_SPEC="${DEFAULT_LOG}*" +fi + +# ----------------------------------------------------------------------------- +# WordPress preset: setzt intern einen Regex-Filter auf typische WP-Endpunkte. +# Regex so formuliert, dass gawk keine Warnungen zu Escape-Sequenzen ausgibt. +# ----------------------------------------------------------------------------- +if [[ "$WP_SUSPECTS" -eq 1 ]]; then + case "${WP_PRESET:-all}" in + login) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-login.php}" ;; + xmlrpc) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/xmlrpc.php}" ;; + admin) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-admin}" ;; + api) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-json}" ;; + cron) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-cron.php}" ;; + all|"") FILTER_PATH_REGEX="^/(wp-login[.]php|xmlrpc[.]php|wp-admin|wp-json|wp-cron[.]php)([?]|/|$)" ;; + *) die "Unbekanntes WP preset: ${WP_PRESET}. Erlaubt: login|xmlrpc|admin|api|cron|all" ;; + esac +fi + +# ----------------------------------------------------------------------------- +# Logdateien expandieren (Glob/Liste) und prüfen +# ----------------------------------------------------------------------------- +mapfile -t REAL_FILES < <(expand_logs "$LOG_SPEC") +[[ ${#REAL_FILES[@]} -gt 0 ]] || die "Keine existierenden Logdateien aus --log: '$LOG_SPEC'" + +# ----------------------------------------------------------------------------- +# AWK Core: Parse + Aggregation +# ----------------------------------------------------------------------------- +AWK_PROG=' +function mon2num(m) { + if (m=="Jan") return 1; if (m=="Feb") return 2; if (m=="Mar") return 3; if (m=="Apr") return 4; + if (m=="May") return 5; if (m=="Jun") return 6; if (m=="Jul") return 7; if (m=="Aug") return 8; + if (m=="Sep") return 9; if (m=="Oct") return 10; if (m=="Nov") return 11; if (m=="Dec") return 12; + return 0 +} +function parse_time(ts, a, d, mon, y, hh, mm, ss, mnum, epoch) { + split(ts, a, " "); ts = a[1] + split(ts, a, ":"); hh=a[2]; mm=a[3]; ss=a[4] + split(a[1], a, "/"); d=a[1]; mon=a[2]; y=a[3] + mnum=mon2num(mon); if (mnum==0) return -1 + epoch=mktime(sprintf("%d %d %d %d %d %d", y, mnum, d, hh, mm, ss)) + minute_key = sprintf("%04d%02d%02d%02d%02d", y, mnum, d, hh, mm) + return epoch +} + +BEGIN { + from = (FROM_EPOCH=="" ? -1 : FROM_EPOCH+0) + to = (TO_EPOCH=="" ? -1 : TO_EPOCH+0) + + f_status = FILTER_STATUS + f_method = FILTER_METHOD + f_prefix = FILTER_PATH_PREFIX + f_regex = FILTER_PATH_REGEX + + site_exact = FILTER_SITE_EXACT + site_re = FILTER_SITE_REGEX + ex_site_re = FILTER_EXCLUDE_SITE_REGEX +} + +{ + ip = $1 + site = $2 + + if (site_exact != "" && site != site_exact) next + if (site_re != "" && site !~ site_re) next + if (ex_site_re != "" && site ~ ex_site_re) next + + ts = $4 " " $5 + gsub(/^\[/, "", ts); gsub(/\]$/, "", ts) + + epoch = parse_time(ts) + if (epoch < 0) next + if (from != -1 && epoch < from) next + if (to != -1 && epoch > to) next + + method = $6 + path = $7 + proto = $8 + status = $9 + if (proto !~ /^HTTP\//) next + + ua="" + q1=index($0, "\"") + if (q1>0) { + rest=substr($0, q1+1) + q2=index(rest, "\"") + if (q2>0) ua=substr(rest, 1, q2-1) + } + + if (f_status != "" && status !~ f_status) next + if (f_method != "" && method !~ f_method) next + if (f_prefix != "" && index(path, f_prefix) != 1) next + if (f_regex != "" && path !~ f_regex) next + + total++ + sites[site]++ + ips[ip]++ + + k = site SUBSEP ip + pair[k]++ + if (!(k in seen_pair)) { seen_pair[k]=1; uniq_ip_count[site]++ } + + burst_ip[ip SUBSEP minute_key]++ + + if (ua != "") ua_global[ua]++ + + if (status=="404") path404_site[site SUBSEP path]++ + if (status ~ /^5../) path5xx_site[site SUBSEP path]++ +} + +END { + for (s in sites) print "SITE\t" sites[s] "\t" s + for (i in ips) print "IP\t" ips[i] "\t" i + + for (k in pair) { split(k,a,SUBSEP); print "PAIR\t" pair[k] "\t" a[1] "\t" a[2] } + for (s in uniq_ip_count) print "SITE_UNIQIP\t" uniq_ip_count[s] "\t" s + + for (k in burst_ip) { split(k,a,SUBSEP); print "BURST_IP\t" burst_ip[k] "\t" a[1] "\t" a[2] } + + for (ua in ua_global) print "UA_G\t" ua_global[ua] "\t" ua + + for (k in path404_site) { split(k,a,SUBSEP); print "PATH_404\t" path404_site[k] "\t" a[1] "\t" a[2] } + for (k in path5xx_site) { split(k,a,SUBSEP); print "PATH_5XX\t" path5xx_site[k] "\t" a[1] "\t" a[2] } + + print "TOTAL\t" total "\t-" +} +' + +TMP="$(mktemp)" +trap 'rm -f "$TMP"' EXIT + +{ + for f in "${REAL_FILES[@]}"; do emit_file "$f"; done +} | gawk \ + -v FROM_EPOCH="${FROM_EPOCH}" \ + -v TO_EPOCH="${TO_EPOCH}" \ + -v FILTER_STATUS="${FILTER_STATUS}" \ + -v FILTER_METHOD="${FILTER_METHOD}" \ + -v FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX}" \ + -v FILTER_PATH_REGEX="${FILTER_PATH_REGEX}" \ + -v FILTER_SITE_EXACT="${FILTER_SITE_EXACT}" \ + -v FILTER_SITE_REGEX="${FILTER_SITE_REGEX}" \ + -v FILTER_EXCLUDE_SITE_REGEX="${FILTER_EXCLUDE_SITE_REGEX}" \ + "$AWK_PROG" > "$TMP" + +TOTAL="$(gawk -F'\t' '$1=="TOTAL"{print $2}' "$TMP")" + +# ----------------------------------------------------------------------------- +# CSV/TSV Export +# ----------------------------------------------------------------------------- +emit_records_delim() { + local delim="$1" + LC_ALL=C sort -t $'\t' -k1,1 -k2,2nr "$TMP" \ + | gawk -F'\t' -v TYPES="$CSV_TYPES" -v TOPN="$CSV_TOP" -v GAP="$CSV_GAP" -v SEP="$delim" ' + function q(s){ gsub(/\t/," ",s); gsub(/\r/," ",s); gsub(/\n/," ",s); gsub(/"/,"\"\"",s); return "\"" s "\"" } + function fmt_min(k){ + if (k ~ /^[0-9]{12}$/) return substr(k,1,4) "-" substr(k,5,2) "-" substr(k,7,2) " " substr(k,9,2) ":" substr(k,11,2) + return k + } + function init_types(n,i,a){ + if (TYPES==""){ use_all=1; return } + use_all=0 + n=split(TYPES,a,",") + for(i=1;i<=n;i++){ gsub(/^[ \t]+|[ \t]+$/,"",a[i]); if(a[i]!="") want[a[i]]=1 } + } + function allowed(t){ return (use_all || (t in want)) } + + function emit3(a,b,c){ if(GAP==1) print a SEP "" SEP b SEP "" SEP c; else print a SEP b SEP c } + function emit4(a,b,c,d){ if(GAP==1) print a SEP "" SEP b SEP "" SEP c SEP "" SEP d; else print a SEP b SEP c SEP d } + + function header(t){ + if(did_any) print "" + if (t=="SITE") emit3("rank","count","site") + else if (t=="IP") emit3("rank","count","ip") + else if (t=="PAIR") emit4("rank","count","site","ip") + else if (t=="SITE_UNIQIP") emit3("rank","unique_ips","site") + else if (t=="BURST_IP") emit4("rank","count","ip","minute") + else if (t=="UA_G") emit3("rank","count","user_agent") + else if (t=="PATH_404") emit4("rank","count","site","path_404") + else if (t=="PATH_5XX") emit4("rank","count","site","path_5xx") + else if (t=="TOTAL") print "total_requests" + else emit4("rank","count","key1","key2") + did_any=1 + } + + BEGIN{ prev=""; rank=0; did_any=0; topn=(TOPN+0); init_types() } + + { + t=$1; c=$2; k1=$3; k2=$4 + if(!allowed(t)) next + if(t!=prev){ prev=t; rank=0; header(t) } + if(t=="TOTAL"){ print c; next } + + rank++ + if(topn>0 && rank>topn) next + + if (t=="BURST_IP") k2=fmt_min(k2) + + if (t=="SITE") emit3(rank,c,q(k1)) + else if (t=="IP") emit3(rank,c,q(k1)) + else if (t=="PAIR") emit4(rank,c,q(k1),q(k2)) + else if (t=="SITE_UNIQIP") emit3(rank,c,q(k1)) + else if (t=="BURST_IP") emit4(rank,c,q(k1),q(k2)) + else if (t=="UA_G") emit3(rank,c,q(k1)) + else if (t=="PATH_404") emit4(rank,c,q(k1),q(k2)) + else if (t=="PATH_5XX") emit4(rank,c,q(k1),q(k2)) + else emit4(rank,c,q(k1),q(k2)) + } + ' +} + +emit_records_json() { + LC_ALL=C sort -t $'\t' -k1,1 -k2,2nr "$TMP" \ + | while IFS=$'\t' read -r t c k1 k2 k3; do + if [[ -n "$CSV_TYPES" ]]; then + case ",$CSV_TYPES," in *",$t,"*) ;; *) continue ;; esac + fi + printf '{"type":"%s","count":%s,"key1":"%s","key2":"%s","key3":"%s"}\n' \ + "$(json_escape "${t:-}")" "${c:-0}" \ + "$(json_escape "${k1:-}")" "$(json_escape "${k2:-}")" "$(json_escape "${k3:-}")" + done +} + +case "$OUT_FORMAT" in + csv) emit_records_delim "$CSV_SEP"; exit 0 ;; + tsv) emit_records_delim $'\t'; exit 0 ;; + json) emit_records_json; exit 0 ;; + text) : ;; + *) die "Unbekanntes --out: $OUT_FORMAT" ;; +esac + +# ----------------------------------------------------------------------------- +# Text-Ausgabe: Auswahl via --text-types +# ----------------------------------------------------------------------------- +declare -A WANT_TEXT=() +if [[ -n "$TEXT_TYPES" ]]; then + parse_list_to_map "$TEXT_TYPES" WANT_TEXT +else + parse_list_to_map "$DEFAULT_TEXT_TYPES" WANT_TEXT +fi + +text_wants() { + local t="$1" + [[ -n "${WANT_TEXT[$t]+x}" ]] +} + +# ----------------------------------------------------------------------------- +# Text-Ausgabe Header (Kontext) +# ----------------------------------------------------------------------------- +echo "" +echo "== Apache ip_requests.log Analyse ==" +echo "Dateien:" +for f in "${REAL_FILES[@]}"; do echo " - $f"; done + +if [[ -n "${FROM_EPOCH}" || -n "${TO_EPOCH}" ]]; then + [[ -n "${FROM_EPOCH}" ]] && echo "Von: $(date -d "@${FROM_EPOCH}" "+%F %T %z")" + [[ -n "${TO_EPOCH}" ]] && echo "Bis: $(date -d "@${TO_EPOCH}" "+%F %T %z")" +fi + +if [[ -n "${FILTER_STATUS}${FILTER_METHOD}${FILTER_PATH_PREFIX}${FILTER_PATH_REGEX}${FILTER_SITE_EXACT}${FILTER_SITE_REGEX}${FILTER_EXCLUDE_SITE_REGEX}" ]]; then + echo "Filter:" + [[ -n "${FILTER_STATUS}" ]] && echo " Status: ${FILTER_STATUS}" + [[ -n "${FILTER_METHOD}" ]] && echo " Methode: ${FILTER_METHOD}" + [[ -n "${FILTER_PATH_PREFIX}" ]] && echo " Path-Prefix: ${FILTER_PATH_PREFIX}" + [[ -n "${FILTER_PATH_REGEX}" ]] && echo " Path-Regex: ${FILTER_PATH_REGEX}" + [[ -n "${FILTER_SITE_EXACT}" ]] && echo " Site exact: ${FILTER_SITE_EXACT}" + [[ -n "${FILTER_SITE_REGEX}" ]] && echo " Site regex: ${FILTER_SITE_REGEX}" + [[ -n "${FILTER_EXCLUDE_SITE_REGEX}" ]] && echo " Exclude site: ${FILTER_EXCLUDE_SITE_REGEX}" +fi + +echo "Gesamt (nach Filter): ${TOTAL}" +echo + +if [[ "$DEBUG" -eq 1 ]]; then + echo "== DEBUG: Record-Counts pro TYPE (TMP) ==" + gawk -F'\t' '{c[$1]++} END{for(t in c) printf "%-12s %d\n", t, c[t]}' "$TMP" | sort || true + echo +fi + +# ----------------------------------------------------------------------------- +# Text-Blöcke (je Type) +# ----------------------------------------------------------------------------- +if text_wants "SITE"; then + echo "== Top ${TOP_N} Sites (Requests) ==" + gawk -F'\t' '$1=="SITE"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$TOP_N" \ + | gawk -F'\t' 'BEGIN{printf "%-10s %s\n","COUNT","SITE"} {printf "%-10s %s\n",$1,$2}' || true + echo +fi + +if text_wants "IP"; then + echo "== Top ${TOP_N} IPs (Requests gesamt) ==" + gawk -F'\t' '$1=="IP"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$TOP_N" \ + | gawk -F'\t' 'BEGIN{printf "%-10s %s\n","COUNT","IP"} {printf "%-10s %s\n",$1,$2}' || true + echo +fi + +if text_wants "SITE_UNIQIP"; then + echo "== Unique IPs pro Site (Top ${TOP_N}) ==" + gawk -F'\t' '$1=="SITE_UNIQIP"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$TOP_N" \ + | gawk -F'\t' 'BEGIN{printf "%-10s %s\n","UNIQ_IPS","SITE"} {printf "%-10s %s\n",$1,$2}' || true + echo +fi + +if text_wants "BURST_IP"; then + echo "== Top ${BURST_TOP} BURST_IP (Requests pro IP pro Minute) ==" + echo "Hinweis: IP=gesamt; BURST_IP=Spikes pro Minute (oft Scanner/Attacke)." + gawk -F'\t' ' + $1=="BURST_IP"{ + c=$2; ip=$3; m=$4; + if (m ~ /^[0-9]{12}$/) mm=substr(m,1,4)"-"substr(m,5,2)"-"substr(m,7,2)" "substr(m,9,2)":"substr(m,11,2); else mm=m; + print c "\t" ip "\t" mm + }' "$TMP" | sort -nr -k1,1 | head -n "$BURST_TOP" \ + | gawk -F'\t' 'BEGIN{printf "%-10s %-40s %s\n","COUNT","IP","MINUTE"} {printf "%-10s %-40s %s\n",$1,$2,$3}' || true + echo +fi + +if text_wants "UA_G"; then + echo "== Top ${UA_TOP} User-Agents (global) ==" + gawk -F'\t' '$1=="UA_G"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$UA_TOP" \ + | gawk -F'\t' 'BEGIN{printf "%-10s %s\n","COUNT","USER-AGENT"} {printf "%-10s %s\n",$1,$2}' || true + echo +fi + +if text_wants "PATH_404"; then + echo "== Top ${PATH_ERR_TOP} PATH_404 (site + path) ==" + gawk -F'\t' '$1=="PATH_404"{print $2 "\t" $3 "\t" $4}' "$TMP" | sort -nr -k1,1 | head -n "$PATH_ERR_TOP" \ + | gawk -F'\t' 'BEGIN{printf "%-10s %-30s %s\n","COUNT","SITE","PATH_404"} {printf "%-10s %-30s %s\n",$1,$2,$3}' || true + echo +fi + +if text_wants "PATH_5XX"; then + echo "== Top ${PATH_ERR_TOP} PATH_5XX (site + path) ==" + gawk -F'\t' '$1=="PATH_5XX"{print $2 "\t" $3 "\t" $4}' "$TMP" | sort -nr -k1,1 | head -n "$PATH_ERR_TOP" \ + | gawk -F'\t' 'BEGIN{printf "%-10s %-30s %s\n","COUNT","SITE","PATH_5XX"} {printf "%-10s %-30s %s\n",$1,$2,$3}' || true + echo +fi + +echo "Tipp (LibreOffice):" +echo " $PROG --out csv --csv-top 100 > report.csv" +echo " (CSV Delimiter ';' default; --csv-no-gap ohne Leer-Spalten)" +echo "" +exit 0