Files
apache2/apache-ip-requests-analyze.sh

773 lines
28 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
###############################################################################
# apache-ip-requests-analyze.sh
#
# Zweck
# -----
# Dieses Skript analysiert eine zentrale Apache-Sammellogdatei, in die alle
# VirtualHosts zusätzlich protokollieren, z.B.:
#
# CustomLog /var/log/apache2/ip_requests.log base_requests
# LogFormat "%a %v %p %t %r %>s \"%{User-Agent}i\" %T" base_requests
#
# Beispielzeile:
# 62.138.6.15 www.example.tld 443 [21/Feb/2026:00:00:59 +0100] \
# GET /foo HTTP/1.1 404 "Mozilla/5.0 ..." 0
#
# Hintergrund / Nutzen
# --------------------
# - Erkennen, ob einzelne Sites auffällig viel Traffic bekommen (Site-Angriff).
# - Erkennen, ob einzelne IPs auffällig viel senden (Scanner/Bruteforce).
# - Erkennen von "Spikes" (BURST) pro Minute typisch bei DDoS/Scans.
# - Erkennen von WP-typischen Angriffspfaden (wp-login, xmlrpc, wp-json, …).
#
# ---------------------------------------------------------------------------
# WICHTIG: Automatische Logdatei-Auswahl bei --from/--to (NEU)
# ---------------------------------------------------------------------------
# Default ohne Parameter:
# -> Es wird NUR /var/log/apache2/ip_requests.log ausgewertet
#
# Sobald du aber --from und/oder --to angibst UND du NICHT explizit --log nutzt:
# -> Das Skript schaltet automatisch auf /var/log/apache2/ip_requests.log*
# (also inkl. ip_requests.log.1, .2.gz, .3.gz, ...)
#
# Grund: Bei Zeitraum-Filtern ist das "aktuelle" Log oft nicht mehr ausreichend.
# ---------------------------------------------------------------------------
#
# ---------------------------------------------------------------------------
# Features
# ---------------------------------------------------------------------------
# Input:
# - Standard: nur /var/log/apache2/ip_requests.log
# - Rotationen: per --log auch *.gz und *.1, *.2.gz, ... (z.B. logrotate daily)
# - Auto-Range: bei --from/--to automatisch ip_requests.log* (wenn --log fehlt)
#
# Filter:
# - Zeitraum: --from / --to (lokale Zeit, wie Apache loggt)
# - Request: --status (Regex), --method (Regex), --path-prefix (Prefix)
# - WordPress: --wp-suspects [preset] (typische WP-Endpunkte)
# - Site/VHost: --site (exakt), --site-regex (Regex), --exclude-site (Regex)
#
# Auswertungen (TYPES):
# - Requests pro Site (SITE)
# - Requests pro IP (IP)
# - Unique IPs pro Site (SITE_UNIQIP)
# - Requests pro Site+IP (PAIR) -> "IPs pro Site"
# - Burst pro IP/Minute (BURST_IP) -> "Spikes pro IP"
# - Top User-Agents (UA_G)
# - 404 Pfade pro Site (PATH_404)
# - 5xx Pfade pro Site (PATH_5XX)
#
# Output:
# - --out text (Default): stdout (mit --text-types steuerbar)
# - --out csv (Default Separator ';' für LibreOffice/Excel-DE)
# - --out tsv
# - --out json (JSON Lines)
#
# CSV/TSV Eigenschaften:
# - Blockweise pro TYPE: Headerzeile, dann Daten, Leerzeile zwischen Blöcken
# - Pro TYPE nach COUNT absteigend sortiert
# - --csv-top N begrenzt die Zeilen pro TYPE (Top-N pro Block)
# - --csv-gap / --csv-no-gap: optionale Leerspalten zwischen Spalten
#
# ---------------------------------------------------------------------------
# TYPE-Matrix (Bedeutung der TMP/CSV Records)
# ---------------------------------------------------------------------------
# Intern (TMP):
# TYPE<TAB>COUNT<TAB>KEY1<TAB>KEY2<TAB>KEY3
#
# TYPE | Bedeutung | key1 | key2 | key3
# ----------- | ------------------------------------------- | ---- | ------ | ----
# TOTAL | Gesamtrequests nach allen Filtern | - | - | -
# SITE | Requests pro Site/VHost | site | - | -
# IP | Requests pro IP | ip | - | -
# SITE_UNIQIP | Anzahl unterschiedlicher IPs pro Site | site | - | -
# PAIR | Requests pro (Site,IP) Kombination | site | ip | -
# BURST_IP | Requests pro IP innerhalb einer Minute | ip | minute | -
# UA_G | User-Agent global (alle Sites zusammen) | ua | - | -
# PATH_404 | 404-Pfade je Site (site + path) | site | path | -
# PATH_5XX | 5xx-Pfade je Site (site + path) | site | path | -
#
# Minute-Format:
# - Intern: minute_key = YYYYmmddHHMM
# - In CSV/TSV/Text: YYYY-mm-dd HH:MM
#
# Burst erklärt (IP vs BURST_IP):
# - IP: Gesamtrequests pro IP (über gesamten Zeitraum)
# - BURST_IP: Aktivität pro IP in EINER Minute (Spikes -> Scanner/DDoS)
#
###############################################################################
set -euo pipefail
PROG="$(basename "$0")"
# -----------------------------------------------------------------------------
# Defaults / Konfiguration
# -----------------------------------------------------------------------------
DEFAULT_LOG="/var/log/apache2/ip_requests.log"
# LOG_SPEC ist der "User-Level Input" (Datei/Glob/Liste). Wird später expandiert.
LOG_SPEC="$DEFAULT_LOG"
USER_LOG_SPECIFIED=0 # (NEU) Merker: hat User explizit --log gesetzt?
FROM_EPOCH=""
TO_EPOCH=""
OUT_FORMAT="text" # text|csv|tsv|json
# Text defaults
TOP_N=25
STATUS_TOP_SITES=25
BURST_TOP=25
UA_TOP=25
PATH_ERR_TOP=25
# Stdout selection: Welche Blöcke (TYPES) sollen in Text-Ausgabe erscheinen?
TEXT_TYPES="" # comma list
DEFAULT_TEXT_TYPES="SITE,IP,SITE_UNIQIP,BURST_IP,UA_G,PATH_404,PATH_5XX"
# CSV/TSV controls
CSV_TYPES="" # empty => all
CSV_TOP=0 # 0 => unlimited
CSV_GAP=1 # default: ON
CSV_SEP=";" # default: ';' (LibreOffice)
# Globale Request-Filter (wirken auf alle Auswertungen)
FILTER_STATUS="" # regex, e.g. 404 or '^5..$'
FILTER_METHOD="" # regex, e.g. POST or 'GET|POST'
FILTER_PATH_PREFIX="" # prefix, e.g. /wp-login.php
FILTER_PATH_REGEX="" # internal for WP preset all
# Site/VHost Filter
FILTER_SITE_EXACT="" # exact match
FILTER_SITE_REGEX="" # regex include
FILTER_EXCLUDE_SITE_REGEX="" # regex exclude
# WordPress Shortcuts
WP_SUSPECTS=0
WP_PRESET="all"
# Debug
DEBUG=0
# -----------------------------------------------------------------------------
# Helper-Funktionen
# -----------------------------------------------------------------------------
die() { echo "ERROR: $*" >&2; exit 2; }
# Parse "YYYY-mm-dd [HH:MM:SS]" -> epoch seconds
to_epoch() {
local s="$1"
date -d "$s" +%s 2>/dev/null || die "Kann Datum nicht parsen: '$s'"
}
# Datei lesen, optional gzip transparent
emit_file() {
local f="$1"
if [[ "$f" =~ \.gz$ ]]; then
zcat -- "$f"
else
cat -- "$f"
fi
}
# Expand --log SPEC:
# - kann glob sein (z.B. /var/log/apache2/ip_requests.log*)
# - kann mehrere Pfade in einem String enthalten (in Quotes)
expand_logs() {
local spec="$1"
local -a parts out
local -A seen=()
# shellcheck disable=SC2206
parts=( $spec )
for p in "${parts[@]}"; do
# glob expansion
# shellcheck disable=SC2206
local -a g=( $p )
# wenn kein glob-match und Datei existiert nicht -> skip
if [[ ${#g[@]} -eq 1 && "${g[0]}" == "$p" && ! -e "$p" ]]; then
continue
fi
for f in "${g[@]}"; do
[[ -e "$f" ]] || continue
if [[ -z "${seen[$f]+x}" ]]; then
seen["$f"]=1
out+=("$f")
fi
done
done
printf "%s\n" "${out[@]}"
}
# JSON escape for JSON Lines output
json_escape() {
local s="$1"
s="${s//\\/\\\\}"
s="${s//\"/\\\"}"
s="${s//$'\n'/\\n}"
s="${s//$'\r'/\\r}"
s="${s//$'\t'/\\t}"
echo -n "$s"
}
# Parse comma-separated list into associative array (trim spaces)
# Usage: parse_list_to_map "A,B,C" MAPNAME
parse_list_to_map() {
local list="$1"
local -n _map="$2"
local IFS=,
local item
for item in $list; do
item="${item#"${item%%[![:space:]]*}"}"
item="${item%"${item##*[![:space:]]}"}"
[[ -n "$item" ]] && _map["$item"]=1
done
}
usage() {
cat <<EOF
Usage:
$PROG [OPTIONS]
Input / Logfiles:
Default (ohne Parameter):
-> Nur ${DEFAULT_LOG}
Auto-Range (NEU):
Wenn --from oder --to angegeben wird UND du NICHT --log setzt,
nutzt das Skript automatisch:
${DEFAULT_LOG}*
also inkl. Rotationen (.1, .2.gz, ...)
-h, --help
-l, --log SPEC Datei/Glob/Liste (in Quotes), z.B.:
-l ${DEFAULT_LOG}
-l "${DEFAULT_LOG}*"
-l "${DEFAULT_LOG} ${DEFAULT_LOG}.1"
Zeit:
--from "YYYY-mm-dd[ HH:MM:SS]" Start (inkl.)
--to "YYYY-mm-dd[ HH:MM:SS]" Ende (inkl.)
Request-Filter (wirken auf ALLES):
--status X Statuscode oder Regex, z.B. 404 oder '^5..$'
--method M Methode oder Regex, z.B. POST oder 'GET|POST'
--path-prefix P Pfadprefix, z.B. /wp-login.php
Site-Filter:
--site SITE Nur exakt diese Site (vhost)
--site-regex REGEX Nur Sites die REGEX matchen
--exclude-site REGEX Sites ausblenden, die REGEX matchen
WordPress Shortcuts:
--wp-suspects [PRESET] PRESET = login|xmlrpc|admin|api|cron|all (Default: all)
Matcht typische WP-Endpunkte:
/wp-login.php, /xmlrpc.php, /wp-admin, /wp-json, /wp-cron.php
Output:
--out text|csv|tsv|json Default: text
CSV default Delimiter: ';' (LibreOffice/Excel-DE)
Text-Ausgabe:
--top N Top N (Default: ${TOP_N})
--text-types LIST Welche TYPE-Blöcke in stdout (text) erscheinen sollen.
Beispiel:
--text-types "SITE,IP,BURST_IP,UA_G,PATH_404,PATH_5XX"
Default:
${DEFAULT_TEXT_TYPES}
CSV/TSV:
--csv-types LIST Nur bestimmte TYPE-Blöcke (Komma-Liste)
Beispiel: --csv-types "SITE,IP,BURST_IP"
--csv-top N Pro TYPE nur Top N Zeilen (0=unbegrenzt)
--csv-gap | --csv-no-gap Leerspalte zwischen Spalten (Default: gap ON)
Debug:
--debug TYPE-Counts aus TMP ausgeben
TYPE-Matrix (für CSV/TSV/JSON und --text-types):
TYPE | Bedeutung | key1 | key2 | key3
----------- | --------------------------------------- | ---- | ------ | ----
TOTAL | Gesamtrequests nach Filtern | - | - | -
SITE | Requests pro Site/VHost | site | - | -
IP | Requests pro IP | ip | - | -
SITE_UNIQIP | Unique IPs pro Site | site | - | -
PAIR | Requests pro (site,ip) | site | ip | -
BURST_IP | Requests pro IP in einer Minute | ip | minute | -
UA_G | User-Agent global | ua | - | -
PATH_404 | 404 Pfade je Site (site+path) | site | path | -
PATH_5XX | 5xx Pfade je Site (site+path) | site | path | -
Minute-Format:
minute wird als "YYYY-mm-dd HH:MM" ausgegeben.
Burst erklärt (IP vs BURST_IP):
- IP: Gesamtrequests pro IP im Zeitraum
- BURST_IP: Requests pro IP pro Minute (Spikes -> Scanner/Attacken)
Beispiele (Grundlagen):
# 1) Standard (Text, nur aktuelle Datei):
$PROG
# 2) Zeitraum: Auto-Range greift -> automatisch alle Rotationen:
$PROG --from "2026-02-21 00:00:00" --to "2026-02-21 23:59:59"
# 3) Zeitraum aber bewusst NUR aktuelle Datei (override):
$PROG -l ${DEFAULT_LOG} --from "2026-02-21 00:00:00" --to "2026-02-21 23:59:59"
# 4) Alle Rotationen explizit:
$PROG -l "${DEFAULT_LOG}*"
# 5) Sites die "aktions" enthalten, aber staging ausblenden:
$PROG --site-regex 'aktions' --exclude-site 'staging'
# 6) Nur 404s ansehen (z.B. für Scans):
$PROG --status 404
# 7) Nur POSTs ansehen (z.B. Login-Bruteforce):
$PROG --method POST
Beispiele (Angriff / DDoS / WordPress):
# A) Verdacht auf DDoS / Traffic-Spikes:
# Zeigt stärkste Minuten (gesamt). Hohe Werte = genereller Spike.
$PROG --out csv --csv-types "BURST_TOTAL" --csv-top 50 > ddos-burst-total.csv
# B) DDoS auf einzelne Site (Spikes pro Site/Minute):
$PROG --out csv --csv-types "BURST_SITE" --csv-top 100 > ddos-burst-site.csv
# C) DDoS/Spikes pro IP (Top 200):
$PROG --out csv --csv-types "BURST_IP" --csv-top 200 > burst-ip.csv
# D) "Wer greift welche Site an?" -> Top IPs pro Site (PAIR):
$PROG --out csv --csv-types "PAIR" --csv-top 300 > ips-pro-site.csv
# E) WordPress Suspects (Pfad/404/5xx + Burst + UA + IP):
$PROG --wp-suspects all --out csv --csv-types "PATH_404,PATH_5XX,BURST_IP,UA_G,IP" --csv-top 200 > wp-suspects.csv
# F) Bruteforce: POST auf wp-login.php (eine Site):
$PROG --site www.example.tld --method POST --path-prefix /wp-login.php --out csv --csv-types "IP,BURST_IP,UA_G" --csv-top 300 > wp-login-post.csv
# G) xmlrpc (häufig für Bruteforce/Amplification):
$PROG --path-prefix /xmlrpc.php --out csv --csv-types "SITE,IP,BURST_IP,UA_G,PATH_G" --csv-top 200 > xmlrpc.csv
# H) Viele 404-Scans:
$PROG --status 404 --out csv --csv-types "IP,BURST_IP,PATH_404,UA_G" --csv-top 300 > scan-404.csv
# I) Viele 5xx (Server unter Stress/Fehler):
$PROG --status '^5..$' --out csv --csv-types "SITE,PATH_5XX,IP,UA_G" --csv-top 200 > server-5xx.csv
# J) stdout nur Attack-Indikatoren:
$PROG --text-types "BURST_IP,UA_G,PATH_404,PATH_5XX"
EOF
}
# -----------------------------------------------------------------------------
# Argumente parsen
# -----------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
case "$1" in
-h|--help) usage; exit 0 ;;
-l|--log)
[[ $# -ge 2 ]] || die "Fehlender Wert nach $1"
LOG_SPEC="$2"
USER_LOG_SPECIFIED=1
shift 2
;;
--from) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FROM_EPOCH="$(to_epoch "$2")"; shift 2 ;;
--to) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; TO_EPOCH="$(to_epoch "$2")"; shift 2 ;;
--status) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_STATUS="$2"; shift 2 ;;
--method) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_METHOD="$2"; shift 2 ;;
--path-prefix) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_PATH_PREFIX="$2"; shift 2 ;;
--site) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_SITE_EXACT="$2"; shift 2 ;;
--site-regex) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_SITE_REGEX="$2"; shift 2 ;;
--exclude-site) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; FILTER_EXCLUDE_SITE_REGEX="$2"; shift 2 ;;
--wp-suspects)
WP_SUSPECTS=1
if [[ $# -ge 2 && ! "$2" =~ ^- ]]; then WP_PRESET="$2"; shift 2; else WP_PRESET="all"; shift; fi
;;
--out)
[[ $# -ge 2 ]] || die "Fehlender Wert nach $1"
case "$2" in text|csv|tsv|json) OUT_FORMAT="$2" ;; *) die "Ungültig: $2" ;; esac
shift 2
;;
--top) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; [[ "$2" =~ ^[0-9]+$ ]] || die "--top erwartet Zahl"; TOP_N="$2"; shift 2 ;;
--text-types)
[[ $# -ge 2 ]] || die "Fehlender Wert nach $1"
TEXT_TYPES="$2"
shift 2
;;
--csv-types) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; CSV_TYPES="$2"; shift 2 ;;
--csv-top) [[ $# -ge 2 ]] || die "Fehlender Wert nach $1"; [[ "$2" =~ ^[0-9]+$ ]] || die "--csv-top erwartet Zahl"; CSV_TOP="$2"; shift 2 ;;
--csv-gap) CSV_GAP=1; shift ;;
--csv-no-gap) CSV_GAP=0; shift ;;
--debug) DEBUG=1; shift ;;
*) die "Unbekannte Option: $1 (nutze --help)" ;;
esac
done
# -----------------------------------------------------------------------------
# Auto-Range Log Auswahl (NEU)
# - Wenn Zeitraum angegeben wurde (from/to) und der User NICHT explizit --log
# gesetzt hat, wechseln wir automatisch auf ip_requests.log*
# -----------------------------------------------------------------------------
if [[ "$USER_LOG_SPECIFIED" -eq 0 ]] && [[ -n "${FROM_EPOCH}${TO_EPOCH}" ]]; then
LOG_SPEC="${DEFAULT_LOG}*"
fi
# -----------------------------------------------------------------------------
# WordPress preset: setzt intern einen Regex-Filter auf typische WP-Endpunkte.
# Regex so formuliert, dass gawk keine Warnungen zu Escape-Sequenzen ausgibt.
# -----------------------------------------------------------------------------
if [[ "$WP_SUSPECTS" -eq 1 ]]; then
case "${WP_PRESET:-all}" in
login) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-login.php}" ;;
xmlrpc) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/xmlrpc.php}" ;;
admin) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-admin}" ;;
api) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-json}" ;;
cron) FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX:-/wp-cron.php}" ;;
all|"") FILTER_PATH_REGEX="^/(wp-login[.]php|xmlrpc[.]php|wp-admin|wp-json|wp-cron[.]php)([?]|/|$)" ;;
*) die "Unbekanntes WP preset: ${WP_PRESET}. Erlaubt: login|xmlrpc|admin|api|cron|all" ;;
esac
fi
# -----------------------------------------------------------------------------
# Logdateien expandieren (Glob/Liste) und prüfen
# -----------------------------------------------------------------------------
mapfile -t REAL_FILES < <(expand_logs "$LOG_SPEC")
[[ ${#REAL_FILES[@]} -gt 0 ]] || die "Keine existierenden Logdateien aus --log: '$LOG_SPEC'"
# -----------------------------------------------------------------------------
# AWK Core: Parse + Aggregation
# -----------------------------------------------------------------------------
AWK_PROG='
function mon2num(m) {
if (m=="Jan") return 1; if (m=="Feb") return 2; if (m=="Mar") return 3; if (m=="Apr") return 4;
if (m=="May") return 5; if (m=="Jun") return 6; if (m=="Jul") return 7; if (m=="Aug") return 8;
if (m=="Sep") return 9; if (m=="Oct") return 10; if (m=="Nov") return 11; if (m=="Dec") return 12;
return 0
}
function parse_time(ts, a, d, mon, y, hh, mm, ss, mnum, epoch) {
split(ts, a, " "); ts = a[1]
split(ts, a, ":"); hh=a[2]; mm=a[3]; ss=a[4]
split(a[1], a, "/"); d=a[1]; mon=a[2]; y=a[3]
mnum=mon2num(mon); if (mnum==0) return -1
epoch=mktime(sprintf("%d %d %d %d %d %d", y, mnum, d, hh, mm, ss))
minute_key = sprintf("%04d%02d%02d%02d%02d", y, mnum, d, hh, mm)
return epoch
}
BEGIN {
from = (FROM_EPOCH=="" ? -1 : FROM_EPOCH+0)
to = (TO_EPOCH=="" ? -1 : TO_EPOCH+0)
f_status = FILTER_STATUS
f_method = FILTER_METHOD
f_prefix = FILTER_PATH_PREFIX
f_regex = FILTER_PATH_REGEX
site_exact = FILTER_SITE_EXACT
site_re = FILTER_SITE_REGEX
ex_site_re = FILTER_EXCLUDE_SITE_REGEX
}
{
ip = $1
site = $2
if (site_exact != "" && site != site_exact) next
if (site_re != "" && site !~ site_re) next
if (ex_site_re != "" && site ~ ex_site_re) next
ts = $4 " " $5
gsub(/^\[/, "", ts); gsub(/\]$/, "", ts)
epoch = parse_time(ts)
if (epoch < 0) next
if (from != -1 && epoch < from) next
if (to != -1 && epoch > to) next
method = $6
path = $7
proto = $8
status = $9
if (proto !~ /^HTTP\//) next
ua=""
q1=index($0, "\"")
if (q1>0) {
rest=substr($0, q1+1)
q2=index(rest, "\"")
if (q2>0) ua=substr(rest, 1, q2-1)
}
if (f_status != "" && status !~ f_status) next
if (f_method != "" && method !~ f_method) next
if (f_prefix != "" && index(path, f_prefix) != 1) next
if (f_regex != "" && path !~ f_regex) next
total++
sites[site]++
ips[ip]++
k = site SUBSEP ip
pair[k]++
if (!(k in seen_pair)) { seen_pair[k]=1; uniq_ip_count[site]++ }
burst_ip[ip SUBSEP minute_key]++
if (ua != "") ua_global[ua]++
if (status=="404") path404_site[site SUBSEP path]++
if (status ~ /^5../) path5xx_site[site SUBSEP path]++
}
END {
for (s in sites) print "SITE\t" sites[s] "\t" s
for (i in ips) print "IP\t" ips[i] "\t" i
for (k in pair) { split(k,a,SUBSEP); print "PAIR\t" pair[k] "\t" a[1] "\t" a[2] }
for (s in uniq_ip_count) print "SITE_UNIQIP\t" uniq_ip_count[s] "\t" s
for (k in burst_ip) { split(k,a,SUBSEP); print "BURST_IP\t" burst_ip[k] "\t" a[1] "\t" a[2] }
for (ua in ua_global) print "UA_G\t" ua_global[ua] "\t" ua
for (k in path404_site) { split(k,a,SUBSEP); print "PATH_404\t" path404_site[k] "\t" a[1] "\t" a[2] }
for (k in path5xx_site) { split(k,a,SUBSEP); print "PATH_5XX\t" path5xx_site[k] "\t" a[1] "\t" a[2] }
print "TOTAL\t" total "\t-"
}
'
TMP="$(mktemp)"
trap 'rm -f "$TMP"' EXIT
{
for f in "${REAL_FILES[@]}"; do emit_file "$f"; done
} | gawk \
-v FROM_EPOCH="${FROM_EPOCH}" \
-v TO_EPOCH="${TO_EPOCH}" \
-v FILTER_STATUS="${FILTER_STATUS}" \
-v FILTER_METHOD="${FILTER_METHOD}" \
-v FILTER_PATH_PREFIX="${FILTER_PATH_PREFIX}" \
-v FILTER_PATH_REGEX="${FILTER_PATH_REGEX}" \
-v FILTER_SITE_EXACT="${FILTER_SITE_EXACT}" \
-v FILTER_SITE_REGEX="${FILTER_SITE_REGEX}" \
-v FILTER_EXCLUDE_SITE_REGEX="${FILTER_EXCLUDE_SITE_REGEX}" \
"$AWK_PROG" > "$TMP"
TOTAL="$(gawk -F'\t' '$1=="TOTAL"{print $2}' "$TMP")"
# -----------------------------------------------------------------------------
# CSV/TSV Export
# -----------------------------------------------------------------------------
emit_records_delim() {
local delim="$1"
LC_ALL=C sort -t $'\t' -k1,1 -k2,2nr "$TMP" \
| gawk -F'\t' -v TYPES="$CSV_TYPES" -v TOPN="$CSV_TOP" -v GAP="$CSV_GAP" -v SEP="$delim" '
function q(s){ gsub(/\t/," ",s); gsub(/\r/," ",s); gsub(/\n/," ",s); gsub(/"/,"\"\"",s); return "\"" s "\"" }
function fmt_min(k){
if (k ~ /^[0-9]{12}$/) return substr(k,1,4) "-" substr(k,5,2) "-" substr(k,7,2) " " substr(k,9,2) ":" substr(k,11,2)
return k
}
function init_types(n,i,a){
if (TYPES==""){ use_all=1; return }
use_all=0
n=split(TYPES,a,",")
for(i=1;i<=n;i++){ gsub(/^[ \t]+|[ \t]+$/,"",a[i]); if(a[i]!="") want[a[i]]=1 }
}
function allowed(t){ return (use_all || (t in want)) }
function emit3(a,b,c){ if(GAP==1) print a SEP "" SEP b SEP "" SEP c; else print a SEP b SEP c }
function emit4(a,b,c,d){ if(GAP==1) print a SEP "" SEP b SEP "" SEP c SEP "" SEP d; else print a SEP b SEP c SEP d }
function header(t){
if(did_any) print ""
if (t=="SITE") emit3("rank","count","site")
else if (t=="IP") emit3("rank","count","ip")
else if (t=="PAIR") emit4("rank","count","site","ip")
else if (t=="SITE_UNIQIP") emit3("rank","unique_ips","site")
else if (t=="BURST_IP") emit4("rank","count","ip","minute")
else if (t=="UA_G") emit3("rank","count","user_agent")
else if (t=="PATH_404") emit4("rank","count","site","path_404")
else if (t=="PATH_5XX") emit4("rank","count","site","path_5xx")
else if (t=="TOTAL") print "total_requests"
else emit4("rank","count","key1","key2")
did_any=1
}
BEGIN{ prev=""; rank=0; did_any=0; topn=(TOPN+0); init_types() }
{
t=$1; c=$2; k1=$3; k2=$4
if(!allowed(t)) next
if(t!=prev){ prev=t; rank=0; header(t) }
if(t=="TOTAL"){ print c; next }
rank++
if(topn>0 && rank>topn) next
if (t=="BURST_IP") k2=fmt_min(k2)
if (t=="SITE") emit3(rank,c,q(k1))
else if (t=="IP") emit3(rank,c,q(k1))
else if (t=="PAIR") emit4(rank,c,q(k1),q(k2))
else if (t=="SITE_UNIQIP") emit3(rank,c,q(k1))
else if (t=="BURST_IP") emit4(rank,c,q(k1),q(k2))
else if (t=="UA_G") emit3(rank,c,q(k1))
else if (t=="PATH_404") emit4(rank,c,q(k1),q(k2))
else if (t=="PATH_5XX") emit4(rank,c,q(k1),q(k2))
else emit4(rank,c,q(k1),q(k2))
}
'
}
emit_records_json() {
LC_ALL=C sort -t $'\t' -k1,1 -k2,2nr "$TMP" \
| while IFS=$'\t' read -r t c k1 k2 k3; do
if [[ -n "$CSV_TYPES" ]]; then
case ",$CSV_TYPES," in *",$t,"*) ;; *) continue ;; esac
fi
printf '{"type":"%s","count":%s,"key1":"%s","key2":"%s","key3":"%s"}\n' \
"$(json_escape "${t:-}")" "${c:-0}" \
"$(json_escape "${k1:-}")" "$(json_escape "${k2:-}")" "$(json_escape "${k3:-}")"
done
}
case "$OUT_FORMAT" in
csv) emit_records_delim "$CSV_SEP"; exit 0 ;;
tsv) emit_records_delim $'\t'; exit 0 ;;
json) emit_records_json; exit 0 ;;
text) : ;;
*) die "Unbekanntes --out: $OUT_FORMAT" ;;
esac
# -----------------------------------------------------------------------------
# Text-Ausgabe: Auswahl via --text-types
# -----------------------------------------------------------------------------
declare -A WANT_TEXT=()
if [[ -n "$TEXT_TYPES" ]]; then
parse_list_to_map "$TEXT_TYPES" WANT_TEXT
else
parse_list_to_map "$DEFAULT_TEXT_TYPES" WANT_TEXT
fi
text_wants() {
local t="$1"
[[ -n "${WANT_TEXT[$t]+x}" ]]
}
# -----------------------------------------------------------------------------
# Text-Ausgabe Header (Kontext)
# -----------------------------------------------------------------------------
echo ""
echo "== Apache ip_requests.log Analyse =="
echo "Dateien:"
for f in "${REAL_FILES[@]}"; do echo " - $f"; done
if [[ -n "${FROM_EPOCH}" || -n "${TO_EPOCH}" ]]; then
[[ -n "${FROM_EPOCH}" ]] && echo "Von: $(date -d "@${FROM_EPOCH}" "+%F %T %z")"
[[ -n "${TO_EPOCH}" ]] && echo "Bis: $(date -d "@${TO_EPOCH}" "+%F %T %z")"
fi
if [[ -n "${FILTER_STATUS}${FILTER_METHOD}${FILTER_PATH_PREFIX}${FILTER_PATH_REGEX}${FILTER_SITE_EXACT}${FILTER_SITE_REGEX}${FILTER_EXCLUDE_SITE_REGEX}" ]]; then
echo "Filter:"
[[ -n "${FILTER_STATUS}" ]] && echo " Status: ${FILTER_STATUS}"
[[ -n "${FILTER_METHOD}" ]] && echo " Methode: ${FILTER_METHOD}"
[[ -n "${FILTER_PATH_PREFIX}" ]] && echo " Path-Prefix: ${FILTER_PATH_PREFIX}"
[[ -n "${FILTER_PATH_REGEX}" ]] && echo " Path-Regex: ${FILTER_PATH_REGEX}"
[[ -n "${FILTER_SITE_EXACT}" ]] && echo " Site exact: ${FILTER_SITE_EXACT}"
[[ -n "${FILTER_SITE_REGEX}" ]] && echo " Site regex: ${FILTER_SITE_REGEX}"
[[ -n "${FILTER_EXCLUDE_SITE_REGEX}" ]] && echo " Exclude site: ${FILTER_EXCLUDE_SITE_REGEX}"
fi
echo "Gesamt (nach Filter): ${TOTAL}"
echo
if [[ "$DEBUG" -eq 1 ]]; then
echo "== DEBUG: Record-Counts pro TYPE (TMP) =="
gawk -F'\t' '{c[$1]++} END{for(t in c) printf "%-12s %d\n", t, c[t]}' "$TMP" | sort || true
echo
fi
# -----------------------------------------------------------------------------
# Text-Blöcke (je Type)
# -----------------------------------------------------------------------------
if text_wants "SITE"; then
echo "== Top ${TOP_N} Sites (Requests) =="
gawk -F'\t' '$1=="SITE"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$TOP_N" \
| gawk -F'\t' 'BEGIN{printf "%-10s %s\n","COUNT","SITE"} {printf "%-10s %s\n",$1,$2}' || true
echo
fi
if text_wants "IP"; then
echo "== Top ${TOP_N} IPs (Requests gesamt) =="
gawk -F'\t' '$1=="IP"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$TOP_N" \
| gawk -F'\t' 'BEGIN{printf "%-10s %s\n","COUNT","IP"} {printf "%-10s %s\n",$1,$2}' || true
echo
fi
if text_wants "SITE_UNIQIP"; then
echo "== Unique IPs pro Site (Top ${TOP_N}) =="
gawk -F'\t' '$1=="SITE_UNIQIP"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$TOP_N" \
| gawk -F'\t' 'BEGIN{printf "%-10s %s\n","UNIQ_IPS","SITE"} {printf "%-10s %s\n",$1,$2}' || true
echo
fi
if text_wants "BURST_IP"; then
echo "== Top ${BURST_TOP} BURST_IP (Requests pro IP pro Minute) =="
echo "Hinweis: IP=gesamt; BURST_IP=Spikes pro Minute (oft Scanner/Attacke)."
gawk -F'\t' '
$1=="BURST_IP"{
c=$2; ip=$3; m=$4;
if (m ~ /^[0-9]{12}$/) mm=substr(m,1,4)"-"substr(m,5,2)"-"substr(m,7,2)" "substr(m,9,2)":"substr(m,11,2); else mm=m;
print c "\t" ip "\t" mm
}' "$TMP" | sort -nr -k1,1 | head -n "$BURST_TOP" \
| gawk -F'\t' 'BEGIN{printf "%-10s %-40s %s\n","COUNT","IP","MINUTE"} {printf "%-10s %-40s %s\n",$1,$2,$3}' || true
echo
fi
if text_wants "UA_G"; then
echo "== Top ${UA_TOP} User-Agents (global) =="
gawk -F'\t' '$1=="UA_G"{print $2 "\t" $3}' "$TMP" | sort -nr -k1,1 | head -n "$UA_TOP" \
| gawk -F'\t' 'BEGIN{printf "%-10s %s\n","COUNT","USER-AGENT"} {printf "%-10s %s\n",$1,$2}' || true
echo
fi
if text_wants "PATH_404"; then
echo "== Top ${PATH_ERR_TOP} PATH_404 (site + path) =="
gawk -F'\t' '$1=="PATH_404"{print $2 "\t" $3 "\t" $4}' "$TMP" | sort -nr -k1,1 | head -n "$PATH_ERR_TOP" \
| gawk -F'\t' 'BEGIN{printf "%-10s %-30s %s\n","COUNT","SITE","PATH_404"} {printf "%-10s %-30s %s\n",$1,$2,$3}' || true
echo
fi
if text_wants "PATH_5XX"; then
echo "== Top ${PATH_ERR_TOP} PATH_5XX (site + path) =="
gawk -F'\t' '$1=="PATH_5XX"{print $2 "\t" $3 "\t" $4}' "$TMP" | sort -nr -k1,1 | head -n "$PATH_ERR_TOP" \
| gawk -F'\t' 'BEGIN{printf "%-10s %-30s %s\n","COUNT","SITE","PATH_5XX"} {printf "%-10s %-30s %s\n",$1,$2,$3}' || true
echo
fi
echo "Tipp (LibreOffice):"
echo " $PROG --out csv --csv-top 100 > report.csv"
echo " (CSV Delimiter ';' default; --csv-no-gap ohne Leer-Spalten)"
echo ""
exit 0