Checkmk/local checks/gpfs_node_state.sh

112 lines
3.9 KiB
Bash
Raw Normal View History

2026-01-14 07:28:32 +01:00
#!/bin/bash
#
# CheckMK Local Check für IBM Spectrum Scale mmhealth node show
# Datei: /usr/lib/check_mk_agent/local/mmhealth_node
# Ausführbar machen: chmod +x /usr/lib/check_mk_agent/local/mmhealth_node
#
# Erstellt separate Services für jeden GPFS Health Component
#
# Pfad zum mmhealth Kommando
MMHEALTH_CMD="/usr/lpp/mmfs/bin/mmhealth"
HOSTNAME=$(hostname)
# Prüfen ob mmhealth verfügbar ist
if [ ! -x "$MMHEALTH_CMD" ]; then
    echo "2 \"GPFS HEALTH GLOBAL $HOSTNAME\" - CRITICAL: mmhealth command not found at $MMHEALTH_CMD"
    exit 0
fi
# mmhealth node show ausführen und Ausgabe parsen
OUTPUT=$($MMHEALTH_CMD node show 2>&1)
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
    echo "2 \"GPFS HEALTH GLOBAL $HOSTNAME\" - CRITICAL: mmhealth command failed with exit code $EXIT_CODE"
    exit 0
fi
# Node Status extrahieren
NODE_STATUS=$(echo "$OUTPUT" | grep "Node status:" | awk '{print $3}')
STATUS_CHANGE=$(echo "$OUTPUT" | grep "Status Change:" | cut -d':' -f2- | sed 's/^[[:space:]]*//')
if [ -z "$NODE_STATUS" ]; then
    echo "2 \"GPFS HEALTH GLOBAL $HOSTNAME\" - CRITICAL: Could not parse node status from mmhealth output"
    exit 0
fi
# Funktion zur Status-Konvertierung in CheckMK-Codes
get_status_code() {
    case "$1" in
        "HEALTHY") echo "0" ;;           # OK
        "TIPS") echo "1" ;;              # WARNING - TIPS bedeutet Verbesserungsvorschläge
        "DEGRADED"|"WARNING") echo "1" ;; # WARNING
        "UNHEALTHY"|"FAILED"|"ERROR") echo "2" ;; # CRITICAL
        *) echo "3" ;;                   # UNKNOWN
    esac
}
# Funktion zur Status-Beschreibung
get_status_text() {
    case "$1" in
        "HEALTHY") echo "OK" ;;
        "TIPS") echo "WARNING" ;;
        "DEGRADED"|"WARNING") echo "WARNING" ;;
        "UNHEALTHY"|"FAILED"|"ERROR") echo "CRITICAL" ;;
        *) echo "UNKNOWN" ;;
    esac
}
# Funktion zur Performance-Daten (numerisch für Graphing)
get_perf_value() {
    case "$1" in
        "HEALTHY") echo "1" ;;
        "TIPS") echo "0.5" ;;
        "DEGRADED"|"WARNING") echo "0.5" ;;
        "UNHEALTHY"|"FAILED"|"ERROR") echo "-1" ;;
        *) echo "-2" ;;
    esac
}
# Globaler Node Status
STATUS_CODE=$(get_status_code "$NODE_STATUS")
STATUS_TEXT=$(get_status_text "$NODE_STATUS")
PERF_VALUE=$(get_perf_value "$NODE_STATUS")
echo "$STATUS_CODE \"GPFS HEALTH NODE $HOSTNAME\" - $STATUS_TEXT: Node status is $NODE_STATUS ($STATUS_CHANGE) | node_status=$PERF_VALUE"
# Component-Details extrahieren (nach der Trennlinie)
COMPONENT_DATA=$(echo "$OUTPUT" | sed -n '/^Component/,/^$/p' | tail -n +3)
# Jeden Component parsen
while IFS= read -r line; do
    # Leere Zeilen überspringen
    [ -z "$line" ] && continue
    # Zeilen mit nur Trennzeichen überspringen
    echo "$line" | grep -q "^-*$" && continue
    # Component-Daten extrahieren (whitespace-separated, aber mit Spaces in Reasons)
    COMPONENT=$(echo "$line" | awk '{print $1}')
    COMP_STATUS=$(echo "$line" | awk '{print $2}')
    COMP_CHANGE=$(echo "$line" | awk '{print $3" "$4" "$5}' | sed 's/[[:space:]]*$//')
    REASONS=$(echo "$line" | cut -d' ' -f6- | sed 's/^[[:space:]]*//' | sed 's/[[:space:]]*$//')
    # Leere Component-Namen überspringen
    [ -z "$COMPONENT" ] || [ "$COMPONENT" = "-" ] && continue
    # Status-Codes für Component
    STATUS_CODE=$(get_status_code "$COMP_STATUS")
    STATUS_TEXT=$(get_status_text "$COMP_STATUS")
    PERF_VALUE=$(get_perf_value "$COMP_STATUS")
    # Reasons formatieren (falls vorhanden)
    if [ -n "$REASONS" ] && [ "$REASONS" != "-" ]; then
        REASON_TEXT=" - $REASONS"
    else
        REASON_TEXT=""
    fi
    # Service für Component erstellen
    echo "$STATUS_CODE \"GPFS HEALTH $COMPONENT $HOSTNAME\" - $STATUS_TEXT: $COMPONENT is $COMP_STATUS ($COMP_CHANGE)$REASON_TEXT | ${COMPONENT,,}_status=$PERF_VALUE"
done <<< "$COMPONENT_DATA"