Checkmk/local checks/gpfs_ces_state.sh

135 lines
4.8 KiB
Bash
Raw Permalink Normal View History

2026-01-14 07:28:32 +01:00
#!/bin/bash
#
# CheckMK Local Check für IBM Spectrum Scale CES (Cluster Export Services)
# Datei: /usr/lib/check_mk_agent/local/mmces_state
# Ausführbar machen: chmod +x /usr/lib/check_mk_agent/local/mmces_state
#
# Erstellt separate Services für jeden CES-Dienst
#
# Pfad zum mmces Kommando
MMCES_CMD="/usr/lpp/mmfs/bin/mmces"
HOSTNAME=$(hostname)
# Prüfen ob mmces verfügbar ist
if [ ! -x "$MMCES_CMD" ]; then
    echo "2 MMCES_Global - CRITICAL: mmces command not found at $MMCES_CMD"
    exit 0
fi
# mmces state show ausführen und Ausgabe parsen
OUTPUT=$($MMCES_CMD state show -N "$HOSTNAME" 2>&1)
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
    echo "2 MMCES_Global - CRITICAL: mmces command failed with exit code $EXIT_CODE"
    exit 0
fi
# Header-Zeile überspringen und Datenzeile extrahieren
DATA_LINE=$(echo "$OUTPUT" | grep -v "^NODE" | grep -v "^-" | head -1)
if [ -z "$DATA_LINE" ]; then
    echo "2 MMCES_Global - CRITICAL: No data found in mmces output"
    exit 0
fi
# Felder extrahieren (whitespace-separated)
read -r NODE AUTH BLOCK NETWORK HDFS_NAMENODE AUTH_OBJ NFS OBJ S3 SMB CES <<< "$DATA_LINE"
# Funktion zur Status-Konvertierung in CheckMK-Codes
get_status_code() {
    case "$1" in
        "HEALTHY") echo "0" ;;           # OK
        "DISABLED") echo "0" ;;          # OK (DISABLED ist normal)
        "DEGRADED"|"WARNING") echo "1" ;; # WARNING
        "UNHEALTHY"|"FAILED"|"ERROR") echo "2" ;; # CRITICAL
        *) echo "3" ;;                   # UNKNOWN
    esac
}
# Funktion zur Status-Beschreibung
get_status_text() {
    case "$1" in
        "HEALTHY") echo "OK" ;;
        "DISABLED") echo "OK (Disabled)" ;;
        "DEGRADED"|"WARNING") echo "WARNING" ;;
        "UNHEALTHY"|"FAILED"|"ERROR") echo "CRITICAL" ;;
        *) echo "UNKNOWN" ;;
    esac
}
# Funktion zur Performance-Daten (numerisch für Graphing)
get_perf_value() {
    case "$1" in
        "HEALTHY") echo "1" ;;
        "DISABLED") echo "0" ;;
        "DEGRADED"|"WARNING") echo "0.5" ;;
        "UNHEALTHY"|"FAILED"|"ERROR") echo "-1" ;;
        *) echo "-2" ;;
    esac
}
# Separate Services für jeden CES-Dienst erstellen
# Format: STATUS_CODE SERVICENAME - STATUS_TEXT: Details | performance_data
# AUTH Service
STATUS_CODE=$(get_status_code "$AUTH")
STATUS_TEXT=$(get_status_text "$AUTH")
PERF_VALUE=$(get_perf_value "$AUTH")
echo "$STATUS_CODE \"GPFS CES STATE AUTH $HOSTNAME\" - $STATUS_TEXT: Authentication service is $AUTH | auth=$PERF_VALUE"
# BLOCK Service
STATUS_CODE=$(get_status_code "$BLOCK")
STATUS_TEXT=$(get_status_text "$BLOCK")
PERF_VALUE=$(get_perf_value "$BLOCK")
echo "$STATUS_CODE \"GPFS CES STATE BLOCK $HOSTNAME\" - $STATUS_TEXT: Block service is $BLOCK | block=$PERF_VALUE"
# NETWORK Service
STATUS_CODE=$(get_status_code "$NETWORK")
STATUS_TEXT=$(get_status_text "$NETWORK")
PERF_VALUE=$(get_perf_value "$NETWORK")
echo "$STATUS_CODE \"GPFS CES STATE NETWORK $HOSTNAME\" - $STATUS_TEXT: Network service is $NETWORK | network=$PERF_VALUE"
# HDFS_NAMENODE Service
STATUS_CODE=$(get_status_code "$HDFS_NAMENODE")
STATUS_TEXT=$(get_status_text "$HDFS_NAMENODE")
PERF_VALUE=$(get_perf_value "$HDFS_NAMENODE")
echo "$STATUS_CODE \"GPFS CES STATE HDFS_NAMENODE $HOSTNAME\" - $STATUS_TEXT: HDFS NameNode service is $HDFS_NAMENODE | hdfs_namenode=$PERF_VALUE"
# AUTH_OBJ Service
STATUS_CODE=$(get_status_code "$AUTH_OBJ")
STATUS_TEXT=$(get_status_text "$AUTH_OBJ")
PERF_VALUE=$(get_perf_value "$AUTH_OBJ")
echo "$STATUS_CODE \"GPFS CES STATE AUTH_OBJ $HOSTNAME\" - $STATUS_TEXT: Authentication Object service is $AUTH_OBJ | auth_obj=$PERF_VALUE"
# NFS Service
STATUS_CODE=$(get_status_code "$NFS")
STATUS_TEXT=$(get_status_text "$NFS")
PERF_VALUE=$(get_perf_value "$NFS")
echo "$STATUS_CODE \"GPFS CES STATE NFS $HOSTNAME\" - $STATUS_TEXT: NFS service is $NFS | nfs=$PERF_VALUE"
# OBJ Service
STATUS_CODE=$(get_status_code "$OBJ")
STATUS_TEXT=$(get_status_text "$OBJ")
PERF_VALUE=$(get_perf_value "$OBJ")
echo "$STATUS_CODE \"GPFS CES STATE OBJ $HOSTNAME\" - $STATUS_TEXT: Object service is $OBJ | obj=$PERF_VALUE"
# S3 Service
STATUS_CODE=$(get_status_code "$S3")
STATUS_TEXT=$(get_status_text "$S3")
PERF_VALUE=$(get_perf_value "$S3")
echo "$STATUS_CODE \"GPFS CES STATE S3 $HOSTNAME\" - $STATUS_TEXT: S3 service is $S3 | s3=$PERF_VALUE"
# SMB Service
STATUS_CODE=$(get_status_code "$SMB")
STATUS_TEXT=$(get_status_text "$SMB")
PERF_VALUE=$(get_perf_value "$SMB")
echo "$STATUS_CODE \"GPFS CES STATE SMB $HOSTNAME\" - $STATUS_TEXT: SMB service is $SMB | smb=$PERF_VALUE"
# CES Service (Overall)
STATUS_CODE=$(get_status_code "$CES")
STATUS_TEXT=$(get_status_text "$CES")
PERF_VALUE=$(get_perf_value "$CES")
echo "$STATUS_CODE \"GPFS CES STATE CES $HOSTNAME\" - $STATUS_TEXT: CES overall status is $CES | ces=$PERF_VALUE"