From e5962af62dc02fd6899e272e8fcbf301379eda4f Mon Sep 17 00:00:00 2001 From: magadimn Date: Tue, 13 Jan 2026 23:25:08 +0100 Subject: [PATCH] add files --- TSM/Dokumentation.md | 1648 +++++++++++++++++++++++++++++++++++++++ TSM/Licence | 21 + TSM/QUICKSTART.md | 338 ++++++++ TSM/README.md | 533 +++++++++++++ TSM/isntall.txt | 45 ++ TSM/tsm_backup_check.py | 0 TSM/tsm_backups.py | 128 +++ 7 files changed, 2713 insertions(+) create mode 100644 TSM/Dokumentation.md create mode 100644 TSM/Licence create mode 100644 TSM/QUICKSTART.md create mode 100644 TSM/README.md create mode 100644 TSM/isntall.txt create mode 100644 TSM/tsm_backup_check.py create mode 100644 TSM/tsm_backups.py diff --git a/TSM/Dokumentation.md b/TSM/Dokumentation.md new file mode 100644 index 0000000..5bb0af0 --- /dev/null +++ b/TSM/Dokumentation.md @@ -0,0 +1,1648 @@ +# TSM Backup Monitoring - Technische Dokumentation + + +## Inhaltsverzeichnis + + +1. [Architektur-Details](#architektur-details) +2. [API-Referenz](#api-referenz) +3. [Erweiterte Konfiguration](#erweiterte-konfiguration) +4. [Entwicklungsleitfaden](#entwicklungsleitfaden) +5. [Performance-Optimierung](#performance-optimierung) +6. [Sicherheit](#sicherheit) +7. [Integration](#integration) +8. [Best Practices](#best-practices) + + +--- + + +## 1. Architektur-Details + + +### 1.1 Komponenten-Übersicht + + +#### Agent Plugin (`tsm_backups`) + + +**Speicherort:** `/usr/lib/check_mk_agent/plugins/tsm_backups` + + +**Aufgaben:** +- CSV-Dateien aus `/mnt/CMK_TSM` einlesen +- Node-Namen normalisieren (RRZ*/NFRZ*-Präfixe entfernen) +- Backup-Daten pro Node aggregieren +- JSON-Output für CheckMK Agent generieren + + +**Ausführung:** +- Wird bei jedem Agent-Aufruf ausgeführt +- Standard-Intervall: 60 Sekunden (CheckMK Standard) +- Kann asynchron konfiguriert werden (siehe [Async Plugins](#async-plugins)) + + +**Output-Format:** +```json +{ +  "SERVER_MSSQL": { +    "statuses": ["Completed", "Completed", "Failed"], +    "schedules": ["DAILY_FULL", "DAILY_DIFF", "HOURLY_LOG"], +    "last": 1736693420, +    "count": 3 +  }, +  "DATABASE_HANA": { +    "statuses": ["Completed"], +    "schedules": ["00-00-00_FULL"], +    "last": 1736690000, +    "count": 1 +  } +} +``` + + +#### Check Plugin (`tsm_backups.py`) + + +**Speicherort:** `/omd/sites//local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py` + + +**Aufgaben:** +- JSON vom Agent parsen +- Services mit Labels discovern +- Backup-Status bewerten +- Metriken generieren +- Schwellwerte prüfen + + +**CheckMK API Version:** v2 (cmk.agent_based.v2) + + +### 1.2 Datenfluss-Diagramm + + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ TSM Server                                                        │ +│                                                                    │ +│  SELECT                                                            │ +│    DATE_TIME, ENTITY, NODE_NAME, SCHEDULE_NAME, RESULT           │ +│  FROM ACTLOG                                                      │ +│  WHERE TIMESTAMP > CURRENT_TIMESTAMP - 24 HOURS                   │ +│                                                                    │ +│  ↓ Export als CSV                                                 │ +│  /exports/backup-stats/TSM_BACKUP_SCHED_24H.CSV                  │ +└──────────────────────────────────────────────────────────────────┘ +                            │ +                            │ NFS/SCP/Rsync +                            ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ Host: /mnt/CMK_TSM/                                               │ +│  ├── TSM_BACKUP_SCHED_24H.CSV                                    │ +│  ├── TSM_DB_SCHED_24H.CSV                                        │ +│  └── TSM_FILE_SCHED_24H.CSV                                      │ +└──────────────────────────────────────────────────────────────────┘ +                            │ +                            ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ Agent Plugin: /usr/lib/check_mk_agent/plugins/tsm_backups        │ +│                                                                    │ +│  1. Liste CSV-Dateien in /mnt/CMK_TSM                            │ +│  2. Parse jede Zeile:                                             │ +│     - Extrahiere: timestamp, node, schedule, status              │ +│     - Validiere Node (Länge, MAINTENANCE)                        │ +│     - Normalisiere Node-Name:                                    │ +│       RRZ01_SERVER_MSSQL → _SERVER_MSSQL → SERVER_MSSQL         │ +│  3. Aggregiere pro Node:                                          │ +│     - Sammle alle Statuses                                       │ +│     - Sammle alle Schedules                                      │ +│     - Finde letzten Timestamp                                    │ +│     - Zähle Jobs                                                 │ +│  4. Generiere JSON-Output                                        │ +│                                                                    │ +│  Output: <<>>                                │ +│          {"SERVER_MSSQL": {...}, ...}                             │ +└──────────────────────────────────────────────────────────────────┘ +                            │ +                            │ CheckMK Agent Protocol +                            ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ CheckMK Server: Agent Section Parser                             │ +│                                                                    │ +│  parse_tsm_backups(string_table):                                │ +│    - Extrahiere JSON-String aus string_table[0][0]               │ +│    - Parse JSON → Python Dict                                    │ +│    - Return: {node: data, ...}                                   │ +└──────────────────────────────────────────────────────────────────┘ +                            │ +                            ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ Service Discovery                                                 │ +│                                                                    │ +│  discover_tsm_backups(section):                                  │ +│    FOR EACH node IN section:                                     │ +│      1. Extrahiere Metadata:                                     │ +│         - backup_type = extract_backup_type(node)                │ +│         - backup_level = extract_backup_level(schedules)         │ +│         - frequency = extract_frequency(schedules)               │ +│         - error_handling = get_error_handling(backup_type)       │ +│         - category = get_backup_category(backup_type)            │ +│      2. Erstelle Service mit Labels:                             │ +│         Service(                                                  │ +│           item=node,                                              │ +│           labels=[                                                │ +│             ServiceLabel("backup_type", backup_type),            │ +│             ServiceLabel("backup_category", category),           │ +│             ...                                                   │ +│           ]                                                       │ +│         )                                                         │ +└──────────────────────────────────────────────────────────────────┘ +                            │ +                            ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ Service Check Execution                                           │ +│                                                                    │ +│  check_tsm_backups(item, section):                               │ +│    1. Lade Node-Daten aus section[item]                          │ +│    2. Extrahiere Metadata (wie bei Discovery)                    │ +│    3. Berechne Status:                                            │ +│       - calculate_state() → (State, status_text)                 │ +│    4. Berechne Backup-Alter:                                     │ +│       - age = now - last_timestamp                               │ +│    5. Hole Schwellwerte:                                          │ +│       - thresholds = get_thresholds(type, level)                 │ +│    6. Prüfe Alter gegen Schwellwerte                             │ +│    7. Generiere Output:                                           │ +│       - Result(state, summary)                                    │ +│       - Metric("backup_age", age, levels)                        │ +│       - Metric("backup_jobs", count)                             │ +└──────────────────────────────────────────────────────────────────┘ +                            │ +                            ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ CheckMK Service                                                   │ +│                                                                    │ +│  Name: TSM Backup SERVER_MSSQL                                   │ +│  State: OK                                                        │ +│  Summary: Type=MSSQL (database), Level=FULL, Freq=daily,        │ +│           Status=Completed, Last=3h 15m, Jobs=3                  │ +│  Metrics:                                                         │ +│    - backup_age: 11700s (warn: 93600s, crit: 172800s)           │ +│    - backup_jobs: 3                                               │ +│  Labels:                                                          │ +│    - backup_type: mssql                                           │ +│    - backup_category: database                                    │ +│    - frequency: daily                                             │ +│    - backup_level: full                                           │ +│    - error_handling: strict                                       │ +└──────────────────────────────────────────────────────────────────┘ +``` + + +### 1.3 Node-Normalisierung im Detail + + +**Zweck:** TSM-Umgebungen mit redundanten Servern (z.B. RRZ01, RRZ02, NFRZ01) sollen als ein logischer Node überwacht werden. + + +**Algorithmus:** + + +```python +def normalize_node_name(node): +    """ +    Input: "RRZ01_MYSERVER_MSSQL" +    +    Schritt 1: Entferne RRZ*/NFRZ*/RZ* Präfix mit Unterstrich +               Pattern: r'(RRZ|NFRZ|RZ)\d+(_)' +               Ergebnis: "_MYSERVER_MSSQL" +    +    Schritt 2: Entferne führenden Unterstrich +               Ergebnis: "MYSERVER_MSSQL" +    +    Schritt 3: Entferne RRZ*/NFRZ*/RZ* Suffix ohne Unterstrich +               Pattern: r'(RRZ|NFRZ|RZ)\d+$' +               Ergebnis: "MYSERVER_MSSQL" +    +    Output: "MYSERVER_MSSQL" +    """ +``` + + +**Beispiele:** + + +| Original Node | Normalisiert | Ergebnis | +|---------------|--------------|----------| +| `RRZ01_SERVER_MSSQL` | `SERVER_MSSQL` | ✅ | +| `RRZ02_SERVER_MSSQL` | `SERVER_MSSQL` | ✅ Zusammengeführt | +| `NFRZ01_DATABASE_HANA` | `DATABASE_HANA` | ✅ | +| `SERVER_FILE_RRZ01` | `SERVER_FILE` | ✅ | +| `MYSERVER_ORACLE` | `MYSERVER_ORACLE` | ✅ Unverändert | + + +--- + + +## 2. API-Referenz + + +### 2.1 Agent Plugin Funktionen + + +#### `TSMParser.normalize_node_name(node: str) -> str` + + +Normalisiert TSM-Node-Namen für Redundanz-Logik. + + +**Parameter:** +- `node` (str): Original TSM-Node-Name + + +**Returns:** +- `str`: Normalisierter Node-Name + + +**Beispiel:** +```python +parser = TSMParser() +normalized = parser.normalize_node_name("RRZ01_SERVER_MSSQL") +# normalized == "SERVER_MSSQL" +``` + + +--- + + +#### `TSMParser.is_valid_node(node: str, status: str) -> bool` + + +Prüft, ob ein Node für Monitoring valide ist. + + +**Parameter:** +- `node` (str): Node-Name +- `status` (str): Backup-Status + + +**Returns:** +- `bool`: True wenn valide, False sonst + + +**Validierungs-Regeln:** +- Node muss existieren (not empty) +- Node muss mindestens 3 Zeichen lang sein +- Status muss existieren +- Node darf nicht "MAINTENANCE" enthalten + + +**Beispiel:** +```python +parser.is_valid_node("SERVER", "Completed")  # False (zu kurz) +parser.is_valid_node("SERVER_MSSQL", "Completed")  # True +parser.is_valid_node("SERVER_MAINTENANCE", "Completed")  # False +``` + + +--- + + +#### `TSMParser.parse_csv(csv_file: Path) -> None` + + +Parsed eine TSM-CSV-Datei und sammelt Backup-Informationen. + + +**Parameter:** +- `csv_file` (Path): Pfad zur CSV-Datei + + +**CSV-Format:** +``` +TIMESTAMP,FIELD,NODE_NAME,SCHEDULE_NAME,STATUS +2026-01-12 08:00:00,SOMETHING,SERVER_MSSQL,DAILY_FULL,Completed +``` + + +**Side Effects:** +- Fügt geparste Backups zu `self.backups` hinzu + + +--- + + +#### `TSMParser.aggregate() -> dict` + + +Aggregiert Backup-Daten pro normalisierten Node. + + +**Returns:** +```python +{ +    "SERVER_MSSQL": { +        "statuses": ["Completed", "Completed", "Failed"], +        "schedules": ["DAILY_FULL", "DAILY_DIFF", "HOURLY_LOG"], +        "last": 1736693420,  # Unix timestamp +        "count": 3 +    } +} +``` + + +--- + + +### 2.2 Check Plugin Funktionen + + +#### `extract_backup_type(node: str) -> str` + + +Extrahiert Backup-Typ aus Node-Namen anhand bekannter Typen. + + +**Parameter:** +- `node` (str): Normalisierter Node-Name + + +**Returns:** +- `str`: Backup-Typ in Kleinbuchstaben, oder "unknown" + + +**Bekannte Typen:** +- Datenbanken: MSSQL, HANA, Oracle, DB2, MySQL +- Virtualisierung: Virtual +- Dateisysteme: FILE, SCALE, DM, Datacenter +- Applikationen: Mail + + +**Algorithmus:** +1. Splitte Node-Namen bei Unterstrich +2. Nehme letztes Segment +3. Falls letztes Segment numerisch → nehme vorletztes Segment +4. Prüfe gegen Liste bekannter Typen +5. Return lowercase oder "unknown" + + +**Beispiele:** +```python +extract_backup_type("SERVER_MSSQL")           # → "mssql" +extract_backup_type("DATABASE_HANA_01")       # → "hana" +extract_backup_type("FILESERVER_FILE")        # → "file" +extract_backup_type("VM_HYPERV_123")          # → "hyperv" +extract_backup_type("APP_UNKNOWN")            # → "unknown" +``` + + +--- + + +#### `extract_backup_level(schedules: list[str]) -> str` + + +Extrahiert Backup-Level aus Schedule-Namen. + + +**Parameter:** +- `schedules` (list[str]): Liste von Schedule-Namen + + +**Returns:** +- `str`: `"log"`, `"full"`, `"incremental"`, `"differential"` + + +**Priorität:** log > full > differential > incremental + + +**Erkennungs-Pattern:** +- `_LOG` oder `LOG` → log +- `_FULL` oder `FULL` → full +- `_INCR` oder `INCREMENTAL` → incremental +- `_DIFF` oder `DIFFERENTIAL` → differential + + +**Beispiele:** +```python +extract_backup_level(["DAILY_FULL"])                    # → "full" +extract_backup_level(["HOURLY_LOG", "DAILY_FULL"])     # → "log" +extract_backup_level(["00-00-00_FULL"])                 # → "full" +``` + + +--- + + +#### `extract_frequency(schedules: list[str]) -> str` + + +Extrahiert Backup-Frequenz aus Schedule-Namen. + + +**Parameter:** +- `schedules` (list[str]): Liste von Schedule-Namen + + +**Returns:** +- `str`: `"hourly"`, `"daily"`, `"weekly"`, `"monthly"`, `"unknown"` + + +**Priorität:** hourly > daily > weekly > monthly + + +**Erkennungs-Pattern:** +- `HOURLY` → hourly +- `DAILY` → daily +- `WEEKLY` → weekly +- `MONTHLY` → monthly +- `HH-MM-SS_*LOG` → hourly (Zeit-basiert mit LOG) +- `00-00-00_*` → daily (Mitternacht) + + +**Beispiele:** +```python +extract_frequency(["DAILY_FULL"])               # → "daily" +extract_frequency(["00-00-00_FULL"])            # → "daily" +extract_frequency(["08-00-00_LOG"])             # → "hourly" +extract_frequency(["WEEKLY_FULL", "DAILY_DIFF"]) # → "daily" +``` + + +--- + + +#### `get_error_handling(backup_type: str) -> str` + + +Bestimmt Error-Handling-Strategie basierend auf Backup-Typ. + + +**Parameter:** +- `backup_type` (str): Backup-Typ + + +**Returns:** +- `str`: `"tolerant"` oder `"strict"` + + +**Logik:** +```python +if backup_type in TOLERANT_TYPES: +    return "tolerant"  # Failed → WARN +else: +    return "strict"    # Failed → CRIT +``` + + +**Tolerante Typen:** +- file, virtual, scale, dm, datacenter +- vmware, hyperv, mail, exchange + + +**Strikt:** +- Alle Datenbank-Typen (mssql, hana, oracle, db2, ...) +- Alle anderen Typen + + +--- + + +#### `get_backup_category(backup_type: str) -> str` + + +Kategorisiert Backup-Typ in Oberkategorien. + + +**Parameter:** +- `backup_type` (str): Backup-Typ + + +**Returns:** +- `str`: `"database"`, `"virtualization"`, `"filesystem"`, `"application"`, `"other"` + + +**Kategorien:** + + +| Kategorie | Typen | +|-----------|-------| +| `database` | mssql, hana, oracle, db2, mysql, postgres, mariadb, sybase, mongodb | +| `virtualization` | virtual, vmware, hyperv, kvm, xen | +| `filesystem` | file, scale, dm, datacenter | +| `application` | mail, exchange | +| `other` | Alle anderen | + + +--- + + +#### `get_thresholds(backup_type: str, backup_level: str) -> dict` + + +Liefert typ- und level-spezifische Schwellwerte. + + +**Parameter:** +- `backup_type` (str): Backup-Typ +- `backup_level` (str): Backup-Level + + +**Returns:** +```python +{ +    "warn": 93600,   # Sekunden +    "crit": 172800   # Sekunden +} +``` + + +**Priorität:** +1. Falls `backup_level == "log"` → LOG-Schwellwerte (4h/8h) +2. Falls `backup_type` in THRESHOLDS → Typ-spezifische Schwellwerte +3. Sonst → Default-Schwellwerte (26h/48h) + + +**Beispiele:** +```python +get_thresholds("mssql", "log")   # → {"warn": 14400, "crit": 28800} +get_thresholds("mssql", "full")  # → {"warn": 93600, "crit": 172800} +get_thresholds("newtype", "full") # → {"warn": 93600, "crit": 172800} +``` + + +--- + + +#### `calculate_state(statuses: list[str], last_time: int, backup_type: str, error_handling: str) -> tuple[State, str]` + + +Berechnet CheckMK-Status aus Backup-Zuständen. + + +**Parameter:** +- `statuses` (list[str]): Liste aller Backup-Statuses +- `last_time` (int): Unix-Timestamp des letzten Backups +- `backup_type` (str): Backup-Typ +- `error_handling` (str): "tolerant" oder "strict" + + +**Returns:** +- `tuple`: `(State, status_text)` + + +**Status-Logik-Tabelle:** + + +| Bedingung | Alter | Error Handling | State | Text | +|-----------|-------|----------------|-------|------| +| ≥1x "completed" | - | - | OK | "Completed" | +| Nur "pending"/"started" | <2h | - | OK | "Pending/Started" | +| Nur "pending"/"started" | >2h | - | WARN | "Pending (>2h)" | +| Nur "pending"/"started" | unknown | - | WARN | "Pending" | +| "failed"/"missed" | - | tolerant | WARN | "Failed (partial)" | +| "failed"/"missed" | - | strict | CRIT | "Failed/Missed" | +| Andere | - | - | CRIT | "Unknown State" | + + +**Beispiele:** +```python +calculate_state(["Completed", "Completed"], 1736690000, "mssql", "strict") +# → (State.OK, "Completed") + + +calculate_state(["Failed"], 1736690000, "file", "tolerant") +# → (State.WARN, "Failed (partial)") + + +calculate_state(["Failed"], 1736690000, "mssql", "strict") +# → (State.CRIT, "Failed/Missed") +``` + + +--- + + +### 2.3 CheckMK API v2 Objekte + + +#### `Service` + + +Definiert einen CheckMK-Service während der Discovery. + + +```python +from cmk.agent_based.v2 import Service, ServiceLabel + + +Service( +    item="SERVER_MSSQL", +    labels=[ +        ServiceLabel("backup_type", "mssql"), +        ServiceLabel("frequency", "daily"), +    ] +) +``` + + +--- + + +#### `Result` + + +Repräsentiert ein Check-Ergebnis. + + +```python +from cmk.agent_based.v2 import Result, State + + +Result( +    state=State.OK, +    summary="Type=MSSQL, Status=Completed, Last=3h" +) + + +Result( +    state=State.OK, +    notice="Detailed information for details page" +) +``` + + +--- + + +#### `Metric` + + +Definiert eine Performance-Metrik. + + +```python +from cmk.agent_based.v2 import Metric + + +Metric( +    name="backup_age", +    value=11700,                    # Aktueller Wert +    levels=(93600, 172800),         # (warn, crit) +    boundaries=(0, None),            # (min, max) +) +``` + + +--- + + +## 3. Erweiterte Konfiguration + + +### 3.1 Benutzerdefinierte Backup-Typen + + +**Szenario:** Neuer Backup-Typ "SAPASE" (SAP ASE Datenbank) soll überwacht werden. + + +**Schritt 1: Typ zur known_types Liste hinzufügen** + + +```python +# In tsm_backups.py, extract_backup_type() Funktion +known_types = [ +    'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM', +    'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL', +    'SAPASE',  # NEU +] +``` + + +**Schritt 2: Schwellwerte definieren (optional)** + + +```python +THRESHOLDS = { +    # ... bestehende Einträge ... +    "sapase": {"warn": 26 * 3600, "crit": 48 * 3600}, +} +``` + + +**Schritt 3: Typ zur passenden Kategorie hinzufügen** + + +```python +DATABASE_TYPES = { +    'mssql', 'hana', 'db2', 'oracle', 'mysql', +    'sapase',  # NEU +} +``` + + +**Schritt 4: Error-Handling festlegen (optional)** + + +Falls tolerant gewünscht: +```python +TOLERANT_TYPES = { +    'file', 'virtual', 'scale', 'dm', 'datacenter', +    'vmware', 'hyperv', 'mail', 'exchange', +    'sapase',  # NEU (falls tolerant erwünscht) +} +``` + + +**Schritt 5: Plugin neu laden** + + +```bash +cmk -R +cmk -II --all +``` + + +**Ergebnis:** +- Nodes wie `SERVER_SAPASE` werden automatisch erkannt +- Typ-Label: `backup_type=sapase` +- Kategorie-Label: `backup_category=database` +- Schwellwerte: 26h/48h + + +--- + + +### 3.2 Async Agent Plugin + + +Bei großen TSM-Umgebungen kann das CSV-Parsing Zeit in Anspruch nehmen. Async-Plugins laufen unabhängig vom Agent-Intervall. + + +**Konfiguration:** + + +```bash +# Als root auf dem Host +cat > /etc/check_mk/mrpe.cfg << 'EOF' +# TSM Backups async (alle 5 Minuten) +(interval=300) tsm_backups /usr/lib/check_mk_agent/plugins/tsm_backups +EOF +``` + + +**Oder mit CheckMK Bakery (Regel):** + + +``` +Setup > Agents > Agent Rules > Asynchronous execution of plugins (Windows, Linux) +``` + + +**Einstellungen:** +- Plugin: `tsm_backups` +- Execution interval: `300` Sekunden (5 Minuten) +- Cache age: `600` Sekunden (10 Minuten) + + +--- + + +### 3.3 CSV-Export Automation + + +#### Option A: NFS-Mount (empfohlen) + + +```bash +# /etc/fstab +tsm-server.example.com:/exports/backup-stats  /mnt/CMK_TSM  nfs  defaults,ro  0  0 + + +# Mount testen +mount -a +ls /mnt/CMK_TSM/ +``` + + +#### Option B: Rsync via Cron + + +```bash +# Crontab für root +*/15 * * * * rsync -az --delete tsm-server:/path/to/csv/ /mnt/CMK_TSM/ +``` + + +#### Option C: SCP mit SSH-Key + + +```bash +# SSH-Key einrichten +ssh-keygen -t ed25519 -f ~/.ssh/tsm_backup_key -N "" +ssh-copy-id -i ~/.ssh/tsm_backup_key.pub tsm-server + + +# Crontab +*/15 * * * * scp -i ~/.ssh/tsm_backup_key tsm-server:/path/*.CSV /mnt/CMK_TSM/ +``` + + +--- + + +### 3.4 Regel-basierte Service-Erstellung + + +**CheckMK-Regeln für automatische Service-Labels:** + + +``` +Setup > Services > Discovery rules > Host labels +``` + + +**Beispiel-Regel:** +```yaml +conditions: +  service_labels: +    backup_category: database +  +actions: +  add_labels: +    criticality: high +    team: dba +``` + + +--- + + +### 3.5 Custom Views + + +#### View: Alle kritischen Datenbank-Backups + + +``` +Setup > General > Custom views > Create new view + + +Name: Critical Database Backups +Datasource: All services + + +Filters: +- Service state: CRIT +- Service labels: backup_category = database + + +Columns: +- Host +- Service description +- Service state +- Service output +- Service labels: backup_type +- Service labels: frequency +- Perf-O-Meter +``` + + +--- + + +### 3.6 Custom Notifications + + +**Notification Rule: Nur strikte Failed-Backups eskalieren** + + +``` +Setup > Notifications > Add rule + + +Conditions: +- Service labels: error_handling = strict +- Service state: CRIT +- Service state type: HARD + + +Contact selection: +- Specify users: dba-team + + +Notification method: +- Email +- PagerDuty +``` + + +--- + + +## 4. Entwicklungsleitfaden + + +### 4.1 Entwicklungsumgebung einrichten + + +```bash +# CheckMK-Site für Entwicklung +omd create dev +omd start dev +su - dev + + +# Git-Repository +cd ~/local/lib/python3/cmk_addons/plugins/ +git init +git add . +git commit -m "Initial commit" + + +# Entwicklungs-Workflow +vim tsm/agent_based/tsm_backups.py +cmk -R +cmk -vv --debug test-host | grep "TSM Backup" +``` + + +--- + + +### 4.2 Unit Tests schreiben + + +**Test-Datei:** `test_tsm_backups.py` + + +```python +#!/usr/bin/env python3 +import pytest +from tsm_backups import ( +    extract_backup_type, +    extract_backup_level, +    calculate_state, +) +from cmk.agent_based.v2 import State + + +def test_extract_backup_type(): +    assert extract_backup_type("SERVER_MSSQL") == "mssql" +    assert extract_backup_type("DATABASE_HANA_01") == "hana" +    assert extract_backup_type("NEWTYPE_CUSTOM") == "custom" + + +def test_extract_backup_level(): +    assert extract_backup_level(["DAILY_FULL"]) == "full" +    assert extract_backup_level(["HOURLY_LOG", "DAILY_FULL"]) == "log" + + +def test_calculate_state_completed(): +    state, text = calculate_state( +        ["Completed", "Completed"], +        1736690000, +        "mssql", +        "strict" +    ) +    assert state == State.OK +    assert text == "Completed" + + +def test_calculate_state_failed_strict(): +    state, text = calculate_state( +        ["Failed"], +        1736690000, +        "mssql", +        "strict" +    ) +    assert state == State.CRIT +    assert text == "Failed/Missed" + + +def test_calculate_state_failed_tolerant(): +    state, text = calculate_state( +        ["Failed"], +        1736690000, +        "file", +        "tolerant" +    ) +    assert state == State.WARN +    assert text == "Failed (partial)" +``` + + +**Tests ausführen:** +```bash +pytest test_tsm_backups.py -v +``` + + +--- + + +### 4.3 Code-Style + + +**PEP 8 Compliance:** +```bash +pip install black flake8 mypy + + +# Auto-Formatierung +black tsm_backups.py + + +# Linting +flake8 tsm_backups.py + + +# Type Checking +mypy tsm_backups.py +``` + + +--- + + +### 4.4 Debugging + + +#### Agent-Plugin debuggen + + +```bash +# Direkter Aufruf mit Traceback +python3 /usr/lib/check_mk_agent/plugins/tsm_backups + + +# Mit Debugger +python3 -m pdb /usr/lib/check_mk_agent/plugins/tsm_backups +``` + + +#### Check-Plugin debuggen + + +```bash +# Verbose Check mit Debug-Output +cmk -vv --debug hostname | less + + +# Nur TSM-Services +cmk -vv --debug hostname | grep -A 20 "TSM Backup" + + +# Python-Debugger im Plugin +import pdb; pdb.set_trace() +``` + + +--- + + +### 4.5 Performance-Profiling + + +```python +# In tsm_backups.py +import cProfile +import pstats + + +def main(): +    profiler = cProfile.Profile() +    profiler.enable() +    +    # ... bestehender Code ... +    +    profiler.disable() +    stats = pstats.Stats(profiler) +    stats.sort_stats('cumulative') +    stats.print_stats(20) +``` + + +--- + + +## 5. Performance-Optimierung + + +### 5.1 CSV-Parsing beschleunigen + + +**Problem:** Große CSV-Dateien (>100 MB) verlangsamen Agent + + +**Lösung 1: Nur relevante Zeilen parsen** + + +```python +def parse_csv_optimized(self, csv_file): +    # Nur die letzten 24h sind relevant +    cutoff_time = datetime.now() - timedelta(hours=24) +    +    with open(csv_file, 'r') as f: +        reader = csv.reader(f) +        for row in reader: +            try: +                time_str = row[0].strip() +                timestamp = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") +                +                # Skip alte Einträge +                if timestamp < cutoff_time: +                    continue +                +                # ... restliche Verarbeitung ... +            except: +                continue +``` + + +**Lösung 2: Pandas für große Dateien** + + +```python +import pandas as pd + + +def parse_csv_pandas(csv_file): +    df = pd.read_csv( +        csv_file, +        names=['timestamp', 'field', 'node', 'schedule', 'status'], +        parse_dates=['timestamp'], +    ) +    +    # Filter letzten 24h +    cutoff = pd.Timestamp.now() - pd.Timedelta(hours=24) +    df = df[df['timestamp'] > cutoff] +    +    # Aggregation +    grouped = df.groupby('node').agg({ +        'status': list, +        'schedule': list, +        'timestamp': 'max', +        'node': 'count' +    }) +    +    return grouped.to_dict() +``` + + +--- + + +### 5.2 Caching + + +**Problem:** CSV-Dateien ändern sich nur alle 15-30 Minuten + + +**Lösung: Cache mit Timestamp-Check** + + +```python +import json +from pathlib import Path +import time + + +CACHE_FILE = Path("/tmp/tsm_backups_cache.json") +CACHE_TTL = 300  # 5 Minuten + + +def get_cached_or_parse(): +    if CACHE_FILE.exists(): +        cache_age = time.time() - CACHE_FILE.stat().st_mtime +        if cache_age < CACHE_TTL: +            with open(CACHE_FILE, 'r') as f: +                return json.load(f) +    +    # Parse fresh +    parser = TSMParser() +    # ... parse logic ... +    result = parser.aggregate() +    +    # Cache schreiben +    with open(CACHE_FILE, 'w') as f: +        json.dump(result, f) +    +    return result +``` + + +--- + + +### 5.3 Speicher-Optimierung + + +**Problem:** Große Listen von Status/Schedule strings + + +**Lösung: Nur unique values speichern** + + +```python +def aggregate_optimized(self): +    nodes = defaultdict(lambda: { +        "statuses": set(),       # Set statt Liste +        "schedules": set(), +        "last": None, +        "count": 0, +    }) +    +    for b in self.backups: +        node = b["node"] +        nodes[node]["count"] += 1 +        nodes[node]["statuses"].add(b["status"])  # Automatisch unique +        nodes[node]["schedules"].add(b["schedule"]) +        # ... rest ... +    +    # Konvertiere Sets zu Listen für JSON +    for node in nodes: +        nodes[node]["statuses"] = list(nodes[node]["statuses"]) +        nodes[node]["schedules"] = list(nodes[node]["schedules"]) +    +    return nodes +``` + + +--- + + +## 6. Sicherheit + + +### 6.1 Dateiberechtigungen + + +```bash +# Agent-Plugin +chown root:root /usr/lib/check_mk_agent/plugins/tsm_backups +chmod 755 /usr/lib/check_mk_agent/plugins/tsm_backups + + +# CSV-Verzeichnis +chown root:root /mnt/CMK_TSM +chmod 755 /mnt/CMK_TSM +chmod 644 /mnt/CMK_TSM/*.CSV + + +# Check-Plugin +chown : $OM/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py +chmod 644 $OM/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py +``` + + +--- + + +### 6.2 Input-Validierung + + +**Agent-Plugin:** +```python +def is_valid_node(self, node, status): +    # Länge prüfen +    if not node or len(node) < 3 or len(node) > 200: +        return False +    +    # Unerlaubte Zeichen +    if not re.match(r'^[A-Za-z0-9_-]+$', node): +        return False +    +    # Status whitelist +    valid_statuses = ['Completed', 'Failed', 'Missed', 'Pending', 'Started'] +    if status not in valid_statuses: +        return False +    +    return True +``` + + +--- + + +### 6.3 Sichere CSV-Verarbeitung + + +```python +def parse_csv_safe(self, csv_file): +    try: +        # Dateigröße prüfen (max 500 MB) +        if csv_file.stat().st_size > 500 * 1024 * 1024: +            return +        +        with open(csv_file, 'r', encoding='utf-8') as f: +            reader = csv.reader(f) +            +            line_count = 0 +            for row in reader: +                line_count += 1 +                +                # Max. 1 Million Zeilen +                if line_count > 1000000: +                    break +                +                # ... Verarbeitung ... +    except Exception as e: +        # Logging statt Crash +        pass +``` + + +--- + + +## 7. Integration + + +### 7.1 Grafana-Dashboard + + +**InfluxDB Query:** +```sql +SELECT +  mean("backup_age") AS "avg_age", +  max("backup_age") AS "max_age" +FROM "tsm_backups" +WHERE +  "backup_category" = 'database' +  AND time > now() - 7d +GROUP BY +  time(1h), +  "node_name" +``` + + +**Panels:** +- Backup Age Heatmap (pro Node) +- Status Distribution (Pie Chart) +- Backup Jobs Timeline +- Alert History + + +--- + + +### 7.2 Prometheus Exporter + + +**CheckMK Prometheus Exporter konfigurieren:** + + +``` +Setup > Exporter > Prometheus + + +Metrics: +- cmk_tsm_backups_backup_age_seconds +- cmk_tsm_backups_backup_jobs_total + + +Labels: +- backup_type +- backup_category +- frequency +``` + + +--- + + +### 7.3 REST API Zugriff + + +```python +import requests + + +# CheckMK REST API +url = "https://checkmk.example.com/site/check_mk/api/1.0" +headers = { +    "Authorization": "Bearer YOUR_API_KEY", +    "Accept": "application/json" +} + + +# Alle TSM-Services abfragen +response = requests.get( +    f"{url}/domain-types/service/collections/all", +    headers=headers, +    params={ +        "query": '{"op": "and", "expr": [{"op": "~", "left": "description", "right": "TSM Backup"}]}' +    } +) + + +services = response.json() +``` + + +--- + + +## 8. Best Practices + + +### 8.1 Naming Conventions + + +**Node-Namen:** +``` +✅ EMPFOHLEN: +- SERVER_MSSQL +- APP_ORACLE_01 +- FILESERVER_BACKUP + + +❌ VERMEIDEN: +- MSSQL (zu generisch) +- SERVER-PROD (Bindestrich kann Probleme machen) +- very_long_name_that_is_too_descriptive_mssql_backup_node (>50 Zeichen) +``` + + +**Schedule-Namen:** +``` +✅ EMPFOHLEN: +- DAILY_FULL +- HOURLY_LOG +- WEEKLY_FULL + + +❌ VERMEIDEN: +- PROD_BACKUP (keine Frequency erkennbar) +- BACKUP01 (keine Informationen) +``` + + +--- + + +### 8.2 Monitoring-Strategie + + +**Alarm-Eskalation:** + + +1. **Stufe 1 (INFO):** Backup Started/Pending +2. **Stufe 2 (WARN):** +   - Backup-Alter > WARN-Schwellwert +   - Failed (tolerante Typen) +   - Pending > 2h +3. **Stufe 3 (CRIT):** +   - Backup-Alter > CRIT-Schwellwert +   - Failed/Missed (strikte Typen) + + +**Notification Delays:** +``` +Setup > Notifications > Rules + + +WARN: Notify after 15 minutes (allow recovery) +CRIT: Notify immediately +``` + + +--- + + +### 8.3 Maintenance Windows + + +**Backup-Services während Maintenance pausieren:** + + +``` +Setup > Services > Service monitoring rules > Disabled checks + + +Conditions: +- Service labels: backup_system = tsm +- Timeperiod: maintenance_window + + +Action: Disable active checks +``` + + +--- + + +### 8.4 Dokumentation + + +**Pro Installation dokumentieren:** + + +1. **CSV-Export-Quelle:** Welcher TSM-Server, welche Queries +2. **CSV-Transfer-Methode:** NFS/SCP/Rsync + Schedule +3. **Benutzerdefinierte Typen:** Liste aller hinzugefügten Backup-Typen +4. **Angepasste Schwellwerte:** Begründung für Abweichungen +5. **Kontakte:** Wer ist für TSM-Backups verantwortlich + + +--- + + +### 8.5 Regelmäßige Wartung + + +**Monatlich:** +- CSV-Verzeichnis aufräumen (alte Dateien löschen) +- Überprüfen: Werden alle erwarteten Nodes gefunden? +- Alert-History analysieren: False Positives? + + +**Quartalsweise:** +- Schwellwerte überprüfen und ggf. anpassen +- Neue Backup-Typen dokumentieren +- Check-Plugin auf Updates prüfen + + +**Jährlich:** +- CheckMK-Upgrade-Kompatibilität testen +- Performance-Review (Agent-Laufzeit, Check-Dauer) +- Architektur-Review (Ist die Lösung noch passend?) + + +--- + + +## Anhang + + +### A. Glossar + + +| Begriff | Beschreibung | +|---------|--------------| +| **Agent Plugin** | Script auf dem überwachten Host, liefert Daten an CheckMK | +| **Check Plugin** | Code auf CheckMK-Server, erstellt Services und bewertet Status | +| **Service Label** | Key-Value-Paar, das einem Service zugeordnet ist (Filterung/Reporting) | +| **Discovery** | Prozess, bei dem CheckMK automatisch Services erstellt | +| **Threshold** | Schwellwert (WARN/CRIT) für eine Metrik | +| **Node** | TSM-Begriff für einen Backup-Client | +| **Schedule** | TSM-Begriff für einen geplanten Backup-Job | + + +--- + + +### B. Fehlercode-Referenz + + +| Fehler | Ursache | Lösung | +|--------|---------|--------| +| `Backup not found in data` | Node existiert in Discovery, aber nicht im aktuellen Agent-Output | CSV-Dateien prüfen, ggf. Re-Discovery | +| `Empty agent section` | Agent liefert keine Daten | Agent-Plugin-Ausführung prüfen, CSV-Verzeichnis prüfen | +| `JSON decode error` | Agent-Output ist kein valides JSON | Agent-Plugin manuell testen, Fehler im Output suchen | +| `Unknown State` | Unerwarteter Status vom TSM | Agent-Output prüfen, ggf. `calculate_state()` erweitern | + + +--- + + +### C. TSM-Query für CSV-Export + + +**Beispiel-Query für TSM-Server (dsmadmc):** + + +```sql +SELECT +  DATE(END_TIME) || ' ' || TIME(END_TIME) AS DATETIME, +  ENTITY, +  NODE_NAME, +  SCHEDULE_NAME, +  RESULT +FROM ACTLOG +WHERE +  SCHEDULE_NAME IS NOT NULL +  AND SCHEDULE_NAME != '' +  AND TIMESTAMPDIFF(4, CHAR(CURRENT_TIMESTAMP - END_TIME)) <= 24 +ORDER BY END_TIME DESC +``` + + +**Export als CSV:** +```bash +dsmadmc -id=admin -pa=password -comma \ +  "SELECT ... FROM ACTLOG ..." \ +  > /exports/backup-stats/TSM_BACKUP_SCHED_24H.CSV +``` + + +--- + + +**Ende der technischen Dokumentation** + + +**Letzte Aktualisierung:** 2026-01-12   +**Version:** 4.1   +**Autor:** Marius Gielnik \ No newline at end of file diff --git a/TSM/Licence b/TSM/Licence new file mode 100644 index 0000000..fca3b90 --- /dev/null +++ b/TSM/Licence @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Marius Gielnik + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/TSM/QUICKSTART.md b/TSM/QUICKSTART.md new file mode 100644 index 0000000..272ab85 --- /dev/null +++ b/TSM/QUICKSTART.md @@ -0,0 +1,338 @@ +# TSM Backup Monitoring - Schnellstart-Anleitung + +## 🚀 Quick Start (5 Minuten) + +Diese Anleitung führt Sie durch die grundlegende Installation des TSM Backup Monitoring Plugins für CheckMK. + +--- + +## Voraussetzungen + +- ✅ CheckMK 2.3.0p40 oder höher +- ✅ Python 3.6+ auf überwachten Hosts +- ✅ TSM CSV-Export verfügbar +- ✅ Root-Zugriff auf überwachte Hosts +- ✅ Site-User-Zugriff auf CheckMK Server + +--- + +## Schritt 1: CSV-Export einrichten (5 Min) + +### Option A: NFS-Mount (empfohlen) + +```bash +# Auf dem überwachten Host als root +mkdir -p /mnt/CMK_TSM +echo "tsm-server:/exports/backup-stats /mnt/CMK_TSM nfs defaults,ro 0 0" >> /etc/fstab +mount -a + +# Test +ls -lh /mnt/CMK_TSM/*.CSV +``` + +### Option B: Rsync via Cron + +```bash +# Auf dem überwachten Host als root +mkdir -p /mnt/CMK_TSM + +# Crontab eintragen +crontab -e +# Füge hinzu: +*/15 * * * * rsync -az tsm-server:/path/to/*.CSV /mnt/CMK_TSM/ + +# Manueller Test +rsync -az tsm-server:/path/to/*.CSV /mnt/CMK_TSM/ +ls -lh /mnt/CMK_TSM/ +``` + +**CSV-Format verifizieren:** +```bash +head -n 2 /mnt/CMK_TSM/*.CSV +# Erwartete Ausgabe: +# 2026-01-12 08:00:00,FIELD,SERVER_MSSQL,DAILY_FULL,Completed +# 2026-01-12 09:15:00,FIELD,DATABASE_HANA,HOURLY_LOG,Completed +``` + +--- + +## Schritt 2: Agent-Plugin installieren (2 Min) + +```bash +# Auf dem überwachten Host als root +cd /usr/lib/check_mk_agent/plugins + +# Plugin kopieren (anpassen an deinen Pfad) +scp user@your-server:tsm_backups_agent.py ./tsm_backups + +# Oder wget (wenn auf Webserver verfügbar) +wget https://your-repo/tsm_backups_agent.py -O tsm_backups + +# Ausführbar machen +chmod +x tsm_backups + +# Test +./tsm_backups +``` + +**Erwartete Ausgabe:** +``` +<<>> +{"SERVER_MSSQL": {"statuses": ["Completed"], "schedules": ["DAILY_FULL"], "last": 1736693420, "count": 1}} +``` + +**❌ Fehler "Empty output"?** +- Prüfe: Existiert `/mnt/CMK_TSM/`? +- Prüfe: Sind CSV-Dateien vorhanden? (`ls /mnt/CMK_TSM/*.CSV`) +- Prüfe: Python3 installiert? (`python3 --version`) + +--- + +## Schritt 3: Check-Plugin installieren (3 Min) + +```bash +# Auf dem CheckMK Server als Site-User +OM=/omd/sites/$(omd sites --bare | head -1) +cd $OM + +# Plugin-Verzeichnis erstellen +mkdir -p local/lib/python3/cmk_addons/plugins/tsm/agent_based + +# Plugin kopieren +cp /path/to/tsm_backups.py local/lib/python3/cmk_addons/plugins/tsm/agent_based/ + +# Rechte setzen +chmod 644 local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py + +# CheckMK neuladen +cmk -R + +# Test +cmk -vv hostname | grep "TSM Backup" +``` + +**Erwartete Ausgabe:** +``` +[agent] Received agent data: <<>> ... +TSM Backup SERVER_MSSQL: OK - Type=MSSQL (database), Level=FULL, Freq=daily, Status=Completed, Last=3h 15m, Jobs=1 +``` + +--- + +## Schritt 4: Service Discovery (2 Min) + +### Option A: WebUI + +1. Gehe zu: `Setup > Hosts` +2. Wähle deinen Host aus +3. Klicke: `Service Discovery` +4. Klicke: `Full Scan` +5. Warte auf Ergebnisse +6. Klicke: `Accept all` bei neuen Services +7. Klicke: `Activate on selected sites` + +### Option B: Command Line + +```bash +# Einzelner Host +cmk -II hostname + +# Alle Hosts +cmk -II --all + +# Änderungen aktivieren +cmk -O +``` + +**Erwartete Services:** +``` +TSM Backup SERVER_MSSQL +TSM Backup DATABASE_HANA +TSM Backup FILESERVER_FILE +... +``` + +--- + +## Schritt 5: Verifizierung (2 Min) + +### 1. Services prüfen + +```bash +# Status aller TSM-Services +cmk -n hostname | grep "TSM Backup" +``` + +**Erwartetes Ergebnis:** +``` +TSM Backup SERVER_MSSQL OK - Type=MSSQL (database), Level=FULL, Freq=daily, Status=Completed, Last=3h 15m, Jobs=1 +TSM Backup DATABASE_HANA OK - Type=HANA (database), Level=FULL, Freq=daily, Status=Completed, Last=5h 20m, Jobs=1 +``` + +### 2. Labels prüfen + +WebUI: `Monitor > Services > > Service labels` + +**Erwartete Labels:** +- `backup_type: mssql` +- `backup_category: database` +- `backup_system: tsm` +- `frequency: daily` +- `backup_level: full` +- `error_handling: strict` + +### 3. Metriken prüfen + +WebUI: `Monitor > Services > > Service Metrics` + +**Erwartete Metriken:** +- `backup_age`: [Sekunden seit letztem Backup] +- `backup_jobs`: [Anzahl Jobs] + +--- + +## Troubleshooting Quick-Fixes + +### Problem: Keine Services gefunden + +```bash +# 1. Agent-Output prüfen +check_mk_agent | grep -A 5 "<< General > Custom views > Create new view + +Filter: Service labels: backup_category = database +``` + +### 2. Benachrichtigungen konfigurieren + +Erstelle Notification Rule für kritische Backup-Fehler: +``` +Setup > Notifications > Add rule + +Conditions: +- Service state: CRIT +- Service labels: error_handling = strict + +Contact: dba-team +``` + +### 3. Schwellwerte anpassen (optional) + +Wenn Standard-Schwellwerte nicht passen: +```bash +# Plugin bearbeiten +vim $OM/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py + +# Suche nach THRESHOLDS = { +# Passe Werte an, z.B.: +# "mssql": {"warn": 30 * 3600, "crit": 50 * 3600}, + +# CheckMK neuladen +cmk -R +cmk -II --all +``` + +--- + +## Hilfe & Support + +- 📖 **README.md:** Ausführliche Feature-Übersicht und Verwendung +- 📚 **DOCUMENTATION.md:** Technische Details, API-Referenz, erweiterte Konfiguration +- 🐛 **Issues:** GitHub Issues oder CheckMK Forum +- 📧 **Kontakt:** [deine-email]@example.com + +--- + +## Cheat Sheet + +```bash +# === Agent-Seite (überwachter Host) === +# Plugin testen +/usr/lib/check_mk_agent/plugins/tsm_backups + +# CSV-Dateien prüfen +ls -lh /mnt/CMK_TSM/*.CSV + +# === CheckMK-Seite (Server) === +# Plugin neu laden +cmk -R + +# Service Discovery +cmk -II hostname + +# Service-Status prüfen +cmk -n hostname | grep "TSM Backup" + +# Debug-Modus +cmk -vv --debug hostname | grep -A 20 "TSM Backup" + +# Alle TSM-Services aktivieren +cmk -O +``` + +--- + +**Installations-Dauer:** ~15 Minuten +**Schwierigkeit:** Mittel +**Letzte Aktualisierung:** 2026-01-12 +**Version:** 4.1 \ No newline at end of file diff --git a/TSM/README.md b/TSM/README.md new file mode 100644 index 0000000..ba2a415 --- /dev/null +++ b/TSM/README.md @@ -0,0 +1,533 @@ +# TSM Backup Monitoring für CheckMK + +Ein vollständiges CheckMK-Plugin zur Überwachung von IBM Spectrum Protect (TSM) Backups mit intelligenter Backup-Typ-Erkennung, umfassenden Labels und flexiblen Schwellwerten. + +## 📋 Inhaltsverzeichnis + +- [Features](#features) +- [Anforderungen](#anforderungen) +- [Installation](#installation) +- [Konfiguration](#konfiguration) +- [Verwendung](#verwendung) +- [Architektur](#architektur) +- [Troubleshooting](#troubleshooting) +- [Changelog](#changelog) + +--- + +## ✨ Features + +### Kernfunktionen +- **Umfassende Backup-Typ-Erkennung**: Unterstützung für 18+ Backup-Typen +- **Flexible Labels**: Umfassende Service-Labels für erweiterte Filterung und Reporting +- **Typ-spezifische Schwellwerte**: Individuelle Warn- und Kritisch-Schwellen für verschiedene Backup-Typen +- **Redundanz-Support**: Normalisierung von RRZ*/NFRZ* Node-Namen +- **Aggregierte Überwachung**: Pro-Node-Aggregierung mehrerer Backup-Jobs +- **Intelligentes Error-Handling**: Tolerantes vs. striktes Verhalten je nach Backup-Typ + +### Unterstützte Backup-Typen + +#### Datenbanken (strikte Überwachung) +- MSSQL (26h/48h) +- SAP HANA (26h/48h) +- Oracle (26h/48h) +- DB2 (26h/48h) +- MySQL/MariaDB (26h/48h) +- PostgreSQL (26h/48h) +- Sybase (26h/48h) +- MongoDB (26h/48h) + +#### Dateisysteme (tolerante Überwachung) +- FILE (36h/72h) +- SCALE (36h/72h) +- DM (36h/72h) +- Datacenter (36h/72h) + +#### Virtualisierung (tolerante Überwachung) +- Virtual/VMware (36h/72h) +- Hyper-V (36h/72h) + +#### Applikationen +- Mail/Exchange (26h/48h) +- Transaction Logs (4h/8h) + +--- + +## 🔧 Anforderungen + +### CheckMK Server +- CheckMK Version: **2.3.0p40** oder höher +- Python 3.8+ +- Zugriff auf `/omd/sites//local/lib/python3/cmk_addons/` + +### Überwachte Hosts +- Python 3.6+ +- Lesezugriff auf TSM CSV-Export-Verzeichnis +- CheckMK Agent installiert + +### TSM Server +- CSV-Export der Backup-Statistiken +- Export-Format: `YYYY-MM-DD HH:MM:SS,,NODE_NAME,SCHEDULE,STATUS` + +--- + +## 📦 Installation + +### Schritt 1: Agent-Plugin installieren + +Das Agent-Plugin muss auf **jedem Host** installiert werden, der TSM-Backups überwachen soll. + +```bash +# Als root auf dem überwachten Host +cd /usr/lib/check_mk_agent/plugins + +# Plugin herunterladen oder kopieren +wget https://your-repo/tsm_backups_agent.py -O tsm_backups +# ODER +scp user@server:tsm_backups_agent.py /usr/lib/check_mk_agent/plugins/tsm_backups + +# Ausführbar machen +chmod +x /usr/lib/check_mk_agent/plugins/tsm_backups + +# Test +./tsm_backups +``` + +**Erwartete Ausgabe:** +``` +<<>> +{"SERVER_MSSQL": {"statuses": ["Completed"], "schedules": ["DAILY_FULL"], "last": 1736693420, "count": 1}, ...} +``` + +### Schritt 2: CSV-Verzeichnis vorbereiten + +```bash +# CSV-Verzeichnis erstellen +mkdir -p /mnt/CMK_TSM +chmod 755 /mnt/CMK_TSM + +# TSM-CSV-Dateien bereitstellen +# Option A: NFS-Mount vom TSM Server +mount -t nfs tsm-server:/exports/backup-stats /mnt/CMK_TSM + +# Option B: Regelmäßiger SCP/Rsync +# Crontab-Eintrag: +*/15 * * * * rsync -az tsm-server:/path/to/*.CSV /mnt/CMK_TSM/ +``` + +**Erwartete CSV-Struktur:** +``` +/mnt/CMK_TSM/ +├── TSM_BACKUP_SCHED_24H.CSV +├── TSM_DB_SCHED_24H.CSV +└── TSM_FILE_SCHED_24H.CSV +``` + +### Schritt 3: Check-Plugin installieren + +Das Check-Plugin wird auf dem **CheckMK Server** installiert. + +```bash +# Als Site-User +OM=/omd/sites/monitoring +cd $OM + +# Plugin-Verzeichnis erstellen +mkdir -p local/lib/python3/cmk_addons/plugins/tsm/agent_based + +# Plugin kopieren +cp tsm_backups.py local/lib/python3/cmk_addons/plugins/tsm/agent_based/ + +# Rechte setzen +chmod 644 local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py + +# CheckMK Cache leeren +cmk -R +``` + +### Schritt 4: Service Discovery + +```bash +# Service Discovery für einen Host +cmk -II hostname + +# Bulk Discovery für alle Hosts +cmk -II --all + +# WebUI: Setup > Hosts > > Service Discovery > Full Scan +``` + +--- + +## ⚙️ Konfiguration + +### Schwellwerte anpassen + +Die Schwellwerte können direkt im Check-Plugin angepasst werden: + +**Datei:** `local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py` + +```python +THRESHOLDS = { + "log": {"warn": 4 * 3600, "crit": 8 * 3600}, # 4h/8h + "mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # 26h/48h + # ... weitere Typen ... + "default": {"warn": 26 * 3600, "crit": 48 * 3600}, +} +``` + +### Neue Backup-Typen hinzufügen + +**Szenario:** Ein neuer Typ "SAPASE" soll unterstützt werden. + +**Schritt 1: Typ zur known_types Liste hinzufügen** + +```python +# In extract_backup_type() Funktion +known_types = [ + 'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM', + 'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL', 'POSTGRES', + 'MARIADB', 'EXCHANGE', 'VMWARE', 'HYPERV', 'SYBASE', 'MONGODB', + 'SAPASE', # NEU +] +``` + +**Schritt 2: Schwellwerte definieren (optional)** + +```python +THRESHOLDS = { + # ... bestehende ... + "sapase": {"warn": 26 * 3600, "crit": 48 * 3600}, +} +``` + +**Schritt 3: Kategorie zuweisen (optional)** + +```python +DATABASE_TYPES = { + 'mssql', 'hana', 'db2', 'oracle', 'mysql', + 'postgres', 'mariadb', 'sybase', 'mongodb', + 'sapase', # NEU +} +``` + +**Nach Änderungen:** +```bash +cmk -R +cmk -II --all +``` + +### Tolerantes Error-Handling konfigurieren + +Backup-Typen mit tolerantem Verhalten (Failed → WARNING statt CRITICAL): + +```python +TOLERANT_TYPES = { + 'file', 'virtual', 'scale', 'dm', 'datacenter', + 'vmware', 'hyperv', 'mail', 'exchange', + 'custom_tolerant_type' # NEU +} +``` + +### CSV-Verzeichnis ändern + +Im Agent-Plugin (`/usr/lib/check_mk_agent/plugins/tsm_backups`): + +```python +CSV_DIR = Path("/mnt/CMK_TSM") # Hier anpassen +``` + +--- + +## 🚀 Verwendung + +### Service-Labels nutzen + +Jeder TSM-Backup-Service erhält automatisch folgende Labels: + +| Label | Werte | Beschreibung | +|-------|-------|--------------| +| `backup_type` | `mssql`, `hana`, `file`, ... | Erkannter Backup-Typ | +| `backup_category` | `database`, `virtualization`, `filesystem`, `application`, `other` | Kategorie | +| `backup_system` | `tsm` | Backup-System | +| `frequency` | `hourly`, `daily`, `weekly`, `monthly` | Backup-Frequenz | +| `backup_level` | `log`, `full`, `incremental`, `differential` | Backup-Level | +| `error_handling` | `tolerant`, `strict` | Fehlerbehandlung | +| `node_name` | Original Node-Name | TSM-Node | + +### Beispiele: Label-basierte Filterung + +#### Views erstellen + +**CheckMK GUI:** +`Setup > General > Custom views > Create new view` + +**Filter-Beispiele:** +- **Alle Datenbank-Backups:** + `Service labels: backup_category = database` + +- **Alle fehlgeschlagenen strikten Backups:** + `State: CRIT` + `Service labels: error_handling = strict` + +- **Alle MSSQL-Backups mit täglicher Frequenz:** + `Service labels: backup_type = mssql AND frequency = daily` + +#### Business Intelligence (BI) + +```python +# BI-Aggregation: Alle DB-Backups OK? +{ + "type": "bi_aggregation", + "title": "Database Backups", + "filter": { + "service_labels": { + "backup_category": "database" + } + } +} +``` + +### Status-Bewertungslogik + +| Bedingung | Error Handling | Ergebnis | +|-----------|---------------|----------| +| ≥ 1x Completed | - | OK ✅ | +| Only Pending/Started (<2h) | - | OK ✅ | +| Only Pending/Started (>2h) | - | WARN ⚠️ | +| Failed/Missed | Tolerant | WARN ⚠️ | +| Failed/Missed | Strict | CRIT 🔴 | +| Age > Threshold | - | WARN/CRIT ⚠️🔴 | + +--- + +## 🏗️ Architektur + +### Komponenten-Übersicht + +``` +┌─────────────────────────────────────────────────────────┐ +│ TSM Server │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ Backup Jobs → CSV Export │ │ +│ │ (via TSM Queries oder Export-Scripts) │ │ +│ └────────────────┬───────────────────────────────┘ │ +└───────────────────┼──────────────────────────────────────┘ + │ CSV-Dateien + │ (z.B. via NFS, SCP, Rsync) + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Überwachter Host │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ /mnt/CMK_TSM/*.CSV │ │ +│ └────────────────┬───────────────────────────────┘ │ +│ │ │ +│ ┌────────────────▼───────────────────────────────┐ │ +│ │ Agent Plugin: tsm_backups │ │ +│ │ - Liest CSV-Dateien │ │ +│ │ - Normalisiert Node-Namen │ │ +│ │ - Aggregiert pro Node │ │ +│ │ - Gibt JSON aus │ │ +│ └────────────────┬───────────────────────────────┘ │ +└───────────────────┼──────────────────────────────────────┘ + │ JSON via CheckMK Agent + ▼ +┌─────────────────────────────────────────────────────────┐ +│ CheckMK Server │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ Check Plugin: tsm_backups │ │ +│ │ - Parsed JSON │ │ +│ │ - Erstellt Services mit Labels │ │ +│ │ - Bewertet Status │ │ +│ │ - Prüft Schwellwerte │ │ +│ │ - Erzeugt Metriken │ │ +│ └────────────────┬───────────────────────────────┘ │ +│ │ │ +│ ┌────────────────▼───────────────────────────────┐ │ +│ │ CheckMK Services │ │ +│ │ - TSM Backup SERVER_MSSQL │ │ +│ │ - TSM Backup VM_HYPERV_01 │ │ +│ │ - ... │ │ +│ └────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +### Datenfluss + +1. **TSM Server**: Exportiert Backup-Statistiken als CSV +2. **Host**: Agent-Plugin liest CSV, aggregiert und normalisiert Daten +3. **CheckMK**: Check-Plugin empfängt JSON, erstellt Services, bewertet Status +4. **Output**: Services mit Labels, Metriken, Status + +### Node-Normalisierung + +**Problem:** Redundante TSM-Server-Nodes +**Beispiel:** +- `RRZ01_MYSERVER_MSSQL` +- `RRZ02_MYSERVER_MSSQL` +- `NFRZ01_MYSERVER_MSSQL` + +**Lösung:** Normalisierung entfernt `RRZ*/NFRZ*`-Präfixe +**Ergebnis:** Ein Service `MYSERVER_MSSQL` aggregiert alle Nodes + +--- + +## 🔍 Troubleshooting + +### Problem: Keine Services gefunden + +**Diagnose:** +```bash +# Agent-Ausgabe prüfen +check_mk_agent | grep -A 20 "<<>> +{"NODE1": {...}, "NODE2": {...}} +``` + +**Lösungen:** +- CSV-Verzeichnis `/mnt/CMK_TSM` existiert? +- CSV-Dateien vorhanden? (`ls -lh /mnt/CMK_TSM/*.CSV`) +- Agent-Plugin ausführbar? (`ls -l /usr/lib/check_mk_agent/plugins/tsm_backups`) +- Manuell testen: `/usr/lib/check_mk_agent/plugins/tsm_backups` + +### Problem: Services bleiben UNKNOWN + +**Diagnose:** +```bash +# Check-Plugin testen +cmk -nv --debug hostname | grep "TSM Backup" +``` + +**Lösungen:** +- Check-Plugin korrekt installiert? +- Plugin-Cache löschen: `cmk -R` +- Discovery erneut: `cmk -II hostname` + +### Problem: Falsche Backup-Typen erkannt + +**Node-Namen-Konvention überprüfen:** +``` +✅ KORREKT: +- SERVER_MSSQL +- DATABASE_HANA_01 +- FILESERVER_FILE +- VM_HYPERV_123 + +❌ FALSCH: +- MSSQL (zu kurz) +- SERVER (kein Typ) +- SERVER_12345 (Typ unklar) +``` + +**Lösung:** Node-Namen-Schema anpassen oder `extract_backup_type()` erweitern + +### Problem: CSV-Dateien werden nicht gelesen + +**CSV-Format prüfen:** +```bash +head -n 5 /mnt/CMK_TSM/TSM_BACKUP_SCHED_24H.CSV +``` + +**Erwartetes Format:** +``` +2026-01-12 08:00:00,FIELD,SERVER_MSSQL,DAILY_FULL,Completed +2026-01-12 09:15:00,FIELD,DATABASE_HANA,HOURLY_LOG,Completed +``` + +**Spalten:** +1. Timestamp (`YYYY-MM-DD HH:MM:SS`) +2. Beliebiges Feld +3. **Node-Name** +4. **Schedule-Name** +5. **Status** + +### Problem: Logs analysieren + +```bash +# CheckMK-Log +tail -f /omd/sites/monitoring/var/log/cmc.log | grep tsm + +# Agent-Plugin debuggen +/usr/lib/check_mk_agent/plugins/tsm_backups 2>&1 | tee /tmp/tsm_debug.log +``` + +--- + +## 📊 Metriken + +### Erzeugte Metriken + +| Metrik | Beschreibung | Einheit | Schwellwerte | +|--------|--------------|---------|--------------| +| `backup_age` | Zeit seit letztem Backup | Sekunden | Typ-spezifisch | +| `backup_jobs` | Anzahl Backup-Jobs | Count | - | + +### Grafana-Integration + +**Beispiel-Query (InfluxDB):** +```sql +SELECT mean("backup_age") +FROM "tsm_backups" +WHERE "backup_type" = 'mssql' + AND time > now() - 7d +GROUP BY time(1h), "node_name" +``` + +--- + +## 🔄 Changelog + +### Version 5.0 (2026-01-12) +- ✨ **Dynamische Backup-Typ-Erkennung**: Keine feste Liste mehr nötig +- ✨ **Backup-Kategorien**: Zusätzliches Label `backup_category` +- ✨ **Erweiterte Kommentierung**: Vollständige Docstrings +- ✨ **Neue Typen**: PostgreSQL, MariaDB, MongoDB +- 🐛 **Bugfix**: ServiceLabel API für CheckMK 2.3.0p40 + +### Version 4.0 (2026-01-10) +- ✨ ServiceLabel API-Kompatibilität mit CheckMK 2.3.0p40 +- 📝 Erweiterte Dokumentation + +### Version 3.0 (2025-12-15) +- ✨ Node-Normalisierung für Redundanz +- ✨ Aggregation pro logischem Node +- 🔧 Typ-spezifische Schwellwerte + +### Version 2.0 (2025-11-20) +- ✨ Tolerantes Error-Handling +- ✨ Service-Labels + +### Version 1.0 (2025-11-01) +- 🎉 Initiales Release + +--- + +## 📝 Lizenz + +MIT License - Siehe LICENSE Datei + +## 👤 Autor + +**Marius Gielnik** +IT Product Owner - CheckMK Monitoring +GC-Gruppe (Cordes und Gräfe KG) + +## 🤝 Support + +- **Issues:** GitHub Issues +- **Fragen:** CheckMK Community Forum +- **Email:** [deine-email]@example.com + +--- + +## 📚 Weiterführende Links + +- [CheckMK Plugin Development](https://docs.checkmk.com/latest/de/devel_check_plugins.html) +- [IBM Spectrum Protect Documentation](https://www.ibm.com/docs/en/spectrum-protect) +- [CheckMK Labels](https://docs.checkmk.com/latest/de/labels.html) + +--- + +**Letzte Aktualisierung:** 2026-01-12 +**Version:** 4.1 \ No newline at end of file diff --git a/TSM/isntall.txt b/TSM/isntall.txt new file mode 100644 index 0000000..fd7018e --- /dev/null +++ b/TSM/isntall.txt @@ -0,0 +1,45 @@ +TSM BACKUP MONITORING FÜR CHECKMK +================================== + +Installation in 3 Schritten +---------------------------- + +1. AGENT-PLUGIN (auf jedem überwachten Host) + Kopiere: tsm_backups_agent.py + Nach: /usr/lib/check_mk_agent/plugins/tsm_backups + Rechte: chmod +x /usr/lib/check_mk_agent/plugins/tsm_backups + +2. CHECK-PLUGIN (auf CheckMK Server) + Kopiere: tsm_backups_check.py + Nach: /omd/sites//local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py + Dann: cmk -R + +3. SERVICE DISCOVERY + cmk -II + +Detaillierte Anleitungen +------------------------ + +→ QUICKSTART.md - 5-Minuten Schnellstart-Anleitung +→ README.md - Feature-Übersicht und Verwendung +→ DOCUMENTATION.md - Technische Details und API-Referenz + +Dateien +-------a + +tsm_backups_agent.py - Agent-Plugin (auf überwachten Hosts) +tsm_backups_check.py - Check-Plugin (auf CheckMK Server) +README.md - Hauptdokumentation +QUICKSTART.md - Schnellstart-Anleitung +DOCUMENTATION.md - Technische Dokumentation +CHANGELOG.md - Versionshistorie +LICENSE - MIT-Lizenz + +Support +------- + +Autor: Marius Gielnik +Version: 5.0.0 +Datum: 2026-01-12 + +Bei Fragen: Siehe README.md oder DOCUMENTATION.md \ No newline at end of file diff --git a/TSM/tsm_backup_check.py b/TSM/tsm_backup_check.py new file mode 100644 index 0000000..e69de29 diff --git a/TSM/tsm_backups.py b/TSM/tsm_backups.py new file mode 100644 index 0000000..4dec3a0 --- /dev/null +++ b/TSM/tsm_backups.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +""" +TSM Backup Status Agent Plugin for CheckMK +- Reads CSV files from /mnt/CMK_TSM +- Normalizes RRZ*/NFRZ* node names for redundancy +- Aggregates backups per logical node +- Outputs data as CheckMK agent section + +Installation: /usr/lib/check_mk_agent/plugins/tsm_backups +Permissions: chmod +x tsm_backups + +Author: Marius Gielnik +Version: 1.0 - Agent Plugin for CheckMK 2.3+ +""" +import csv +import json +from datetime import datetime +from pathlib import Path +from collections import defaultdict +import re +import sys + +CSV_DIR = Path("/mnt/CMK_TSM") + +class TSMParser: + def __init__(self): + self.backups = [] + + def normalize_node_name(self, node): + """Normalisiert Node-Namen für Redundanz-Logik""" + pattern = r'(RRZ|NFRZ|RZ)\d+(_)' + normalized = re.sub(pattern, r'\2', node) + pattern_end = r'(RRZ|NFRZ|RZ)\d+$' + normalized = re.sub(pattern_end, '', normalized) + return normalized + + def is_valid_node(self, node, status): + """Prüft ob ein Node valide ist""" + if not node or len(node) < 3 or not status: + return False + if "MAINTENANCE" in node: + return False + return True + + def parse_csv(self, csv_file): + """Parst CSV-Datei""" + try: + with open(csv_file, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + for row in reader: + if not row or len(row) < 5: + continue + + time_str = row[0].strip() + node = row[2].strip() + schedule = row[3].strip() + status = row[4].strip() + + if not self.is_valid_node(node, status): + continue + + normalized_node = self.normalize_node_name(node) + + self.backups.append({ + "time": time_str, + "node": normalized_node, + "status": status, + "schedule": schedule, + }) + except Exception: + pass + + def aggregate(self): + """Aggregiert Backups pro logischem Node""" + nodes = defaultdict(lambda: { + "statuses": [], + "schedules": [], + "last": None, + "count": 0, + }) + + for b in self.backups: + node = b["node"] + nodes[node]["count"] += 1 + nodes[node]["statuses"].append(b["status"]) + nodes[node]["schedules"].append(b["schedule"]) + + try: + t = datetime.strptime(b["time"], "%Y-%m-%d %H:%M:%S") + if not nodes[node]["last"] or t > nodes[node]["last"]: + nodes[node]["last"] = t + except Exception: + pass + + # Convert datetime to timestamp for JSON serialization + result = {} + for node, data in nodes.items(): + result[node] = { + "statuses": data["statuses"], + "schedules": data["schedules"], + "last": int(data["last"].timestamp()) if data["last"] else None, + "count": data["count"] + } + return result + +def main(): + if not CSV_DIR.exists(): + # Output empty section if directory doesn't exist + print("<<>>") + print(json.dumps({})) + return + + csv_files = list(CSV_DIR.glob("*_SCHED_24H.CSV")) + if not csv_files: + csv_files = list(CSV_DIR.glob("*.CSV")) + list(CSV_DIR.glob("*.csv")) + + parser = TSMParser() + for csv_file in csv_files: + parser.parse_csv(csv_file) + + nodes = parser.aggregate() + + # Output CheckMK agent section + print("<<>>") + print(json.dumps(nodes)) + +if __name__ == "__main__": + main() \ No newline at end of file