#!/usr/bin/env python3 """ TSM Backup Status Check Plugin for CheckMK 2.3+ - Parses tsm_backups agent section - Creates services with labels (backup_type, frequency, level, error_handling) - Dynamic backup type detection - Configurable type-specific thresholds - Tolerant error handling for FILE/VIRTUAL backups Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py Check name: tsm_backups Author: Marius Gielnik Version: 4.1 - Fixed Label Detection & Comprehensive Documentation """ from typing import Any, Mapping from datetime import datetime import json import re from cmk.agent_based.v2 import ( AgentSection, CheckPlugin, CheckResult, DiscoveryResult, Result, Service, ServiceLabel, State, Metric, render, StringTable, ) # ============================================================================ # Configuration Section # ============================================================================ # Type-specific thresholds (in seconds) # New types can be added here dynamically without code changes THRESHOLDS = { "log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups "mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups "hana": {"warn": 26 * 3600, "crit": 48 * 3600}, "db2": {"warn": 26 * 3600, "crit": 48 * 3600}, "oracle": {"warn": 26 * 3600, "crit": 48 * 3600}, "mysql": {"warn": 26 * 3600, "crit": 48 * 3600}, "file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant "virtual": {"warn": 36 * 3600, "crit": 72 * 3600}, "mail": {"warn": 26 * 3600, "crit": 48 * 3600}, "scale": {"warn": 36 * 3600, "crit": 72 * 3600}, "dm": {"warn": 36 * 3600, "crit": 72 * 3600}, "datacenter": {"warn": 36 * 3600, "crit": 72 * 3600}, "default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback } # Types that should use tolerant error handling # Failed/Missed backups result in WARNING instead of CRITICAL TOLERANT_TYPES = { 'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail' } # Known database types (for better classification) DATABASE_TYPES = { 'mssql', 'hana', 'db2', 'oracle', 'mysql' } # Known virtualization types VIRTUAL_TYPES = { 'virtual' } # ============================================================================ # Helper Functions # ============================================================================ def extract_backup_type(node: str) -> str: """ Extrahiert Backup-Typ aus Node-Namen. Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert. Examples: MYSERVER_MSSQL -> mssql DATABASE_HANA_01 -> hana FILESERVER_FILE -> file VM_HYPERV_123 -> hyperv Args: node: Node name from TSM Returns: str: Detected backup type in lowercase """ if '_' not in node: return "unknown" parts = node.split('_') last = parts[-1].upper() # Bekannte Backup-Typen (fest definiert) known_types = [ 'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM', 'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL' ] if last in known_types: return last.lower() # Falls letztes Segment numerisch, versuche vorletztes if last.isdigit() and len(parts) > 1: second_last = parts[-2].upper() if second_last in known_types: return second_last.lower() return "unknown" def extract_backup_level(schedules: list) -> str: """ Extracts backup level from schedule names. Priority: log > full > differential > incremental Args: schedules: List of schedule names Returns: str: Detected backup level """ levels_found = set() for schedule in schedules: schedule_upper = schedule.upper() if '_LOG' in schedule_upper or 'LOG' in schedule_upper: levels_found.add('log') elif '_FULL' in schedule_upper or 'FULL' in schedule_upper: levels_found.add('full') elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper: levels_found.add('incremental') elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper: levels_found.add('differential') # Return in priority order if 'log' in levels_found: return 'log' if 'full' in levels_found: return 'full' if 'differential' in levels_found: return 'differential' if 'incremental' in levels_found: return 'incremental' return 'full' # Default def extract_frequency(schedules: list) -> str: """ Extracts backup frequency from schedule names. Priority: hourly > daily > weekly > monthly Args: schedules: List of schedule names Returns: str: Detected frequency """ frequencies_found = set() for schedule in schedules: schedule_upper = schedule.upper() if 'HOURLY' in schedule_upper: frequencies_found.add('hourly') elif 'DAILY' in schedule_upper: frequencies_found.add('daily') elif 'WEEKLY' in schedule_upper: frequencies_found.add('weekly') elif 'MONTHLY' in schedule_upper: frequencies_found.add('monthly') # Time-based pattern detection (HH-MM-SS_) elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule): if '_LOG' in schedule_upper: frequencies_found.add('hourly') elif schedule.startswith('00-00-00'): frequencies_found.add('daily') else: frequencies_found.add('daily') # Return in priority order if 'hourly' in frequencies_found: return 'hourly' if 'daily' in frequencies_found: return 'daily' if 'weekly' in frequencies_found: return 'weekly' if 'monthly' in frequencies_found: return 'monthly' return 'unknown' def get_error_handling(backup_type: str) -> str: """ Determines error handling strategy based on backup type. Tolerant types: Failed backups trigger WARNING instead of CRITICAL Strict types: Failed backups trigger CRITICAL Args: backup_type: Detected backup type Returns: str: 'tolerant' or 'strict' """ return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict' def get_backup_category(backup_type: str) -> str: """ Categorizes backup type for additional labeling. Categories: database, virtualization, filesystem, application, other Args: backup_type: Detected backup type Returns: str: Category name """ if backup_type in DATABASE_TYPES: return 'database' elif backup_type in VIRTUAL_TYPES: return 'virtualization' elif backup_type in {'file', 'scale', 'dm', 'datacenter'}: return 'filesystem' elif backup_type in {'mail', 'exchange'}: return 'application' else: return 'other' def get_thresholds(backup_type: str, backup_level: str) -> dict: """ Returns type and level-specific thresholds. Priority: 1. If level is 'log', use log thresholds 2. If type has specific thresholds, use those 3. Use default thresholds Args: backup_type: Detected backup type backup_level: Detected backup level Returns: dict: {"warn": seconds, "crit": seconds} """ if backup_level == 'log': return THRESHOLDS['log'] elif backup_type in THRESHOLDS: return THRESHOLDS[backup_type] else: return THRESHOLDS['default'] def calculate_state(statuses: list, last_time: int, backup_type: str, error_handling: str) -> tuple: """ Calculates CheckMK state from backup statuses. Logic: - At least 1x Completed -> OK - Only Pending/Started (recent) -> OK - Only Pending/Started (old) -> WARN - Failed/Missed + tolerant -> WARN - Failed/Missed + strict -> CRIT Args: statuses: List of backup statuses last_time: Timestamp of last backup backup_type: Detected backup type error_handling: 'tolerant' or 'strict' Returns: tuple: (State, status_text) """ statuses_lower = [s.lower() for s in statuses] # At least one completed backup -> OK if "completed" in statuses_lower: return (State.OK, "Completed") # Only Pending/Started only_pending_started = all(s in ["pending", "started"] for s in statuses_lower) if only_pending_started: if last_time: age = int(datetime.now().timestamp()) - last_time if age < 2 * 3600: # 2 hours return (State.OK, "Pending/Started") else: return (State.WARN, "Pending (>2h)") else: return (State.WARN, "Pending") # Failed or Missed backups has_failed = any("failed" in s for s in statuses_lower) has_missed = "missed" in statuses_lower if has_failed or has_missed: if error_handling == 'tolerant': return (State.WARN, "Failed (partial)") else: return (State.CRIT, "Failed/Missed") return (State.CRIT, "Unknown State") # ============================================================================ # Agent Section Registration # ============================================================================ def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]: """ Parses the tsm_backups agent section. Expected format: JSON string from agent plugin Args: string_table: Raw agent output Returns: dict: Parsed backup data per node """ if not string_table or not string_table[0]: return {} try: json_str = string_table[0][0] return json.loads(json_str) except (json.JSONDecodeError, IndexError, KeyError): return {} agent_section_tsm_backups = AgentSection( name="tsm_backups", parse_function=parse_tsm_backups, ) # ============================================================================ # Check Plugin Registration # ============================================================================ def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult: """ Discovery function - creates one service per logical node. Services are created with dynamic labels: - backup_type: Dynamically detected from node name - backup_category: database/virtualization/filesystem/application/other - backup_system: Always "tsm" - frequency: hourly/daily/weekly/monthly/unknown - backup_level: log/full/differential/incremental - error_handling: tolerant/strict - node_name: Original node name Args: section: Parsed agent data Yields: Service: CheckMK service objects with labels """ for node in section: data = section[node] # Extract metadata dynamically backup_type = extract_backup_type(node) backup_level = extract_backup_level(data["schedules"]) frequency = extract_frequency(data["schedules"]) error_handling = get_error_handling(backup_type) category = get_backup_category(backup_type) # Create service with dynamic labels yield Service( item=node, labels=[ ServiceLabel("backup_type", backup_type), ServiceLabel("backup_category", category), ServiceLabel("backup_system", "tsm"), ServiceLabel("frequency", frequency), ServiceLabel("backup_level", backup_level), ServiceLabel("error_handling", error_handling), ServiceLabel("node_name", node), ] ) def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult: """ Check function - evaluates backup status and generates metrics. Checks: - Backup completion status - Backup age against type-specific thresholds - Number of backup jobs Args: item: Node name (service identifier) section: Parsed agent data Yields: Result: Check results with state Metric: Performance metrics """ if item not in section: yield Result( state=State.UNKNOWN, summary=f"Backup {item} not found in data" ) return data = section[item] # Extract metadata dynamically backup_type = extract_backup_type(item) backup_level = extract_backup_level(data["schedules"]) frequency = extract_frequency(data["schedules"]) error_handling = get_error_handling(backup_type) category = get_backup_category(backup_type) # Calculate state state, status_text = calculate_state( data["statuses"], data["last"], backup_type, error_handling ) # Calculate backup age if data["last"]: age_seconds = int(datetime.now().timestamp()) - data["last"] age_txt = render.timespan(age_seconds) else: age_seconds = 999999 age_txt = "unknown" # Get type-specific thresholds thresholds = get_thresholds(backup_type, backup_level) warn_seconds = thresholds["warn"] crit_seconds = thresholds["crit"] # Check age against thresholds (only if backup completed) if state == State.OK and age_seconds > crit_seconds: state = State.CRIT elif state == State.OK and age_seconds > warn_seconds: state = State.WARN # Build summary summary = ( f"Type={backup_type.upper()} ({category}), " f"Level={backup_level.upper()}, " f"Freq={frequency}, " f"Status={status_text}, " f"Last={age_txt}, " f"Jobs={data['count']}" ) # Main result yield Result(state=state, summary=summary) # Detailed information if state in [State.WARN, State.CRIT]: details = ( f"Thresholds: WARN={render.timespan(warn_seconds)}, " f"CRIT={render.timespan(crit_seconds)}" ) yield Result(state=State.OK, notice=details) # Metrics yield Metric( name="backup_age", value=age_seconds, levels=(warn_seconds, crit_seconds), boundaries=(0, None), ) yield Metric( name="backup_jobs", value=data["count"], ) check_plugin_tsm_backups = CheckPlugin( name="tsm_backups", service_name="TSM Backup %s", discovery_function=discover_tsm_backups, check_function=check_tsm_backups, sections=["tsm_backups"], )