From 91522703ad03efbba92c8a885ddd2c030d58a881 Mon Sep 17 00:00:00 2001 From: magadimn Date: Tue, 13 Jan 2026 23:33:28 +0100 Subject: [PATCH] add content to tsm_backup_check.py --- TSM/tms_backup_check.py | 503 ++++++++++++++++++++++++++++++++++++++++ TSM/tsm_backup_check.py | 503 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1006 insertions(+) create mode 100644 TSM/tms_backup_check.py diff --git a/TSM/tms_backup_check.py b/TSM/tms_backup_check.py new file mode 100644 index 0000000..16aa4b7 --- /dev/null +++ b/TSM/tms_backup_check.py @@ -0,0 +1,503 @@ +#!/usr/bin/env python3 +""" +TSM Backup Status Check Plugin for CheckMK 2.3+ +- Parses tsm_backups agent section +- Creates services with labels (backup_type, frequency, level, error_handling) +- Dynamic backup type detection +- Configurable type-specific thresholds +- Tolerant error handling for FILE/VIRTUAL backups + +Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py +Check name: tsm_backups + +Author: Marius Gielnik +Version: 4.1 - Fixed Label Detection & Comprehensive Documentation +""" +from typing import Any, Mapping +from datetime import datetime +import json +import re + +from cmk.agent_based.v2 import ( + AgentSection, + CheckPlugin, + CheckResult, + DiscoveryResult, + Result, + Service, + ServiceLabel, + State, + Metric, + render, + StringTable, +) + +# ============================================================================ +# Configuration Section +# ============================================================================ + +# Type-specific thresholds (in seconds) +# New types can be added here dynamically without code changes +THRESHOLDS = { + "log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups + "mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups + "hana": {"warn": 26 * 3600, "crit": 48 * 3600}, + "db2": {"warn": 26 * 3600, "crit": 48 * 3600}, + "oracle": {"warn": 26 * 3600, "crit": 48 * 3600}, + "mysql": {"warn": 26 * 3600, "crit": 48 * 3600}, + "file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant + "virtual": {"warn": 36 * 3600, "crit": 72 * 3600}, + "mail": {"warn": 26 * 3600, "crit": 48 * 3600}, + "scale": {"warn": 36 * 3600, "crit": 72 * 3600}, + "dm": {"warn": 36 * 3600, "crit": 72 * 3600}, + "datacenter": {"warn": 36 * 3600, "crit": 72 * 3600}, + "default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback +} + +# Types that should use tolerant error handling +# Failed/Missed backups result in WARNING instead of CRITICAL +TOLERANT_TYPES = { + 'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail' +} + +# Known database types (for better classification) +DATABASE_TYPES = { + 'mssql', 'hana', 'db2', 'oracle', 'mysql' +} + +# Known virtualization types +VIRTUAL_TYPES = { + 'virtual' +} + + +# ============================================================================ +# Helper Functions +# ============================================================================ + +def extract_backup_type(node: str) -> str: + """ + Extrahiert Backup-Typ aus Node-Namen. + + Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert. + + Examples: + MYSERVER_MSSQL -> mssql + DATABASE_HANA_01 -> hana + FILESERVER_FILE -> file + VM_HYPERV_123 -> hyperv + + Args: + node: Node name from TSM + + Returns: + str: Detected backup type in lowercase + """ + if '_' not in node: + return "unknown" + + parts = node.split('_') + last = parts[-1].upper() + + # Bekannte Backup-Typen (fest definiert) + known_types = [ + 'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM', + 'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL' + ] + + if last in known_types: + return last.lower() + + # Falls letztes Segment numerisch, versuche vorletztes + if last.isdigit() and len(parts) > 1: + second_last = parts[-2].upper() + if second_last in known_types: + return second_last.lower() + + return "unknown" + + +def extract_backup_level(schedules: list) -> str: + """ + Extracts backup level from schedule names. + + Priority: log > full > differential > incremental + + Args: + schedules: List of schedule names + + Returns: + str: Detected backup level + """ + levels_found = set() + + for schedule in schedules: + schedule_upper = schedule.upper() + + if '_LOG' in schedule_upper or 'LOG' in schedule_upper: + levels_found.add('log') + elif '_FULL' in schedule_upper or 'FULL' in schedule_upper: + levels_found.add('full') + elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper: + levels_found.add('incremental') + elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper: + levels_found.add('differential') + + # Return in priority order + if 'log' in levels_found: + return 'log' + if 'full' in levels_found: + return 'full' + if 'differential' in levels_found: + return 'differential' + if 'incremental' in levels_found: + return 'incremental' + + return 'full' # Default + + +def extract_frequency(schedules: list) -> str: + """ + Extracts backup frequency from schedule names. + + Priority: hourly > daily > weekly > monthly + + Args: + schedules: List of schedule names + + Returns: + str: Detected frequency + """ + frequencies_found = set() + + for schedule in schedules: + schedule_upper = schedule.upper() + + if 'HOURLY' in schedule_upper: + frequencies_found.add('hourly') + elif 'DAILY' in schedule_upper: + frequencies_found.add('daily') + elif 'WEEKLY' in schedule_upper: + frequencies_found.add('weekly') + elif 'MONTHLY' in schedule_upper: + frequencies_found.add('monthly') + # Time-based pattern detection (HH-MM-SS_) + elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule): + if '_LOG' in schedule_upper: + frequencies_found.add('hourly') + elif schedule.startswith('00-00-00'): + frequencies_found.add('daily') + else: + frequencies_found.add('daily') + + # Return in priority order + if 'hourly' in frequencies_found: + return 'hourly' + if 'daily' in frequencies_found: + return 'daily' + if 'weekly' in frequencies_found: + return 'weekly' + if 'monthly' in frequencies_found: + return 'monthly' + + return 'unknown' + + +def get_error_handling(backup_type: str) -> str: + """ + Determines error handling strategy based on backup type. + + Tolerant types: Failed backups trigger WARNING instead of CRITICAL + Strict types: Failed backups trigger CRITICAL + + Args: + backup_type: Detected backup type + + Returns: + str: 'tolerant' or 'strict' + """ + return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict' + + +def get_backup_category(backup_type: str) -> str: + """ + Categorizes backup type for additional labeling. + + Categories: database, virtualization, filesystem, application, other + + Args: + backup_type: Detected backup type + + Returns: + str: Category name + """ + if backup_type in DATABASE_TYPES: + return 'database' + elif backup_type in VIRTUAL_TYPES: + return 'virtualization' + elif backup_type in {'file', 'scale', 'dm', 'datacenter'}: + return 'filesystem' + elif backup_type in {'mail', 'exchange'}: + return 'application' + else: + return 'other' + + +def get_thresholds(backup_type: str, backup_level: str) -> dict: + """ + Returns type and level-specific thresholds. + + Priority: + 1. If level is 'log', use log thresholds + 2. If type has specific thresholds, use those + 3. Use default thresholds + + Args: + backup_type: Detected backup type + backup_level: Detected backup level + + Returns: + dict: {"warn": seconds, "crit": seconds} + """ + if backup_level == 'log': + return THRESHOLDS['log'] + elif backup_type in THRESHOLDS: + return THRESHOLDS[backup_type] + else: + return THRESHOLDS['default'] + + +def calculate_state(statuses: list, last_time: int, backup_type: str, + error_handling: str) -> tuple: + """ + Calculates CheckMK state from backup statuses. + + Logic: + - At least 1x Completed -> OK + - Only Pending/Started (recent) -> OK + - Only Pending/Started (old) -> WARN + - Failed/Missed + tolerant -> WARN + - Failed/Missed + strict -> CRIT + + Args: + statuses: List of backup statuses + last_time: Timestamp of last backup + backup_type: Detected backup type + error_handling: 'tolerant' or 'strict' + + Returns: + tuple: (State, status_text) + """ + statuses_lower = [s.lower() for s in statuses] + + # At least one completed backup -> OK + if "completed" in statuses_lower: + return (State.OK, "Completed") + + # Only Pending/Started + only_pending_started = all(s in ["pending", "started"] for s in statuses_lower) + if only_pending_started: + if last_time: + age = int(datetime.now().timestamp()) - last_time + if age < 2 * 3600: # 2 hours + return (State.OK, "Pending/Started") + else: + return (State.WARN, "Pending (>2h)") + else: + return (State.WARN, "Pending") + + # Failed or Missed backups + has_failed = any("failed" in s for s in statuses_lower) + has_missed = "missed" in statuses_lower + + if has_failed or has_missed: + if error_handling == 'tolerant': + return (State.WARN, "Failed (partial)") + else: + return (State.CRIT, "Failed/Missed") + + return (State.CRIT, "Unknown State") + + +# ============================================================================ +# Agent Section Registration +# ============================================================================ + +def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]: + """ + Parses the tsm_backups agent section. + + Expected format: JSON string from agent plugin + + Args: + string_table: Raw agent output + + Returns: + dict: Parsed backup data per node + """ + if not string_table or not string_table[0]: + return {} + + try: + json_str = string_table[0][0] + return json.loads(json_str) + except (json.JSONDecodeError, IndexError, KeyError): + return {} + + +agent_section_tsm_backups = AgentSection( + name="tsm_backups", + parse_function=parse_tsm_backups, +) + + +# ============================================================================ +# Check Plugin Registration +# ============================================================================ + +def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult: + """ + Discovery function - creates one service per logical node. + + Services are created with dynamic labels: + - backup_type: Dynamically detected from node name + - backup_category: database/virtualization/filesystem/application/other + - backup_system: Always "tsm" + - frequency: hourly/daily/weekly/monthly/unknown + - backup_level: log/full/differential/incremental + - error_handling: tolerant/strict + - node_name: Original node name + + Args: + section: Parsed agent data + + Yields: + Service: CheckMK service objects with labels + """ + for node in section: + data = section[node] + + # Extract metadata dynamically + backup_type = extract_backup_type(node) + backup_level = extract_backup_level(data["schedules"]) + frequency = extract_frequency(data["schedules"]) + error_handling = get_error_handling(backup_type) + category = get_backup_category(backup_type) + + # Create service with dynamic labels + yield Service( + item=node, + labels=[ + ServiceLabel("backup_type", backup_type), + ServiceLabel("backup_category", category), + ServiceLabel("backup_system", "tsm"), + ServiceLabel("frequency", frequency), + ServiceLabel("backup_level", backup_level), + ServiceLabel("error_handling", error_handling), + ServiceLabel("node_name", node), + ] + ) + + +def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult: + """ + Check function - evaluates backup status and generates metrics. + + Checks: + - Backup completion status + - Backup age against type-specific thresholds + - Number of backup jobs + + Args: + item: Node name (service identifier) + section: Parsed agent data + + Yields: + Result: Check results with state + Metric: Performance metrics + """ + if item not in section: + yield Result( + state=State.UNKNOWN, + summary=f"Backup {item} not found in data" + ) + return + + data = section[item] + + # Extract metadata dynamically + backup_type = extract_backup_type(item) + backup_level = extract_backup_level(data["schedules"]) + frequency = extract_frequency(data["schedules"]) + error_handling = get_error_handling(backup_type) + category = get_backup_category(backup_type) + + # Calculate state + state, status_text = calculate_state( + data["statuses"], + data["last"], + backup_type, + error_handling + ) + + # Calculate backup age + if data["last"]: + age_seconds = int(datetime.now().timestamp()) - data["last"] + age_txt = render.timespan(age_seconds) + else: + age_seconds = 999999 + age_txt = "unknown" + + # Get type-specific thresholds + thresholds = get_thresholds(backup_type, backup_level) + warn_seconds = thresholds["warn"] + crit_seconds = thresholds["crit"] + + # Check age against thresholds (only if backup completed) + if state == State.OK and age_seconds > crit_seconds: + state = State.CRIT + elif state == State.OK and age_seconds > warn_seconds: + state = State.WARN + + # Build summary + summary = ( + f"Type={backup_type.upper()} ({category}), " + f"Level={backup_level.upper()}, " + f"Freq={frequency}, " + f"Status={status_text}, " + f"Last={age_txt}, " + f"Jobs={data['count']}" + ) + + # Main result + yield Result(state=state, summary=summary) + + # Detailed information + if state in [State.WARN, State.CRIT]: + details = ( + f"Thresholds: WARN={render.timespan(warn_seconds)}, " + f"CRIT={render.timespan(crit_seconds)}" + ) + yield Result(state=State.OK, notice=details) + + # Metrics + yield Metric( + name="backup_age", + value=age_seconds, + levels=(warn_seconds, crit_seconds), + boundaries=(0, None), + ) + + yield Metric( + name="backup_jobs", + value=data["count"], + ) + + +check_plugin_tsm_backups = CheckPlugin( + name="tsm_backups", + service_name="TSM Backup %s", + discovery_function=discover_tsm_backups, + check_function=check_tsm_backups, + sections=["tsm_backups"], +) \ No newline at end of file diff --git a/TSM/tsm_backup_check.py b/TSM/tsm_backup_check.py index e69de29..16aa4b7 100644 --- a/TSM/tsm_backup_check.py +++ b/TSM/tsm_backup_check.py @@ -0,0 +1,503 @@ +#!/usr/bin/env python3 +""" +TSM Backup Status Check Plugin for CheckMK 2.3+ +- Parses tsm_backups agent section +- Creates services with labels (backup_type, frequency, level, error_handling) +- Dynamic backup type detection +- Configurable type-specific thresholds +- Tolerant error handling for FILE/VIRTUAL backups + +Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py +Check name: tsm_backups + +Author: Marius Gielnik +Version: 4.1 - Fixed Label Detection & Comprehensive Documentation +""" +from typing import Any, Mapping +from datetime import datetime +import json +import re + +from cmk.agent_based.v2 import ( + AgentSection, + CheckPlugin, + CheckResult, + DiscoveryResult, + Result, + Service, + ServiceLabel, + State, + Metric, + render, + StringTable, +) + +# ============================================================================ +# Configuration Section +# ============================================================================ + +# Type-specific thresholds (in seconds) +# New types can be added here dynamically without code changes +THRESHOLDS = { + "log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups + "mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups + "hana": {"warn": 26 * 3600, "crit": 48 * 3600}, + "db2": {"warn": 26 * 3600, "crit": 48 * 3600}, + "oracle": {"warn": 26 * 3600, "crit": 48 * 3600}, + "mysql": {"warn": 26 * 3600, "crit": 48 * 3600}, + "file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant + "virtual": {"warn": 36 * 3600, "crit": 72 * 3600}, + "mail": {"warn": 26 * 3600, "crit": 48 * 3600}, + "scale": {"warn": 36 * 3600, "crit": 72 * 3600}, + "dm": {"warn": 36 * 3600, "crit": 72 * 3600}, + "datacenter": {"warn": 36 * 3600, "crit": 72 * 3600}, + "default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback +} + +# Types that should use tolerant error handling +# Failed/Missed backups result in WARNING instead of CRITICAL +TOLERANT_TYPES = { + 'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail' +} + +# Known database types (for better classification) +DATABASE_TYPES = { + 'mssql', 'hana', 'db2', 'oracle', 'mysql' +} + +# Known virtualization types +VIRTUAL_TYPES = { + 'virtual' +} + + +# ============================================================================ +# Helper Functions +# ============================================================================ + +def extract_backup_type(node: str) -> str: + """ + Extrahiert Backup-Typ aus Node-Namen. + + Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert. + + Examples: + MYSERVER_MSSQL -> mssql + DATABASE_HANA_01 -> hana + FILESERVER_FILE -> file + VM_HYPERV_123 -> hyperv + + Args: + node: Node name from TSM + + Returns: + str: Detected backup type in lowercase + """ + if '_' not in node: + return "unknown" + + parts = node.split('_') + last = parts[-1].upper() + + # Bekannte Backup-Typen (fest definiert) + known_types = [ + 'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM', + 'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL' + ] + + if last in known_types: + return last.lower() + + # Falls letztes Segment numerisch, versuche vorletztes + if last.isdigit() and len(parts) > 1: + second_last = parts[-2].upper() + if second_last in known_types: + return second_last.lower() + + return "unknown" + + +def extract_backup_level(schedules: list) -> str: + """ + Extracts backup level from schedule names. + + Priority: log > full > differential > incremental + + Args: + schedules: List of schedule names + + Returns: + str: Detected backup level + """ + levels_found = set() + + for schedule in schedules: + schedule_upper = schedule.upper() + + if '_LOG' in schedule_upper or 'LOG' in schedule_upper: + levels_found.add('log') + elif '_FULL' in schedule_upper or 'FULL' in schedule_upper: + levels_found.add('full') + elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper: + levels_found.add('incremental') + elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper: + levels_found.add('differential') + + # Return in priority order + if 'log' in levels_found: + return 'log' + if 'full' in levels_found: + return 'full' + if 'differential' in levels_found: + return 'differential' + if 'incremental' in levels_found: + return 'incremental' + + return 'full' # Default + + +def extract_frequency(schedules: list) -> str: + """ + Extracts backup frequency from schedule names. + + Priority: hourly > daily > weekly > monthly + + Args: + schedules: List of schedule names + + Returns: + str: Detected frequency + """ + frequencies_found = set() + + for schedule in schedules: + schedule_upper = schedule.upper() + + if 'HOURLY' in schedule_upper: + frequencies_found.add('hourly') + elif 'DAILY' in schedule_upper: + frequencies_found.add('daily') + elif 'WEEKLY' in schedule_upper: + frequencies_found.add('weekly') + elif 'MONTHLY' in schedule_upper: + frequencies_found.add('monthly') + # Time-based pattern detection (HH-MM-SS_) + elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule): + if '_LOG' in schedule_upper: + frequencies_found.add('hourly') + elif schedule.startswith('00-00-00'): + frequencies_found.add('daily') + else: + frequencies_found.add('daily') + + # Return in priority order + if 'hourly' in frequencies_found: + return 'hourly' + if 'daily' in frequencies_found: + return 'daily' + if 'weekly' in frequencies_found: + return 'weekly' + if 'monthly' in frequencies_found: + return 'monthly' + + return 'unknown' + + +def get_error_handling(backup_type: str) -> str: + """ + Determines error handling strategy based on backup type. + + Tolerant types: Failed backups trigger WARNING instead of CRITICAL + Strict types: Failed backups trigger CRITICAL + + Args: + backup_type: Detected backup type + + Returns: + str: 'tolerant' or 'strict' + """ + return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict' + + +def get_backup_category(backup_type: str) -> str: + """ + Categorizes backup type for additional labeling. + + Categories: database, virtualization, filesystem, application, other + + Args: + backup_type: Detected backup type + + Returns: + str: Category name + """ + if backup_type in DATABASE_TYPES: + return 'database' + elif backup_type in VIRTUAL_TYPES: + return 'virtualization' + elif backup_type in {'file', 'scale', 'dm', 'datacenter'}: + return 'filesystem' + elif backup_type in {'mail', 'exchange'}: + return 'application' + else: + return 'other' + + +def get_thresholds(backup_type: str, backup_level: str) -> dict: + """ + Returns type and level-specific thresholds. + + Priority: + 1. If level is 'log', use log thresholds + 2. If type has specific thresholds, use those + 3. Use default thresholds + + Args: + backup_type: Detected backup type + backup_level: Detected backup level + + Returns: + dict: {"warn": seconds, "crit": seconds} + """ + if backup_level == 'log': + return THRESHOLDS['log'] + elif backup_type in THRESHOLDS: + return THRESHOLDS[backup_type] + else: + return THRESHOLDS['default'] + + +def calculate_state(statuses: list, last_time: int, backup_type: str, + error_handling: str) -> tuple: + """ + Calculates CheckMK state from backup statuses. + + Logic: + - At least 1x Completed -> OK + - Only Pending/Started (recent) -> OK + - Only Pending/Started (old) -> WARN + - Failed/Missed + tolerant -> WARN + - Failed/Missed + strict -> CRIT + + Args: + statuses: List of backup statuses + last_time: Timestamp of last backup + backup_type: Detected backup type + error_handling: 'tolerant' or 'strict' + + Returns: + tuple: (State, status_text) + """ + statuses_lower = [s.lower() for s in statuses] + + # At least one completed backup -> OK + if "completed" in statuses_lower: + return (State.OK, "Completed") + + # Only Pending/Started + only_pending_started = all(s in ["pending", "started"] for s in statuses_lower) + if only_pending_started: + if last_time: + age = int(datetime.now().timestamp()) - last_time + if age < 2 * 3600: # 2 hours + return (State.OK, "Pending/Started") + else: + return (State.WARN, "Pending (>2h)") + else: + return (State.WARN, "Pending") + + # Failed or Missed backups + has_failed = any("failed" in s for s in statuses_lower) + has_missed = "missed" in statuses_lower + + if has_failed or has_missed: + if error_handling == 'tolerant': + return (State.WARN, "Failed (partial)") + else: + return (State.CRIT, "Failed/Missed") + + return (State.CRIT, "Unknown State") + + +# ============================================================================ +# Agent Section Registration +# ============================================================================ + +def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]: + """ + Parses the tsm_backups agent section. + + Expected format: JSON string from agent plugin + + Args: + string_table: Raw agent output + + Returns: + dict: Parsed backup data per node + """ + if not string_table or not string_table[0]: + return {} + + try: + json_str = string_table[0][0] + return json.loads(json_str) + except (json.JSONDecodeError, IndexError, KeyError): + return {} + + +agent_section_tsm_backups = AgentSection( + name="tsm_backups", + parse_function=parse_tsm_backups, +) + + +# ============================================================================ +# Check Plugin Registration +# ============================================================================ + +def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult: + """ + Discovery function - creates one service per logical node. + + Services are created with dynamic labels: + - backup_type: Dynamically detected from node name + - backup_category: database/virtualization/filesystem/application/other + - backup_system: Always "tsm" + - frequency: hourly/daily/weekly/monthly/unknown + - backup_level: log/full/differential/incremental + - error_handling: tolerant/strict + - node_name: Original node name + + Args: + section: Parsed agent data + + Yields: + Service: CheckMK service objects with labels + """ + for node in section: + data = section[node] + + # Extract metadata dynamically + backup_type = extract_backup_type(node) + backup_level = extract_backup_level(data["schedules"]) + frequency = extract_frequency(data["schedules"]) + error_handling = get_error_handling(backup_type) + category = get_backup_category(backup_type) + + # Create service with dynamic labels + yield Service( + item=node, + labels=[ + ServiceLabel("backup_type", backup_type), + ServiceLabel("backup_category", category), + ServiceLabel("backup_system", "tsm"), + ServiceLabel("frequency", frequency), + ServiceLabel("backup_level", backup_level), + ServiceLabel("error_handling", error_handling), + ServiceLabel("node_name", node), + ] + ) + + +def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult: + """ + Check function - evaluates backup status and generates metrics. + + Checks: + - Backup completion status + - Backup age against type-specific thresholds + - Number of backup jobs + + Args: + item: Node name (service identifier) + section: Parsed agent data + + Yields: + Result: Check results with state + Metric: Performance metrics + """ + if item not in section: + yield Result( + state=State.UNKNOWN, + summary=f"Backup {item} not found in data" + ) + return + + data = section[item] + + # Extract metadata dynamically + backup_type = extract_backup_type(item) + backup_level = extract_backup_level(data["schedules"]) + frequency = extract_frequency(data["schedules"]) + error_handling = get_error_handling(backup_type) + category = get_backup_category(backup_type) + + # Calculate state + state, status_text = calculate_state( + data["statuses"], + data["last"], + backup_type, + error_handling + ) + + # Calculate backup age + if data["last"]: + age_seconds = int(datetime.now().timestamp()) - data["last"] + age_txt = render.timespan(age_seconds) + else: + age_seconds = 999999 + age_txt = "unknown" + + # Get type-specific thresholds + thresholds = get_thresholds(backup_type, backup_level) + warn_seconds = thresholds["warn"] + crit_seconds = thresholds["crit"] + + # Check age against thresholds (only if backup completed) + if state == State.OK and age_seconds > crit_seconds: + state = State.CRIT + elif state == State.OK and age_seconds > warn_seconds: + state = State.WARN + + # Build summary + summary = ( + f"Type={backup_type.upper()} ({category}), " + f"Level={backup_level.upper()}, " + f"Freq={frequency}, " + f"Status={status_text}, " + f"Last={age_txt}, " + f"Jobs={data['count']}" + ) + + # Main result + yield Result(state=state, summary=summary) + + # Detailed information + if state in [State.WARN, State.CRIT]: + details = ( + f"Thresholds: WARN={render.timespan(warn_seconds)}, " + f"CRIT={render.timespan(crit_seconds)}" + ) + yield Result(state=State.OK, notice=details) + + # Metrics + yield Metric( + name="backup_age", + value=age_seconds, + levels=(warn_seconds, crit_seconds), + boundaries=(0, None), + ) + + yield Metric( + name="backup_jobs", + value=data["count"], + ) + + +check_plugin_tsm_backups = CheckPlugin( + name="tsm_backups", + service_name="TSM Backup %s", + discovery_function=discover_tsm_backups, + check_function=check_tsm_backups, + sections=["tsm_backups"], +) \ No newline at end of file