add content to tsm_backup_check.py

This commit is contained in:
magadimn 2026-01-13 23:33:28 +01:00
parent e5962af62d
commit 91522703ad
2 changed files with 1006 additions and 0 deletions

503
TSM/tms_backup_check.py Normal file
View file

@ -0,0 +1,503 @@
#!/usr/bin/env python3
"""
TSM Backup Status Check Plugin for CheckMK 2.3+
- Parses tsm_backups agent section
- Creates services with labels (backup_type, frequency, level, error_handling)
- Dynamic backup type detection
- Configurable type-specific thresholds
- Tolerant error handling for FILE/VIRTUAL backups
Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py
Check name: tsm_backups
Author: Marius Gielnik
Version: 4.1 - Fixed Label Detection & Comprehensive Documentation
"""
from typing import Any, Mapping
from datetime import datetime
import json
import re
from cmk.agent_based.v2 import (
AgentSection,
CheckPlugin,
CheckResult,
DiscoveryResult,
Result,
Service,
ServiceLabel,
State,
Metric,
render,
StringTable,
)
# ============================================================================
# Configuration Section
# ============================================================================
# Type-specific thresholds (in seconds)
# New types can be added here dynamically without code changes
THRESHOLDS = {
"log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups
"mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups
"hana": {"warn": 26 * 3600, "crit": 48 * 3600},
"db2": {"warn": 26 * 3600, "crit": 48 * 3600},
"oracle": {"warn": 26 * 3600, "crit": 48 * 3600},
"mysql": {"warn": 26 * 3600, "crit": 48 * 3600},
"file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant
"virtual": {"warn": 36 * 3600, "crit": 72 * 3600},
"mail": {"warn": 26 * 3600, "crit": 48 * 3600},
"scale": {"warn": 36 * 3600, "crit": 72 * 3600},
"dm": {"warn": 36 * 3600, "crit": 72 * 3600},
"datacenter": {"warn": 36 * 3600, "crit": 72 * 3600},
"default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback
}
# Types that should use tolerant error handling
# Failed/Missed backups result in WARNING instead of CRITICAL
TOLERANT_TYPES = {
'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail'
}
# Known database types (for better classification)
DATABASE_TYPES = {
'mssql', 'hana', 'db2', 'oracle', 'mysql'
}
# Known virtualization types
VIRTUAL_TYPES = {
'virtual'
}
# ============================================================================
# Helper Functions
# ============================================================================
def extract_backup_type(node: str) -> str:
"""
Extrahiert Backup-Typ aus Node-Namen.
Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert.
Examples:
MYSERVER_MSSQL -> mssql
DATABASE_HANA_01 -> hana
FILESERVER_FILE -> file
VM_HYPERV_123 -> hyperv
Args:
node: Node name from TSM
Returns:
str: Detected backup type in lowercase
"""
if '_' not in node:
return "unknown"
parts = node.split('_')
last = parts[-1].upper()
# Bekannte Backup-Typen (fest definiert)
known_types = [
'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM',
'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL'
]
if last in known_types:
return last.lower()
# Falls letztes Segment numerisch, versuche vorletztes
if last.isdigit() and len(parts) > 1:
second_last = parts[-2].upper()
if second_last in known_types:
return second_last.lower()
return "unknown"
def extract_backup_level(schedules: list) -> str:
"""
Extracts backup level from schedule names.
Priority: log > full > differential > incremental
Args:
schedules: List of schedule names
Returns:
str: Detected backup level
"""
levels_found = set()
for schedule in schedules:
schedule_upper = schedule.upper()
if '_LOG' in schedule_upper or 'LOG' in schedule_upper:
levels_found.add('log')
elif '_FULL' in schedule_upper or 'FULL' in schedule_upper:
levels_found.add('full')
elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper:
levels_found.add('incremental')
elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper:
levels_found.add('differential')
# Return in priority order
if 'log' in levels_found:
return 'log'
if 'full' in levels_found:
return 'full'
if 'differential' in levels_found:
return 'differential'
if 'incremental' in levels_found:
return 'incremental'
return 'full' # Default
def extract_frequency(schedules: list) -> str:
"""
Extracts backup frequency from schedule names.
Priority: hourly > daily > weekly > monthly
Args:
schedules: List of schedule names
Returns:
str: Detected frequency
"""
frequencies_found = set()
for schedule in schedules:
schedule_upper = schedule.upper()
if 'HOURLY' in schedule_upper:
frequencies_found.add('hourly')
elif 'DAILY' in schedule_upper:
frequencies_found.add('daily')
elif 'WEEKLY' in schedule_upper:
frequencies_found.add('weekly')
elif 'MONTHLY' in schedule_upper:
frequencies_found.add('monthly')
# Time-based pattern detection (HH-MM-SS_)
elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule):
if '_LOG' in schedule_upper:
frequencies_found.add('hourly')
elif schedule.startswith('00-00-00'):
frequencies_found.add('daily')
else:
frequencies_found.add('daily')
# Return in priority order
if 'hourly' in frequencies_found:
return 'hourly'
if 'daily' in frequencies_found:
return 'daily'
if 'weekly' in frequencies_found:
return 'weekly'
if 'monthly' in frequencies_found:
return 'monthly'
return 'unknown'
def get_error_handling(backup_type: str) -> str:
"""
Determines error handling strategy based on backup type.
Tolerant types: Failed backups trigger WARNING instead of CRITICAL
Strict types: Failed backups trigger CRITICAL
Args:
backup_type: Detected backup type
Returns:
str: 'tolerant' or 'strict'
"""
return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict'
def get_backup_category(backup_type: str) -> str:
"""
Categorizes backup type for additional labeling.
Categories: database, virtualization, filesystem, application, other
Args:
backup_type: Detected backup type
Returns:
str: Category name
"""
if backup_type in DATABASE_TYPES:
return 'database'
elif backup_type in VIRTUAL_TYPES:
return 'virtualization'
elif backup_type in {'file', 'scale', 'dm', 'datacenter'}:
return 'filesystem'
elif backup_type in {'mail', 'exchange'}:
return 'application'
else:
return 'other'
def get_thresholds(backup_type: str, backup_level: str) -> dict:
"""
Returns type and level-specific thresholds.
Priority:
1. If level is 'log', use log thresholds
2. If type has specific thresholds, use those
3. Use default thresholds
Args:
backup_type: Detected backup type
backup_level: Detected backup level
Returns:
dict: {"warn": seconds, "crit": seconds}
"""
if backup_level == 'log':
return THRESHOLDS['log']
elif backup_type in THRESHOLDS:
return THRESHOLDS[backup_type]
else:
return THRESHOLDS['default']
def calculate_state(statuses: list, last_time: int, backup_type: str,
error_handling: str) -> tuple:
"""
Calculates CheckMK state from backup statuses.
Logic:
- At least 1x Completed -> OK
- Only Pending/Started (recent) -> OK
- Only Pending/Started (old) -> WARN
- Failed/Missed + tolerant -> WARN
- Failed/Missed + strict -> CRIT
Args:
statuses: List of backup statuses
last_time: Timestamp of last backup
backup_type: Detected backup type
error_handling: 'tolerant' or 'strict'
Returns:
tuple: (State, status_text)
"""
statuses_lower = [s.lower() for s in statuses]
# At least one completed backup -> OK
if "completed" in statuses_lower:
return (State.OK, "Completed")
# Only Pending/Started
only_pending_started = all(s in ["pending", "started"] for s in statuses_lower)
if only_pending_started:
if last_time:
age = int(datetime.now().timestamp()) - last_time
if age < 2 * 3600: # 2 hours
return (State.OK, "Pending/Started")
else:
return (State.WARN, "Pending (>2h)")
else:
return (State.WARN, "Pending")
# Failed or Missed backups
has_failed = any("failed" in s for s in statuses_lower)
has_missed = "missed" in statuses_lower
if has_failed or has_missed:
if error_handling == 'tolerant':
return (State.WARN, "Failed (partial)")
else:
return (State.CRIT, "Failed/Missed")
return (State.CRIT, "Unknown State")
# ============================================================================
# Agent Section Registration
# ============================================================================
def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]:
"""
Parses the tsm_backups agent section.
Expected format: JSON string from agent plugin
Args:
string_table: Raw agent output
Returns:
dict: Parsed backup data per node
"""
if not string_table or not string_table[0]:
return {}
try:
json_str = string_table[0][0]
return json.loads(json_str)
except (json.JSONDecodeError, IndexError, KeyError):
return {}
agent_section_tsm_backups = AgentSection(
name="tsm_backups",
parse_function=parse_tsm_backups,
)
# ============================================================================
# Check Plugin Registration
# ============================================================================
def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult:
"""
Discovery function - creates one service per logical node.
Services are created with dynamic labels:
- backup_type: Dynamically detected from node name
- backup_category: database/virtualization/filesystem/application/other
- backup_system: Always "tsm"
- frequency: hourly/daily/weekly/monthly/unknown
- backup_level: log/full/differential/incremental
- error_handling: tolerant/strict
- node_name: Original node name
Args:
section: Parsed agent data
Yields:
Service: CheckMK service objects with labels
"""
for node in section:
data = section[node]
# Extract metadata dynamically
backup_type = extract_backup_type(node)
backup_level = extract_backup_level(data["schedules"])
frequency = extract_frequency(data["schedules"])
error_handling = get_error_handling(backup_type)
category = get_backup_category(backup_type)
# Create service with dynamic labels
yield Service(
item=node,
labels=[
ServiceLabel("backup_type", backup_type),
ServiceLabel("backup_category", category),
ServiceLabel("backup_system", "tsm"),
ServiceLabel("frequency", frequency),
ServiceLabel("backup_level", backup_level),
ServiceLabel("error_handling", error_handling),
ServiceLabel("node_name", node),
]
)
def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult:
"""
Check function - evaluates backup status and generates metrics.
Checks:
- Backup completion status
- Backup age against type-specific thresholds
- Number of backup jobs
Args:
item: Node name (service identifier)
section: Parsed agent data
Yields:
Result: Check results with state
Metric: Performance metrics
"""
if item not in section:
yield Result(
state=State.UNKNOWN,
summary=f"Backup {item} not found in data"
)
return
data = section[item]
# Extract metadata dynamically
backup_type = extract_backup_type(item)
backup_level = extract_backup_level(data["schedules"])
frequency = extract_frequency(data["schedules"])
error_handling = get_error_handling(backup_type)
category = get_backup_category(backup_type)
# Calculate state
state, status_text = calculate_state(
data["statuses"],
data["last"],
backup_type,
error_handling
)
# Calculate backup age
if data["last"]:
age_seconds = int(datetime.now().timestamp()) - data["last"]
age_txt = render.timespan(age_seconds)
else:
age_seconds = 999999
age_txt = "unknown"
# Get type-specific thresholds
thresholds = get_thresholds(backup_type, backup_level)
warn_seconds = thresholds["warn"]
crit_seconds = thresholds["crit"]
# Check age against thresholds (only if backup completed)
if state == State.OK and age_seconds > crit_seconds:
state = State.CRIT
elif state == State.OK and age_seconds > warn_seconds:
state = State.WARN
# Build summary
summary = (
f"Type={backup_type.upper()} ({category}), "
f"Level={backup_level.upper()}, "
f"Freq={frequency}, "
f"Status={status_text}, "
f"Last={age_txt}, "
f"Jobs={data['count']}"
)
# Main result
yield Result(state=state, summary=summary)
# Detailed information
if state in [State.WARN, State.CRIT]:
details = (
f"Thresholds: WARN={render.timespan(warn_seconds)}, "
f"CRIT={render.timespan(crit_seconds)}"
)
yield Result(state=State.OK, notice=details)
# Metrics
yield Metric(
name="backup_age",
value=age_seconds,
levels=(warn_seconds, crit_seconds),
boundaries=(0, None),
)
yield Metric(
name="backup_jobs",
value=data["count"],
)
check_plugin_tsm_backups = CheckPlugin(
name="tsm_backups",
service_name="TSM Backup %s",
discovery_function=discover_tsm_backups,
check_function=check_tsm_backups,
sections=["tsm_backups"],
)

View file

@ -0,0 +1,503 @@
#!/usr/bin/env python3
"""
TSM Backup Status Check Plugin for CheckMK 2.3+
- Parses tsm_backups agent section
- Creates services with labels (backup_type, frequency, level, error_handling)
- Dynamic backup type detection
- Configurable type-specific thresholds
- Tolerant error handling for FILE/VIRTUAL backups
Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py
Check name: tsm_backups
Author: Marius Gielnik
Version: 4.1 - Fixed Label Detection & Comprehensive Documentation
"""
from typing import Any, Mapping
from datetime import datetime
import json
import re
from cmk.agent_based.v2 import (
AgentSection,
CheckPlugin,
CheckResult,
DiscoveryResult,
Result,
Service,
ServiceLabel,
State,
Metric,
render,
StringTable,
)
# ============================================================================
# Configuration Section
# ============================================================================
# Type-specific thresholds (in seconds)
# New types can be added here dynamically without code changes
THRESHOLDS = {
"log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups
"mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups
"hana": {"warn": 26 * 3600, "crit": 48 * 3600},
"db2": {"warn": 26 * 3600, "crit": 48 * 3600},
"oracle": {"warn": 26 * 3600, "crit": 48 * 3600},
"mysql": {"warn": 26 * 3600, "crit": 48 * 3600},
"file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant
"virtual": {"warn": 36 * 3600, "crit": 72 * 3600},
"mail": {"warn": 26 * 3600, "crit": 48 * 3600},
"scale": {"warn": 36 * 3600, "crit": 72 * 3600},
"dm": {"warn": 36 * 3600, "crit": 72 * 3600},
"datacenter": {"warn": 36 * 3600, "crit": 72 * 3600},
"default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback
}
# Types that should use tolerant error handling
# Failed/Missed backups result in WARNING instead of CRITICAL
TOLERANT_TYPES = {
'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail'
}
# Known database types (for better classification)
DATABASE_TYPES = {
'mssql', 'hana', 'db2', 'oracle', 'mysql'
}
# Known virtualization types
VIRTUAL_TYPES = {
'virtual'
}
# ============================================================================
# Helper Functions
# ============================================================================
def extract_backup_type(node: str) -> str:
"""
Extrahiert Backup-Typ aus Node-Namen.
Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert.
Examples:
MYSERVER_MSSQL -> mssql
DATABASE_HANA_01 -> hana
FILESERVER_FILE -> file
VM_HYPERV_123 -> hyperv
Args:
node: Node name from TSM
Returns:
str: Detected backup type in lowercase
"""
if '_' not in node:
return "unknown"
parts = node.split('_')
last = parts[-1].upper()
# Bekannte Backup-Typen (fest definiert)
known_types = [
'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM',
'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL'
]
if last in known_types:
return last.lower()
# Falls letztes Segment numerisch, versuche vorletztes
if last.isdigit() and len(parts) > 1:
second_last = parts[-2].upper()
if second_last in known_types:
return second_last.lower()
return "unknown"
def extract_backup_level(schedules: list) -> str:
"""
Extracts backup level from schedule names.
Priority: log > full > differential > incremental
Args:
schedules: List of schedule names
Returns:
str: Detected backup level
"""
levels_found = set()
for schedule in schedules:
schedule_upper = schedule.upper()
if '_LOG' in schedule_upper or 'LOG' in schedule_upper:
levels_found.add('log')
elif '_FULL' in schedule_upper or 'FULL' in schedule_upper:
levels_found.add('full')
elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper:
levels_found.add('incremental')
elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper:
levels_found.add('differential')
# Return in priority order
if 'log' in levels_found:
return 'log'
if 'full' in levels_found:
return 'full'
if 'differential' in levels_found:
return 'differential'
if 'incremental' in levels_found:
return 'incremental'
return 'full' # Default
def extract_frequency(schedules: list) -> str:
"""
Extracts backup frequency from schedule names.
Priority: hourly > daily > weekly > monthly
Args:
schedules: List of schedule names
Returns:
str: Detected frequency
"""
frequencies_found = set()
for schedule in schedules:
schedule_upper = schedule.upper()
if 'HOURLY' in schedule_upper:
frequencies_found.add('hourly')
elif 'DAILY' in schedule_upper:
frequencies_found.add('daily')
elif 'WEEKLY' in schedule_upper:
frequencies_found.add('weekly')
elif 'MONTHLY' in schedule_upper:
frequencies_found.add('monthly')
# Time-based pattern detection (HH-MM-SS_)
elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule):
if '_LOG' in schedule_upper:
frequencies_found.add('hourly')
elif schedule.startswith('00-00-00'):
frequencies_found.add('daily')
else:
frequencies_found.add('daily')
# Return in priority order
if 'hourly' in frequencies_found:
return 'hourly'
if 'daily' in frequencies_found:
return 'daily'
if 'weekly' in frequencies_found:
return 'weekly'
if 'monthly' in frequencies_found:
return 'monthly'
return 'unknown'
def get_error_handling(backup_type: str) -> str:
"""
Determines error handling strategy based on backup type.
Tolerant types: Failed backups trigger WARNING instead of CRITICAL
Strict types: Failed backups trigger CRITICAL
Args:
backup_type: Detected backup type
Returns:
str: 'tolerant' or 'strict'
"""
return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict'
def get_backup_category(backup_type: str) -> str:
"""
Categorizes backup type for additional labeling.
Categories: database, virtualization, filesystem, application, other
Args:
backup_type: Detected backup type
Returns:
str: Category name
"""
if backup_type in DATABASE_TYPES:
return 'database'
elif backup_type in VIRTUAL_TYPES:
return 'virtualization'
elif backup_type in {'file', 'scale', 'dm', 'datacenter'}:
return 'filesystem'
elif backup_type in {'mail', 'exchange'}:
return 'application'
else:
return 'other'
def get_thresholds(backup_type: str, backup_level: str) -> dict:
"""
Returns type and level-specific thresholds.
Priority:
1. If level is 'log', use log thresholds
2. If type has specific thresholds, use those
3. Use default thresholds
Args:
backup_type: Detected backup type
backup_level: Detected backup level
Returns:
dict: {"warn": seconds, "crit": seconds}
"""
if backup_level == 'log':
return THRESHOLDS['log']
elif backup_type in THRESHOLDS:
return THRESHOLDS[backup_type]
else:
return THRESHOLDS['default']
def calculate_state(statuses: list, last_time: int, backup_type: str,
error_handling: str) -> tuple:
"""
Calculates CheckMK state from backup statuses.
Logic:
- At least 1x Completed -> OK
- Only Pending/Started (recent) -> OK
- Only Pending/Started (old) -> WARN
- Failed/Missed + tolerant -> WARN
- Failed/Missed + strict -> CRIT
Args:
statuses: List of backup statuses
last_time: Timestamp of last backup
backup_type: Detected backup type
error_handling: 'tolerant' or 'strict'
Returns:
tuple: (State, status_text)
"""
statuses_lower = [s.lower() for s in statuses]
# At least one completed backup -> OK
if "completed" in statuses_lower:
return (State.OK, "Completed")
# Only Pending/Started
only_pending_started = all(s in ["pending", "started"] for s in statuses_lower)
if only_pending_started:
if last_time:
age = int(datetime.now().timestamp()) - last_time
if age < 2 * 3600: # 2 hours
return (State.OK, "Pending/Started")
else:
return (State.WARN, "Pending (>2h)")
else:
return (State.WARN, "Pending")
# Failed or Missed backups
has_failed = any("failed" in s for s in statuses_lower)
has_missed = "missed" in statuses_lower
if has_failed or has_missed:
if error_handling == 'tolerant':
return (State.WARN, "Failed (partial)")
else:
return (State.CRIT, "Failed/Missed")
return (State.CRIT, "Unknown State")
# ============================================================================
# Agent Section Registration
# ============================================================================
def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]:
"""
Parses the tsm_backups agent section.
Expected format: JSON string from agent plugin
Args:
string_table: Raw agent output
Returns:
dict: Parsed backup data per node
"""
if not string_table or not string_table[0]:
return {}
try:
json_str = string_table[0][0]
return json.loads(json_str)
except (json.JSONDecodeError, IndexError, KeyError):
return {}
agent_section_tsm_backups = AgentSection(
name="tsm_backups",
parse_function=parse_tsm_backups,
)
# ============================================================================
# Check Plugin Registration
# ============================================================================
def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult:
"""
Discovery function - creates one service per logical node.
Services are created with dynamic labels:
- backup_type: Dynamically detected from node name
- backup_category: database/virtualization/filesystem/application/other
- backup_system: Always "tsm"
- frequency: hourly/daily/weekly/monthly/unknown
- backup_level: log/full/differential/incremental
- error_handling: tolerant/strict
- node_name: Original node name
Args:
section: Parsed agent data
Yields:
Service: CheckMK service objects with labels
"""
for node in section:
data = section[node]
# Extract metadata dynamically
backup_type = extract_backup_type(node)
backup_level = extract_backup_level(data["schedules"])
frequency = extract_frequency(data["schedules"])
error_handling = get_error_handling(backup_type)
category = get_backup_category(backup_type)
# Create service with dynamic labels
yield Service(
item=node,
labels=[
ServiceLabel("backup_type", backup_type),
ServiceLabel("backup_category", category),
ServiceLabel("backup_system", "tsm"),
ServiceLabel("frequency", frequency),
ServiceLabel("backup_level", backup_level),
ServiceLabel("error_handling", error_handling),
ServiceLabel("node_name", node),
]
)
def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult:
"""
Check function - evaluates backup status and generates metrics.
Checks:
- Backup completion status
- Backup age against type-specific thresholds
- Number of backup jobs
Args:
item: Node name (service identifier)
section: Parsed agent data
Yields:
Result: Check results with state
Metric: Performance metrics
"""
if item not in section:
yield Result(
state=State.UNKNOWN,
summary=f"Backup {item} not found in data"
)
return
data = section[item]
# Extract metadata dynamically
backup_type = extract_backup_type(item)
backup_level = extract_backup_level(data["schedules"])
frequency = extract_frequency(data["schedules"])
error_handling = get_error_handling(backup_type)
category = get_backup_category(backup_type)
# Calculate state
state, status_text = calculate_state(
data["statuses"],
data["last"],
backup_type,
error_handling
)
# Calculate backup age
if data["last"]:
age_seconds = int(datetime.now().timestamp()) - data["last"]
age_txt = render.timespan(age_seconds)
else:
age_seconds = 999999
age_txt = "unknown"
# Get type-specific thresholds
thresholds = get_thresholds(backup_type, backup_level)
warn_seconds = thresholds["warn"]
crit_seconds = thresholds["crit"]
# Check age against thresholds (only if backup completed)
if state == State.OK and age_seconds > crit_seconds:
state = State.CRIT
elif state == State.OK and age_seconds > warn_seconds:
state = State.WARN
# Build summary
summary = (
f"Type={backup_type.upper()} ({category}), "
f"Level={backup_level.upper()}, "
f"Freq={frequency}, "
f"Status={status_text}, "
f"Last={age_txt}, "
f"Jobs={data['count']}"
)
# Main result
yield Result(state=state, summary=summary)
# Detailed information
if state in [State.WARN, State.CRIT]:
details = (
f"Thresholds: WARN={render.timespan(warn_seconds)}, "
f"CRIT={render.timespan(crit_seconds)}"
)
yield Result(state=State.OK, notice=details)
# Metrics
yield Metric(
name="backup_age",
value=age_seconds,
levels=(warn_seconds, crit_seconds),
boundaries=(0, None),
)
yield Metric(
name="backup_jobs",
value=data["count"],
)
check_plugin_tsm_backups = CheckPlugin(
name="tsm_backups",
service_name="TSM Backup %s",
discovery_function=discover_tsm_backups,
check_function=check_tsm_backups,
sections=["tsm_backups"],
)