503 lines
14 KiB
Python
503 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
TSM Backup Status Check Plugin for CheckMK 2.3+
|
|
- Parses tsm_backups agent section
|
|
- Creates services with labels (backup_type, frequency, level, error_handling)
|
|
- Dynamic backup type detection
|
|
- Configurable type-specific thresholds
|
|
- Tolerant error handling for FILE/VIRTUAL backups
|
|
|
|
Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py
|
|
Check name: tsm_backups
|
|
|
|
Author: Marius Gielnik
|
|
Version: 4.1 - Fixed Label Detection & Comprehensive Documentation
|
|
"""
|
|
from typing import Any, Mapping
|
|
from datetime import datetime
|
|
import json
|
|
import re
|
|
|
|
from cmk.agent_based.v2 import (
|
|
AgentSection,
|
|
CheckPlugin,
|
|
CheckResult,
|
|
DiscoveryResult,
|
|
Result,
|
|
Service,
|
|
ServiceLabel,
|
|
State,
|
|
Metric,
|
|
render,
|
|
StringTable,
|
|
)
|
|
|
|
# ============================================================================
|
|
# Configuration Section
|
|
# ============================================================================
|
|
|
|
# Type-specific thresholds (in seconds)
|
|
# New types can be added here dynamically without code changes
|
|
THRESHOLDS = {
|
|
"log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups
|
|
"mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups
|
|
"hana": {"warn": 26 * 3600, "crit": 48 * 3600},
|
|
"db2": {"warn": 26 * 3600, "crit": 48 * 3600},
|
|
"oracle": {"warn": 26 * 3600, "crit": 48 * 3600},
|
|
"mysql": {"warn": 26 * 3600, "crit": 48 * 3600},
|
|
"file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant
|
|
"virtual": {"warn": 36 * 3600, "crit": 72 * 3600},
|
|
"mail": {"warn": 26 * 3600, "crit": 48 * 3600},
|
|
"scale": {"warn": 36 * 3600, "crit": 72 * 3600},
|
|
"dm": {"warn": 36 * 3600, "crit": 72 * 3600},
|
|
"datacenter": {"warn": 36 * 3600, "crit": 72 * 3600},
|
|
"default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback
|
|
}
|
|
|
|
# Types that should use tolerant error handling
|
|
# Failed/Missed backups result in WARNING instead of CRITICAL
|
|
TOLERANT_TYPES = {
|
|
'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail'
|
|
}
|
|
|
|
# Known database types (for better classification)
|
|
DATABASE_TYPES = {
|
|
'mssql', 'hana', 'db2', 'oracle', 'mysql'
|
|
}
|
|
|
|
# Known virtualization types
|
|
VIRTUAL_TYPES = {
|
|
'virtual'
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# Helper Functions
|
|
# ============================================================================
|
|
|
|
def extract_backup_type(node: str) -> str:
|
|
"""
|
|
Extrahiert Backup-Typ aus Node-Namen.
|
|
|
|
Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert.
|
|
|
|
Examples:
|
|
MYSERVER_MSSQL -> mssql
|
|
DATABASE_HANA_01 -> hana
|
|
FILESERVER_FILE -> file
|
|
VM_HYPERV_123 -> hyperv
|
|
|
|
Args:
|
|
node: Node name from TSM
|
|
|
|
Returns:
|
|
str: Detected backup type in lowercase
|
|
"""
|
|
if '_' not in node:
|
|
return "unknown"
|
|
|
|
parts = node.split('_')
|
|
last = parts[-1].upper()
|
|
|
|
# Bekannte Backup-Typen (fest definiert)
|
|
known_types = [
|
|
'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM',
|
|
'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL'
|
|
]
|
|
|
|
if last in known_types:
|
|
return last.lower()
|
|
|
|
# Falls letztes Segment numerisch, versuche vorletztes
|
|
if last.isdigit() and len(parts) > 1:
|
|
second_last = parts[-2].upper()
|
|
if second_last in known_types:
|
|
return second_last.lower()
|
|
|
|
return "unknown"
|
|
|
|
|
|
def extract_backup_level(schedules: list) -> str:
|
|
"""
|
|
Extracts backup level from schedule names.
|
|
|
|
Priority: log > full > differential > incremental
|
|
|
|
Args:
|
|
schedules: List of schedule names
|
|
|
|
Returns:
|
|
str: Detected backup level
|
|
"""
|
|
levels_found = set()
|
|
|
|
for schedule in schedules:
|
|
schedule_upper = schedule.upper()
|
|
|
|
if '_LOG' in schedule_upper or 'LOG' in schedule_upper:
|
|
levels_found.add('log')
|
|
elif '_FULL' in schedule_upper or 'FULL' in schedule_upper:
|
|
levels_found.add('full')
|
|
elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper:
|
|
levels_found.add('incremental')
|
|
elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper:
|
|
levels_found.add('differential')
|
|
|
|
# Return in priority order
|
|
if 'log' in levels_found:
|
|
return 'log'
|
|
if 'full' in levels_found:
|
|
return 'full'
|
|
if 'differential' in levels_found:
|
|
return 'differential'
|
|
if 'incremental' in levels_found:
|
|
return 'incremental'
|
|
|
|
return 'full' # Default
|
|
|
|
|
|
def extract_frequency(schedules: list) -> str:
|
|
"""
|
|
Extracts backup frequency from schedule names.
|
|
|
|
Priority: hourly > daily > weekly > monthly
|
|
|
|
Args:
|
|
schedules: List of schedule names
|
|
|
|
Returns:
|
|
str: Detected frequency
|
|
"""
|
|
frequencies_found = set()
|
|
|
|
for schedule in schedules:
|
|
schedule_upper = schedule.upper()
|
|
|
|
if 'HOURLY' in schedule_upper:
|
|
frequencies_found.add('hourly')
|
|
elif 'DAILY' in schedule_upper:
|
|
frequencies_found.add('daily')
|
|
elif 'WEEKLY' in schedule_upper:
|
|
frequencies_found.add('weekly')
|
|
elif 'MONTHLY' in schedule_upper:
|
|
frequencies_found.add('monthly')
|
|
# Time-based pattern detection (HH-MM-SS_)
|
|
elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule):
|
|
if '_LOG' in schedule_upper:
|
|
frequencies_found.add('hourly')
|
|
elif schedule.startswith('00-00-00'):
|
|
frequencies_found.add('daily')
|
|
else:
|
|
frequencies_found.add('daily')
|
|
|
|
# Return in priority order
|
|
if 'hourly' in frequencies_found:
|
|
return 'hourly'
|
|
if 'daily' in frequencies_found:
|
|
return 'daily'
|
|
if 'weekly' in frequencies_found:
|
|
return 'weekly'
|
|
if 'monthly' in frequencies_found:
|
|
return 'monthly'
|
|
|
|
return 'unknown'
|
|
|
|
|
|
def get_error_handling(backup_type: str) -> str:
|
|
"""
|
|
Determines error handling strategy based on backup type.
|
|
|
|
Tolerant types: Failed backups trigger WARNING instead of CRITICAL
|
|
Strict types: Failed backups trigger CRITICAL
|
|
|
|
Args:
|
|
backup_type: Detected backup type
|
|
|
|
Returns:
|
|
str: 'tolerant' or 'strict'
|
|
"""
|
|
return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict'
|
|
|
|
|
|
def get_backup_category(backup_type: str) -> str:
|
|
"""
|
|
Categorizes backup type for additional labeling.
|
|
|
|
Categories: database, virtualization, filesystem, application, other
|
|
|
|
Args:
|
|
backup_type: Detected backup type
|
|
|
|
Returns:
|
|
str: Category name
|
|
"""
|
|
if backup_type in DATABASE_TYPES:
|
|
return 'database'
|
|
elif backup_type in VIRTUAL_TYPES:
|
|
return 'virtualization'
|
|
elif backup_type in {'file', 'scale', 'dm', 'datacenter'}:
|
|
return 'filesystem'
|
|
elif backup_type in {'mail', 'exchange'}:
|
|
return 'application'
|
|
else:
|
|
return 'other'
|
|
|
|
|
|
def get_thresholds(backup_type: str, backup_level: str) -> dict:
|
|
"""
|
|
Returns type and level-specific thresholds.
|
|
|
|
Priority:
|
|
1. If level is 'log', use log thresholds
|
|
2. If type has specific thresholds, use those
|
|
3. Use default thresholds
|
|
|
|
Args:
|
|
backup_type: Detected backup type
|
|
backup_level: Detected backup level
|
|
|
|
Returns:
|
|
dict: {"warn": seconds, "crit": seconds}
|
|
"""
|
|
if backup_level == 'log':
|
|
return THRESHOLDS['log']
|
|
elif backup_type in THRESHOLDS:
|
|
return THRESHOLDS[backup_type]
|
|
else:
|
|
return THRESHOLDS['default']
|
|
|
|
|
|
def calculate_state(statuses: list, last_time: int, backup_type: str,
|
|
error_handling: str) -> tuple:
|
|
"""
|
|
Calculates CheckMK state from backup statuses.
|
|
|
|
Logic:
|
|
- At least 1x Completed -> OK
|
|
- Only Pending/Started (recent) -> OK
|
|
- Only Pending/Started (old) -> WARN
|
|
- Failed/Missed + tolerant -> WARN
|
|
- Failed/Missed + strict -> CRIT
|
|
|
|
Args:
|
|
statuses: List of backup statuses
|
|
last_time: Timestamp of last backup
|
|
backup_type: Detected backup type
|
|
error_handling: 'tolerant' or 'strict'
|
|
|
|
Returns:
|
|
tuple: (State, status_text)
|
|
"""
|
|
statuses_lower = [s.lower() for s in statuses]
|
|
|
|
# At least one completed backup -> OK
|
|
if "completed" in statuses_lower:
|
|
return (State.OK, "Completed")
|
|
|
|
# Only Pending/Started
|
|
only_pending_started = all(s in ["pending", "started"] for s in statuses_lower)
|
|
if only_pending_started:
|
|
if last_time:
|
|
age = int(datetime.now().timestamp()) - last_time
|
|
if age < 2 * 3600: # 2 hours
|
|
return (State.OK, "Pending/Started")
|
|
else:
|
|
return (State.WARN, "Pending (>2h)")
|
|
else:
|
|
return (State.WARN, "Pending")
|
|
|
|
# Failed or Missed backups
|
|
has_failed = any("failed" in s for s in statuses_lower)
|
|
has_missed = "missed" in statuses_lower
|
|
|
|
if has_failed or has_missed:
|
|
if error_handling == 'tolerant':
|
|
return (State.WARN, "Failed (partial)")
|
|
else:
|
|
return (State.CRIT, "Failed/Missed")
|
|
|
|
return (State.CRIT, "Unknown State")
|
|
|
|
|
|
# ============================================================================
|
|
# Agent Section Registration
|
|
# ============================================================================
|
|
|
|
def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]:
|
|
"""
|
|
Parses the tsm_backups agent section.
|
|
|
|
Expected format: JSON string from agent plugin
|
|
|
|
Args:
|
|
string_table: Raw agent output
|
|
|
|
Returns:
|
|
dict: Parsed backup data per node
|
|
"""
|
|
if not string_table or not string_table[0]:
|
|
return {}
|
|
|
|
try:
|
|
json_str = string_table[0][0]
|
|
return json.loads(json_str)
|
|
except (json.JSONDecodeError, IndexError, KeyError):
|
|
return {}
|
|
|
|
|
|
agent_section_tsm_backups = AgentSection(
|
|
name="tsm_backups",
|
|
parse_function=parse_tsm_backups,
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# Check Plugin Registration
|
|
# ============================================================================
|
|
|
|
def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult:
|
|
"""
|
|
Discovery function - creates one service per logical node.
|
|
|
|
Services are created with dynamic labels:
|
|
- backup_type: Dynamically detected from node name
|
|
- backup_category: database/virtualization/filesystem/application/other
|
|
- backup_system: Always "tsm"
|
|
- frequency: hourly/daily/weekly/monthly/unknown
|
|
- backup_level: log/full/differential/incremental
|
|
- error_handling: tolerant/strict
|
|
- node_name: Original node name
|
|
|
|
Args:
|
|
section: Parsed agent data
|
|
|
|
Yields:
|
|
Service: CheckMK service objects with labels
|
|
"""
|
|
for node in section:
|
|
data = section[node]
|
|
|
|
# Extract metadata dynamically
|
|
backup_type = extract_backup_type(node)
|
|
backup_level = extract_backup_level(data["schedules"])
|
|
frequency = extract_frequency(data["schedules"])
|
|
error_handling = get_error_handling(backup_type)
|
|
category = get_backup_category(backup_type)
|
|
|
|
# Create service with dynamic labels
|
|
yield Service(
|
|
item=node,
|
|
labels=[
|
|
ServiceLabel("backup_type", backup_type),
|
|
ServiceLabel("backup_category", category),
|
|
ServiceLabel("backup_system", "tsm"),
|
|
ServiceLabel("frequency", frequency),
|
|
ServiceLabel("backup_level", backup_level),
|
|
ServiceLabel("error_handling", error_handling),
|
|
ServiceLabel("node_name", node),
|
|
]
|
|
)
|
|
|
|
|
|
def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult:
|
|
"""
|
|
Check function - evaluates backup status and generates metrics.
|
|
|
|
Checks:
|
|
- Backup completion status
|
|
- Backup age against type-specific thresholds
|
|
- Number of backup jobs
|
|
|
|
Args:
|
|
item: Node name (service identifier)
|
|
section: Parsed agent data
|
|
|
|
Yields:
|
|
Result: Check results with state
|
|
Metric: Performance metrics
|
|
"""
|
|
if item not in section:
|
|
yield Result(
|
|
state=State.UNKNOWN,
|
|
summary=f"Backup {item} not found in data"
|
|
)
|
|
return
|
|
|
|
data = section[item]
|
|
|
|
# Extract metadata dynamically
|
|
backup_type = extract_backup_type(item)
|
|
backup_level = extract_backup_level(data["schedules"])
|
|
frequency = extract_frequency(data["schedules"])
|
|
error_handling = get_error_handling(backup_type)
|
|
category = get_backup_category(backup_type)
|
|
|
|
# Calculate state
|
|
state, status_text = calculate_state(
|
|
data["statuses"],
|
|
data["last"],
|
|
backup_type,
|
|
error_handling
|
|
)
|
|
|
|
# Calculate backup age
|
|
if data["last"]:
|
|
age_seconds = int(datetime.now().timestamp()) - data["last"]
|
|
age_txt = render.timespan(age_seconds)
|
|
else:
|
|
age_seconds = 999999
|
|
age_txt = "unknown"
|
|
|
|
# Get type-specific thresholds
|
|
thresholds = get_thresholds(backup_type, backup_level)
|
|
warn_seconds = thresholds["warn"]
|
|
crit_seconds = thresholds["crit"]
|
|
|
|
# Check age against thresholds (only if backup completed)
|
|
if state == State.OK and age_seconds > crit_seconds:
|
|
state = State.CRIT
|
|
elif state == State.OK and age_seconds > warn_seconds:
|
|
state = State.WARN
|
|
|
|
# Build summary
|
|
summary = (
|
|
f"Type={backup_type.upper()} ({category}), "
|
|
f"Level={backup_level.upper()}, "
|
|
f"Freq={frequency}, "
|
|
f"Status={status_text}, "
|
|
f"Last={age_txt}, "
|
|
f"Jobs={data['count']}"
|
|
)
|
|
|
|
# Main result
|
|
yield Result(state=state, summary=summary)
|
|
|
|
# Detailed information
|
|
if state in [State.WARN, State.CRIT]:
|
|
details = (
|
|
f"Thresholds: WARN={render.timespan(warn_seconds)}, "
|
|
f"CRIT={render.timespan(crit_seconds)}"
|
|
)
|
|
yield Result(state=State.OK, notice=details)
|
|
|
|
# Metrics
|
|
yield Metric(
|
|
name="backup_age",
|
|
value=age_seconds,
|
|
levels=(warn_seconds, crit_seconds),
|
|
boundaries=(0, None),
|
|
)
|
|
|
|
yield Metric(
|
|
name="backup_jobs",
|
|
value=data["count"],
|
|
)
|
|
|
|
|
|
check_plugin_tsm_backups = CheckPlugin(
|
|
name="tsm_backups",
|
|
service_name="TSM Backup %s",
|
|
discovery_function=discover_tsm_backups,
|
|
check_function=check_tsm_backups,
|
|
sections=["tsm_backups"],
|
|
) |