add content to tsm_backup_check.py
This commit is contained in:
parent
e5962af62d
commit
91522703ad
503
TSM/tms_backup_check.py
Normal file
503
TSM/tms_backup_check.py
Normal file
|
|
@ -0,0 +1,503 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
TSM Backup Status Check Plugin for CheckMK 2.3+
|
||||
- Parses tsm_backups agent section
|
||||
- Creates services with labels (backup_type, frequency, level, error_handling)
|
||||
- Dynamic backup type detection
|
||||
- Configurable type-specific thresholds
|
||||
- Tolerant error handling for FILE/VIRTUAL backups
|
||||
|
||||
Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py
|
||||
Check name: tsm_backups
|
||||
|
||||
Author: Marius Gielnik
|
||||
Version: 4.1 - Fixed Label Detection & Comprehensive Documentation
|
||||
"""
|
||||
from typing import Any, Mapping
|
||||
from datetime import datetime
|
||||
import json
|
||||
import re
|
||||
|
||||
from cmk.agent_based.v2 import (
|
||||
AgentSection,
|
||||
CheckPlugin,
|
||||
CheckResult,
|
||||
DiscoveryResult,
|
||||
Result,
|
||||
Service,
|
||||
ServiceLabel,
|
||||
State,
|
||||
Metric,
|
||||
render,
|
||||
StringTable,
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# Configuration Section
|
||||
# ============================================================================
|
||||
|
||||
# Type-specific thresholds (in seconds)
|
||||
# New types can be added here dynamically without code changes
|
||||
THRESHOLDS = {
|
||||
"log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups
|
||||
"mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups
|
||||
"hana": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"db2": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"oracle": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"mysql": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant
|
||||
"virtual": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"mail": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"scale": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"dm": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"datacenter": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback
|
||||
}
|
||||
|
||||
# Types that should use tolerant error handling
|
||||
# Failed/Missed backups result in WARNING instead of CRITICAL
|
||||
TOLERANT_TYPES = {
|
||||
'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail'
|
||||
}
|
||||
|
||||
# Known database types (for better classification)
|
||||
DATABASE_TYPES = {
|
||||
'mssql', 'hana', 'db2', 'oracle', 'mysql'
|
||||
}
|
||||
|
||||
# Known virtualization types
|
||||
VIRTUAL_TYPES = {
|
||||
'virtual'
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
def extract_backup_type(node: str) -> str:
|
||||
"""
|
||||
Extrahiert Backup-Typ aus Node-Namen.
|
||||
|
||||
Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert.
|
||||
|
||||
Examples:
|
||||
MYSERVER_MSSQL -> mssql
|
||||
DATABASE_HANA_01 -> hana
|
||||
FILESERVER_FILE -> file
|
||||
VM_HYPERV_123 -> hyperv
|
||||
|
||||
Args:
|
||||
node: Node name from TSM
|
||||
|
||||
Returns:
|
||||
str: Detected backup type in lowercase
|
||||
"""
|
||||
if '_' not in node:
|
||||
return "unknown"
|
||||
|
||||
parts = node.split('_')
|
||||
last = parts[-1].upper()
|
||||
|
||||
# Bekannte Backup-Typen (fest definiert)
|
||||
known_types = [
|
||||
'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM',
|
||||
'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL'
|
||||
]
|
||||
|
||||
if last in known_types:
|
||||
return last.lower()
|
||||
|
||||
# Falls letztes Segment numerisch, versuche vorletztes
|
||||
if last.isdigit() and len(parts) > 1:
|
||||
second_last = parts[-2].upper()
|
||||
if second_last in known_types:
|
||||
return second_last.lower()
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
def extract_backup_level(schedules: list) -> str:
|
||||
"""
|
||||
Extracts backup level from schedule names.
|
||||
|
||||
Priority: log > full > differential > incremental
|
||||
|
||||
Args:
|
||||
schedules: List of schedule names
|
||||
|
||||
Returns:
|
||||
str: Detected backup level
|
||||
"""
|
||||
levels_found = set()
|
||||
|
||||
for schedule in schedules:
|
||||
schedule_upper = schedule.upper()
|
||||
|
||||
if '_LOG' in schedule_upper or 'LOG' in schedule_upper:
|
||||
levels_found.add('log')
|
||||
elif '_FULL' in schedule_upper or 'FULL' in schedule_upper:
|
||||
levels_found.add('full')
|
||||
elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper:
|
||||
levels_found.add('incremental')
|
||||
elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper:
|
||||
levels_found.add('differential')
|
||||
|
||||
# Return in priority order
|
||||
if 'log' in levels_found:
|
||||
return 'log'
|
||||
if 'full' in levels_found:
|
||||
return 'full'
|
||||
if 'differential' in levels_found:
|
||||
return 'differential'
|
||||
if 'incremental' in levels_found:
|
||||
return 'incremental'
|
||||
|
||||
return 'full' # Default
|
||||
|
||||
|
||||
def extract_frequency(schedules: list) -> str:
|
||||
"""
|
||||
Extracts backup frequency from schedule names.
|
||||
|
||||
Priority: hourly > daily > weekly > monthly
|
||||
|
||||
Args:
|
||||
schedules: List of schedule names
|
||||
|
||||
Returns:
|
||||
str: Detected frequency
|
||||
"""
|
||||
frequencies_found = set()
|
||||
|
||||
for schedule in schedules:
|
||||
schedule_upper = schedule.upper()
|
||||
|
||||
if 'HOURLY' in schedule_upper:
|
||||
frequencies_found.add('hourly')
|
||||
elif 'DAILY' in schedule_upper:
|
||||
frequencies_found.add('daily')
|
||||
elif 'WEEKLY' in schedule_upper:
|
||||
frequencies_found.add('weekly')
|
||||
elif 'MONTHLY' in schedule_upper:
|
||||
frequencies_found.add('monthly')
|
||||
# Time-based pattern detection (HH-MM-SS_)
|
||||
elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule):
|
||||
if '_LOG' in schedule_upper:
|
||||
frequencies_found.add('hourly')
|
||||
elif schedule.startswith('00-00-00'):
|
||||
frequencies_found.add('daily')
|
||||
else:
|
||||
frequencies_found.add('daily')
|
||||
|
||||
# Return in priority order
|
||||
if 'hourly' in frequencies_found:
|
||||
return 'hourly'
|
||||
if 'daily' in frequencies_found:
|
||||
return 'daily'
|
||||
if 'weekly' in frequencies_found:
|
||||
return 'weekly'
|
||||
if 'monthly' in frequencies_found:
|
||||
return 'monthly'
|
||||
|
||||
return 'unknown'
|
||||
|
||||
|
||||
def get_error_handling(backup_type: str) -> str:
|
||||
"""
|
||||
Determines error handling strategy based on backup type.
|
||||
|
||||
Tolerant types: Failed backups trigger WARNING instead of CRITICAL
|
||||
Strict types: Failed backups trigger CRITICAL
|
||||
|
||||
Args:
|
||||
backup_type: Detected backup type
|
||||
|
||||
Returns:
|
||||
str: 'tolerant' or 'strict'
|
||||
"""
|
||||
return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict'
|
||||
|
||||
|
||||
def get_backup_category(backup_type: str) -> str:
|
||||
"""
|
||||
Categorizes backup type for additional labeling.
|
||||
|
||||
Categories: database, virtualization, filesystem, application, other
|
||||
|
||||
Args:
|
||||
backup_type: Detected backup type
|
||||
|
||||
Returns:
|
||||
str: Category name
|
||||
"""
|
||||
if backup_type in DATABASE_TYPES:
|
||||
return 'database'
|
||||
elif backup_type in VIRTUAL_TYPES:
|
||||
return 'virtualization'
|
||||
elif backup_type in {'file', 'scale', 'dm', 'datacenter'}:
|
||||
return 'filesystem'
|
||||
elif backup_type in {'mail', 'exchange'}:
|
||||
return 'application'
|
||||
else:
|
||||
return 'other'
|
||||
|
||||
|
||||
def get_thresholds(backup_type: str, backup_level: str) -> dict:
|
||||
"""
|
||||
Returns type and level-specific thresholds.
|
||||
|
||||
Priority:
|
||||
1. If level is 'log', use log thresholds
|
||||
2. If type has specific thresholds, use those
|
||||
3. Use default thresholds
|
||||
|
||||
Args:
|
||||
backup_type: Detected backup type
|
||||
backup_level: Detected backup level
|
||||
|
||||
Returns:
|
||||
dict: {"warn": seconds, "crit": seconds}
|
||||
"""
|
||||
if backup_level == 'log':
|
||||
return THRESHOLDS['log']
|
||||
elif backup_type in THRESHOLDS:
|
||||
return THRESHOLDS[backup_type]
|
||||
else:
|
||||
return THRESHOLDS['default']
|
||||
|
||||
|
||||
def calculate_state(statuses: list, last_time: int, backup_type: str,
|
||||
error_handling: str) -> tuple:
|
||||
"""
|
||||
Calculates CheckMK state from backup statuses.
|
||||
|
||||
Logic:
|
||||
- At least 1x Completed -> OK
|
||||
- Only Pending/Started (recent) -> OK
|
||||
- Only Pending/Started (old) -> WARN
|
||||
- Failed/Missed + tolerant -> WARN
|
||||
- Failed/Missed + strict -> CRIT
|
||||
|
||||
Args:
|
||||
statuses: List of backup statuses
|
||||
last_time: Timestamp of last backup
|
||||
backup_type: Detected backup type
|
||||
error_handling: 'tolerant' or 'strict'
|
||||
|
||||
Returns:
|
||||
tuple: (State, status_text)
|
||||
"""
|
||||
statuses_lower = [s.lower() for s in statuses]
|
||||
|
||||
# At least one completed backup -> OK
|
||||
if "completed" in statuses_lower:
|
||||
return (State.OK, "Completed")
|
||||
|
||||
# Only Pending/Started
|
||||
only_pending_started = all(s in ["pending", "started"] for s in statuses_lower)
|
||||
if only_pending_started:
|
||||
if last_time:
|
||||
age = int(datetime.now().timestamp()) - last_time
|
||||
if age < 2 * 3600: # 2 hours
|
||||
return (State.OK, "Pending/Started")
|
||||
else:
|
||||
return (State.WARN, "Pending (>2h)")
|
||||
else:
|
||||
return (State.WARN, "Pending")
|
||||
|
||||
# Failed or Missed backups
|
||||
has_failed = any("failed" in s for s in statuses_lower)
|
||||
has_missed = "missed" in statuses_lower
|
||||
|
||||
if has_failed or has_missed:
|
||||
if error_handling == 'tolerant':
|
||||
return (State.WARN, "Failed (partial)")
|
||||
else:
|
||||
return (State.CRIT, "Failed/Missed")
|
||||
|
||||
return (State.CRIT, "Unknown State")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Agent Section Registration
|
||||
# ============================================================================
|
||||
|
||||
def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]:
|
||||
"""
|
||||
Parses the tsm_backups agent section.
|
||||
|
||||
Expected format: JSON string from agent plugin
|
||||
|
||||
Args:
|
||||
string_table: Raw agent output
|
||||
|
||||
Returns:
|
||||
dict: Parsed backup data per node
|
||||
"""
|
||||
if not string_table or not string_table[0]:
|
||||
return {}
|
||||
|
||||
try:
|
||||
json_str = string_table[0][0]
|
||||
return json.loads(json_str)
|
||||
except (json.JSONDecodeError, IndexError, KeyError):
|
||||
return {}
|
||||
|
||||
|
||||
agent_section_tsm_backups = AgentSection(
|
||||
name="tsm_backups",
|
||||
parse_function=parse_tsm_backups,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Check Plugin Registration
|
||||
# ============================================================================
|
||||
|
||||
def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult:
|
||||
"""
|
||||
Discovery function - creates one service per logical node.
|
||||
|
||||
Services are created with dynamic labels:
|
||||
- backup_type: Dynamically detected from node name
|
||||
- backup_category: database/virtualization/filesystem/application/other
|
||||
- backup_system: Always "tsm"
|
||||
- frequency: hourly/daily/weekly/monthly/unknown
|
||||
- backup_level: log/full/differential/incremental
|
||||
- error_handling: tolerant/strict
|
||||
- node_name: Original node name
|
||||
|
||||
Args:
|
||||
section: Parsed agent data
|
||||
|
||||
Yields:
|
||||
Service: CheckMK service objects with labels
|
||||
"""
|
||||
for node in section:
|
||||
data = section[node]
|
||||
|
||||
# Extract metadata dynamically
|
||||
backup_type = extract_backup_type(node)
|
||||
backup_level = extract_backup_level(data["schedules"])
|
||||
frequency = extract_frequency(data["schedules"])
|
||||
error_handling = get_error_handling(backup_type)
|
||||
category = get_backup_category(backup_type)
|
||||
|
||||
# Create service with dynamic labels
|
||||
yield Service(
|
||||
item=node,
|
||||
labels=[
|
||||
ServiceLabel("backup_type", backup_type),
|
||||
ServiceLabel("backup_category", category),
|
||||
ServiceLabel("backup_system", "tsm"),
|
||||
ServiceLabel("frequency", frequency),
|
||||
ServiceLabel("backup_level", backup_level),
|
||||
ServiceLabel("error_handling", error_handling),
|
||||
ServiceLabel("node_name", node),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult:
|
||||
"""
|
||||
Check function - evaluates backup status and generates metrics.
|
||||
|
||||
Checks:
|
||||
- Backup completion status
|
||||
- Backup age against type-specific thresholds
|
||||
- Number of backup jobs
|
||||
|
||||
Args:
|
||||
item: Node name (service identifier)
|
||||
section: Parsed agent data
|
||||
|
||||
Yields:
|
||||
Result: Check results with state
|
||||
Metric: Performance metrics
|
||||
"""
|
||||
if item not in section:
|
||||
yield Result(
|
||||
state=State.UNKNOWN,
|
||||
summary=f"Backup {item} not found in data"
|
||||
)
|
||||
return
|
||||
|
||||
data = section[item]
|
||||
|
||||
# Extract metadata dynamically
|
||||
backup_type = extract_backup_type(item)
|
||||
backup_level = extract_backup_level(data["schedules"])
|
||||
frequency = extract_frequency(data["schedules"])
|
||||
error_handling = get_error_handling(backup_type)
|
||||
category = get_backup_category(backup_type)
|
||||
|
||||
# Calculate state
|
||||
state, status_text = calculate_state(
|
||||
data["statuses"],
|
||||
data["last"],
|
||||
backup_type,
|
||||
error_handling
|
||||
)
|
||||
|
||||
# Calculate backup age
|
||||
if data["last"]:
|
||||
age_seconds = int(datetime.now().timestamp()) - data["last"]
|
||||
age_txt = render.timespan(age_seconds)
|
||||
else:
|
||||
age_seconds = 999999
|
||||
age_txt = "unknown"
|
||||
|
||||
# Get type-specific thresholds
|
||||
thresholds = get_thresholds(backup_type, backup_level)
|
||||
warn_seconds = thresholds["warn"]
|
||||
crit_seconds = thresholds["crit"]
|
||||
|
||||
# Check age against thresholds (only if backup completed)
|
||||
if state == State.OK and age_seconds > crit_seconds:
|
||||
state = State.CRIT
|
||||
elif state == State.OK and age_seconds > warn_seconds:
|
||||
state = State.WARN
|
||||
|
||||
# Build summary
|
||||
summary = (
|
||||
f"Type={backup_type.upper()} ({category}), "
|
||||
f"Level={backup_level.upper()}, "
|
||||
f"Freq={frequency}, "
|
||||
f"Status={status_text}, "
|
||||
f"Last={age_txt}, "
|
||||
f"Jobs={data['count']}"
|
||||
)
|
||||
|
||||
# Main result
|
||||
yield Result(state=state, summary=summary)
|
||||
|
||||
# Detailed information
|
||||
if state in [State.WARN, State.CRIT]:
|
||||
details = (
|
||||
f"Thresholds: WARN={render.timespan(warn_seconds)}, "
|
||||
f"CRIT={render.timespan(crit_seconds)}"
|
||||
)
|
||||
yield Result(state=State.OK, notice=details)
|
||||
|
||||
# Metrics
|
||||
yield Metric(
|
||||
name="backup_age",
|
||||
value=age_seconds,
|
||||
levels=(warn_seconds, crit_seconds),
|
||||
boundaries=(0, None),
|
||||
)
|
||||
|
||||
yield Metric(
|
||||
name="backup_jobs",
|
||||
value=data["count"],
|
||||
)
|
||||
|
||||
|
||||
check_plugin_tsm_backups = CheckPlugin(
|
||||
name="tsm_backups",
|
||||
service_name="TSM Backup %s",
|
||||
discovery_function=discover_tsm_backups,
|
||||
check_function=check_tsm_backups,
|
||||
sections=["tsm_backups"],
|
||||
)
|
||||
|
|
@ -0,0 +1,503 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
TSM Backup Status Check Plugin for CheckMK 2.3+
|
||||
- Parses tsm_backups agent section
|
||||
- Creates services with labels (backup_type, frequency, level, error_handling)
|
||||
- Dynamic backup type detection
|
||||
- Configurable type-specific thresholds
|
||||
- Tolerant error handling for FILE/VIRTUAL backups
|
||||
|
||||
Installation: /omd/sites/monitoring/local/lib/python3/cmk_addons/plugins/tsm/agent_based/tsm_backups.py
|
||||
Check name: tsm_backups
|
||||
|
||||
Author: Marius Gielnik
|
||||
Version: 4.1 - Fixed Label Detection & Comprehensive Documentation
|
||||
"""
|
||||
from typing import Any, Mapping
|
||||
from datetime import datetime
|
||||
import json
|
||||
import re
|
||||
|
||||
from cmk.agent_based.v2 import (
|
||||
AgentSection,
|
||||
CheckPlugin,
|
||||
CheckResult,
|
||||
DiscoveryResult,
|
||||
Result,
|
||||
Service,
|
||||
ServiceLabel,
|
||||
State,
|
||||
Metric,
|
||||
render,
|
||||
StringTable,
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# Configuration Section
|
||||
# ============================================================================
|
||||
|
||||
# Type-specific thresholds (in seconds)
|
||||
# New types can be added here dynamically without code changes
|
||||
THRESHOLDS = {
|
||||
"log": {"warn": 4 * 3600, "crit": 8 * 3600}, # Hourly LOG backups
|
||||
"mssql": {"warn": 26 * 3600, "crit": 48 * 3600}, # Daily DB backups
|
||||
"hana": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"db2": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"oracle": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"mysql": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"file": {"warn": 36 * 3600, "crit": 72 * 3600}, # More tolerant
|
||||
"virtual": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"mail": {"warn": 26 * 3600, "crit": 48 * 3600},
|
||||
"scale": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"dm": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"datacenter": {"warn": 36 * 3600, "crit": 72 * 3600},
|
||||
"default": {"warn": 26 * 3600, "crit": 48 * 3600}, # Fallback
|
||||
}
|
||||
|
||||
# Types that should use tolerant error handling
|
||||
# Failed/Missed backups result in WARNING instead of CRITICAL
|
||||
TOLERANT_TYPES = {
|
||||
'file', 'virtual', 'scale', 'dm', 'datacenter', 'mail'
|
||||
}
|
||||
|
||||
# Known database types (for better classification)
|
||||
DATABASE_TYPES = {
|
||||
'mssql', 'hana', 'db2', 'oracle', 'mysql'
|
||||
}
|
||||
|
||||
# Known virtualization types
|
||||
VIRTUAL_TYPES = {
|
||||
'virtual'
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
def extract_backup_type(node: str) -> str:
|
||||
"""
|
||||
Extrahiert Backup-Typ aus Node-Namen.
|
||||
|
||||
Bekannte Typen werden erkannt, alle anderen werden als "unknown" markiert.
|
||||
|
||||
Examples:
|
||||
MYSERVER_MSSQL -> mssql
|
||||
DATABASE_HANA_01 -> hana
|
||||
FILESERVER_FILE -> file
|
||||
VM_HYPERV_123 -> hyperv
|
||||
|
||||
Args:
|
||||
node: Node name from TSM
|
||||
|
||||
Returns:
|
||||
str: Detected backup type in lowercase
|
||||
"""
|
||||
if '_' not in node:
|
||||
return "unknown"
|
||||
|
||||
parts = node.split('_')
|
||||
last = parts[-1].upper()
|
||||
|
||||
# Bekannte Backup-Typen (fest definiert)
|
||||
known_types = [
|
||||
'MSSQL', 'HANA', 'FILE', 'ORACLE', 'DB2', 'SCALE', 'DM',
|
||||
'DATACENTER', 'VIRTUAL', 'MAIL', 'MYSQL'
|
||||
]
|
||||
|
||||
if last in known_types:
|
||||
return last.lower()
|
||||
|
||||
# Falls letztes Segment numerisch, versuche vorletztes
|
||||
if last.isdigit() and len(parts) > 1:
|
||||
second_last = parts[-2].upper()
|
||||
if second_last in known_types:
|
||||
return second_last.lower()
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
def extract_backup_level(schedules: list) -> str:
|
||||
"""
|
||||
Extracts backup level from schedule names.
|
||||
|
||||
Priority: log > full > differential > incremental
|
||||
|
||||
Args:
|
||||
schedules: List of schedule names
|
||||
|
||||
Returns:
|
||||
str: Detected backup level
|
||||
"""
|
||||
levels_found = set()
|
||||
|
||||
for schedule in schedules:
|
||||
schedule_upper = schedule.upper()
|
||||
|
||||
if '_LOG' in schedule_upper or 'LOG' in schedule_upper:
|
||||
levels_found.add('log')
|
||||
elif '_FULL' in schedule_upper or 'FULL' in schedule_upper:
|
||||
levels_found.add('full')
|
||||
elif '_INCR' in schedule_upper or 'INCREMENTAL' in schedule_upper:
|
||||
levels_found.add('incremental')
|
||||
elif '_DIFF' in schedule_upper or 'DIFFERENTIAL' in schedule_upper:
|
||||
levels_found.add('differential')
|
||||
|
||||
# Return in priority order
|
||||
if 'log' in levels_found:
|
||||
return 'log'
|
||||
if 'full' in levels_found:
|
||||
return 'full'
|
||||
if 'differential' in levels_found:
|
||||
return 'differential'
|
||||
if 'incremental' in levels_found:
|
||||
return 'incremental'
|
||||
|
||||
return 'full' # Default
|
||||
|
||||
|
||||
def extract_frequency(schedules: list) -> str:
|
||||
"""
|
||||
Extracts backup frequency from schedule names.
|
||||
|
||||
Priority: hourly > daily > weekly > monthly
|
||||
|
||||
Args:
|
||||
schedules: List of schedule names
|
||||
|
||||
Returns:
|
||||
str: Detected frequency
|
||||
"""
|
||||
frequencies_found = set()
|
||||
|
||||
for schedule in schedules:
|
||||
schedule_upper = schedule.upper()
|
||||
|
||||
if 'HOURLY' in schedule_upper:
|
||||
frequencies_found.add('hourly')
|
||||
elif 'DAILY' in schedule_upper:
|
||||
frequencies_found.add('daily')
|
||||
elif 'WEEKLY' in schedule_upper:
|
||||
frequencies_found.add('weekly')
|
||||
elif 'MONTHLY' in schedule_upper:
|
||||
frequencies_found.add('monthly')
|
||||
# Time-based pattern detection (HH-MM-SS_)
|
||||
elif re.match(r'^\d{2}-\d{2}-\d{2}_', schedule):
|
||||
if '_LOG' in schedule_upper:
|
||||
frequencies_found.add('hourly')
|
||||
elif schedule.startswith('00-00-00'):
|
||||
frequencies_found.add('daily')
|
||||
else:
|
||||
frequencies_found.add('daily')
|
||||
|
||||
# Return in priority order
|
||||
if 'hourly' in frequencies_found:
|
||||
return 'hourly'
|
||||
if 'daily' in frequencies_found:
|
||||
return 'daily'
|
||||
if 'weekly' in frequencies_found:
|
||||
return 'weekly'
|
||||
if 'monthly' in frequencies_found:
|
||||
return 'monthly'
|
||||
|
||||
return 'unknown'
|
||||
|
||||
|
||||
def get_error_handling(backup_type: str) -> str:
|
||||
"""
|
||||
Determines error handling strategy based on backup type.
|
||||
|
||||
Tolerant types: Failed backups trigger WARNING instead of CRITICAL
|
||||
Strict types: Failed backups trigger CRITICAL
|
||||
|
||||
Args:
|
||||
backup_type: Detected backup type
|
||||
|
||||
Returns:
|
||||
str: 'tolerant' or 'strict'
|
||||
"""
|
||||
return 'tolerant' if backup_type in TOLERANT_TYPES else 'strict'
|
||||
|
||||
|
||||
def get_backup_category(backup_type: str) -> str:
|
||||
"""
|
||||
Categorizes backup type for additional labeling.
|
||||
|
||||
Categories: database, virtualization, filesystem, application, other
|
||||
|
||||
Args:
|
||||
backup_type: Detected backup type
|
||||
|
||||
Returns:
|
||||
str: Category name
|
||||
"""
|
||||
if backup_type in DATABASE_TYPES:
|
||||
return 'database'
|
||||
elif backup_type in VIRTUAL_TYPES:
|
||||
return 'virtualization'
|
||||
elif backup_type in {'file', 'scale', 'dm', 'datacenter'}:
|
||||
return 'filesystem'
|
||||
elif backup_type in {'mail', 'exchange'}:
|
||||
return 'application'
|
||||
else:
|
||||
return 'other'
|
||||
|
||||
|
||||
def get_thresholds(backup_type: str, backup_level: str) -> dict:
|
||||
"""
|
||||
Returns type and level-specific thresholds.
|
||||
|
||||
Priority:
|
||||
1. If level is 'log', use log thresholds
|
||||
2. If type has specific thresholds, use those
|
||||
3. Use default thresholds
|
||||
|
||||
Args:
|
||||
backup_type: Detected backup type
|
||||
backup_level: Detected backup level
|
||||
|
||||
Returns:
|
||||
dict: {"warn": seconds, "crit": seconds}
|
||||
"""
|
||||
if backup_level == 'log':
|
||||
return THRESHOLDS['log']
|
||||
elif backup_type in THRESHOLDS:
|
||||
return THRESHOLDS[backup_type]
|
||||
else:
|
||||
return THRESHOLDS['default']
|
||||
|
||||
|
||||
def calculate_state(statuses: list, last_time: int, backup_type: str,
|
||||
error_handling: str) -> tuple:
|
||||
"""
|
||||
Calculates CheckMK state from backup statuses.
|
||||
|
||||
Logic:
|
||||
- At least 1x Completed -> OK
|
||||
- Only Pending/Started (recent) -> OK
|
||||
- Only Pending/Started (old) -> WARN
|
||||
- Failed/Missed + tolerant -> WARN
|
||||
- Failed/Missed + strict -> CRIT
|
||||
|
||||
Args:
|
||||
statuses: List of backup statuses
|
||||
last_time: Timestamp of last backup
|
||||
backup_type: Detected backup type
|
||||
error_handling: 'tolerant' or 'strict'
|
||||
|
||||
Returns:
|
||||
tuple: (State, status_text)
|
||||
"""
|
||||
statuses_lower = [s.lower() for s in statuses]
|
||||
|
||||
# At least one completed backup -> OK
|
||||
if "completed" in statuses_lower:
|
||||
return (State.OK, "Completed")
|
||||
|
||||
# Only Pending/Started
|
||||
only_pending_started = all(s in ["pending", "started"] for s in statuses_lower)
|
||||
if only_pending_started:
|
||||
if last_time:
|
||||
age = int(datetime.now().timestamp()) - last_time
|
||||
if age < 2 * 3600: # 2 hours
|
||||
return (State.OK, "Pending/Started")
|
||||
else:
|
||||
return (State.WARN, "Pending (>2h)")
|
||||
else:
|
||||
return (State.WARN, "Pending")
|
||||
|
||||
# Failed or Missed backups
|
||||
has_failed = any("failed" in s for s in statuses_lower)
|
||||
has_missed = "missed" in statuses_lower
|
||||
|
||||
if has_failed or has_missed:
|
||||
if error_handling == 'tolerant':
|
||||
return (State.WARN, "Failed (partial)")
|
||||
else:
|
||||
return (State.CRIT, "Failed/Missed")
|
||||
|
||||
return (State.CRIT, "Unknown State")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Agent Section Registration
|
||||
# ============================================================================
|
||||
|
||||
def parse_tsm_backups(string_table: StringTable) -> Mapping[str, Any]:
|
||||
"""
|
||||
Parses the tsm_backups agent section.
|
||||
|
||||
Expected format: JSON string from agent plugin
|
||||
|
||||
Args:
|
||||
string_table: Raw agent output
|
||||
|
||||
Returns:
|
||||
dict: Parsed backup data per node
|
||||
"""
|
||||
if not string_table or not string_table[0]:
|
||||
return {}
|
||||
|
||||
try:
|
||||
json_str = string_table[0][0]
|
||||
return json.loads(json_str)
|
||||
except (json.JSONDecodeError, IndexError, KeyError):
|
||||
return {}
|
||||
|
||||
|
||||
agent_section_tsm_backups = AgentSection(
|
||||
name="tsm_backups",
|
||||
parse_function=parse_tsm_backups,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Check Plugin Registration
|
||||
# ============================================================================
|
||||
|
||||
def discover_tsm_backups(section: Mapping[str, Any]) -> DiscoveryResult:
|
||||
"""
|
||||
Discovery function - creates one service per logical node.
|
||||
|
||||
Services are created with dynamic labels:
|
||||
- backup_type: Dynamically detected from node name
|
||||
- backup_category: database/virtualization/filesystem/application/other
|
||||
- backup_system: Always "tsm"
|
||||
- frequency: hourly/daily/weekly/monthly/unknown
|
||||
- backup_level: log/full/differential/incremental
|
||||
- error_handling: tolerant/strict
|
||||
- node_name: Original node name
|
||||
|
||||
Args:
|
||||
section: Parsed agent data
|
||||
|
||||
Yields:
|
||||
Service: CheckMK service objects with labels
|
||||
"""
|
||||
for node in section:
|
||||
data = section[node]
|
||||
|
||||
# Extract metadata dynamically
|
||||
backup_type = extract_backup_type(node)
|
||||
backup_level = extract_backup_level(data["schedules"])
|
||||
frequency = extract_frequency(data["schedules"])
|
||||
error_handling = get_error_handling(backup_type)
|
||||
category = get_backup_category(backup_type)
|
||||
|
||||
# Create service with dynamic labels
|
||||
yield Service(
|
||||
item=node,
|
||||
labels=[
|
||||
ServiceLabel("backup_type", backup_type),
|
||||
ServiceLabel("backup_category", category),
|
||||
ServiceLabel("backup_system", "tsm"),
|
||||
ServiceLabel("frequency", frequency),
|
||||
ServiceLabel("backup_level", backup_level),
|
||||
ServiceLabel("error_handling", error_handling),
|
||||
ServiceLabel("node_name", node),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def check_tsm_backups(item: str, section: Mapping[str, Any]) -> CheckResult:
|
||||
"""
|
||||
Check function - evaluates backup status and generates metrics.
|
||||
|
||||
Checks:
|
||||
- Backup completion status
|
||||
- Backup age against type-specific thresholds
|
||||
- Number of backup jobs
|
||||
|
||||
Args:
|
||||
item: Node name (service identifier)
|
||||
section: Parsed agent data
|
||||
|
||||
Yields:
|
||||
Result: Check results with state
|
||||
Metric: Performance metrics
|
||||
"""
|
||||
if item not in section:
|
||||
yield Result(
|
||||
state=State.UNKNOWN,
|
||||
summary=f"Backup {item} not found in data"
|
||||
)
|
||||
return
|
||||
|
||||
data = section[item]
|
||||
|
||||
# Extract metadata dynamically
|
||||
backup_type = extract_backup_type(item)
|
||||
backup_level = extract_backup_level(data["schedules"])
|
||||
frequency = extract_frequency(data["schedules"])
|
||||
error_handling = get_error_handling(backup_type)
|
||||
category = get_backup_category(backup_type)
|
||||
|
||||
# Calculate state
|
||||
state, status_text = calculate_state(
|
||||
data["statuses"],
|
||||
data["last"],
|
||||
backup_type,
|
||||
error_handling
|
||||
)
|
||||
|
||||
# Calculate backup age
|
||||
if data["last"]:
|
||||
age_seconds = int(datetime.now().timestamp()) - data["last"]
|
||||
age_txt = render.timespan(age_seconds)
|
||||
else:
|
||||
age_seconds = 999999
|
||||
age_txt = "unknown"
|
||||
|
||||
# Get type-specific thresholds
|
||||
thresholds = get_thresholds(backup_type, backup_level)
|
||||
warn_seconds = thresholds["warn"]
|
||||
crit_seconds = thresholds["crit"]
|
||||
|
||||
# Check age against thresholds (only if backup completed)
|
||||
if state == State.OK and age_seconds > crit_seconds:
|
||||
state = State.CRIT
|
||||
elif state == State.OK and age_seconds > warn_seconds:
|
||||
state = State.WARN
|
||||
|
||||
# Build summary
|
||||
summary = (
|
||||
f"Type={backup_type.upper()} ({category}), "
|
||||
f"Level={backup_level.upper()}, "
|
||||
f"Freq={frequency}, "
|
||||
f"Status={status_text}, "
|
||||
f"Last={age_txt}, "
|
||||
f"Jobs={data['count']}"
|
||||
)
|
||||
|
||||
# Main result
|
||||
yield Result(state=state, summary=summary)
|
||||
|
||||
# Detailed information
|
||||
if state in [State.WARN, State.CRIT]:
|
||||
details = (
|
||||
f"Thresholds: WARN={render.timespan(warn_seconds)}, "
|
||||
f"CRIT={render.timespan(crit_seconds)}"
|
||||
)
|
||||
yield Result(state=State.OK, notice=details)
|
||||
|
||||
# Metrics
|
||||
yield Metric(
|
||||
name="backup_age",
|
||||
value=age_seconds,
|
||||
levels=(warn_seconds, crit_seconds),
|
||||
boundaries=(0, None),
|
||||
)
|
||||
|
||||
yield Metric(
|
||||
name="backup_jobs",
|
||||
value=data["count"],
|
||||
)
|
||||
|
||||
|
||||
check_plugin_tsm_backups = CheckPlugin(
|
||||
name="tsm_backups",
|
||||
service_name="TSM Backup %s",
|
||||
discovery_function=discover_tsm_backups,
|
||||
check_function=check_tsm_backups,
|
||||
sections=["tsm_backups"],
|
||||
)
|
||||
Loading…
Reference in a new issue