Scheduled TSM backups write to a log defined in the schedule's .opt file; typically dsmsched.log. On large file servers with long dsmsched.log retention periods, this file can grow easily over 1 gigabyte. Reading the entire dsmsched.log file to determine success of the last backup will likely breach the timeout of the Nagios check. To compensate for this, we need to tail the log file and retrieve the summary data from the last backup. In the check below, I do just that. If you pass the name of the schedule log file (you can run multiple schedules on a client; each with a different log file name), the check will look for it install directory in the "Program Files" stored in the environmental variable. If no log file name is provided to the check, it will search the registry to look for the default log filename. If you are running custom install locations, this will need to me modified.
As you follow through the flow of the code, you will see what triggers are passed on to Nagios. They are fairly straightforward:
- If a backup has not completed in 24 hours, a critical alarm is generated
- If a certain number of failed file backups are reported, a warning alarm is generated
- If a backup is still running, a warning alarm is generated
- If a successful backup is detected, a normal return is generated
And remember, just because you have log files saying you have "good backups" that doesn't mean it's true. You need to test restores on a regular basis as a part of your disaster recovery practice. Ultimately, backups are only as good as their ability to be restored.
UPDATE: I have made some improvements to this code here.
param([string]$tsmFile)
#--------------------------------------------------------------------------------------------------#
Function Get-TSMInfo($server, $tsmInfo) {
$key = "SOFTWARE"
$hKey = [Microsoft.Win32.RegistryHive]::LocalMachine
$baseKey = [Microsoft.Win32.RegistryKey]::OpenRemoteBaseKey($hKey, $server)
foreach($rootKeyValue in ($baseKey.OpenSubKey($key)).GetSubKeyNames()) {
if($rootKeyValue -eq "IBM" -and ($baseKey.OpenSubKey("$key\IBM\ADSM\CurrentVersion")).SubKeyCount -gt 2) {
$tsmVersion = ($baseKey.OpenSubKey("$key\IBM\ADSM\CurrentVersion\BackupClient")).GetValue("PtfLevel")
$tsmPath = ($baseKey.OpenSubKey("$key\IBM\ADSM\CurrentVersion\BackupClient")).GetValue("Path")
$key = "SYSTEM\CurrentControlSet\Services"
if($tsmVersion -ne "" -and $tsmPath -ne "") {
foreach($keyValue in ($baseKey.OpenSubKey($key)).GetSubKeyNames()) {
foreach($subKeyValue in ($baseKey.OpenSubKey("$key\$keyValue")).GetSubKeyNames()) {
$clientNodeName = ""
$errorLog = ""
$optionsFile = ""
$scheduleLog = ""
if(($baseKey.OpenSubKey("$key\$keyValue").GetValue("Start")) -eq "2") {
if($subKeyValue -eq "Parameters") {
foreach($value in ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValueNames()) {
if($value -eq "clientNodeName") {
$clientNodeName = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value)
} elseif($value -eq "errorLog") {
$errorLog = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value)
} elseif($value -eq "optionsFile") {
$optionsFile = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value)
} elseif($value -eq "scheduleLog") {
$scheduleLog = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value)
}
}
}
}
if($clientNodeName -ne "" -and $errorLog -ne "" -and $optionsFile -ne "" -and $scheduleLog -ne "") {
$optionsFileUncPath = ("\\$server\" + ($optionsFile.SubString(0,1) + "$" + $optionsFile.SubString(2)))
$tsmServer = "FAILED"
$tsmClientPort = "FAILED"
if(Test-Path -path $optionsFileUncPath) {
foreach($line in (Get-Content -path $optionsFileUncPath)){
if($line -match "TCPSERVERADDRESS") {
$tsmServer = ($line -replace "TCPSERVERADDRESS","").Trim()
}
if($line -match "TCPCLIENTPORT") {
$tsmClientPort = ($line -replace "TCPCLIENTPORT","").Trim()
}
}
}
$serviceStatus = $null
foreach($service in Get-Service) {
if($service.DisplayName -eq $keyValue) {
$serviceStatus = $service.Status
break
}
}
if($serviceStatus -eq "Running" -or $serviceStatus -eq "Stopped") {
$clientNodeInformation = New-Object -typeName PSObject
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "server" -value $server
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "tsmVersion" -value $tsmVersion
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "installPath" -value $tsmPath
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "tsmServer" -value $tsmServer
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "tsmClientPort" -value $tsmClientPort
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "scheduleName" -value $keyValue
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "clientNodeName" -value $clientNodeName
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "optionsFile" -value $optionsFile
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "scheduleLog" -value $scheduleLog
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "errorLog" -value $errorLog
Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "status" -value $serviceStatus
$tsmInfo += $clientNodeInformation
}
}
}
}
}
}
}
return $tsmInfo
}
#--------------------------------------------------------------------------------------------------#
Function Read-EndOfFileByByteChunk($fileName,$totalNumberOfLines,$byteChunk) {
if($totalNumberOfLines -lt 1) { $totalNumberOfLines = 1 }
if($byteChunk -le 0) { $byteChunk = 10240 }
$linesOfText = New-Object System.Collections.ArrayList
if([System.IO.File]::Exists($fileName)) {
$fileStream = New-Object System.IO.FileStream($fileName,[System.IO.FileMode]::Open,[System.IO.FileAccess]::Read,[System.IO.FileShare]::ReadWrite)
$asciiEncoding = New-Object System.Text.ASCIIEncoding
$fileSize = $fileStream.Length
$byteOffset = $byteChunk
[byte[]] $bytesRead = New-Object byte[] $byteChunk
$totalBytesProcessed = 0
$lastReadAttempt = $false
do {
if($byteOffset -ge $fileSize) {
$byteChunk = $fileSize - $totalBytesProcessed
[byte[]] $bytesRead = New-Object byte[] $byteChunk
$byteOffset = $fileSize
$lastReadAttempt = $true
}
$fileStream.Seek((-$byteOffset), [System.IO.SeekOrigin]::End) | Out-Null
$fileStream.Read($bytesRead, 0, $byteChunk) | Out-Null
$chunkOfText = New-Object System.Collections.ArrayList
$chunkOfText.AddRange(([System.Text.RegularExpressions.Regex]::Split($asciiEncoding.GetString($bytesRead),"\r\n")))
$firstLineLength = ($chunkOfText[0].Length)
$byteOffset = ($byteOffset + $byteChunk) - ($firstLineLength)
if($lastReadAttempt -eq $false -and $chunkOfText.count -lt $totalNumberOfLines) {
$chunkOfText.RemoveAt(0)
}
$totalBytesProcessed += ($byteChunk - $firstLineLength)
$linesOfText.InsertRange(0, $chunkOfText)
} while($totalNumberOfLines -ge $linesOfText.count -and $lastReadAttempt -eq $false -and $totalBytesProcessed -lt $fileSize)
$fileStream.Close()
if($linesOfText.count -gt 1) {
$linesOfText.RemoveAt($linesOfText.count-1)
}
$deltaLines = ($linesOfText.count - $totalNumberOfLines)
if($deltaLines -gt 0) {
$linesOfText.RemoveRange(0, $deltaLines)
}
} else {
$linesOfText.Add("[ERROR] $fileName not found") | Out-Null
}
return $linesOfText
}
#--------------------------------------------------------------------------------------------------#
Set-Variable -name returnNormal -option Constant -value 0
Set-Variable -name returnWarning -option Constant -value 1
Set-Variable -name returnError -option Constant -value 2
Set-Variable -name returnUnknown -option Constant -value 3
Set-Variable -name computerFqdn -option Constant -value (([System.Net.NetworkInformation.IPGlobalProperties]::GetIPGlobalProperties()).HostName + "." + ([System.Net.NetworkInformation.IPGlobalProperties]::GetIPGlobalProperties()).DomainName)
Set-Variable -name backupWindow -option Constant -value 24 # in Hours
Set-Variable -name deafService -option Constant -value 36 # in Hours
Set-Variable -name enableRestarts -option Constant -value $true # Allow check to restart TSM if the service
Set-Variable -name lookBack -option Constant -value 250 # Number of lines to tail
Set-Variable -name maximumFailures -option Constant -value 5 # Your tolerance for failed files
Set-Variable -name successfulBackup -value $false
Set-Variable -name todaysBackupFound -value $false
Set-Variable -name backupStillRunning -value $false
Set-Variable -name completionTime -value $null
Set-Variable -name totalFailed -value 0
Set-Variable -name logEntries -value @()
Set-Variable -name exitMessage -value "Massive Script Failure"
Set-Variable -name exitValue -value $returnError
#--------------------------------------------------------------------------------------------------#
if($tsmFile -eq "$" -or (!$tsmFile)) {
$tsmInfo = @(Get-TSMInfo $computerFqdn @())
foreach($tsmInstance in $tsmInfo) {
if($tsmInstance.scheduleLog -match $tsmFile) {
if($tsmInstance.scheduleLog -match "\\dsmsched.log") {
$tsmLogFile = $tsmInstance.scheduleLog
Write-Host $tsmLogFile
break
}
}
}
} else {
$tsmLogFile = ($env:programfiles + "\Tivoli\TSM\baclient\$tsmFile")
}
if(Test-Path -path $tsmLogFile) {
$logEntries = Read-EndOfFileByByteChunk $tsmLogFile $lookBack 1280
foreach($logEntry in $logEntries) {
if($logEntry.Length -ge 19) {
$dateTest = $logEntry.SubString(0,19) -as [DateTime]
if($dateTest) {
if(((Get-Date) - (Get-Date $logEntry.SubString(0,19))).TotalHours -le $backupWindow) {
if($logEntry -match "Scheduled event '(.*?)' completed successfully.") {
$successfulBackup = $true
$completionTime = Get-Date $logEntry.SubString(0,19)
}
if($logEntry -match "Total number of objects failed:") {
[int]$totalFailed = ($logEntry -Replace "(.*)Total number of objects failed:", "").Trim()
}
$todaysBackupFound = $true
}
$lastLine = $logEntry
}
}
}
if($successfulBackup -eq $false -and $todaysBackupFound -eq $true) {
$lastLogTime = ((Get-Date) - (Get-Date $lastLine.SubString(0,19))).TotalMinutes
if($lastLogTime -le 15) {
$backupStillRunning = $true
}
}
if($todaysBackupFound -eq $false) {
if(((Get-Date) - (Get-Date $lastLine.SubString(0,19))).TotalHours -ge $deafService -and $enableRestarts -eq $true) {
$tsmInfo = @(Get-TSMInfo $computerFqdn @())
$schedulerFound = $false
foreach($tsmInstance in $tsmInfo) {
if($tsmInstance.scheduleLog -match $tsmFile) {
if($tsmInstance.status -eq "Running") {
Restart-Service -name $tsmInstance.scheduleName
$exitMessage = ("TSM Scheduler `"" + $tsmInstance.scheduleName + "`" has not contacted the TSM server in $deafService hours. Restarting service.")
} else {
Start-Service -name $tsmInstance.scheduleName
$exitMessage = ("TSM Scheduler `"" + $tsmInstance.scheduleName + "`" was stopped and hasn't contacted the TSM server in $deafService hours. Starting service.")
}
$schedulerFound = $true
$exitValue = $returnError
break
}
}
if($schedulerFound -eq $false) {
$timeSinceLastContact = ((Get-Date) - (Get-Date $lastLine.SubString(0,19))).TotalHours
$exitMessage = ("Unable to find data in the last $backupWindow hours in $tsmLogFile and the client hasn't contacted the TSM Server in $timeSinceLastContact hours. Last Backup log date: " + (Get-Date $lastLine.SubString(0,19)))
$exitValue = $returnError
}
} else {
$exitMessage = ("Unable to find data in the last $backupWindow hours in $tsmLogFile. Last Backup log date: " + (Get-Date $lastLine.SubString(0,19)))
$exitValue = $returnError
}
} elseif($totalFailed -ge $maximumFailures) {
$exitMessage = "Backup completed with $totalFailed failed objects."
$exitValue = $returnWarning
} elseif($successfulBackup -eq $true) {
$exitMessage = "Backup completed successfully: $completionTime"
$exitValue = $returnNormal
} elseif($backupStillRunning -eq $true) {
$exitMessage = ("Backup still running! Please allow to complete. Current status: " + $lastLine -Replace "\\","/")
$exitValue = $returnWarning
} else {
$exitMessage = ("Unable to find a successful backup. Last status: " + $lastLine -Replace "\\","/")
$exitValue = $returnError
}
} else {
$exitMessage = "Unable to locate $tsmLogFile"
$exitValue = $returnError
}
Write-Host $exitMessage
$Host.SetShouldExit($exitValue)
how to use this with Nagios? Please advice me.
ReplyDelete