Scheduled TSM backups write to a log defined in the schedule's .opt file; typically dsmsched.log. On large file servers with long dsmsched.log retention periods, this file can grow easily over 1 gigabyte. Reading the entire dsmsched.log file to determine success of the last backup will likely breach the timeout of the Nagios check. To compensate for this, we need to tail the log file and retrieve the summary data from the last backup. In the check below, I do just that. If you pass the name of the schedule log file (you can run multiple schedules on a client; each with a different log file name), the check will look for it install directory in the "Program Files" stored in the environmental variable. If no log file name is provided to the check, it will search the registry to look for the default log filename. If you are running custom install locations, this will need to me modified.
As you follow through the flow of the code, you will see what triggers are passed on to Nagios. They are fairly straightforward:
- If a backup has not completed in 24 hours, a critical alarm is generated
- If a certain number of failed file backups are reported, a warning alarm is generated
- If a backup is still running, a warning alarm is generated
- If a successful backup is detected, a normal return is generated
And remember, just because you have log files saying you have "good backups" that doesn't mean it's true. You need to test restores on a regular basis as a part of your disaster recovery practice. Ultimately, backups are only as good as their ability to be restored.
UPDATE: I have made some improvements to this code here.
param([string]$tsmFile) #--------------------------------------------------------------------------------------------------# Function Get-TSMInfo($server, $tsmInfo) { $key = "SOFTWARE" $hKey = [Microsoft.Win32.RegistryHive]::LocalMachine $baseKey = [Microsoft.Win32.RegistryKey]::OpenRemoteBaseKey($hKey, $server) foreach($rootKeyValue in ($baseKey.OpenSubKey($key)).GetSubKeyNames()) { if($rootKeyValue -eq "IBM" -and ($baseKey.OpenSubKey("$key\IBM\ADSM\CurrentVersion")).SubKeyCount -gt 2) { $tsmVersion = ($baseKey.OpenSubKey("$key\IBM\ADSM\CurrentVersion\BackupClient")).GetValue("PtfLevel") $tsmPath = ($baseKey.OpenSubKey("$key\IBM\ADSM\CurrentVersion\BackupClient")).GetValue("Path") $key = "SYSTEM\CurrentControlSet\Services" if($tsmVersion -ne "" -and $tsmPath -ne "") { foreach($keyValue in ($baseKey.OpenSubKey($key)).GetSubKeyNames()) { foreach($subKeyValue in ($baseKey.OpenSubKey("$key\$keyValue")).GetSubKeyNames()) { $clientNodeName = "" $errorLog = "" $optionsFile = "" $scheduleLog = "" if(($baseKey.OpenSubKey("$key\$keyValue").GetValue("Start")) -eq "2") { if($subKeyValue -eq "Parameters") { foreach($value in ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValueNames()) { if($value -eq "clientNodeName") { $clientNodeName = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value) } elseif($value -eq "errorLog") { $errorLog = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value) } elseif($value -eq "optionsFile") { $optionsFile = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value) } elseif($value -eq "scheduleLog") { $scheduleLog = ($baseKey.OpenSubKey("$key\$keyValue\Parameters")).GetValue($value) } } } } if($clientNodeName -ne "" -and $errorLog -ne "" -and $optionsFile -ne "" -and $scheduleLog -ne "") { $optionsFileUncPath = ("\\$server\" + ($optionsFile.SubString(0,1) + "$" + $optionsFile.SubString(2))) $tsmServer = "FAILED" $tsmClientPort = "FAILED" if(Test-Path -path $optionsFileUncPath) { foreach($line in (Get-Content -path $optionsFileUncPath)){ if($line -match "TCPSERVERADDRESS") { $tsmServer = ($line -replace "TCPSERVERADDRESS","").Trim() } if($line -match "TCPCLIENTPORT") { $tsmClientPort = ($line -replace "TCPCLIENTPORT","").Trim() } } } $serviceStatus = $null foreach($service in Get-Service) { if($service.DisplayName -eq $keyValue) { $serviceStatus = $service.Status break } } if($serviceStatus -eq "Running" -or $serviceStatus -eq "Stopped") { $clientNodeInformation = New-Object -typeName PSObject Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "server" -value $server Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "tsmVersion" -value $tsmVersion Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "installPath" -value $tsmPath Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "tsmServer" -value $tsmServer Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "tsmClientPort" -value $tsmClientPort Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "scheduleName" -value $keyValue Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "clientNodeName" -value $clientNodeName Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "optionsFile" -value $optionsFile Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "scheduleLog" -value $scheduleLog Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "errorLog" -value $errorLog Add-Member -inputObject $clientNodeInformation -type NoteProperty -name "status" -value $serviceStatus $tsmInfo += $clientNodeInformation } } } } } } } return $tsmInfo } #--------------------------------------------------------------------------------------------------# Function Read-EndOfFileByByteChunk($fileName,$totalNumberOfLines,$byteChunk) { if($totalNumberOfLines -lt 1) { $totalNumberOfLines = 1 } if($byteChunk -le 0) { $byteChunk = 10240 } $linesOfText = New-Object System.Collections.ArrayList if([System.IO.File]::Exists($fileName)) { $fileStream = New-Object System.IO.FileStream($fileName,[System.IO.FileMode]::Open,[System.IO.FileAccess]::Read,[System.IO.FileShare]::ReadWrite) $asciiEncoding = New-Object System.Text.ASCIIEncoding $fileSize = $fileStream.Length $byteOffset = $byteChunk [byte[]] $bytesRead = New-Object byte[] $byteChunk $totalBytesProcessed = 0 $lastReadAttempt = $false do { if($byteOffset -ge $fileSize) { $byteChunk = $fileSize - $totalBytesProcessed [byte[]] $bytesRead = New-Object byte[] $byteChunk $byteOffset = $fileSize $lastReadAttempt = $true } $fileStream.Seek((-$byteOffset), [System.IO.SeekOrigin]::End) | Out-Null $fileStream.Read($bytesRead, 0, $byteChunk) | Out-Null $chunkOfText = New-Object System.Collections.ArrayList $chunkOfText.AddRange(([System.Text.RegularExpressions.Regex]::Split($asciiEncoding.GetString($bytesRead),"\r\n"))) $firstLineLength = ($chunkOfText[0].Length) $byteOffset = ($byteOffset + $byteChunk) - ($firstLineLength) if($lastReadAttempt -eq $false -and $chunkOfText.count -lt $totalNumberOfLines) { $chunkOfText.RemoveAt(0) } $totalBytesProcessed += ($byteChunk - $firstLineLength) $linesOfText.InsertRange(0, $chunkOfText) } while($totalNumberOfLines -ge $linesOfText.count -and $lastReadAttempt -eq $false -and $totalBytesProcessed -lt $fileSize) $fileStream.Close() if($linesOfText.count -gt 1) { $linesOfText.RemoveAt($linesOfText.count-1) } $deltaLines = ($linesOfText.count - $totalNumberOfLines) if($deltaLines -gt 0) { $linesOfText.RemoveRange(0, $deltaLines) } } else { $linesOfText.Add("[ERROR] $fileName not found") | Out-Null } return $linesOfText } #--------------------------------------------------------------------------------------------------# Set-Variable -name returnNormal -option Constant -value 0 Set-Variable -name returnWarning -option Constant -value 1 Set-Variable -name returnError -option Constant -value 2 Set-Variable -name returnUnknown -option Constant -value 3 Set-Variable -name computerFqdn -option Constant -value (([System.Net.NetworkInformation.IPGlobalProperties]::GetIPGlobalProperties()).HostName + "." + ([System.Net.NetworkInformation.IPGlobalProperties]::GetIPGlobalProperties()).DomainName) Set-Variable -name backupWindow -option Constant -value 24 # in Hours Set-Variable -name deafService -option Constant -value 36 # in Hours Set-Variable -name enableRestarts -option Constant -value $true # Allow check to restart TSM if the service Set-Variable -name lookBack -option Constant -value 250 # Number of lines to tail Set-Variable -name maximumFailures -option Constant -value 5 # Your tolerance for failed files Set-Variable -name successfulBackup -value $false Set-Variable -name todaysBackupFound -value $false Set-Variable -name backupStillRunning -value $false Set-Variable -name completionTime -value $null Set-Variable -name totalFailed -value 0 Set-Variable -name logEntries -value @() Set-Variable -name exitMessage -value "Massive Script Failure" Set-Variable -name exitValue -value $returnError #--------------------------------------------------------------------------------------------------# if($tsmFile -eq "$" -or (!$tsmFile)) { $tsmInfo = @(Get-TSMInfo $computerFqdn @()) foreach($tsmInstance in $tsmInfo) { if($tsmInstance.scheduleLog -match $tsmFile) { if($tsmInstance.scheduleLog -match "\\dsmsched.log") { $tsmLogFile = $tsmInstance.scheduleLog Write-Host $tsmLogFile break } } } } else { $tsmLogFile = ($env:programfiles + "\Tivoli\TSM\baclient\$tsmFile") } if(Test-Path -path $tsmLogFile) { $logEntries = Read-EndOfFileByByteChunk $tsmLogFile $lookBack 1280 foreach($logEntry in $logEntries) { if($logEntry.Length -ge 19) { $dateTest = $logEntry.SubString(0,19) -as [DateTime] if($dateTest) { if(((Get-Date) - (Get-Date $logEntry.SubString(0,19))).TotalHours -le $backupWindow) { if($logEntry -match "Scheduled event '(.*?)' completed successfully.") { $successfulBackup = $true $completionTime = Get-Date $logEntry.SubString(0,19) } if($logEntry -match "Total number of objects failed:") { [int]$totalFailed = ($logEntry -Replace "(.*)Total number of objects failed:", "").Trim() } $todaysBackupFound = $true } $lastLine = $logEntry } } } if($successfulBackup -eq $false -and $todaysBackupFound -eq $true) { $lastLogTime = ((Get-Date) - (Get-Date $lastLine.SubString(0,19))).TotalMinutes if($lastLogTime -le 15) { $backupStillRunning = $true } } if($todaysBackupFound -eq $false) { if(((Get-Date) - (Get-Date $lastLine.SubString(0,19))).TotalHours -ge $deafService -and $enableRestarts -eq $true) { $tsmInfo = @(Get-TSMInfo $computerFqdn @()) $schedulerFound = $false foreach($tsmInstance in $tsmInfo) { if($tsmInstance.scheduleLog -match $tsmFile) { if($tsmInstance.status -eq "Running") { Restart-Service -name $tsmInstance.scheduleName $exitMessage = ("TSM Scheduler `"" + $tsmInstance.scheduleName + "`" has not contacted the TSM server in $deafService hours. Restarting service.") } else { Start-Service -name $tsmInstance.scheduleName $exitMessage = ("TSM Scheduler `"" + $tsmInstance.scheduleName + "`" was stopped and hasn't contacted the TSM server in $deafService hours. Starting service.") } $schedulerFound = $true $exitValue = $returnError break } } if($schedulerFound -eq $false) { $timeSinceLastContact = ((Get-Date) - (Get-Date $lastLine.SubString(0,19))).TotalHours $exitMessage = ("Unable to find data in the last $backupWindow hours in $tsmLogFile and the client hasn't contacted the TSM Server in $timeSinceLastContact hours. Last Backup log date: " + (Get-Date $lastLine.SubString(0,19))) $exitValue = $returnError } } else { $exitMessage = ("Unable to find data in the last $backupWindow hours in $tsmLogFile. Last Backup log date: " + (Get-Date $lastLine.SubString(0,19))) $exitValue = $returnError } } elseif($totalFailed -ge $maximumFailures) { $exitMessage = "Backup completed with $totalFailed failed objects." $exitValue = $returnWarning } elseif($successfulBackup -eq $true) { $exitMessage = "Backup completed successfully: $completionTime" $exitValue = $returnNormal } elseif($backupStillRunning -eq $true) { $exitMessage = ("Backup still running! Please allow to complete. Current status: " + $lastLine -Replace "\\","/") $exitValue = $returnWarning } else { $exitMessage = ("Unable to find a successful backup. Last status: " + $lastLine -Replace "\\","/") $exitValue = $returnError } } else { $exitMessage = "Unable to locate $tsmLogFile" $exitValue = $returnError } Write-Host $exitMessage $Host.SetShouldExit($exitValue)
how to use this with Nagios? Please advice me.
ReplyDelete