#!/usr/bin/perl -w
+########################################################
+# plugin script for Nagios to monitor Lustre #
+# #
+# Sergey Yurevich #
+########################################################
+
use strict;
use warnings;
use IO::Handle;
use Time::Local;
use Time::localtime;
+use Getopt::Std;
# the following is for the status server
# to communicate with Nagios plugin
use IO::Socket;
use IO::Select;
+our ($opt_s, $opt_h);
+getopts('hs');
+
+if($opt_h){
+ &showHelp();
+ exit(0);
+}
+
my $status : shared = "OK";
my $time_ls : shared = &getTime();
-my $sleep_time : shared = 120;
+my $sleep_time : shared = 600; # 10 minutes
our $server_port = '50502';
our $protocol = 'tcp';
while (1)
{
- if( abs($time_du - $time_ls) > 60*60*24 ){
+ if( $opt_s && abs($time_du - $time_ls) > 60*60*24 ){
$lustre_size = &getTotalSize();
$time_du = &getTime();
}
- my $line =`ls -d /lustre_alpha/hades/beam/sep08`;
+ my $line =`ls -d /lustre_alpha/hades/user`;
chomp($line);
$time_ls = &getTime();
- if ($line eq "/lustre_alpha/hades/beam/sep08")
+ if ($line eq "/lustre_alpha/hades/user")
{
- if ($lustre_size > 30) {
- $status = "CRITICAL - Lustre hades/beam space used: $lustre_size TB";
- }
- elsif ($lustre_size > 25) {
- $status = "WARNING - Lustre hades/beam space used: $lustre_size TB";
+ if($opt_s){
+ $status = "OK - Lustre space used: $lustre_size TB";
+ &checkLustreSize($lustre_size);
}
- else {
- $status = "OK - Lustre hades/beam space used: $lustre_size TB";
+ else{
+ $status = "OK - Lustre is mounted";
}
}
else {
- $status = "CRITICAL - failure of Lustre file system!";
+ $status = "CRITICAL - Lustre mount failure!";
}
- sleep $sleep_time; # 2 minutes
+ sleep $sleep_time; # 10 minutes
}
}
IO::Socket::INET->new( LocalPort => $server_port,
Proto => 'tcp',
Listen => SOMAXCONN ) ))
- {
+ {
print "ERROR: Cannot start status server!\n";
}
while(1) {
- # wait 5 seconds for connections
+ #--- wait 5 seconds for connections
while (my @file_handles = $selector->can_read( 5 )) {
foreach my $file_handle (@file_handles) {
if($file_handle == $server_socket) {
- # create a new socket for this transaction
+ #--- create a new socket for this transaction
unless (defined( $client_socket = $server_socket->accept() ))
{
print "ERROR: Cannot open socket to send status!\n";
#--- get total size of lustre beam dir
my $totsize = 0;
- my @size = `du -cms /lustre_alpha/hades/beam`;
+ my @size = `du -cms /lustre_alpha/hades`;
foreach my $line (@size){
if( $line =~ /total/ ){
return $totsize;
}
+sub checkLustreSize
+{
+ my $lustre_size = shift;
+
+ if ($lustre_size > 30) {
+ $status = "CRITICAL - Lustre space used: $lustre_size TB";
+ }
+ elsif ($lustre_size > 25) {
+ $status = "WARNING - Lustre space used: $lustre_size TB";
+ }
+}
+
+sub showHelp
+{
+ print << 'EOF';
+
+ Nagios plugin
+
+ This script checks periodically a Lustre mount on
+ a local machine. The script also runs a status server
+ in a separate thread for reporting the status to Nagios.
+
+ Usage: check_lustre.pl [-s] [-h]
+
+ -s : Check also a disk space used on Lustre file system.
+ -h : Print this help.
+
+EOF
+}