--- /dev/null
+Plugin Location Comment
+--------------------------------------------------------------
+check_backup.pl hadeb07 checks backup on hadeb07.
+ Runs status server to report
+ status to Nagios.
+
+check_archiver.pl lxg0434 checks archiving process of
+ slow ctrl data to Oracle.
+ Runs status server to report
+ status to Nagios.
+
+check_lustre.pl lxhadesdaq checks Lustre mount. Checks
+ used disk space on Lustre.
+ Runs status server to report
+ status to Nagios.
+
+my_check_eblog.pl lxhadesdaq checks discarded events in
+ eb_s.tcl written by EB.
+ Runs status server to report
+ status to Nagios.
+
+my_check_proc_status.pl hadesdaq runs by Nagios to receive
+ status report from remote
+ scripts.
\ No newline at end of file
--- /dev/null
+#!/usr/local/bin/perl -w
+
+use strict;
+use warnings;
+
+use FileHandle;
+use Net::FTP;
+use Data::Dumper;
+use IO::Handle;
+use Time::Local;
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
+## oracle
+use DBI;
+
+############# oracle ############
+my $user = 'DAQ_PUB';
+my $pass = 'hades';
+my $database = 'db-hades';
+my $table = 'hades_scs.mon_channels_last_archived';
+my $hostname = 'pcora2.gsi.de';
+
+my $count = 0;
+my $status : shared = "OK";
+my @screenPID;
+
+our $server_port = '50501';
+our $protocol = 'tcp';
+
+threads->new( \&statusServer);
+
+&main();
+
+exit(0);
+
+sub main {
+
+#--- Connect the database
+my $dbh = DBI->connect( "dbi:Oracle:$database", $user,$pass);
+
+#--- Prepare select
+my $sth = $dbh->prepare("SELECT to_char(max(data_end), 'YYYY-MM-DD hh24:mi:ss') from $table");
+
+my @ora_answer; my $servertime; my $yyyy; my $mm; my $dd; my $hh; my $mi; my $ss;
+
+my $localsec; my $serversec; my $diffsec;
+
+my @items;
+ while(1){
+ my @array = `ps axu`;
+
+ foreach my $line (@array)
+ {
+ if($line =~/SCREEN(?:\s\w+|)\sstartArchiver/)
+ {
+ @items = split(" ", $line);
+ $screenPID[$count]=$items[1];
+ $count ++;
+ }
+ }
+
+ $localsec=timegm((localtime)[0,1,2,3,4,5]);
+ $sth->execute();
+ while( @ora_answer = $sth->fetchrow_array ){
+ $servertime= $ora_answer[0];
+ }
+ ($yyyy,$mm,$dd,$hh,$mi,$ss)=($servertime =~ /(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)/);
+ $yyyy=$yyyy-1900; $mm=$mm-1;
+ $serversec= timelocal($ss,$mi,$hh,$dd,$mm,$yyyy);
+
+ $serversec= timegm($ss,$mi,$hh,$dd,$mm,$yyyy);
+ $diffsec= $localsec - $serversec;
+ unless ($diffsec > 300){
+ if ($count == 0) { $status = "CRITICAL - 0 processes found" }
+ elsif ($count>1){$status = "WARNING -PID: @screenPID" }
+ else { $status = "OK - PID: @screenPID"};}
+ else {$status = "CRITICAL - no update since $servertime"}
+ #print "status: $status\n";
+
+ $count=0;
+ sleep 60;
+ }
+}
+
+sub statusServer{ my $server_socket;
+ my $client_socket;
+ my $selector;
+
+ unless (defined( $server_socket =
+ IO::Socket::INET->new( LocalPort => $server_port,
+ Proto => 'tcp',
+ Listen => SOMAXCONN ) ))
+ {
+ print "ERROR: Cannot start status server!\n";
+ }
+
+ $selector = new IO::Select( $server_socket );
+
+ while(1) {
+
+ # wait 5 seconds for connections
+ while (my @file_handles = $selector->can_read( 5 )) {
+
+ foreach my $file_handle (@file_handles) {
+
+ if($file_handle == $server_socket) {
+
+ # create a new socket for this transaction
+ unless (defined( $client_socket = $server_socket->accept() )
+)
+ {
+ print "ERROR: Cannot open socket to send status!\n";
+ }
+
+
+ print $client_socket $status;
+
+ close( $client_socket );
+ }
+ }
+ }
+ }
+
+
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+#########################################################
+# #
+# Nagios plugin running on hadeb07 (backup server) #
+# and checking the success of the recent backup. #
+# #
+#########################################################
+
+use FileHandle;
+use Net::FTP;
+use Data::Dumper;
+use IO::Handle;
+use Time::Local;
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
+my @items;
+my $count = 0;
+my $status : shared = "OK";
+my @screenPID;
+
+our $server_port = '50501';
+our $protocol = 'tcp';
+
+threads->new( \&statusServer);
+
+&main();
+
+exit(0);
+
+
+sub main {
+
+ my ($servertime, $yyyy, $mm, $dd, $hh, $mi, $ss);
+ my ($localsec, $serversec, $diffsec);
+
+ while(1){
+
+ #--- check the presence of all 5 directories inside
+ my @array = `ls /data/backup/.snapshots/hourly.0/`;
+ #--- check time of only the latest dir: hourly.0
+ my @dir = `ls -ltr /data/backup/.snapshots/`;
+
+ foreach my $line (@array)
+ {
+ #--- count directories
+ if( $line =~ /(\bhadesdaq\b|\blxhadesdaq\b|\bhadeb05\b|\bhades25\b|\bdepc234\b)/ ) {
+ $count ++;
+ }
+ }
+
+ #--- backup must have 5 main directories inside!
+ if( $count < 5 ){
+ $status = "WARNING - backup directory missing";
+ }
+ else{
+ foreach my $dirline (@dir)
+ {
+ if($dirline =~ /\bhourly.0\b/)
+ { $servertime = $dirline; }
+ }
+
+ #--- get time difference between last update and current time
+ @items = split(" ", $servertime);
+ ($yyyy,$mm,$dd) = ($items[5] =~ /(\d+)-(\d+)-(\d+)/);
+ ($hh,$mi) = ($items[6] =~ /(\d+):(\d+)/);
+ $yyyy = $yyyy - 1900;
+ $mm = $mm - 1;
+ $serversec = timelocal(0,$mi,$hh,$dd,$mm,$yyyy);
+ $serversec = timegm(0,$mi,$hh,$dd,$mm,$yyyy);
+ $localsec = timegm((localtime)[0,1,2,3,4,5]);
+ $diffsec = $localsec - $serversec;
+
+ if ($diffsec > 93600) { # must be below 26 hours
+ $status= "WARNING - stopped updating at $items[5] $items[6]";
+ }
+ else {
+ $status = "OK - last update at $items[5] $items[6]";
+ }
+ }
+
+ #print "$status\n";
+ $count = 0;
+ sleep 50000;
+ }
+}
+
+sub statusServer{ my $server_socket;
+ my $client_socket;
+ my $selector;
+
+ unless (defined( $server_socket =
+ IO::Socket::INET->new( LocalPort => $server_port,
+ Proto => 'tcp',
+ Listen => SOMAXCONN ) ))
+ {
+ print "ERROR: Cannot start status server!\n";
+ }
+
+ $selector = new IO::Select( $server_socket );
+
+ while(1) {
+
+ # wait 5 seconds for connections
+ while (my @file_handles = $selector->can_read( 5 )) {
+
+ foreach my $file_handle (@file_handles) {
+
+ if($file_handle == $server_socket) {
+
+ # create a new socket for this transaction
+ unless (defined( $client_socket = $server_socket->accept() ))
+ {
+ print "ERROR: Cannot open socket to send status!\n";
+ }
+
+ print $client_socket $status;
+
+ close( $client_socket );
+ }
+ }
+ }
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+use FileHandle;
+use Net::FTP;
+use Data::Dumper;
+use IO::Handle;
+use Time::Local;
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
+my $status : shared = "OK";
+
+our $server_port = '50502';
+our $protocol = 'tcp';
+
+threads->new( \&statusServer);
+&main();
+
+exit(0);
+
+sub main {
+ my $counter=0;
+ my @total;
+ my $statsize = "0";
+
+ while (1)
+ {
+ if ($counter == 0)
+ {
+ # every 24 hours
+ my @size = `du -cms /lustre_alpha/hades`;
+
+ foreach my $li (@size)
+ {
+ if($li =~/total/)
+ {
+ @total = split(" ", $li);
+ $statsize = $total[0];
+ $statsize =sprintf ("%.2f",$statsize/1024/1024);
+ }
+ }
+ $counter = 720;
+
+ print "/lustre_alpha/hades total size: $statsize TB\n";
+ } # should be 720
+
+ my $line =`ls -d /lustre_alpha/hades/beam/sep08`;
+ chomp($line);
+
+ if ($line eq "/lustre_alpha/hades/beam/sep08")
+ {
+ if ($statsize > 30) {
+ $status = "CRITICAL - Lustre disk space used: $statsize TB";
+ }
+ elsif ($statsize > 25) {
+ $status = "WARNING - Lustre disk space used: $statsize TB";
+ }
+ else {
+ $status = "OK - Lustre disk space used: $statsize TB";
+ }
+ }
+ else {
+ $status = "CRITICAL - failure of Lustre file system!";
+ }
+
+ $counter --;
+
+ #print "$status \n";
+ sleep 120; #should be 120
+ }
+}
+
+sub statusServer{ my $server_socket;
+ my $client_socket;
+ my $selector;
+
+ unless (defined( $server_socket =
+ IO::Socket::INET->new( LocalPort => $server_port,
+ Proto => 'tcp',
+ Listen => SOMAXCONN ) ))
+ {
+ print "ERROR: Cannot start status server!\n";
+ }
+
+ $selector = new IO::Select( $server_socket );
+
+ while(1) {
+
+ # wait 5 seconds for connections
+ while (my @file_handles = $selector->can_read( 5 )) {
+
+ foreach my $file_handle (@file_handles) {
+
+ if($file_handle == $server_socket) {
+
+ # create a new socket for this transaction
+ unless (defined( $client_socket = $server_socket->accept() )
+)
+ {
+ print "ERROR: Cannot open socket to send status!\n";
+ }
+
+
+ print $client_socket $status;
+
+ close( $client_socket );
+ }
+ }
+ }
+ }
+
+
+}
+
+
+
+
+
+
+
+
use Fcntl;
use IO::Handle;
+# some Nagios stuff
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
my $i;
my @lines;
my $line;
my ($evtsComplete, $evtsDiscarded, $evtsDataError, $evtsTagError);
#- status info for Nagios
-my $status;
+my $status : shared = "OK - DAQ is running less than $last_minutes min.";
+my $state = $ERRORS{'OK'};
-#--- loop over all lines backward in the file2read
-for ( $i = $#lines; $i > 1; $i--){
-
- $line = $lines[$i];
-
- #- look for a line with "stopdate"
- if ( $line =~ /stopdate/){
-
- #- check the number of problematic events in file
- if ($filenum > 0 && $evtsComplete > 0) {
-
- #- estimate amount of discarded events
- my $ratio1 = $evtsDiscarded/$evtsComplete;
- my $ratio2 = $evtsDataError/$evtsComplete;
- my $ratio3 = $evtsTagError/$evtsComplete;
-
- #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n";
-
- if ($ratio1 > 0.1) {
- $errfilenum1++;
- }
- if ($ratio2 > 0.1) {
- $errfilenum2++;
- }
- if ($ratio3 > 0.1) {
- $errfilenum3++;
- }
- }
+#- some variables needed for statusServer
+our $server_port = '50501';
+our $protocol = 'tcp';
- #- extract stop date from the line (format: "2007-05-05T19:32:53")
- my ($v1, $v2, $stop_date) = split(" ", $line);
-
- #- get rid of ""
- $stop_date =~ s/\"//g;
+threads->new( \&statusServer);
- #-get rid of "T"
- $stop_date =~ s/T/ /;
+&main();
+exit(0);
- #- get time difference (in minutes)
- my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now );
+sub main {
- #print "stop_date = $stop_date, time_diff = $time_diff\n";
+ while(1) {
- #- look for a recent hour
- if ( $time_diff > $last_minutes) {
-
- if ($filenum == 0) {
- $status = "OK - no new files in a log during last $last_minutes min.";
- last;
- }
- elsif ($filenum > 0) {
- my $persent1 = $errfilenum1/$filenum;
- my $persent2 = $errfilenum2/$filenum;
- my $persent3 = $errfilenum3/$filenum;
-
- # if number of files with discarded events above threshold
- # exceeds 10% -> send a WARNING
- if ($persent1 > 0.1) {
- $status = "WARNING - $persent1 files with discarded events during last $last_minutes min.";
- last;
+ #--- loop over all lines backward in the file2read
+ for ( $i = $#lines; $i > 1; $i--){
+
+ $line = $lines[$i];
+
+ #- look for a line with "stopdate"
+ if ( $line =~ /stopdate/){
+
+ #- check the number of problematic events in file
+ if ($filenum > 0 && $evtsComplete > 0) {
+
+ #- estimate amount of discarded events
+ my $ratio1 = $evtsDiscarded/$evtsComplete;
+ my $ratio2 = $evtsDataError/$evtsComplete;
+ my $ratio3 = $evtsTagError/$evtsComplete;
+
+ #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n";
+
+ if ($ratio1 > 0.1) {
+ $errfilenum1++;
+ }
+ if ($ratio2 > 0.1) {
+ $errfilenum2++;
+ }
+ if ($ratio3 > 0.1) {
+ $errfilenum3++;
+ }
}
- elsif ($persent2 > 0.1) {
- $status = "WARNING - $persent2 files with data error during last $last_minutes min.";
- last;
+
+ #- extract stop date from the line (format: "2007-05-05T19:32:53")
+ my ($v1, $v2, $stop_date) = split(" ", $line);
+
+ #- get rid of ""
+ $stop_date =~ s/\"//g;
+
+ #-get rid of "T"
+ $stop_date =~ s/T/ /;
+
+ #- get time difference (in minutes)
+ my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now );
+
+ #print "stop_date = $stop_date, time_diff = $time_diff\n";
+
+ #- look for a recent hour
+ if ( $time_diff > $last_minutes) {
+
+ if ($filenum == 0) {
+ $status = "OK - no new files in a log during last $last_minutes min.";
+ $state = $ERRORS{'OK'};
+ last;
+ }
+ elsif ($filenum > 0) {
+ my $persent1 = $errfilenum1/$filenum;
+ my $persent2 = $errfilenum2/$filenum;
+ my $persent3 = $errfilenum3/$filenum;
+
+ # if number of files with discarded events above threshold
+ # exceeds 10% -> send a WARNING
+ if ($persent1 > 0.1) {
+ $status = "WARNING - $persent1 files with discarded events during last $last_minutes min.";
+ $state = $ERRORS{'WARNING'};
+ last;
+ }
+ elsif ($persent2 > 0.1) {
+ $status = "WARNING - $persent2 files with data error during last $last_minutes min.";
+ $state = $ERRORS{'WARNING'};
+ last;
+ }
+ elsif ($persent3 > 0.1) {
+ $status = "WARNING - $persent3 files with tag error during last $last_minutes min.";
+ $state = $ERRORS{'WARNING'};
+ last;
+ }
+ else {
+ $status = "OK - $persent1 files with discarded events during last $last_minutes min.";
+ $state = $ERRORS{'OK'};
+ last;
+ }
+ }
+ } #if ( $time_diff > 60.)
+ else {
+
+ #- increment filenum counter
+ $filenum++;
}
- elsif ($persent3 > 0.1) {
- $status = "WARNING - $persent3 files with tag error during last $last_minutes min.";
- last;
+ } #if ( $line =~ /stopdate/){
+ else {
+
+ my ($v1, $v2);
+
+ if($line =~ /evtsComplete/) {
+ ($v1, $v2, $evtsComplete) = split(" ", $line);
}
- else {
- $status = "OK - $persent1 files with discarded events during last $last_minutes min.";
- last;
+ if($line =~ /evtsDiscarded/) {
+ ($v1, $v2, $evtsDiscarded) = split(" ", $line);
+ }
+ if($line =~ /evtsDataError/) {
+ ($v1, $v2, $evtsDataError) = split(" ", $line);
+ }
+ if($line =~ /evtsTagError/) {
+ ($v1, $v2, $evtsTagError) = split(" ", $line);
}
}
- } #if ( $time_diff > 60.)
- else {
-
- #- increment filenum counter
- $filenum++;
- }
- } #if ( $line =~ /stopdate/){
- else {
-
- my ($v1, $v2);
-
- if($line =~ /evtsComplete/) {
- ($v1, $v2, $evtsComplete) = split(" ", $line);
}
- if($line =~ /evtsDiscarded/) {
- ($v1, $v2, $evtsDiscarded) = split(" ", $line);
- }
- if($line =~ /evtsDataError/) {
- ($v1, $v2, $evtsDataError) = split(" ", $line);
- }
- if($line =~ /evtsTagError/) {
- ($v1, $v2, $evtsTagError) = split(" ", $line);
- }
- }
-}
-
-print "status for Nagios: $status\n";
+
+ print "status for Nagios: $status\n";
+#exit $state;
+
+ sleep 1200; #20 min.
+ } #while(1)
+} #sub main
sub timeDiff (%) {
#
my $totminutes = int($ssDiff / 60);
"$totminutes";
}
+
+sub statusServer
+{
+ my $server_socket;
+ my $client_socket;
+ my $selector;
+
+ unless (defined( $server_socket =
+ IO::Socket::INET->new( LocalPort => $server_port,
+ Proto => 'tcp',
+ Listen => SOMAXCONN ) ))
+ {
+ print "ERROR: Cannot start status server!\n";
+ }
+
+ $selector = new IO::Select( $server_socket );
+
+ while(1) {
+
+ # wait 5 seconds for connections
+ while (my @file_handles = $selector->can_read( 5 )) {
+
+ foreach my $file_handle (@file_handles) {
+
+ if($file_handle == $server_socket) {
+
+ # create a new socket for this transaction
+ unless (defined( $client_socket = $server_socket->accept() ))
+ {
+ print "ERROR: Cannot open socket to send status!\n";
+ }
+
+ my $status_line = "$status";
+
+ print $client_socket $status_line;
+
+ close( $client_socket );
+ }
+ }
+ }
+ }
+}