From: hadaq Date: Fri, 19 Sep 2008 13:18:56 +0000 (+0000) Subject: new scripts to check different services (see README). Sergey Yurevich. X-Git-Url: https://jspc29.x-matter.uni-frankfurt.de/git/?a=commitdiff_plain;h=1453dbd8d1613b5b4c0d83b8f7b62757fa33b6ec;p=hadesicinga.git new scripts to check different services (see README). Sergey Yurevich. --- diff --git a/plugins/README b/plugins/README new file mode 100644 index 0000000..672a08e --- /dev/null +++ b/plugins/README @@ -0,0 +1,24 @@ +Plugin Location Comment +-------------------------------------------------------------- +check_backup.pl hadeb07 checks backup on hadeb07. + Runs status server to report + status to Nagios. + +check_archiver.pl lxg0434 checks archiving process of + slow ctrl data to Oracle. + Runs status server to report + status to Nagios. + +check_lustre.pl lxhadesdaq checks Lustre mount. Checks + used disk space on Lustre. + Runs status server to report + status to Nagios. + +my_check_eblog.pl lxhadesdaq checks discarded events in + eb_s.tcl written by EB. + Runs status server to report + status to Nagios. + +my_check_proc_status.pl hadesdaq runs by Nagios to receive + status report from remote + scripts. \ No newline at end of file diff --git a/plugins/check_archiver.pl b/plugins/check_archiver.pl new file mode 100755 index 0000000..33d0d0c --- /dev/null +++ b/plugins/check_archiver.pl @@ -0,0 +1,131 @@ +#!/usr/local/bin/perl -w + +use strict; +use warnings; + +use FileHandle; +use Net::FTP; +use Data::Dumper; +use IO::Handle; +use Time::Local; + +# the following is for the status server +# to communicate with Nagios plugin +use threads; +use threads::shared; +use IO::Socket; +use IO::Select; + +## oracle +use DBI; + +############# oracle ############ +my $user = 'DAQ_PUB'; +my $pass = 'hades'; +my $database = 'db-hades'; +my $table = 'hades_scs.mon_channels_last_archived'; +my $hostname = 'pcora2.gsi.de'; + +my $count = 0; +my $status : shared = "OK"; +my @screenPID; + +our $server_port = '50501'; +our $protocol = 'tcp'; + +threads->new( \&statusServer); + +&main(); + +exit(0); + +sub main { + +#--- Connect the database +my $dbh = DBI->connect( "dbi:Oracle:$database", $user,$pass); + +#--- Prepare select +my $sth = $dbh->prepare("SELECT to_char(max(data_end), 'YYYY-MM-DD hh24:mi:ss') from $table"); + +my @ora_answer; my $servertime; my $yyyy; my $mm; my $dd; my $hh; my $mi; my $ss; + +my $localsec; my $serversec; my $diffsec; + +my @items; + while(1){ + my @array = `ps axu`; + + foreach my $line (@array) + { + if($line =~/SCREEN(?:\s\w+|)\sstartArchiver/) + { + @items = split(" ", $line); + $screenPID[$count]=$items[1]; + $count ++; + } + } + + $localsec=timegm((localtime)[0,1,2,3,4,5]); + $sth->execute(); + while( @ora_answer = $sth->fetchrow_array ){ + $servertime= $ora_answer[0]; + } + ($yyyy,$mm,$dd,$hh,$mi,$ss)=($servertime =~ /(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)/); + $yyyy=$yyyy-1900; $mm=$mm-1; + $serversec= timelocal($ss,$mi,$hh,$dd,$mm,$yyyy); + + $serversec= timegm($ss,$mi,$hh,$dd,$mm,$yyyy); + $diffsec= $localsec - $serversec; + unless ($diffsec > 300){ + if ($count == 0) { $status = "CRITICAL - 0 processes found" } + elsif ($count>1){$status = "WARNING -PID: @screenPID" } + else { $status = "OK - PID: @screenPID"};} + else {$status = "CRITICAL - no update since $servertime"} + #print "status: $status\n"; + + $count=0; + sleep 60; + } +} + +sub statusServer{ my $server_socket; + my $client_socket; + my $selector; + + unless (defined( $server_socket = + IO::Socket::INET->new( LocalPort => $server_port, + Proto => 'tcp', + Listen => SOMAXCONN ) )) + { + print "ERROR: Cannot start status server!\n"; + } + + $selector = new IO::Select( $server_socket ); + + while(1) { + + # wait 5 seconds for connections + while (my @file_handles = $selector->can_read( 5 )) { + + foreach my $file_handle (@file_handles) { + + if($file_handle == $server_socket) { + + # create a new socket for this transaction + unless (defined( $client_socket = $server_socket->accept() ) +) + { + print "ERROR: Cannot open socket to send status!\n"; + } + + + print $client_socket $status; + + close( $client_socket ); + } + } + } + } + + +} diff --git a/plugins/check_backup.pl b/plugins/check_backup.pl new file mode 100755 index 0000000..4820c57 --- /dev/null +++ b/plugins/check_backup.pl @@ -0,0 +1,133 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +######################################################### +# # +# Nagios plugin running on hadeb07 (backup server) # +# and checking the success of the recent backup. # +# # +######################################################### + +use FileHandle; +use Net::FTP; +use Data::Dumper; +use IO::Handle; +use Time::Local; + +# the following is for the status server +# to communicate with Nagios plugin +use threads; +use threads::shared; +use IO::Socket; +use IO::Select; + +my @items; +my $count = 0; +my $status : shared = "OK"; +my @screenPID; + +our $server_port = '50501'; +our $protocol = 'tcp'; + +threads->new( \&statusServer); + +&main(); + +exit(0); + + +sub main { + + my ($servertime, $yyyy, $mm, $dd, $hh, $mi, $ss); + my ($localsec, $serversec, $diffsec); + + while(1){ + + #--- check the presence of all 5 directories inside + my @array = `ls /data/backup/.snapshots/hourly.0/`; + #--- check time of only the latest dir: hourly.0 + my @dir = `ls -ltr /data/backup/.snapshots/`; + + foreach my $line (@array) + { + #--- count directories + if( $line =~ /(\bhadesdaq\b|\blxhadesdaq\b|\bhadeb05\b|\bhades25\b|\bdepc234\b)/ ) { + $count ++; + } + } + + #--- backup must have 5 main directories inside! + if( $count < 5 ){ + $status = "WARNING - backup directory missing"; + } + else{ + foreach my $dirline (@dir) + { + if($dirline =~ /\bhourly.0\b/) + { $servertime = $dirline; } + } + + #--- get time difference between last update and current time + @items = split(" ", $servertime); + ($yyyy,$mm,$dd) = ($items[5] =~ /(\d+)-(\d+)-(\d+)/); + ($hh,$mi) = ($items[6] =~ /(\d+):(\d+)/); + $yyyy = $yyyy - 1900; + $mm = $mm - 1; + $serversec = timelocal(0,$mi,$hh,$dd,$mm,$yyyy); + $serversec = timegm(0,$mi,$hh,$dd,$mm,$yyyy); + $localsec = timegm((localtime)[0,1,2,3,4,5]); + $diffsec = $localsec - $serversec; + + if ($diffsec > 93600) { # must be below 26 hours + $status= "WARNING - stopped updating at $items[5] $items[6]"; + } + else { + $status = "OK - last update at $items[5] $items[6]"; + } + } + + #print "$status\n"; + $count = 0; + sleep 50000; + } +} + +sub statusServer{ my $server_socket; + my $client_socket; + my $selector; + + unless (defined( $server_socket = + IO::Socket::INET->new( LocalPort => $server_port, + Proto => 'tcp', + Listen => SOMAXCONN ) )) + { + print "ERROR: Cannot start status server!\n"; + } + + $selector = new IO::Select( $server_socket ); + + while(1) { + + # wait 5 seconds for connections + while (my @file_handles = $selector->can_read( 5 )) { + + foreach my $file_handle (@file_handles) { + + if($file_handle == $server_socket) { + + # create a new socket for this transaction + unless (defined( $client_socket = $server_socket->accept() )) + { + print "ERROR: Cannot open socket to send status!\n"; + } + + print $client_socket $status; + + close( $client_socket ); + } + } + } + } +} diff --git a/plugins/check_lustre.pl b/plugins/check_lustre.pl new file mode 100755 index 0000000..ed868a9 --- /dev/null +++ b/plugins/check_lustre.pl @@ -0,0 +1,129 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +use FileHandle; +use Net::FTP; +use Data::Dumper; +use IO::Handle; +use Time::Local; + +# the following is for the status server +# to communicate with Nagios plugin +use threads; +use threads::shared; +use IO::Socket; +use IO::Select; + +my $status : shared = "OK"; + +our $server_port = '50502'; +our $protocol = 'tcp'; + +threads->new( \&statusServer); +&main(); + +exit(0); + +sub main { + my $counter=0; + my @total; + my $statsize = "0"; + + while (1) + { + if ($counter == 0) + { + # every 24 hours + my @size = `du -cms /lustre_alpha/hades`; + + foreach my $li (@size) + { + if($li =~/total/) + { + @total = split(" ", $li); + $statsize = $total[0]; + $statsize =sprintf ("%.2f",$statsize/1024/1024); + } + } + $counter = 720; + + print "/lustre_alpha/hades total size: $statsize TB\n"; + } # should be 720 + + my $line =`ls -d /lustre_alpha/hades/beam/sep08`; + chomp($line); + + if ($line eq "/lustre_alpha/hades/beam/sep08") + { + if ($statsize > 30) { + $status = "CRITICAL - Lustre disk space used: $statsize TB"; + } + elsif ($statsize > 25) { + $status = "WARNING - Lustre disk space used: $statsize TB"; + } + else { + $status = "OK - Lustre disk space used: $statsize TB"; + } + } + else { + $status = "CRITICAL - failure of Lustre file system!"; + } + + $counter --; + + #print "$status \n"; + sleep 120; #should be 120 + } +} + +sub statusServer{ my $server_socket; + my $client_socket; + my $selector; + + unless (defined( $server_socket = + IO::Socket::INET->new( LocalPort => $server_port, + Proto => 'tcp', + Listen => SOMAXCONN ) )) + { + print "ERROR: Cannot start status server!\n"; + } + + $selector = new IO::Select( $server_socket ); + + while(1) { + + # wait 5 seconds for connections + while (my @file_handles = $selector->can_read( 5 )) { + + foreach my $file_handle (@file_handles) { + + if($file_handle == $server_socket) { + + # create a new socket for this transaction + unless (defined( $client_socket = $server_socket->accept() ) +) + { + print "ERROR: Cannot open socket to send status!\n"; + } + + + print $client_socket $status; + + close( $client_socket ); + } + } + } + } + + +} + + + + + + + + diff --git a/plugins/my_check_eblog.pl b/plugins/my_check_eblog.pl index a8e458b..9333c5b 100755 --- a/plugins/my_check_eblog.pl +++ b/plugins/my_check_eblog.pl @@ -16,6 +16,17 @@ use Tie::File; use Fcntl; use IO::Handle; +# some Nagios stuff +use lib '/usr/local/nagios/libexec/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +# the following is for the status server +# to communicate with Nagios plugin +use threads; +use threads::shared; +use IO::Socket; +use IO::Select; + my $i; my @lines; my $line; @@ -42,109 +53,133 @@ my $errfilenum3 = 0; #file with many evtsTagError my ($evtsComplete, $evtsDiscarded, $evtsDataError, $evtsTagError); #- status info for Nagios -my $status; +my $status : shared = "OK - DAQ is running less than $last_minutes min."; +my $state = $ERRORS{'OK'}; -#--- loop over all lines backward in the file2read -for ( $i = $#lines; $i > 1; $i--){ - - $line = $lines[$i]; - - #- look for a line with "stopdate" - if ( $line =~ /stopdate/){ - - #- check the number of problematic events in file - if ($filenum > 0 && $evtsComplete > 0) { - - #- estimate amount of discarded events - my $ratio1 = $evtsDiscarded/$evtsComplete; - my $ratio2 = $evtsDataError/$evtsComplete; - my $ratio3 = $evtsTagError/$evtsComplete; - - #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n"; - - if ($ratio1 > 0.1) { - $errfilenum1++; - } - if ($ratio2 > 0.1) { - $errfilenum2++; - } - if ($ratio3 > 0.1) { - $errfilenum3++; - } - } +#- some variables needed for statusServer +our $server_port = '50501'; +our $protocol = 'tcp'; - #- extract stop date from the line (format: "2007-05-05T19:32:53") - my ($v1, $v2, $stop_date) = split(" ", $line); - - #- get rid of "" - $stop_date =~ s/\"//g; +threads->new( \&statusServer); - #-get rid of "T" - $stop_date =~ s/T/ /; +&main(); +exit(0); - #- get time difference (in minutes) - my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now ); +sub main { - #print "stop_date = $stop_date, time_diff = $time_diff\n"; + while(1) { - #- look for a recent hour - if ( $time_diff > $last_minutes) { - - if ($filenum == 0) { - $status = "OK - no new files in a log during last $last_minutes min."; - last; - } - elsif ($filenum > 0) { - my $persent1 = $errfilenum1/$filenum; - my $persent2 = $errfilenum2/$filenum; - my $persent3 = $errfilenum3/$filenum; - - # if number of files with discarded events above threshold - # exceeds 10% -> send a WARNING - if ($persent1 > 0.1) { - $status = "WARNING - $persent1 files with discarded events during last $last_minutes min."; - last; + #--- loop over all lines backward in the file2read + for ( $i = $#lines; $i > 1; $i--){ + + $line = $lines[$i]; + + #- look for a line with "stopdate" + if ( $line =~ /stopdate/){ + + #- check the number of problematic events in file + if ($filenum > 0 && $evtsComplete > 0) { + + #- estimate amount of discarded events + my $ratio1 = $evtsDiscarded/$evtsComplete; + my $ratio2 = $evtsDataError/$evtsComplete; + my $ratio3 = $evtsTagError/$evtsComplete; + + #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n"; + + if ($ratio1 > 0.1) { + $errfilenum1++; + } + if ($ratio2 > 0.1) { + $errfilenum2++; + } + if ($ratio3 > 0.1) { + $errfilenum3++; + } } - elsif ($persent2 > 0.1) { - $status = "WARNING - $persent2 files with data error during last $last_minutes min."; - last; + + #- extract stop date from the line (format: "2007-05-05T19:32:53") + my ($v1, $v2, $stop_date) = split(" ", $line); + + #- get rid of "" + $stop_date =~ s/\"//g; + + #-get rid of "T" + $stop_date =~ s/T/ /; + + #- get time difference (in minutes) + my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now ); + + #print "stop_date = $stop_date, time_diff = $time_diff\n"; + + #- look for a recent hour + if ( $time_diff > $last_minutes) { + + if ($filenum == 0) { + $status = "OK - no new files in a log during last $last_minutes min."; + $state = $ERRORS{'OK'}; + last; + } + elsif ($filenum > 0) { + my $persent1 = $errfilenum1/$filenum; + my $persent2 = $errfilenum2/$filenum; + my $persent3 = $errfilenum3/$filenum; + + # if number of files with discarded events above threshold + # exceeds 10% -> send a WARNING + if ($persent1 > 0.1) { + $status = "WARNING - $persent1 files with discarded events during last $last_minutes min."; + $state = $ERRORS{'WARNING'}; + last; + } + elsif ($persent2 > 0.1) { + $status = "WARNING - $persent2 files with data error during last $last_minutes min."; + $state = $ERRORS{'WARNING'}; + last; + } + elsif ($persent3 > 0.1) { + $status = "WARNING - $persent3 files with tag error during last $last_minutes min."; + $state = $ERRORS{'WARNING'}; + last; + } + else { + $status = "OK - $persent1 files with discarded events during last $last_minutes min."; + $state = $ERRORS{'OK'}; + last; + } + } + } #if ( $time_diff > 60.) + else { + + #- increment filenum counter + $filenum++; } - elsif ($persent3 > 0.1) { - $status = "WARNING - $persent3 files with tag error during last $last_minutes min."; - last; + } #if ( $line =~ /stopdate/){ + else { + + my ($v1, $v2); + + if($line =~ /evtsComplete/) { + ($v1, $v2, $evtsComplete) = split(" ", $line); } - else { - $status = "OK - $persent1 files with discarded events during last $last_minutes min."; - last; + if($line =~ /evtsDiscarded/) { + ($v1, $v2, $evtsDiscarded) = split(" ", $line); + } + if($line =~ /evtsDataError/) { + ($v1, $v2, $evtsDataError) = split(" ", $line); + } + if($line =~ /evtsTagError/) { + ($v1, $v2, $evtsTagError) = split(" ", $line); } } - } #if ( $time_diff > 60.) - else { - - #- increment filenum counter - $filenum++; - } - } #if ( $line =~ /stopdate/){ - else { - - my ($v1, $v2); - - if($line =~ /evtsComplete/) { - ($v1, $v2, $evtsComplete) = split(" ", $line); } - if($line =~ /evtsDiscarded/) { - ($v1, $v2, $evtsDiscarded) = split(" ", $line); - } - if($line =~ /evtsDataError/) { - ($v1, $v2, $evtsDataError) = split(" ", $line); - } - if($line =~ /evtsTagError/) { - ($v1, $v2, $evtsTagError) = split(" ", $line); - } - } -} - -print "status for Nagios: $status\n"; + + print "status for Nagios: $status\n"; +#exit $state; + + sleep 1200; #20 min. + } #while(1) +} #sub main sub timeDiff (%) { # @@ -190,3 +225,45 @@ sub timeDiff (%) { my $totminutes = int($ssDiff / 60); "$totminutes"; } + +sub statusServer +{ + my $server_socket; + my $client_socket; + my $selector; + + unless (defined( $server_socket = + IO::Socket::INET->new( LocalPort => $server_port, + Proto => 'tcp', + Listen => SOMAXCONN ) )) + { + print "ERROR: Cannot start status server!\n"; + } + + $selector = new IO::Select( $server_socket ); + + while(1) { + + # wait 5 seconds for connections + while (my @file_handles = $selector->can_read( 5 )) { + + foreach my $file_handle (@file_handles) { + + if($file_handle == $server_socket) { + + # create a new socket for this transaction + unless (defined( $client_socket = $server_socket->accept() )) + { + print "ERROR: Cannot open socket to send status!\n"; + } + + my $status_line = "$status"; + + print $client_socket $status_line; + + close( $client_socket ); + } + } + } + } +}