]> jspc29.x-matter.uni-frankfurt.de Git - hadesicinga.git/commitdiff
new scripts to check different services (see README). Sergey Yurevich.
authorhadaq <hadaq>
Fri, 19 Sep 2008 13:18:56 +0000 (13:18 +0000)
committerhadaq <hadaq>
Fri, 19 Sep 2008 13:18:56 +0000 (13:18 +0000)
plugins/README [new file with mode: 0644]
plugins/check_archiver.pl [new file with mode: 0755]
plugins/check_backup.pl [new file with mode: 0755]
plugins/check_lustre.pl [new file with mode: 0755]
plugins/my_check_eblog.pl

diff --git a/plugins/README b/plugins/README
new file mode 100644 (file)
index 0000000..672a08e
--- /dev/null
@@ -0,0 +1,24 @@
+Plugin               Location     Comment
+--------------------------------------------------------------
+check_backup.pl      hadeb07      checks backup on hadeb07.
+                                  Runs status server to report
+                                  status to Nagios. 
+
+check_archiver.pl    lxg0434      checks archiving process of
+                                  slow ctrl data to Oracle.
+                                  Runs status server to report
+                                  status to Nagios.
+
+check_lustre.pl      lxhadesdaq   checks Lustre mount. Checks
+                                  used disk space on Lustre.
+                                  Runs status server to report
+                                  status to Nagios.
+
+my_check_eblog.pl    lxhadesdaq   checks discarded events in 
+                                  eb_s.tcl written by EB. 
+                                  Runs status server to report
+                                  status to Nagios.
+
+my_check_proc_status.pl hadesdaq  runs by Nagios to receive
+                                  status report from remote
+                                  scripts.
\ No newline at end of file
diff --git a/plugins/check_archiver.pl b/plugins/check_archiver.pl
new file mode 100755 (executable)
index 0000000..33d0d0c
--- /dev/null
@@ -0,0 +1,131 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+use warnings;
+
+use FileHandle;
+use Net::FTP;
+use Data::Dumper;
+use IO::Handle;
+use Time::Local;
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
+## oracle
+use DBI;
+
+############# oracle ############
+my $user     = 'DAQ_PUB';
+my $pass     = 'hades';
+my $database = 'db-hades';
+my $table    = 'hades_scs.mon_channels_last_archived';
+my $hostname = 'pcora2.gsi.de';
+
+my $count = 0;
+my $status : shared = "OK";
+my @screenPID;
+
+our $server_port = '50501';
+our $protocol    = 'tcp';
+
+threads->new( \&statusServer);
+
+&main();
+
+exit(0);
+
+sub main {
+
+#--- Connect the database
+my $dbh = DBI->connect( "dbi:Oracle:$database", $user,$pass);
+
+#--- Prepare select
+my $sth = $dbh->prepare("SELECT to_char(max(data_end), 'YYYY-MM-DD hh24:mi:ss') from $table");
+
+my @ora_answer; my $servertime; my $yyyy; my $mm; my $dd; my $hh; my $mi; my $ss;
+
+my $localsec; my $serversec; my $diffsec;
+
+my @items;
+       while(1){
+               my @array = `ps axu`;
+
+               foreach my $line (@array) 
+               {
+                       if($line =~/SCREEN(?:\s\w+|)\sstartArchiver/) 
+                       {       
+                               @items = split(" ", $line);
+                               $screenPID[$count]=$items[1];
+                               $count ++;
+                       }       
+               } 
+               
+               $localsec=timegm((localtime)[0,1,2,3,4,5]);                     
+               $sth->execute();
+               while( @ora_answer = $sth->fetchrow_array ){  
+                       $servertime= $ora_answer[0];
+               }
+               ($yyyy,$mm,$dd,$hh,$mi,$ss)=($servertime =~ /(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)/);
+               $yyyy=$yyyy-1900; $mm=$mm-1;                
+               $serversec= timelocal($ss,$mi,$hh,$dd,$mm,$yyyy);
+               
+               $serversec= timegm($ss,$mi,$hh,$dd,$mm,$yyyy);
+               $diffsec= $localsec - $serversec;
+               unless ($diffsec > 300){ 
+                       if ($count == 0) { $status = "CRITICAL - 0 processes found" }
+                       elsif ($count>1){$status = "WARNING -PID: @screenPID" }
+                       else { $status = "OK - PID: @screenPID"};}
+               else {$status = "CRITICAL - no update since $servertime"}       
+               #print "status: $status\n";
+
+               $count=0;
+               sleep 60;
+       }
+}
+
+sub statusServer{ my $server_socket;
+    my $client_socket;
+    my $selector;
+
+    unless (defined( $server_socket =
+                     IO::Socket::INET->new( LocalPort => $server_port,
+                                            Proto     => 'tcp',
+                                            Listen    => SOMAXCONN ) ))
+    {
+        print "ERROR: Cannot start status server!\n";
+    }
+
+    $selector = new IO::Select( $server_socket );
+
+    while(1) {
+
+        # wait 5 seconds for connections
+        while (my @file_handles = $selector->can_read( 5 )) {
+
+            foreach my $file_handle (@file_handles) {
+
+                if($file_handle == $server_socket) {
+
+                    # create a new socket for this transaction
+                    unless (defined( $client_socket = $server_socket->accept() )
+)
+                    {
+                        print "ERROR: Cannot open socket to send status!\n";
+                    }
+
+
+                    print $client_socket $status;
+
+                    close( $client_socket );
+                }
+            }
+        }
+    }
+
+
+}
diff --git a/plugins/check_backup.pl b/plugins/check_backup.pl
new file mode 100755 (executable)
index 0000000..4820c57
--- /dev/null
@@ -0,0 +1,133 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+#########################################################
+#                                                       #
+# Nagios plugin running on hadeb07 (backup server)      #
+# and checking the success of the recent backup.        #
+#                                                       #
+#########################################################
+
+use FileHandle;
+use Net::FTP;
+use Data::Dumper;
+use IO::Handle;
+use Time::Local;
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
+my @items;
+my $count = 0;
+my $status : shared = "OK";
+my @screenPID;
+
+our $server_port = '50501';
+our $protocol    = 'tcp';
+
+threads->new( \&statusServer);
+
+&main();
+
+exit(0);
+
+
+sub main {
+
+    my ($servertime, $yyyy, $mm, $dd, $hh, $mi, $ss);
+    my ($localsec, $serversec, $diffsec);
+
+    while(1){
+
+       #--- check the presence of all 5 directories inside
+       my @array = `ls /data/backup/.snapshots/hourly.0/`;
+       #--- check time of only the latest dir: hourly.0
+       my @dir   = `ls -ltr /data/backup/.snapshots/`;
+       
+       foreach my $line (@array) 
+       {
+           #--- count directories
+           if( $line =~ /(\bhadesdaq\b|\blxhadesdaq\b|\bhadeb05\b|\bhades25\b|\bdepc234\b)/ ) { 
+               $count ++;
+           }   
+       } 
+
+       #--- backup must have 5 main directories inside!
+       if( $count < 5 ){
+           $status = "WARNING - backup directory missing";
+       }
+       else{
+           foreach my $dirline (@dir) 
+           {
+               if($dirline =~ /\bhourly.0\b/)
+               { $servertime = $dirline; }
+           }
+                       
+           #--- get time difference between last update and current time
+           @items = split(" ", $servertime);
+           ($yyyy,$mm,$dd) = ($items[5] =~ /(\d+)-(\d+)-(\d+)/);
+           ($hh,$mi)  = ($items[6] =~ /(\d+):(\d+)/);
+           $yyyy      = $yyyy - 1900; 
+           $mm        = $mm - 1;                
+           $serversec = timelocal(0,$mi,$hh,$dd,$mm,$yyyy);
+           $serversec = timegm(0,$mi,$hh,$dd,$mm,$yyyy);
+           $localsec  = timegm((localtime)[0,1,2,3,4,5]);              
+           $diffsec   = $localsec - $serversec;
+
+           if ($diffsec > 93600) { # must be below 26 hours
+               $status= "WARNING - stopped updating at $items[5] $items[6]";
+           }
+           else {
+               $status = "OK - last update at $items[5] $items[6]";
+           }
+       }
+
+       #print "$status\n";
+       $count = 0;
+       sleep 50000;
+    }
+}
+
+sub statusServer{ my $server_socket;
+    my $client_socket;
+    my $selector;
+
+    unless (defined( $server_socket =
+                     IO::Socket::INET->new( LocalPort => $server_port,
+                                            Proto     => 'tcp',
+                                            Listen    => SOMAXCONN ) ))
+    {
+        print "ERROR: Cannot start status server!\n";
+    }
+
+    $selector = new IO::Select( $server_socket );
+
+    while(1) {
+
+        # wait 5 seconds for connections
+        while (my @file_handles = $selector->can_read( 5 )) {
+
+            foreach my $file_handle (@file_handles) {
+
+                if($file_handle == $server_socket) {
+
+                    # create a new socket for this transaction
+                    unless (defined( $client_socket = $server_socket->accept() ))
+                    {
+                        print "ERROR: Cannot open socket to send status!\n";
+                    }
+
+                    print $client_socket $status;
+
+                    close( $client_socket );
+                }
+            }
+        }
+    }
+}
diff --git a/plugins/check_lustre.pl b/plugins/check_lustre.pl
new file mode 100755 (executable)
index 0000000..ed868a9
--- /dev/null
@@ -0,0 +1,129 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+use FileHandle;
+use Net::FTP;
+use Data::Dumper;
+use IO::Handle;
+use Time::Local;
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
+my $status : shared = "OK";
+
+our $server_port = '50502';
+our $protocol    = 'tcp';
+
+threads->new( \&statusServer);
+&main();
+
+exit(0);
+
+sub main {
+    my $counter=0;
+    my @total;
+    my $statsize = "0";
+    
+    while (1)
+    {
+       if ($counter == 0)     
+       {
+            # every 24 hours
+           my @size = `du -cms /lustre_alpha/hades`;
+           
+           foreach my $li (@size) 
+           {
+               if($li =~/total/) 
+               {
+                   @total = split(" ", $li);
+                   $statsize = $total[0];
+                   $statsize =sprintf ("%.2f",$statsize/1024/1024);
+               }
+           }
+           $counter = 720;
+           
+           print "/lustre_alpha/hades total size: $statsize TB\n";
+       }   # should be 720
+       
+       my $line =`ls -d /lustre_alpha/hades/beam/sep08`;
+       chomp($line);
+       
+       if ($line eq "/lustre_alpha/hades/beam/sep08") 
+       {
+           if ($statsize > 30) {
+               $status = "CRITICAL - Lustre disk space used: $statsize TB";
+           }
+           elsif ($statsize > 25) {
+               $status  = "WARNING - Lustre disk space used: $statsize TB";
+           }
+           else {
+               $status  = "OK - Lustre disk space used: $statsize TB";
+           }
+       }
+       else {
+           $status  = "CRITICAL - failure of Lustre file system!";
+       }
+       
+       $counter --;
+       
+       #print "$status \n";
+       sleep 120; #should be 120
+    }
+}
+
+sub statusServer{ my $server_socket;
+    my $client_socket;
+    my $selector;
+
+    unless (defined( $server_socket =
+                     IO::Socket::INET->new( LocalPort => $server_port,
+                                            Proto     => 'tcp',
+                                            Listen    => SOMAXCONN ) ))
+   {
+        print "ERROR: Cannot start status server!\n";
+    }
+
+    $selector = new IO::Select( $server_socket );
+
+    while(1) {
+
+        # wait 5 seconds for connections
+        while (my @file_handles = $selector->can_read( 5 )) {
+
+            foreach my $file_handle (@file_handles) {
+
+                if($file_handle == $server_socket) {
+
+                    # create a new socket for this transaction
+                    unless (defined( $client_socket = $server_socket->accept() )
+)
+                    {
+                        print "ERROR: Cannot open socket to send status!\n";
+                    }
+
+
+                    print $client_socket $status;
+
+                    close( $client_socket );
+                }
+            }
+        }
+    }
+
+
+}
+
+
+
+
+
+
+
+
index a8e458b9cb12ce8284e74fa6aa141bffc11a17c5..9333c5b207fa1a606f4133863ea8e20e47661496 100755 (executable)
@@ -16,6 +16,17 @@ use Tie::File;
 use Fcntl;
 use IO::Handle;
 
+# some Nagios stuff
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+# the following is for the status server
+# to communicate with Nagios plugin
+use threads;
+use threads::shared;
+use IO::Socket;
+use IO::Select;
+
 my $i;
 my @lines;
 my $line;
@@ -42,109 +53,133 @@ my $errfilenum3    = 0;  #file with many evtsTagError
 my ($evtsComplete, $evtsDiscarded, $evtsDataError, $evtsTagError);
 
 #- status info for Nagios
-my $status;
+my $status : shared = "OK - DAQ is running less than $last_minutes min.";
+my $state  = $ERRORS{'OK'};
 
-#--- loop over all lines backward in the file2read
-for ( $i = $#lines; $i > 1; $i--){
-    
-    $line = $lines[$i];
-    
-    #- look for a line with "stopdate"
-    if ( $line =~ /stopdate/){
-       
-       #- check the number of problematic events in file
-       if ($filenum > 0 && $evtsComplete > 0) {
-
-           #- estimate amount of discarded events
-           my $ratio1 = $evtsDiscarded/$evtsComplete;
-           my $ratio2 = $evtsDataError/$evtsComplete;
-           my $ratio3 = $evtsTagError/$evtsComplete;
-
-           #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n";
-           
-           if ($ratio1 > 0.1) {
-               $errfilenum1++;
-           }
-           if ($ratio2 > 0.1) {
-               $errfilenum2++;
-           }
-           if ($ratio3 > 0.1) {
-               $errfilenum3++;
-           }
-       }
+#- some variables needed for statusServer
+our $server_port = '50501';
+our $protocol    = 'tcp';
 
-       #- extract stop date from the line (format: "2007-05-05T19:32:53")
-       my ($v1, $v2, $stop_date) = split(" ", $line);
-       
-       #- get rid of ""
-       $stop_date =~ s/\"//g;
+threads->new( \&statusServer);
 
-       #-get rid of "T"
-       $stop_date =~ s/T/ /;
+&main();
+exit(0);
 
-       #- get time difference (in minutes)
-       my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now );
+sub main {
 
-       #print "stop_date = $stop_date, time_diff = $time_diff\n";
+    while(1) {
 
-       #- look for a recent hour
-       if ( $time_diff > $last_minutes) {
-          
-           if ($filenum == 0) {
-               $status = "OK - no new files in a log during last $last_minutes min.";
-               last;
-           }
-           elsif ($filenum > 0) {
-               my $persent1 = $errfilenum1/$filenum;
-               my $persent2 = $errfilenum2/$filenum;
-               my $persent3 = $errfilenum3/$filenum;
-
-               # if number of files with discarded events above threshold
-               # exceeds 10% -> send a WARNING
-               if ($persent1 > 0.1) {
-                   $status = "WARNING - $persent1 files with discarded events during last $last_minutes min.";
-                   last;
+       #--- loop over all lines backward in the file2read
+       for ( $i = $#lines; $i > 1; $i--){
+    
+           $line = $lines[$i];
+           
+           #- look for a line with "stopdate"
+           if ( $line =~ /stopdate/){
+               
+               #- check the number of problematic events in file
+               if ($filenum > 0 && $evtsComplete > 0) {
+                   
+                   #- estimate amount of discarded events
+                   my $ratio1 = $evtsDiscarded/$evtsComplete;
+                   my $ratio2 = $evtsDataError/$evtsComplete;
+                   my $ratio3 = $evtsTagError/$evtsComplete;
+                   
+                   #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n";
+                   
+                   if ($ratio1 > 0.1) {
+                       $errfilenum1++;
+                   }
+                   if ($ratio2 > 0.1) {
+                       $errfilenum2++;
+                   }
+                   if ($ratio3 > 0.1) {
+                       $errfilenum3++;
+                   }
                }
-               elsif ($persent2 > 0.1) {
-                   $status = "WARNING - $persent2 files with data error during last $last_minutes min.";
-                   last;  
+               
+               #- extract stop date from the line (format: "2007-05-05T19:32:53")
+               my ($v1, $v2, $stop_date) = split(" ", $line);
+               
+               #- get rid of ""
+               $stop_date =~ s/\"//g;
+               
+               #-get rid of "T"
+               $stop_date =~ s/T/ /;
+               
+               #- get time difference (in minutes)
+               my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now );
+               
+               #print "stop_date = $stop_date, time_diff = $time_diff\n";
+               
+               #- look for a recent hour
+               if ( $time_diff > $last_minutes) {
+                   
+                   if ($filenum == 0) {
+                       $status = "OK - no new files in a log during last $last_minutes min.";
+                       $state = $ERRORS{'OK'};
+                       last;
+                   }
+                   elsif ($filenum > 0) {
+                       my $persent1 = $errfilenum1/$filenum;
+                       my $persent2 = $errfilenum2/$filenum;
+                       my $persent3 = $errfilenum3/$filenum;
+                       
+                       # if number of files with discarded events above threshold
+                       # exceeds 10% -> send a WARNING
+                       if ($persent1 > 0.1) {
+                           $status = "WARNING - $persent1 files with discarded events during last $last_minutes min.";
+                           $state = $ERRORS{'WARNING'};
+                           last;
+                       }
+                       elsif ($persent2 > 0.1) {
+                           $status = "WARNING - $persent2 files with data error during last $last_minutes min.";
+                           $state = $ERRORS{'WARNING'};
+                           last;  
+                       }
+                       elsif ($persent3 > 0.1) {
+                           $status = "WARNING - $persent3 files with tag error during last $last_minutes min.";
+                           $state = $ERRORS{'WARNING'};
+                           last;  
+                       }
+                       else {
+                           $status = "OK - $persent1 files with discarded events during last $last_minutes min.";
+                           $state = $ERRORS{'OK'};
+                           last;
+                       }
+                   }
+               } #if ( $time_diff > 60.) 
+               else {
+                   
+                   #- increment filenum counter
+                   $filenum++;
                }
-               elsif ($persent3 > 0.1) {
-                   $status = "WARNING - $persent3 files with tag error during last $last_minutes min.";
-                   last;  
+           } #if ( $line =~ /stopdate/){
+           else {
+               
+               my ($v1, $v2);
+               
+               if($line =~ /evtsComplete/) { 
+                   ($v1, $v2, $evtsComplete) = split(" ", $line);
                }
-               else {
-                   $status = "OK - $persent1 files with discarded events during last $last_minutes min.";
-                   last;
+               if($line =~ /evtsDiscarded/) {
+                   ($v1, $v2, $evtsDiscarded) = split(" ", $line);
+               }
+               if($line =~ /evtsDataError/) {
+                   ($v1, $v2, $evtsDataError) = split(" ", $line);
+               }
+               if($line =~ /evtsTagError/) {
+                   ($v1, $v2, $evtsTagError) = split(" ", $line);
                }
            }
-       } #if ( $time_diff > 60.) 
-       else {
-           
-           #- increment filenum counter
-           $filenum++;
-       }
-    } #if ( $line =~ /stopdate/){
-    else {
-
-       my ($v1, $v2);
-
-       if($line =~ /evtsComplete/) { 
-           ($v1, $v2, $evtsComplete) = split(" ", $line);
        }
-       if($line =~ /evtsDiscarded/) {
-           ($v1, $v2, $evtsDiscarded) = split(" ", $line);
-       }
-       if($line =~ /evtsDataError/) {
-           ($v1, $v2, $evtsDataError) = split(" ", $line);
-       }
-       if($line =~ /evtsTagError/) {
-           ($v1, $v2, $evtsTagError) = split(" ", $line);
-       }
-    }
-}
-
-print "status for Nagios: $status\n";
+    
+       print "status for Nagios: $status\n";
+#exit $state;
+       
+       sleep 1200; #20 min.
+    } #while(1)
+} #sub main
 
 sub timeDiff (%) {
     #
@@ -190,3 +225,45 @@ sub timeDiff (%) {
        my $totminutes = int($ssDiff /    60);
        "$totminutes";
 }
+
+sub statusServer
+{
+    my $server_socket;
+    my $client_socket;
+    my $selector;
+
+    unless (defined( $server_socket =
+                     IO::Socket::INET->new( LocalPort => $server_port,
+                                            Proto     => 'tcp',
+                                            Listen    => SOMAXCONN ) ))
+    {
+        print "ERROR: Cannot start status server!\n";
+    }
+
+    $selector = new IO::Select( $server_socket );
+
+    while(1) {
+
+        # wait 5 seconds for connections
+        while (my @file_handles = $selector->can_read( 5 )) {
+
+            foreach my $file_handle (@file_handles) {
+
+                if($file_handle == $server_socket) {
+
+                    # create a new socket for this transaction
+                    unless (defined( $client_socket = $server_socket->accept() ))
+                    {
+                        print "ERROR: Cannot open socket to send status!\n";
+                    }
+
+                    my $status_line = "$status";
+
+                    print $client_socket $status_line;
+
+                    close( $client_socket );
+                }
+            }
+        }
+    }
+}