Initial revision

author hadaq <hadaq>

Fri, 19 Sep 2008 10:02:08 +0000 (10:02 +0000)

committer hadaq <hadaq>

Fri, 19 Sep 2008 10:02:08 +0000 (10:02 +0000)
author hadaq <hadaq>
Fri, 19 Sep 2008 10:02:08 +0000 (10:02 +0000)
committer hadaq <hadaq>
Fri, 19 Sep 2008 10:02:08 +0000 (10:02 +0000)
diff --git a/config/commands.cfg b/config/commands.cfg

new file mode 100644 (file)

index 0000000..bcb79eb
--- /dev/null
+++ b/config/commands.cfg
@@ -0,0 +1,393 @@
+################################################################################
+# Sample command definitions for Nagios 2.6
+#
+# Read the documentation for more information on this configuration file.  I've
+# provided some comments here, but things may not be so clear without further
+# explanation, so make sure to read the HTML documentation!
+# 
+# Last Modified: 11-21-2006
+#
+################################################################################
+
+
+################################################################################
+# COMMAND DEFINITIONS
+#
+# SYNTAX:
+#
+#      define command{
+#               template      <templatename>
+#              name          <objectname>
+#               command_name  <commandname>
+#               command_line  <commandline>
+#               }
+#
+# WHERE:
+#
+# <templatename> = object name of another command definition that should be
+#                  used as a template for this definition (optional)
+# <objectname>   = object name of command definition, referenced by other
+#                  command definitions that use it as a template (optional)
+# <commandname>  = name of the command, as recognized/used by Nagios
+# <commandline>  = command line
+#
+################################################################################
+
+
+
+
+################################################################################
+#
+# SAMPLE SERVICE CHECK COMMANDS
+#
+# These are some example service check commands.  They may or may not work on
+# your system, as they must be modified for your plugins.  See the HTML 
+# documentation on the plugins for examples of how to configure command definitions.
+#
+################################################################################
+
+
+################################################################################
+# NOTE:  The following 'check_local_...' functions are designed to monitor
+#        various metrics on the host that Nagios is running on (i.e. this one).
+################################################################################
+
+# 'check_local_disk' command definition
+define command{
+        command_name    check_local_disk
+        command_line    /usr/local/nagios/libexec/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
+        }
+
+
+# 'check_local_load' command definition
+define command{
+        command_name    check_local_load
+        command_line    /usr/local/nagios/libexec/check_load -w $ARG1$ -c $ARG2$
+        }
+
+
+# 'check_local_procs' command definition
+define command{
+        command_name    check_local_procs
+        command_line    /usr/local/nagios/libexec/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$
+        }
+
+
+# 'check_local_users' command definition
+define command{
+        command_name    check_local_users
+        command_line    /usr/local/nagios/libexec/check_users -w $ARG1$ -c $ARG2$
+        }
+
+# 'check_local_nmap' command definition
+define command{
+        command_name   check_local_nmap 
+        command_line   /usr/bin/nmap -sT -p22 -P0 localhost| grep open 2> /dev/null
+        }
+
+
+################################################################################
+# NOTE:  The following 'check_...' commands are used to monitor services on
+#        both local and remote hosts.
+################################################################################
+
+# 'check_dns' command definition
+define command{
+        command_name    check_dns
+        command_line    /usr/local/nagios/libexec/check_dns -H www.yahoo.com -s $HOSTADDRESS$
+        }
+
+
+# 'check_ftp' command definition
+define command{
+        command_name    check_ftp
+        command_line    /usr/local/nagios/libexec/check_ftp -H $HOSTADDRESS$
+        }
+
+
+# 'check_hpjd' command definition
+define command{
+        command_name    check_hpjd
+        command_line    /usr/local/nagios/libexec/check_hpjd -H $HOSTADDRESS$ -C public
+        }
+
+
+# 'check_http' command definition
+define command{
+        command_name    check_http
+        command_line    /usr/local/nagios/libexec/check_http -H $HOSTADDRESS$
+        }
+
+
+# 'check_nntp' command definition
+define command{
+        command_name    check_nntp
+        command_line    /usr/local/nagios/libexec/check_nntp -H $HOSTADDRESS$
+        }
+
+
+# 'check_ping' command definition
+define command{
+        command_name    check_ping
+        command_line    /usr/local/nagios/libexec/check_ping -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5
+        }
+
+
+# 'check_pop' command definition
+define command{
+        command_name    check_pop
+        command_line    /usr/local/nagios/libexec/check_pop -H $HOSTADDRESS$
+        }
+
+
+# 'check_smtp' command definition
+define command{
+        command_name    check_smtp
+        command_line    /usr/local/nagios/libexec/check_smtp -H $HOSTADDRESS$
+        }
+
+
+# 'check_tcp' command definition
+define command{
+       command_name    check_tcp
+       command_line    /usr/local/nagios/libexec/check_tcp -H $HOSTADDRESS$ -p $ARG1$
+       }
+
+
+# 'check_telnet' command definition
+define command{
+        command_name    check_telnet
+        command_line    /usr/local/nagios/libexec/check_tcp -H $HOSTADDRESS$ -p 23
+        }
+
+
+# 'check_udp' command definition
+define command{
+       command_name    check_udp
+       command_line    /usr/local/nagios/libexec/check_udp -H $HOSTADDRESS$ -p $ARG1$
+       }
+
+# 'check_ssh' command definition
+define command{
+       command_name    check_ssh
+       command_line    /usr/local/nagios/libexec/check_ssh -t $ARG1$ $HOSTADDRESS$
+       }
+
+
+
+################################################################################
+#
+# SAMPLE HOST CHECK COMMANDS
+#
+################################################################################
+
+
+# This command checks to see if a host is "alive" by pinging it
+# The check must result in a 100% packet loss or 5 second (5000ms) round trip 
+# average time to produce a critical error.
+# Note: Only one ICMP echo packet is sent (determined by the '-p 1' argument)
+
+# 'check-host-alive' command definition
+define command{
+        command_name    check-host-alive
+        command_line    /usr/local/nagios/libexec/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 1
+        }
+
+
+
+
+################################################################################
+#
+# SAMPLE NOTIFICATION COMMANDS
+#
+# These are some example notification commands.  They may or may not work on
+# your system without modification.  As an example, some systems will require 
+# you to use "/usr/bin/mailx" instead of "/usr/bin/mail" in the commands below.
+#
+################################################################################
+
+
+# 'host-notify-by-email' command definition
+define command{
+       command_name    host-notify-by-email
+       command_line    /usr/bin/printf "%b" "***** Nagios 2.6 *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /usr/bin/mail -s "Host $HOSTSTATE$ alert for $HOSTNAME$!" $CONTACTEMAIL$
+       }
+
+
+# 'host-notify-by-epager' command definition
+define command{
+       command_name    host-notify-by-epager
+       command_line    /usr/bin/printf "%b" "Host '$HOSTALIAS$' is $HOSTSTATE$\nInfo: $HOSTOUTPUT$\nTime: $LONGDATETIME$" | /usr/bin/mail -s "$NOTIFICATIONTYPE$ alert - Host $HOSTNAME$ is $HOSTSTATE$" $CONTACTPAGER$
+       }
+
+# 'notify-by-email' command definition
+define command{
+       command_name    notify-by-email
+       command_line    /usr/bin/printf "%b" "***** Nagios 2.6 *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$" | /usr/bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
+       }
+
+
+# 'notify-by-epager' command definition
+define command{
+       command_name    notify-by-epager
+       command_line    /usr/bin/printf "%b" "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\nInfo: $SERVICEOUTPUT$\nDate: $LONGDATETIME$" | /usr/bin/mail -s "$NOTIFICATIONTYPE$: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$" $CONTACTPAGER$
+       }
+
+
+
+
+
+################################################################################
+#
+# SAMPLE PERFORMANCE DATA COMMANDS
+#
+# These are sample performance data commands that can be used to send performance
+# data output to two text files (one for hosts, another for services).  If you
+# plan on simply writing performance data out to a file, consider using the 
+# host_perfdata_file and service_perfdata_file options in the main config file.
+#
+################################################################################
+
+
+# 'process-host-perfdata' command definition
+define command{
+       command_name    process-host-perfdata
+       command_line    /usr/bin/printf "%b" "$LASTHOSTCHECK$\t$HOSTNAME$\t$HOSTSTATE$\t$HOSTATTEMPT$\t$HOSTSTATETYPE$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$\n" >> /usr/local/nagios/var/host-perfdata.out
+       }
+
+
+# 'process-service-perfdata' command definition
+define command{
+       command_name    process-service-perfdata
+       command_line    /usr/bin/printf "%b" "$LASTSERVICECHECK$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICESTATE$\t$SERVICEATTEMPT$\t$SERVICESTATETYPE$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$\n" >> /usr/local/nagios/var/service-perfdata.out
+       }
+
+
+########################################################################
+#
+# Remote host check commands go first
+#
+########################################################################
+
+#check_proc_qa-dst_by_ssh
+define command {
+   command_name check_proc_qa-dst_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins-scripts/my_check_process_qa-dst.pl $ARG1$ $ARG2$ $ARG3$"
+
+}
+
+#check_proc_by_ssh
+define command {
+   command_name check_proc_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins-scripts/my_check_process.pl $ARG1$"
+
+}
+
+#check_proc2_by_ssh
+define command {
+   command_name check_proc2_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_process.pl $ARG1$"
+
+}
+
+#check_proc2
+define command {
+   command_name check_proc2
+   command_line /usr/local/nagios/libexec/my_check_process.pl $ARG1$
+}
+
+#check_by_ssh check_disk on lxg0447 (special command because of a path to check_disk on lxg0447)
+define command {
+   command_name check_disk_by_ssh_lxg0447
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$"
+}
+
+#check_by_ssh check_disk on lxg0451 (special command because of a path to check_disk on lxg0451)
+define command {
+   command_name check_disk_by_ssh_lxg0451
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$"
+}
+
+#check_by_ssh check_disk
+define command {
+   command_name check_disk_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$"
+}
+
+#check_by_ssh my_check_raid.pl
+define command {
+   command_name check_raid_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_raid.pl"
+}
+
+#check_by_ssh check_load
+define command {
+   command_name check_load_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/check_load -w $ARG1$,$ARG2$,$ARG3$ -c $ARG4$,$ARG5$,$ARG6$"
+}
+
+#check_load
+define command {
+   command_name check_load
+   command_line /usr/local/nagios/libexec/check_load -w $ARG1$,$ARG2$,$ARG3$ -c $ARG4$,$ARG5$,$ARG6$
+}
+
+#my_check_raid.pl
+define command {
+   command_name check_raid
+   command_line /usr/local/nagios/libexec/my_check_raid.pl
+}
+
+#my_check_archivist.pl
+define command {
+   command_name check_archivist
+   command_line /usr/local/nagios/libexec/my_check_archivist.pl $HOSTADDRESS$ $ARG1$   
+}
+
+#my_check_disk_smartctl.pl
+define command {
+   command_name check_disk_smartctl
+   command_line /usr/local/nagios/libexec/my_check_disk_smartctl.pl -d $ARG1$ -d $ARG2$
+}
+
+#my_check_disk_smartctl.pl
+define command {
+   command_name check_disk_smartctl_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_disk_smartctl.pl $ARG1$ $ARG2$"
+}
+
+#my_check_disk_smartctl.pl
+define command {
+   command_name check_disk_smartctl_temp
+   command_line /usr/local/nagios/libexec/my_check_disk_smartctl.pl -d $ARG1$ -d $ARG2$ -t -w $ARG3$ -c $ARG4$
+}
+
+#my_check_disk_smartctl.pl
+define command {
+   command_name check_disk_smartctl_temp_by_ssh
+   command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_disk_smartctl.pl -d $ARG1$ -d $ARG2$ -t -w $ARG3$ -c $ARG4$"
+}
+
+#my_check_dhcp.pl
+define command {
+   command_name check_dhcp
+   command_line /usr/local/nagios/libexec/my_check_dhcp.pl $ARG1$ 
+}
+
+#my_check_proc_status.pl
+define command {
+   command_name check_proc_status
+   command_line /usr/local/nagios/libexec/my_check_proc_status.pl $HOSTADDRESS$ $ARG1$ $ARG2$
+}
+
+#my_check_eblog_status.pl
+define command {
+   command_name check_eblog    
+   command_line /usr/local/nagios/libexec/my_check_eblog_status.pl $HOSTADDRESS$ $ARG1$
+}
+
+#my_epics.sh
+define command {
+   command_name check_epics    
+   command_line /usr/local/nagios/libexec/my_epics.sh -pv $ARG1$
+}
+\ No newline at end of file
diff --git a/config/contactgroups.cfg b/config/contactgroups.cfg

new file mode 100644 (file)

index 0000000..1f856fa
--- /dev/null
+++ b/config/contactgroups.cfg
@@ -0,0 +1,13 @@
+# 'linux-admins' contact group definition
+define contactgroup{
+       contactgroup_name       linux-admins
+       alias                   Linux Administrators
+       members                 hadaq,Michael,Ingo
+       }
+
+# 'qa-dst-admins' contact group definition
+#define contactgroup{
+#       contactgroup_name       qa-dst-admins
+#       alias                   online QA/DST Administrators
+#       members                 hadaq,Jacek,Malgorzata
+#       }
+\ No newline at end of file
diff --git a/config/contacts.cfg b/config/contacts.cfg

new file mode 100644 (file)

index 0000000..0b4eeae
--- /dev/null
+++ b/config/contacts.cfg
@@ -0,0 +1,60 @@
+# 'nagios' contact definition
+define contact{
+       contact_name                    hadaq
+       alias                           Nagios Admin
+       service_notification_period     24x7
+       host_notification_period        24x7
+       service_notification_options    w,u,c,r
+       host_notification_options       d,u,r
+       service_notification_commands   notify-by-email
+       host_notification_commands      host-notify-by-email
+       email                           s.yurevich@gsi.de
+       }
+
+define contact{
+       contact_name                    Michael
+       alias                           DAQ Expert
+       service_notification_period     24x7
+       host_notification_period        24x7
+       service_notification_options    w,u,c,r
+       host_notification_options       d,u,r
+       service_notification_commands   notify-by-email
+       host_notification_commands      host-notify-by-email
+       email                           m.traxler@gsi.de
+       }
+
+define contact{
+       contact_name                    Ingo
+       alias                           DAQ Expert 2
+       service_notification_period     24x7
+       host_notification_period        24x7
+       service_notification_options    w,u,c,r
+       host_notification_options       d,u,r
+       service_notification_commands   notify-by-email
+       host_notification_commands      host-notify-by-email
+       email                           froehlich@physik.uni-frankfurt.de
+       }
+
+define contact{
+       contact_name                    Jacek
+       alias                           QA/DST Expert 
+       service_notification_period     24x7
+       host_notification_period        24x7
+       service_notification_options    w,u,c,r
+       host_notification_options       d,u,r
+       service_notification_commands   notify-by-email
+       host_notification_commands      host-notify-by-email
+       email                           otwinow@hades2.if.uj.edu.pl
+       }
+
+define contact{
+       contact_name                    Malgorzata
+       alias                           QA/DST Expert 2
+       service_notification_period     24x7
+       host_notification_period        24x7
+       service_notification_options    w,u,c,r
+       host_notification_options       d,u,r
+       service_notification_commands   notify-by-email
+       host_notification_commands      host-notify-by-email
+       email                           M.Sudol@gsi.de
+       }
+\ No newline at end of file
diff --git a/config/hostgroups.cfg b/config/hostgroups.cfg

new file mode 100644 (file)

index 0000000..f38558e
--- /dev/null
+++ b/config/hostgroups.cfg
@@ -0,0 +1,39 @@
+# 'linux-boxes' host group definition
+define hostgroup{
+       hostgroup_name  vmecpu-group
+       alias           VME CPUS
+#       contact_groups  <edit-this> ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this.
+       members         hadc01,hadc02,hadc03,hadc04,hadc05,hadc06,hadc07,hadc08,hadc09,hadc10,hadc11,hadc12,hadc13,hadc14,hadc15,hadc16,hadc17;
+       }
+
+# 'eb-servers' host group definition
+define hostgroup{
+       hostgroup_name  hadeb-group
+       alias           EB Servers
+#      contact_groups  <edit-this> ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this.
+       members         hadeb01,hadeb03,hadeb04,hadeb05,hadeb06a,hadeb07,lxhadesdaq
+       }
+
+# lxg-hosts group definition
+define hostgroup{
+       hostgroup_name  lxg-group
+       alias           lxg hosts
+#      contact_groups  <edit-this> ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this.
+       members         lxg0447,lxg0411,lxg0451,lxg0434,lxg0440,lxg0441,lxg0442,lxg0443,lxg0444,lxg0430,lxg0438,lxg0449,lxg0450
+       }
+
+# hades-hosts group definition
+define hostgroup{
+       hostgroup_name  hades-group
+       alias           hades hosts
+#      contact_groups  <edit-this> ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this.
+       members        hades25,hades17,hades27
+       }
+
+# scs-hosts group definition
+define hostgroup{
+       hostgroup_name  scs-group
+       alias           scs hosts
+#      contact_groups  <edit-this> ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this.
+       members        hadsc1
+       }
diff --git a/config/hosts.cfg b/config/hosts.cfg

new file mode 100644 (file)

index 0000000..464bf9d
--- /dev/null
+++ b/config/hosts.cfg
@@ -0,0 +1,542 @@
+# Generic host definition template
+define host{
+        name                            generic-host    ; The name of this host template
+        notifications_enabled           1       ; Host notifications are enabled
+        event_handler_enabled           1       ; Host event handler is enabled
+        flap_detection_enabled          1       ; Flap detection is enabled
+        process_perf_data               1       ; Process performance data
+        retain_status_information       1       ; Retain status information
+        retain_nonstatus_information    1       ; Retain non-status information
+
+        register                        0       ; DONT REGISTER, JUST A TEMPLATE!
+        }
+
+define host{
+        name                            vme-cpu    ; The name of this host template
+        notifications_enabled           1       ; Host notifications are enabled
+        event_handler_enabled           1       ; Host event handler is enabled
+        flap_detection_enabled          1       ; Flap detection is enabled
+        process_perf_data               1       ; Process performance data
+        retain_status_information       1       ; Retain status information
+        retain_nonstatus_information    1       ; Retain non-status information
+        check_command           check-host-alive
+        check_period            24x7          
+        contact_groups          linux-admins   
+        max_check_attempts      5
+        notification_interval   120
+        notification_period     24x7
+        notification_options    d,u,r
+
+        register                        0       ; DONT REGISTER, JUST A TEMPLATE!
+        }
+
+define host{
+        name                            hadeb-host    ; The name of this host template
+        notifications_enabled           1       ; Host notifications are enabled
+        event_handler_enabled           1       ; Host event handler is enabled
+        flap_detection_enabled          1       ; Flap detection is enabled
+        process_perf_data               1       ; Process performance data
+        retain_status_information       1       ; Retain status information
+        retain_nonstatus_information    1       ; Retain non-status information
+        check_command           check-host-alive
+        check_period            24x7          
+        contact_groups          linux-admins   
+        max_check_attempts      5
+        notification_interval   120
+        notification_period     24x7
+        notification_options    d,u,r
+
+        register                        0       ; DONT REGISTER, JUST A TEMPLATE!
+        }
+
+define host{
+        name                            lxg-host    ; The name of this host template
+        notifications_enabled           1       ; Host notifications are enabled
+        event_handler_enabled           1       ; Host event handler is enabled
+        flap_detection_enabled          1       ; Flap detection is enabled
+        process_perf_data               1       ; Process performance data
+        retain_status_information       1       ; Retain status information 
+        retain_nonstatus_information    1       ; Retain non-status information 
+        check_command           check-host-alive
+        check_period            24x7          
+        contact_groups          linux-admins   
+        max_check_attempts      5
+        notification_interval   120
+        notification_period     24x7
+        notification_options    d,u,r
+
+        register                        0       ; DONT REGISTER, JUST A TEMPLATE!
+        }
+
+define host{
+        name                            hades-host ; The name of this host template
+        notifications_enabled           1       ; Host notifications are enabled
+        event_handler_enabled           1       ; Host event handler is enabled
+        flap_detection_enabled          1       ; Flap detection is enabled
+        process_perf_data               1       ; Process performance data
+        retain_status_information       1       ; Retain status information
+        retain_nonstatus_information    1       ; Retain non-status information
+        check_command           check-host-alive
+        check_period            24x7          
+        contact_groups          linux-admins   
+        max_check_attempts      5
+        notification_interval   120
+        notification_period     24x7
+        notification_options    d,u,r
+
+        register                        0       ; DONT REGISTER, JUST A TEMPLATE!
+        }
+
+define host{
+        name                            scs-host ; The name of this host template
+        notifications_enabled           1       ; Host notifications are enabled
+        event_handler_enabled           1       ; Host event handler is enabled
+        flap_detection_enabled          1       ; Flap detection is enabled
+        process_perf_data               1       ; Process performance data
+        retain_status_information       1       ; Retain status information
+        retain_nonstatus_information    1       ; Retain non-status information
+        check_command           check-host-alive
+        check_period            24x7          
+        contact_groups          linux-admins   
+        max_check_attempts      5
+        notification_interval   120
+        notification_period     24x7
+        notification_options    d,u,r
+
+        register                        0       ; DONT REGISTER, JUST A TEMPLATE!
+        }
+
+# 'localhost' host definition
+define host{
+        name                    localhost
+        use                     generic-host    ; Name of host template to use
+        host_name               hadesdaq
+        alias                   nagios server
+        address                 127.0.0.1
+        check_command           check-host-alive
+        check_period            24x7     
+        contact_groups          linux-admins
+        max_check_attempts      5
+        notification_interval   120
+        notification_period     24x7
+        notification_options    d,u,r
+        register                1
+        }
+
+# hadeb01 host definition
+define host{
+        name                    hadeb01
+        use                     hadeb-host    ; Name of host template to use
+        host_name               hadeb01
+        alias                   old eb server
+        address                 140.181.96.30
+        register                1
+        }
+
+# hadeb03 host definition
+define host{
+        name                    hadeb03
+        use                     hadeb-host    ; Name of host template to use
+        host_name               hadeb03
+        alias                   backup server
+        address                 140.181.97.118
+        register                1
+        }
+
+# hadeb04 host definition
+define host{
+        name                    hadeb04
+        use                     hadeb-host    ; Name of host template to use
+        host_name               hadeb04
+        alias                   server
+        address                 140.181.83.152
+        register                1
+        }
+
+# hadeb05 host definition
+define host{
+        name                    hadeb05
+        use                     hadeb-host    ; Name of host template to use
+        host_name               hadeb05
+        alias                   server
+        address                 140.181.93.18
+        register                1
+        }
+
+# hadeb06 host definition
+define host{
+        name                    hadeb06a
+        use                     hadeb-host    ; Name of host template to use
+        host_name               hadeb06a
+        alias                   server
+        address                 140.181.93.112
+        register                1
+        }
+
+#hadeb07
+define host{
+        name                    hadeb07
+        use                     hadeb-host    ; Name of host template to use
+        host_name               hadeb07
+        alias                   backup server
+        address                 140.181.103.216
+        register                1
+        }
+
+#lxhadesdaq host definition
+define host{
+        name                    lxhadesdaq
+        use                     generic-host    ; Name of host template to use
+        host_name               lxhadesdaq
+        alias                   main server
+        address                 140.181.75.158
+        check_command           check-host-alive
+        check_period            24x7            ; new
+        contact_groups          linux-admins    ; new
+        max_check_attempts      5
+        notification_interval   120
+        notification_period     24x7
+        notification_options    d,u,r
+        register                1
+        }
+
+#hadc01
+define host{
+        name                    hadc01
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc01
+        alias                   ---
+        address                 140.181.82.98
+        register                1
+        }
+
+#hadc02
+define host{
+        name                    hadc02
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc02
+        alias                   MDC-1 readout in cave
+        address                 140.181.84.20
+        register                1
+        }
+
+#hadc03
+define host{
+        name                    hadc03
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc03
+        alias                   TOF-0 readout in cave
+        address                 140.181.87.78
+        register                1
+        }
+
+#hadc04
+define host{
+        name                    hadc04
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc04
+        alias                   TOF-1 readout in cave
+        address                 140.181.87.80
+        register                1
+        }
+
+#hadc05
+define host{
+        name                    hadc05
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc05
+        alias                   TOF-2 readout in cave
+        address                 140.181.87.82
+        register                1
+        }
+
+#hadc06
+define host{
+        name                    hadc06
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc06
+        alias                   TOF-3 readout in cave
+        address                 140.181.87.84
+        register                1
+        }
+
+#hadc07
+define host{
+        name                    hadc07
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc07
+        alias                   TOF-4 readout in cave
+        address                 140.181.87.86
+        register                1
+        }
+
+#hadc08
+define host{
+        name                    hadc08
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc08
+        alias                   Matching Unit
+        address                 140.181.87.88
+        register                1
+        }
+
+#hadc09
+define host{
+        name                    hadc09
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc09
+        alias                   Ingos lab in Frankfurt
+        address                 140.181.87.90
+        register                1
+        }
+
+#hadc10
+define host{
+        name                    hadc10
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc10
+        alias                   Shower
+        address                 140.181.87.92
+        register                1
+        }
+
+#hadc11
+define host{
+        name                    hadc11
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc11
+        alias                   RICH1, Torte TU-Munchen
+        address                 140.181.87.94
+        register                1
+        }
+
+#hadc12
+define host{
+        name                    hadc12
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc12
+        alias                   RICH1
+        address                 140.181.87.96
+        register                1
+        }
+
+#hadc13
+define host{
+        name                    hadc13
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc13
+        alias                   RICH2
+        address                 140.181.87.98
+        register                1
+        }
+
+#hadc14
+define host{
+        name                    hadc14
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc14
+        alias                   RICH3
+        address                 140.181.87.100
+        register                1
+        }
+
+#hadc15
+define host{
+        name                    hadc15
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc15
+        alias                   MDC-0 readout in cave
+        address                 140.181.87.102
+        register                1
+        }
+
+#hadc16
+define host{
+        name                    hadc16
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc16
+        alias                   EE-Lab, GSI
+        address                 140.181.87.104
+        register                1
+        }
+
+#hadc17
+define host{
+        name                    hadc17
+        use                     vme-cpu    ; Name of host template to use
+        host_name               hadc17
+        alias                   Lab in Giessen, Tiago
+        address                 140.181.87.106
+        register                1
+        }
+
+#lxg0411
+define host{
+        name                    lxg0411
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0411
+        alias                   QA Server (Go4)
+        address                 140.181.74.222
+        register                1
+        }
+
+#lxg0447
+define host{
+        name                    lxg0447
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0447
+        alias                   QA RAM-Disk
+        address                 140.181.92.234
+        register                1
+        }
+
+#lxg0430
+define host{
+        name                    lxg0430
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0430
+        alias                   online DST 
+        address                 140.181.67.145
+        register                1
+        }
+
+#lxg0434
+define host{
+        name                    lxg0434
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0434
+        alias                   EPICS Oracle
+        address                 140.181.84.32
+        register                1
+        }
+
+#lxg0438
+define host{
+        name                    lxg0438
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0438
+        alias                   Rossendorf PC
+        address                 140.181.84.40
+        register                1
+        }
+
+
+#lxg0440
+define host{
+        name                    lxg0440
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0440
+        alias                   RICH acc PC
+        address                 140.181.92.220
+        register                1
+        }
+
+#lxg0441
+define host{
+        name                    lxg0441
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0441
+        alias                   MDC acc PC
+        address                 140.181.92.222
+        register                1
+        }
+
+#lxg0442
+define host{
+        name                    lxg0442
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0442
+        alias                   Start/Veto/Trigger acc PC
+        address                 140.181.92.224
+        register                1
+        }
+
+#lxg0443
+define host{
+        name                    lxg0443
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0443
+        alias                   TOF/TOFino acc PC
+        address                 140.181.92.226
+        register                1
+        }
+
+#lxg0444
+define host{
+        name                    lxg0444
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0444
+        alias                   Shower acc PC
+        address                 140.181.92.228
+        register                1
+        }
+
+#lxg0449
+define host{
+        name                    lxg0449
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0449
+        alias                   pc in cave
+        address                 140.181.102.238
+        register                1
+        }
+
+#lxg0450
+define host{
+        name                    lxg0450
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0450
+        alias                   pc in cave
+        address                 140.181.102.240
+        register                1
+        }
+
+#lxg0451
+define host{
+        name                    lxg0451
+        use                     lxg-host    ; Name of host template to use
+        host_name               lxg0451
+        alias                   online DST PC1
+        address                 140.181.103.214
+        register                1
+        }
+
+#hades25
+define host{
+        name                    hades25
+        use                     hades-host    ; Name of host template to use
+        host_name               hades25
+        alias                   Slow Control System
+        address                 140.181.107.26
+        register                1
+        }
+
+#hades17 (used to be hades26)
+define host{
+        name                    hades17
+        use                     hades-host    ; Name of host template to use
+        host_name               hades17
+        alias                   - System
+        address                 140.181.100.181  ; used to be 140.181.107.28 (hades26)
+        register                1
+        }
+
+#hades27
+define host{
+        name                    hades27
+        use                     hades-host    ; Name of host template to use
+        host_name               hades27
+        alias                   - System
+        address                 140.181.107.30
+        register                1
+        }
+
+#hadsc1
+define host{
+        name                    hadsc1
+        use                     scs-host    ; Name of host template to use
+        host_name               hadsc1
+        alias                   - System
+        address                 140.181.111.196
+        register                1
+        }
diff --git a/config/nagios.cfg b/config/nagios.cfg

new file mode 100644 (file)

index 0000000..4a65928
--- /dev/null
+++ b/config/nagios.cfg
@@ -0,0 +1,948 @@
+##############################################################################
+#
+# NAGIOS.CFG - Sample Main Config File for Nagios 2.6
+#
+# Read the documentation for more information on this configuration
+# file.  I've provided some comments here, but things may not be so
+# clear without further explanation.
+#
+# Last Modified: 11-21-2006
+#
+##############################################################################
+
+
+# LOG FILE
+# This is the main log file where service and host events are logged
+# for historical purposes.  This should be the first option specified 
+# in the config file!!!
+
+#log_file=/usr/local/nagios/var/nagios.log
+log_file=/var/log/nagios/nagios.log
+
+# OBJECT CONFIGURATION FILE(S)
+# This is the configuration file in which you define hosts, host
+# groups, contacts, contact groups, services, etc.  I guess it would
+# be better called an object definition file, but for historical
+# reasons it isn't.  You can split object definitions into several
+# different config files by using multiple cfg_file statements here.
+# Nagios will read and process all the config files you define.
+# This can be very useful if you want to keep command definitions 
+# separate from host and contact definitions...
+
+# Command definitions
+cfg_file=/usr/local/nagios/etc/commands.cfg
+
+# Host and service definitions for monitoring this machine
+#cfg_file=/usr/local/nagios/etc/localhost.cfg
+
+
+# You can split other types of object definitions across several
+# config files if you wish (as done here), or keep them all in a
+# single config file.
+
+cfg_file=/usr/local/nagios/etc/contactgroups.cfg
+cfg_file=/usr/local/nagios/etc/contacts.cfg
+#cfg_file=/usr/local/nagios/etc/dependencies.cfg
+#cfg_file=/usr/local/nagios/etc/escalations.cfg
+cfg_file=/usr/local/nagios/etc/hostgroups.cfg
+cfg_file=/usr/local/nagios/etc/hosts.cfg
+cfg_file=/usr/local/nagios/etc/services.cfg
+#cfg_file=/usr/local/nagios/etc/services_qadst.cfg
+cfg_file=/usr/local/nagios/etc/servicegroups.cfg
+cfg_file=/usr/local/nagios/etc/timeperiods.cfg
+
+# Extended host/service info definitions are now stored along with
+# other object definitions:
+#cfg_file=/usr/local/nagios/etc/hostextinfo.cfg
+#cfg_file=/usr/local/nagios/etc/serviceextinfo.cfg
+
+# You can also tell Nagios to process all config files (with a .cfg
+# extension) in a particular directory by using the cfg_dir
+# directive as shown below:
+
+#cfg_dir=/usr/local/nagios/etc/servers
+#cfg_dir=/usr/local/nagios/etc/printers
+#cfg_dir=/usr/local/nagios/etc/switches
+#cfg_dir=/usr/local/nagios/etc/routers
+
+
+
+# OBJECT CACHE FILE
+# This option determines where object definitions are cached when
+# Nagios starts/restarts.  The CGIs read object definitions from 
+# this cache file (rather than looking at the object config files
+# directly) in order to prevent inconsistencies that can occur
+# when the config files are modified after Nagios starts.
+
+object_cache_file=/usr/local/nagios/var/objects.cache
+
+
+
+# RESOURCE FILE
+# This is an optional resource file that contains $USERx$ macro
+# definitions. Multiple resource files can be specified by using
+# multiple resource_file definitions.  The CGIs will not attempt to
+# read the contents of resource files, so information that is
+# considered to be sensitive (usernames, passwords, etc) can be
+# defined as macros in this file and restrictive permissions (600)
+# can be placed on this file.
+
+resource_file=/usr/local/nagios/etc/resource.cfg
+
+
+
+# STATUS FILE
+# This is where the current status of all monitored services and
+# hosts is stored.  Its contents are read and processed by the CGIs.
+# The contents of the status file are deleted every time Nagios
+#  restarts.
+
+#status_file=/usr/local/nagios/var/status.dat
+status_file=/var/log/nagios/status.dat
+
+
+# NAGIOS USER
+# This determines the effective user that Nagios should run as.  
+# You can either supply a username or a UID.
+
+nagios_user=hadaq
+
+
+
+# NAGIOS GROUP
+# This determines the effective group that Nagios should run as.  
+# You can either supply a group name or a GID.
+
+nagios_group=users
+
+
+
+# EXTERNAL COMMAND OPTION
+# This option allows you to specify whether or not Nagios should check
+# for external commands (in the command file defined below).  By default
+# Nagios will *not* check for external commands, just to be on the
+# cautious side.  If you want to be able to use the CGI command interface
+# you will have to enable this.  Setting this value to 0 disables command
+# checking (the default), other values enable it.
+
+check_external_commands=1
+
+
+
+# EXTERNAL COMMAND CHECK INTERVAL
+# This is the interval at which Nagios should check for external commands.
+# This value works of the interval_length you specify later.  If you leave
+# that at its default value of 60 (seconds), a value of 1 here will cause
+# Nagios to check for external commands every minute.  If you specify a
+# number followed by an "s" (i.e. 15s), this will be interpreted to mean
+# actual seconds rather than a multiple of the interval_length variable.
+# Note: In addition to reading the external command file at regularly 
+# scheduled intervals, Nagios will also check for external commands after
+# event handlers are executed.
+# NOTE: Setting this value to -1 causes Nagios to check the external
+# command file as often as possible.
+
+#command_check_interval=15s
+command_check_interval=-1
+
+
+
+# EXTERNAL COMMAND FILE
+# This is the file that Nagios checks for external command requests.
+# It is also where the command CGI will write commands that are submitted
+# by users, so it must be writeable by the user that the web server
+# is running as (usually 'nobody').  Permissions should be set at the 
+# directory level instead of on the file, as the file is deleted every
+# time its contents are processed.
+
+command_file=/usr/local/nagios/var/rw/nagios.cmd
+
+
+
+# COMMENT FILE
+# This is the file that Nagios will use for storing host and service
+# comments.
+
+#comment_file=/usr/local/nagios/var/comments.dat
+comment_file=/var/log/nagios/comments.dat
+
+
+# DOWNTIME FILE
+# This is the file that Nagios will use for storing host and service
+# downtime data.
+
+#downtime_file=/usr/local/nagios/var/downtime.dat
+downtime_file=/var/log/nagios/downtime.dat
+
+
+# LOCK FILE
+# This is the lockfile that Nagios will use to store its PID number
+# in when it is running in daemon mode.
+
+#lock_file=/usr/local/nagios/var/nagios.lock
+lock_file=/var/log/nagios/nagios.lock
+
+
+# TEMP FILE
+# This is a temporary file that is used as scratch space when Nagios
+# updates the status log, cleans the comment file, etc.  This file
+# is created, used, and deleted throughout the time that Nagios is
+# running.
+
+temp_file=/usr/local/nagios/var/nagios.tmp
+
+
+
+# EVENT BROKER OPTIONS
+# Controls what (if any) data gets sent to the event broker.
+# Values:  0      = Broker nothing
+#         -1      = Broker everything
+#         <other> = See documentation
+
+event_broker_options=-1
+
+
+
+# EVENT BROKER MODULE(S)
+# This directive is used to specify an event broker module that should
+# by loaded by Nagios at startup.  Use multiple directives if you want
+# to load more than one module.  Arguments that should be passed to
+# the module at startup are seperated from the module path by a space.
+#
+# Example:
+#
+#   broker_module=<modulepath> [moduleargs]
+
+#broker_module=/somewhere/module1.o
+#broker_module=/somewhere/module2.o arg1 arg2=3 debug=0
+
+
+
+
+# LOG ROTATION METHOD
+# This is the log rotation method that Nagios should use to rotate
+# the main log file. Values are as follows..
+#      n       = None - don't rotate the log
+#      h       = Hourly rotation (top of the hour)
+#      d       = Daily rotation (midnight every day)
+#      w       = Weekly rotation (midnight on Saturday evening)
+#      m       = Monthly rotation (midnight last day of month)
+
+log_rotation_method=d
+
+
+
+# LOG ARCHIVE PATH
+# This is the directory where archived (rotated) log files should be 
+# placed (assuming you've chosen to do log rotation).
+
+log_archive_path=/usr/local/nagios/var/archives
+
+
+
+# LOGGING OPTIONS
+# If you want messages logged to the syslog facility, as well as the
+# NetAlarm log file set this option to 1.  If not, set it to 0.
+
+use_syslog=0
+
+
+
+# NOTIFICATION LOGGING OPTION
+# If you don't want notifications to be logged, set this value to 0.
+# If notifications should be logged, set the value to 1.
+
+log_notifications=0
+
+
+
+# SERVICE RETRY LOGGING OPTION
+# If you don't want service check retries to be logged, set this value
+# to 0.  If retries should be logged, set the value to 1.
+
+log_service_retries=1
+
+
+
+# HOST RETRY LOGGING OPTION
+# If you don't want host check retries to be logged, set this value to
+# 0.  If retries should be logged, set the value to 1.
+
+log_host_retries=1
+
+
+
+# EVENT HANDLER LOGGING OPTION
+# If you don't want host and service event handlers to be logged, set
+# this value to 0.  If event handlers should be logged, set the value
+# to 1.
+
+log_event_handlers=1
+
+
+
+# INITIAL STATES LOGGING OPTION
+# If you want Nagios to log all initial host and service states to
+# the main log file (the first time the service or host is checked)
+# you can enable this option by setting this value to 1.  If you
+# are not using an external application that does long term state
+# statistics reporting, you do not need to enable this option.  In
+# this case, set the value to 0.
+
+log_initial_states=0
+
+
+
+# EXTERNAL COMMANDS LOGGING OPTION
+# If you don't want Nagios to log external commands, set this value
+# to 0.  If external commands should be logged, set this value to 1.
+# Note: This option does not include logging of passive service
+# checks - see the option below for controlling whether or not
+# passive checks are logged.
+
+log_external_commands=1
+
+
+
+# PASSIVE CHECKS LOGGING OPTION
+# If you don't want Nagios to log passive host and service checks, set
+# this value to 0.  If passive checks should be logged, set
+# this value to 1.
+
+log_passive_checks=1
+
+
+
+# GLOBAL HOST AND SERVICE EVENT HANDLERS
+# These options allow you to specify a host and service event handler
+# command that is to be run for every host or service state change.
+# The global event handler is executed immediately prior to the event
+# handler that you have optionally specified in each host or
+# service definition. The command argument is the short name of a
+# command definition that you define in your host configuration file.
+# Read the HTML docs for more information.
+
+#global_host_event_handler=somecommand
+#global_service_event_handler=somecommand
+
+
+
+# SERVICE INTER-CHECK DELAY METHOD
+# This is the method that Nagios should use when initially
+# "spreading out" service checks when it starts monitoring.  The
+# default is to use smart delay calculation, which will try to
+# space all service checks out evenly to minimize CPU load.
+# Using the dumb setting will cause all checks to be scheduled
+# at the same time (with no delay between them)!  This is not a
+# good thing for production, but is useful when testing the
+# parallelization functionality.
+#      n       = None - don't use any delay between checks
+#      d       = Use a "dumb" delay of 1 second between checks
+#      s       = Use "smart" inter-check delay calculation
+#       x.xx    = Use an inter-check delay of x.xx seconds
+
+service_inter_check_delay_method=s
+
+
+
+# MAXIMUM SERVICE CHECK SPREAD
+# This variable determines the timeframe (in minutes) from the
+# program start time that an initial check of all services should
+# be completed.  Default is 30 minutes.
+
+max_service_check_spread=30
+
+
+
+# SERVICE CHECK INTERLEAVE FACTOR
+# This variable determines how service checks are interleaved.
+# Interleaving the service checks allows for a more even
+# distribution of service checks and reduced load on remote
+# hosts.  Setting this value to 1 is equivalent to how versions
+# of Nagios previous to 0.0.5 did service checks.  Set this
+# value to s (smart) for automatic calculation of the interleave
+# factor unless you have a specific reason to change it.
+#       s       = Use "smart" interleave factor calculation
+#       x       = Use an interleave factor of x, where x is a
+#                 number greater than or equal to 1.
+
+service_interleave_factor=s
+
+
+
+# HOST INTER-CHECK DELAY METHOD
+# This is the method that Nagios should use when initially
+# "spreading out" host checks when it starts monitoring.  The
+# default is to use smart delay calculation, which will try to
+# space all host checks out evenly to minimize CPU load.
+# Using the dumb setting will cause all checks to be scheduled
+# at the same time (with no delay between them)!
+#      n       = None - don't use any delay between checks
+#      d       = Use a "dumb" delay of 1 second between checks
+#      s       = Use "smart" inter-check delay calculation
+#       x.xx    = Use an inter-check delay of x.xx seconds
+
+host_inter_check_delay_method=s
+
+
+
+# MAXIMUM HOST CHECK SPREAD
+# This variable determines the timeframe (in minutes) from the
+# program start time that an initial check of all hosts should
+# be completed.  Default is 30 minutes.
+
+max_host_check_spread=30
+
+
+
+# MAXIMUM CONCURRENT SERVICE CHECKS
+# This option allows you to specify the maximum number of 
+# service checks that can be run in parallel at any given time.
+# Specifying a value of 1 for this variable essentially prevents
+# any service checks from being parallelized.  A value of 0
+# will not restrict the number of concurrent checks that are
+# being executed.
+
+max_concurrent_checks=0
+
+
+
+# SERVICE CHECK REAPER FREQUENCY
+# This is the frequency (in seconds!) that Nagios will process
+# the results of services that have been checked.
+
+service_reaper_frequency=10
+
+
+
+
+# AUTO-RESCHEDULING OPTION
+# This option determines whether or not Nagios will attempt to
+# automatically reschedule active host and service checks to
+# "smooth" them out over time.  This can help balance the load on
+# the monitoring server.  
+# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE
+# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY
+
+auto_reschedule_checks=0
+
+
+
+# AUTO-RESCHEDULING INTERVAL
+# This option determines how often (in seconds) Nagios will
+# attempt to automatically reschedule checks.  This option only
+# has an effect if the auto_reschedule_checks option is enabled.
+# Default is 30 seconds.
+# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE
+# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY
+
+auto_rescheduling_interval=30
+
+
+
+
+# AUTO-RESCHEDULING WINDOW
+# This option determines the "window" of time (in seconds) that
+# Nagios will look at when automatically rescheduling checks.
+# Only host and service checks that occur in the next X seconds
+# (determined by this variable) will be rescheduled. This option
+# only has an effect if the auto_reschedule_checks option is
+# enabled.  Default is 180 seconds (3 minutes).
+# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE
+# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY
+
+auto_rescheduling_window=180
+
+
+
+# SLEEP TIME
+# This is the number of seconds to sleep between checking for system
+# events and service checks that need to be run.
+
+sleep_time=0.25
+
+
+
+# TIMEOUT VALUES
+# These options control how much time Nagios will allow various
+# types of commands to execute before killing them off.  Options
+# are available for controlling maximum time allotted for
+# service checks, host checks, event handlers, notifications, the
+# ocsp command, and performance data commands.  All values are in
+# seconds.
+
+service_check_timeout=60
+host_check_timeout=30
+event_handler_timeout=30
+notification_timeout=30
+ocsp_timeout=5
+perfdata_timeout=5
+
+
+
+# RETAIN STATE INFORMATION
+# This setting determines whether or not Nagios will save state
+# information for services and hosts before it shuts down.  Upon
+# startup Nagios will reload all saved service and host state
+# information before starting to monitor.  This is useful for 
+# maintaining long-term data on state statistics, etc, but will
+# slow Nagios down a bit when it (re)starts.  Since its only
+# a one-time penalty, I think its well worth the additional
+# startup delay.
+
+retain_state_information=1
+
+
+
+# STATE RETENTION FILE
+# This is the file that Nagios should use to store host and
+# service state information before it shuts down.  The state 
+# information in this file is also read immediately prior to
+# starting to monitor the network when Nagios is restarted.
+# This file is used only if the preserve_state_information
+# variable is set to 1.
+
+state_retention_file=/usr/local/nagios/var/retention.dat
+
+
+
+# RETENTION DATA UPDATE INTERVAL
+# This setting determines how often (in minutes) that Nagios
+# will automatically save retention data during normal operation.
+# If you set this value to 0, Nagios will not save retention
+# data at regular interval, but it will still save retention
+# data before shutting down or restarting.  If you have disabled
+# state retention, this option has no effect.
+
+retention_update_interval=60
+
+
+
+# USE RETAINED PROGRAM STATE
+# This setting determines whether or not Nagios will set 
+# program status variables based on the values saved in the
+# retention file.  If you want to use retained program status
+# information, set this value to 1.  If not, set this value
+# to 0.
+
+use_retained_program_state=1
+
+
+
+# USE RETAINED SCHEDULING INFO
+# This setting determines whether or not Nagios will retain
+# the scheduling info (next check time) for hosts and services
+# based on the values saved in the retention file.  If you
+# If you want to use retained scheduling info, set this
+# value to 1.  If not, set this value to 0.
+
+use_retained_scheduling_info=0
+
+
+
+# INTERVAL LENGTH
+# This is the seconds per unit interval as used in the
+# host/contact/service configuration files.  Setting this to 60 means
+# that each interval is one minute long (60 seconds).  Other settings
+# have not been tested much, so your mileage is likely to vary...
+
+interval_length=60
+
+
+
+# AGGRESSIVE HOST CHECKING OPTION
+# If you don't want to turn on aggressive host checking features, set
+# this value to 0 (the default).  Otherwise set this value to 1 to
+# enable the aggressive check option.  Read the docs for more info
+# on what aggressive host check is or check out the source code in
+# base/checks.c
+
+use_aggressive_host_checking=0
+
+
+
+# SERVICE CHECK EXECUTION OPTION
+# This determines whether or not Nagios will actively execute
+# service checks when it initially starts.  If this option is 
+# disabled, checks are not actively made, but Nagios can still
+# receive and process passive check results that come in.  Unless
+# you're implementing redundant hosts or have a special need for
+# disabling the execution of service checks, leave this enabled!
+# Values: 1 = enable checks, 0 = disable checks
+
+execute_service_checks=1
+
+
+
+# PASSIVE SERVICE CHECK ACCEPTANCE OPTION
+# This determines whether or not Nagios will accept passive
+# service checks results when it initially (re)starts.
+# Values: 1 = accept passive checks, 0 = reject passive checks
+
+accept_passive_service_checks=1
+
+
+
+# HOST CHECK EXECUTION OPTION
+# This determines whether or not Nagios will actively execute
+# host checks when it initially starts.  If this option is 
+# disabled, checks are not actively made, but Nagios can still
+# receive and process passive check results that come in.  Unless
+# you're implementing redundant hosts or have a special need for
+# disabling the execution of host checks, leave this enabled!
+# Values: 1 = enable checks, 0 = disable checks
+
+execute_host_checks=1
+
+
+
+# PASSIVE HOST CHECK ACCEPTANCE OPTION
+# This determines whether or not Nagios will accept passive
+# host checks results when it initially (re)starts.
+# Values: 1 = accept passive checks, 0 = reject passive checks
+
+accept_passive_host_checks=1
+
+
+
+# NOTIFICATIONS OPTION
+# This determines whether or not Nagios will sent out any host or
+# service notifications when it is initially (re)started.
+# Values: 1 = enable notifications, 0 = disable notifications
+
+enable_notifications=0
+
+
+
+# EVENT HANDLER USE OPTION
+# This determines whether or not Nagios will run any host or
+# service event handlers when it is initially (re)started.  Unless
+# you're implementing redundant hosts, leave this option enabled.
+# Values: 1 = enable event handlers, 0 = disable event handlers
+
+enable_event_handlers=1
+
+
+
+# PROCESS PERFORMANCE DATA OPTION
+# This determines whether or not Nagios will process performance
+# data returned from service and host checks.  If this option is
+# enabled, host performance data will be processed using the
+# host_perfdata_command (defined below) and service performance
+# data will be processed using the service_perfdata_command (also
+# defined below).  Read the HTML docs for more information on
+# performance data.
+# Values: 1 = process performance data, 0 = do not process performance data
+
+process_performance_data=0
+
+
+
+# HOST AND SERVICE PERFORMANCE DATA PROCESSING COMMANDS
+# These commands are run after every host and service check is
+# performed.  These commands are executed only if the
+# enable_performance_data option (above) is set to 1.  The command
+# argument is the short name of a command definition that you 
+# define in your host configuration file.  Read the HTML docs for
+# more information on performance data.
+
+#host_perfdata_command=process-host-perfdata
+#service_perfdata_command=process-service-perfdata
+
+
+
+# HOST AND SERVICE PERFORMANCE DATA FILES
+# These files are used to store host and service performance data.
+# Performance data is only written to these files if the
+# enable_performance_data option (above) is set to 1.
+
+#host_perfdata_file=/tmp/host-perfdata
+#service_perfdata_file=/tmp/service-perfdata
+
+
+
+# HOST AND SERVICE PERFORMANCE DATA FILE TEMPLATES
+# These options determine what data is written (and how) to the
+# performance data files.  The templates may contain macros, special
+# characters (\t for tab, \r for carriage return, \n for newline)
+# and plain text.  A newline is automatically added after each write
+# to the performance data file.  Some examples of what you can do are
+# shown below.
+
+#host_perfdata_file_template=[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$
+#service_perfdata_file_template=[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$
+
+
+
+
+# HOST AND SERVICE PERFORMANCE DATA FILE MODES
+# This option determines whether or not the host and service
+# performance data files are opened in write ("w") or append ("a")
+# mode.  Unless you are the files are named pipes, you will probably
+# want to use the default mode of append ("a").
+
+#host_perfdata_file_mode=a
+#service_perfdata_file_mode=a
+
+
+
+# HOST AND SERVICE PERFORMANCE DATA FILE PROCESSING INTERVAL
+# These options determine how often (in seconds) the host and service
+# performance data files are processed using the commands defined
+# below.  A value of 0 indicates the files should not be periodically
+# processed.
+
+#host_perfdata_file_processing_interval=0
+#service_perfdata_file_processing_interval=0
+
+
+
+# HOST AND SERVICE PERFORMANCE DATA FILE PROCESSING COMMANDS
+# These commands are used to periodically process the host and
+# service performance data files.  The interval at which the
+# processing occurs is determined by the options above.
+
+#host_perfdata_file_processing_command=process-host-perfdata-file
+#service_perfdata_file_processing_command=process-service-perfdata-file
+
+
+
+# OBSESS OVER SERVICE CHECKS OPTION
+# This determines whether or not Nagios will obsess over service
+# checks and run the ocsp_command defined below.  Unless you're
+# planning on implementing distributed monitoring, do not enable
+# this option.  Read the HTML docs for more information on
+# implementing distributed monitoring.
+# Values: 1 = obsess over services, 0 = do not obsess (default)
+
+obsess_over_services=0
+
+
+
+# OBSESSIVE COMPULSIVE SERVICE PROCESSOR COMMAND
+# This is the command that is run for every service check that is
+# processed by Nagios.  This command is executed only if the
+# obsess_over_service option (above) is set to 1.  The command 
+# argument is the short name of a command definition that you
+# define in your host configuration file. Read the HTML docs for
+# more information on implementing distributed monitoring.
+
+#ocsp_command=somecommand
+
+
+
+# ORPHANED SERVICE CHECK OPTION
+# This determines whether or not Nagios will periodically 
+# check for orphaned services.  Since service checks are not
+# rescheduled until the results of their previous execution 
+# instance are processed, there exists a possibility that some
+# checks may never get rescheduled.  This seems to be a rare
+# problem and should not happen under normal circumstances.
+# If you have problems with service checks never getting
+# rescheduled, you might want to try enabling this option.
+# Values: 1 = enable checks, 0 = disable checks
+
+check_for_orphaned_services=1
+
+
+
+# SERVICE FRESHNESS CHECK OPTION
+# This option determines whether or not Nagios will periodically
+# check the "freshness" of service results.  Enabling this option
+# is useful for ensuring passive checks are received in a timely
+# manner.
+# Values: 1 = enabled freshness checking, 0 = disable freshness checking
+
+check_service_freshness=1
+
+
+
+# SERVICE FRESHNESS CHECK INTERVAL
+# This setting determines how often (in seconds) Nagios will
+# check the "freshness" of service check results.  If you have
+# disabled service freshness checking, this option has no effect.
+
+service_freshness_check_interval=60
+
+
+
+# HOST FRESHNESS CHECK OPTION
+# This option determines whether or not Nagios will periodically
+# check the "freshness" of host results.  Enabling this option
+# is useful for ensuring passive checks are received in a timely
+# manner.
+# Values: 1 = enabled freshness checking, 0 = disable freshness checking
+
+check_host_freshness=0
+
+
+
+# HOST FRESHNESS CHECK INTERVAL
+# This setting determines how often (in seconds) Nagios will
+# check the "freshness" of host check results.  If you have
+# disabled host freshness checking, this option has no effect.
+
+host_freshness_check_interval=60
+
+
+
+# AGGREGATED STATUS UPDATES
+# This option determines whether or not Nagios will 
+# aggregate updates of host, service, and program status
+# data.  Normally, status data is updated immediately when
+# a change occurs.  This can result in high CPU loads if
+# you are monitoring a lot of services.  If you want Nagios
+# to only refresh status data every few seconds, disable
+# this option.
+# Values: 1 = enable aggregate updates, 0 = disable aggregate updates
+
+aggregate_status_updates=1
+
+
+
+# AGGREGATED STATUS UPDATE INTERVAL
+# Combined with the aggregate_status_updates option,
+# this option determines the frequency (in seconds!) that
+# Nagios will periodically dump program, host, and 
+# service status data.  If you are not using aggregated
+# status data updates, this option has no effect.
+
+status_update_interval=15
+
+
+
+# FLAP DETECTION OPTION
+# This option determines whether or not Nagios will try
+# and detect hosts and services that are "flapping".  
+# Flapping occurs when a host or service changes between
+# states too frequently.  When Nagios detects that a 
+# host or service is flapping, it will temporarily suppress
+# notifications for that host/service until it stops
+# flapping.  Flap detection is very experimental, so read
+# the HTML documentation before enabling this feature!
+# Values: 1 = enable flap detection
+#         0 = disable flap detection (default)
+
+enable_flap_detection=0
+
+
+
+# FLAP DETECTION THRESHOLDS FOR HOSTS AND SERVICES
+# Read the HTML documentation on flap detection for
+# an explanation of what this option does.  This option
+# has no effect if flap detection is disabled.
+
+low_service_flap_threshold=5.0
+high_service_flap_threshold=20.0
+low_host_flap_threshold=5.0
+high_host_flap_threshold=20.0
+
+
+
+# DATE FORMAT OPTION
+# This option determines how short dates are displayed. Valid options
+# include:
+#      us              (MM-DD-YYYY HH:MM:SS)
+#      euro            (DD-MM-YYYY HH:MM:SS)
+#      iso8601         (YYYY-MM-DD HH:MM:SS)
+#      strict-iso8601  (YYYY-MM-DDTHH:MM:SS)
+#
+
+date_format=iso8601
+
+
+
+# P1.PL FILE LOCATION
+# This value determines where the p1.pl perl script (used by the
+# embedded Perl interpreter) is located.  If you didn't compile
+# Nagios with embedded Perl support, this option has no effect.
+
+p1_file=/usr/local/nagios/bin/p1.pl
+
+
+
+# ILLEGAL OBJECT NAME CHARACTERS
+# This option allows you to specify illegal characters that cannot
+# be used in host names, service descriptions, or names of other
+# object types.
+
+illegal_object_name_chars=`~!$%^&*|'"<>?,()=
+
+
+
+# ILLEGAL MACRO OUTPUT CHARACTERS
+# This option allows you to specify illegal characters that are
+# stripped from macros before being used in notifications, event
+# handlers, etc.  This DOES NOT affect macros used in service or
+# host check commands.
+# The following macros are stripped of the characters you specify:
+#      $HOSTOUTPUT$
+#      $HOSTPERFDATA$
+#      $HOSTACKAUTHOR$
+#      $HOSTACKCOMMENT$
+#      $SERVICEOUTPUT$
+#      $SERVICEPERFDATA$
+#      $SERVICEACKAUTHOR$
+#      $SERVICEACKCOMMENT$
+
+illegal_macro_output_chars=`~$&|'"<>
+
+
+
+# REGULAR EXPRESSION MATCHING
+# This option controls whether or not regular expression matching
+# takes place in the object config files.  Regular expression
+# matching is used to match host, hostgroup, service, and service
+# group names/descriptions in some fields of various object types.
+# Values: 1 = enable regexp matching, 0 = disable regexp matching
+
+use_regexp_matching=0
+
+
+
+# "TRUE" REGULAR EXPRESSION MATCHING
+# This option controls whether or not "true" regular expression 
+# matching takes place in the object config files.  This option
+# only has an effect if regular expression matching is enabled
+# (see above).  If this option is DISABLED, regular expression
+# matching only occurs if a string contains wildcard characters
+# (* and ?).  If the option is ENABLED, regexp matching occurs
+# all the time (which can be annoying).
+# Values: 1 = enable true matching, 0 = disable true matching
+
+use_true_regexp_matching=0
+
+
+
+
+# ADMINISTRATOR EMAIL ADDRESS
+# The email address of the administrator of *this* machine (the one
+# doing the monitoring).  Nagios never uses this value itself, but
+# you can access this value by using the $ADMINEMAIL$ macro in your
+# notification commands.
+
+admin_email=s.yurevich@gsi.de
+
+
+# ADMINISTRATOR PAGER NUMBER/ADDRESS
+# The pager number/address for the administrator of *this* machine.
+# Nagios never uses this value itself, but you can access this
+# value by using the $ADMINPAGER$ macro in your notification
+# commands.
+
+admin_pager=pagehadaq
+
+
+
+# DAEMON CORE DUMP OPTION
+# This option determines whether or not Nagios is allowed to create
+# a core dump when it runs as a daemon.  Note that it is generally
+# considered bad form to allow this, but it may be useful for
+# debugging purposes.
+# Values: 1 - Allow core dumps
+#         0 - Do not allow core dumps (default)
+
+daemon_dumps_core=0
+
+
+
diff --git a/config/servicegroups.cfg b/config/servicegroups.cfg

new file mode 100644 (file)

index 0000000..44714f7
--- /dev/null
+++ b/config/servicegroups.cfg
@@ -0,0 +1,20 @@
+# SOUND SERVER service group
+define servicegroup{
+       servicegroup_name  soundserver-group
+       alias           SOUND SERVER
+       members hadesdaq,SOUND_SERVER,lxhadesdaq,SOUND_SERVER,hadc08,SOUND_SERVER;
+       }
+
+# HARD DISK TEST service group
+define servicegroup{
+       servicegroup_name  harddisk-group
+       alias           DISK TEST
+       members hadesdaq,DISK TEST,hadesdaq,RAID1,hadeb07,DISK_AB TEST,hadeb07,DISK_CD TEST,hades17,DISK TEST,hades17,RAID1,hades25,DISK TEST,hades25,RAID1,hades27,DISK TEST,hades27,RAID1;
+       }
+
+# online QA/DST service group
+#define servicegroup{
+#       servicegroup_name  onlinedst-group
+#       alias           online QA/DST
+#       members lxg0411,updateQA,lxg0411,updateDST,lxg0430,runPairDST,lxg0440,runPairDST,lxg0441,runPairDST,lxg0442,runPairDST,lxg0443,runPairDST,lxg0444,runPairDST,lxg0451,runQA,lxg0452,runPairDST;
+#       }
+\ No newline at end of file
diff --git a/config/services.cfg b/config/services.cfg

new file mode 100644 (file)

index 0000000..d723892
--- /dev/null
+++ b/config/services.cfg
@@ -0,0 +1,923 @@
+# Generic service definition template - This is NOT a real service, just a template!
+
+define service{
+        name                            generic-service         ; The 'name' of this service template
+        active_checks_enabled           1                       ; Active service checks are enabled
+        passive_checks_enabled          1                       ; Passive service checks are enabled accepted
+        parallelize_check               1                       ; Active service checks should be par allelized (disabling this can lead to major performance problems)
+        obsess_over_service             1                       ; We should obsess over this service (if necessary)
+        check_freshness                 0                       ; Default is to NOT check service 'freshness'
+        notifications_enabled           1                       ; Service notifications are enabled
+        event_handler_enabled           1                       ; Service event handler is enabled
+        flap_detection_enabled          1                       ; Flap detection is enabled
+        failure_prediction_enabled      1                       ; Failure prediction is enabled
+        process_perf_data               1                       ; Process performance data
+        retain_status_information       1                       ; Retain status information across program restarts
+        retain_nonstatus_information    1                       ; Retain non-status information across program restarts
+        is_volatile                     0                       ; The service is not volatile
+        register                        0                       ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
+        }
+
+# PING-SERVICE
+define service{
+       use                             generic-service
+       name                            ping-service 
+       hostgroups                      *
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           5
+       retry_check_interval            1
+       contact_groups                  linux-admins    ; Make sure that the value here is also located in the contactgroup.cfg
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+
+       register   0
+       }
+
+# SSH-SERVICE
+define service{
+       use                             generic-service
+       name                            ssh-service 
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       hostgroups                      *
+
+       register   0
+        }
+
+# PING-SERVICE for lxg hosts
+define service{
+       use                             generic-service
+       name                            ping-service-lxg 
+       hostgroups                      *
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins    ; Make sure that the value here is also located in the contactgroup.cfg
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+
+       register   0
+       }
+
+# SSH-SERVICE for lxg hosts
+define service{
+       use                             generic-service
+       name                            ssh-service-lxg 
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       hostgroups                      *
+
+       register   0
+        }
+
+# PING
+define service{
+       use                             ping-service         ; Name of service template to use
+       hostgroup_name                  vmecpu-group,hadeb-group
+#       host_name                       *
+       service_description             PING
+       check_command                   check_ping!100.0,20%!500.0,60%
+       }
+
+
+# SSH
+define service{
+       use                             ssh-service 
+#       host_name                       *
+       hostgroup_name                  vmecpu-group,hadeb-group
+       service_description             SSH
+       check_command                   check_ssh!2
+}
+
+############# COMMON SERVICES FOR LXG04**
+# PING
+define service{
+       use                             ping-service-lxg         ; Name of service template to use
+       hostgroup_name                  lxg-group,hades-group
+#       host_name                       *
+       service_description             PING
+       check_command                   check_ping!100.0,20%!500.0,60%
+       }
+
+
+# SSH
+define service{
+       use                             ssh-service-lxg 
+#       host_name                       *
+       hostgroup_name                  lxg-group,hades-group
+       service_description             SSH
+       check_command                   check_ssh!2
+}
+
+####################### hadesdaq ##########################
+# local raid
+define service{
+       use                             generic-service
+       host_name                       hadesdaq
+       service_description             RAID1
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           30
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_raid
+       }
+
+# local check load
+define service{
+       use                             generic-service
+       host_name                       hadesdaq
+       service_description             CPU LOAD
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           5
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_load!10!10!0.9!20!20!1.5!
+       }
+
+# local disk test
+define service{
+       use                             generic-service
+       host_name                       hadesdaq
+       service_description             DISK TEST
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           480
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           480
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_smartctl_temp!/dev/sda!/dev/sdb!50!60!
+       }
+
+# check process: sound_server.pl
+define service{
+       use                             generic-service
+       host_name                       hadesdaq
+       service_description             SOUND_SERVER
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           5
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           480
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc2!sound_server.pl!
+       }
+
+################### lxhadesdaq ########################
+#check disk space
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             /DATA
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           30
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/data!
+       }
+
+#check disk space
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             /VAR
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!8%!4%!/var!
+       }
+
+# remote cpu load
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             CPU LOAD
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           5
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_load_by_ssh!10!10!2.5!20!20!3.5!
+       }
+
+#check archivist
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             ARCHIVIST
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           5
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_archivist!1978!
+       }
+
+#check process: runinfo2ora.pl
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             RUNINFO2ORA
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc2_by_ssh!runinfo2ora.pl!
+       }
+
+#check process: sound_server.pl
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             SOUND_SERVER
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc2_by_ssh!sound_server.pl!
+       }
+
+#check process: dhcp service
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             DHCP
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           120
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_dhcp!140.181.75.158!
+       }
+
+#check EB log file for discardred events
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             DISCARDED EVTS
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_eblog!50501!
+       }
+
+#check EB log file for discardred events
+define service{
+       use                             generic-service
+       host_name                       lxhadesdaq
+       service_description             LUSTRE
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           2
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_status!50502!check_lustre!
+       }
+
+####################### hadeb01 ###########################
+# remote disk check
+#define service{
+#       use                             generic-service
+#       host_name                       hadeb01
+#       service_description             /VAR
+#       is_volatile                   0
+#       check_period                    24x7
+#       max_check_attempts              3
+#       normal_check_interval           60
+#       retry_check_interval            1
+#       contact_groups                  linux-admins
+#       notification_interval           120
+#       notification_period             24x7
+#       notification_options            c,r
+#       check_command                   check_disk_by_ssh!20%!10%!/var!
+#       }
+
+####################### hadeb03 ###########################
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       hadeb03
+       service_description             /D/HADEB03
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/d/hadeb03!
+       }
+
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       hadeb03
+       service_description             /D/HADEB03B
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/d/hadeb03b!
+       }
+
+# remote raid check
+define service{
+       use                             generic-service
+       host_name                       hadeb03
+       service_description             RAID1
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_raid_by_ssh
+       }
+
+####################### hadeb04 ###########################
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       hadeb04
+       service_description             /DATA/HADEB04
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/data/hadeb04!
+       }
+
+####################### hadeb05 ###########################
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       hadeb05
+       service_description             /
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/!
+       }
+
+# remote cpu load
+define service{
+       use                             generic-service
+       host_name                       hadeb05
+       service_description             CPU LOAD
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           5
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_load_by_ssh!10!10!1.0!20!20!1.5!
+       }
+
+####################### hadeb06 ###########################
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       hadeb06a
+       service_description             /DATA/HADEB06
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/data/hadeb06!
+       }
+
+#define service{
+#       use                             generic-service
+#       host_name                       hadeb06a
+#       service_description             CONNECT_RES_RAM
+#       is_volatile                     0
+#       check_period                    24x7
+#       max_check_attempts              3
+#       normal_check_interval           10
+#       retry_check_interval            1
+#       contact_groups                  linux-admins
+#       notification_interval           120
+#       notification_period             24x7
+#       notification_options            c,r
+#       check_command                   check_proc2_by_ssh!connect_res_ram!
+#       }
+
+#define service{
+#       use                             generic-service
+#       host_name                       hadeb06a
+#       service_description             GET_HLD_RAMDISK
+#       is_volatile                     0
+#       check_period                    24x7
+#       max_check_attempts              3
+#       normal_check_interval           10
+#       retry_check_interval            1
+#       contact_groups                  linux-admins
+#       notification_interval           120
+#       notification_period             24x7
+#       notification_options            c,r
+#       check_command                   check_proc2_by_ssh!get_hld_ramdisk!
+#       }
+
+# remote process (connect_res) check status
+define service{
+       use                             generic-service
+       host_name                       hadeb06a
+       service_description             CONNECT_RES_RAM
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_status!50501!connect_res_ram!
+       }       
+
+# remote process (get_hld_ramdisk) check status
+define service{
+       use                             generic-service
+       host_name                       hadeb06a
+       service_description             GET_HLD_RAMDISK
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_status!50502!get_hld_ramdisk!
+       }
+
+####################### lxg0434 ###########################
+# remote process (check_archiver) check status
+define service{
+       use                             generic-service
+       host_name                       lxg0434
+       service_description             ARCHIVER
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           5
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_status!50501!check_archiver!
+       }  
+
+####################### lxg0447 ###########################
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       lxg0447
+       service_description             /DATA.LOCAL2
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh_lxg0447!30%!20%!/data.local2!
+       }
+
+# remote process (connect_res) check
+define service{
+       use                             generic-service
+       host_name                       lxg0447
+       service_description             CONNECT_RES
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_by_ssh!connect_res!
+       }
+
+# remote process (connect_res) check status
+define service{
+       use                             generic-service
+       host_name                       lxg0447
+       service_description             CONNECT_RES STATUS
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_status!50501!connect_res!
+       }
+
+####################### lxg0451 ###########################
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       lxg0451
+       service_description             /DATA.LOCAL2
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh_lxg0451!15%!10%!/data.local2!
+       }
+
+# remote process (connect_res) check
+define service{
+       use                             generic-service
+       host_name                       lxg0451
+       service_description             CONNECT_RES
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           60
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_by_ssh!connect_res!
+       }
+
+# remote process (connect_res) check status
+define service{
+       use                             generic-service
+       host_name                       lxg0451
+       service_description             CONNECT_RES STATUS
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_status!50501!connect_res!
+       }
+
+####################### hadeb07 ###########################
+# remote disk check
+define service{
+       use                             generic-service
+       host_name                       hadeb07
+       service_description             DISK_AB TEST
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!50!60!
+       }
+
+define service{
+       use                             generic-service
+       host_name                       hadeb07
+       service_description             DISK_CD TEST
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_smartctl_temp_by_ssh!/dev/sdc!/dev/sdd!50!60!
+       }
+
+define service{
+       use                             generic-service
+       host_name                       hadeb07
+       service_description             BACKUP
+       is_volatile                     0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc_status!50501!check_backup!
+       }       
+
+###################### hadc08 #############################
+#check process: sound_server.pl
+define service{
+       use                             generic-service
+       host_name                       hadc08
+       service_description             SOUND_SERVER
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           10
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           120
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_proc2_by_ssh!sound_server.pl!
+       }
+
+###################### hades25 ############################
+## local disk test
+define service{
+       use                             generic-service
+       host_name                       hades25
+       service_description             DISK TEST
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!50!60!
+       }
+
+# remote raid check
+define service{
+       use                             generic-service
+       host_name                       hades25
+       service_description             RAID1
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_raid_by_ssh
+       }
+
+#check disk space
+define service{
+       use                             generic-service
+       host_name                       hades25
+       service_description             /
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/!
+       }
+
+###################### hades17 ############################
+## local disk test
+define service{
+       use                             generic-service
+       host_name                       hades17
+       service_description             DISK TEST
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!60!70!
+       }
+
+# remote raid check
+define service{
+       use                             generic-service
+       host_name                       hades17
+       service_description             RAID1
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_raid_by_ssh
+       }
+
+#check disk space
+define service{
+       use                             generic-service
+       host_name                       hades17
+       service_description             /
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/!
+       }
+
+###################### hades27 ############################
+## remote disk test
+define service{
+       use                             generic-service
+       host_name                       hades27
+       service_description             DISK TEST
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!60!70!
+       }
+
+# remote raid check
+define service{
+       use                             generic-service
+       host_name                       hades27
+       service_description             RAID1
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_raid_by_ssh
+       }
+
+#check disk space
+define service{
+       use                             generic-service
+       host_name                       hades27
+       service_description             /
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_disk_by_ssh!20%!10%!/!
+       }
+
+#check EPICS
+define service{
+       use                             generic-service
+       host_name                       hadsc1
+       service_description             EPICS test
+       is_volatile                    0
+       check_period                    24x7
+       max_check_attempts              3
+       normal_check_interval           1440
+       retry_check_interval            1
+       contact_groups                  linux-admins
+       notification_interval           1440
+       notification_period             24x7
+       notification_options            c,r
+       check_command                   check_epics!HAD:hadsc1:scan1!
+       }
diff --git a/plugins/check_archivist.pl b/plugins/check_archivist.pl

new file mode 100755 (executable)

index 0000000..3e96d1c
--- /dev/null
+++ b/plugins/check_archivist.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/perl -w
+# ---------------------------------------------------------------------------
+# File Name:            my_check_archivist.pl
+# Author:               Sergey Yurevich
+# Date:                 16/01/2007
+# Version:              0.1
+# Description:          script will check to see if there 
+#                       is a message from archivist
+# ---------------------------------------------------------------------------
+
+use strict;
+use warnings;
+use IO::Socket;
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+@ARGV == 2 or die "usage: my_check_archivist.pl host_ip host_port\n"; 
+
+my ($remote_host, $remote_port) = @ARGV;
+
+#my $remote_host = 'lxhadesdaq.gsi.de';
+#my $remote_port = '60006';
+my $protocol    = 'tcp';
+my $state;
+my $answer = "";
+
+my $socket = IO::Socket::INET->new(PeerAddr => $remote_host,
+                                  PeerPort => $remote_port,
+                                  Proto    => $protocol,
+                                  Type     => SOCK_STREAM)
+     or $answer = "CRITICAL - no response from archivist at $remote_host:$remote_port";
+
+if($answer){
+    $state = $ERRORS{'CRITICAL'};
+}
+else{
+    $answer = <$socket>;
+
+    close($socket);
+
+    if($answer =~/OK/){
+       $state = $ERRORS{'OK'};
+    }
+    elsif($answer =~/WARNING/){
+       $state = $ERRORS{'WARNING'};
+    }
+    elsif($answer =~/CRITICAL/){
+       $state = $ERRORS{'CRITICAL'};
+    }
+    elsif($answer){
+       $state = $ERRORS{'UNKNOWN'};
+    }
+}
+
+if($state == $ERRORS{'OK'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'WARNING'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'CRITICAL'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'UNKNOWN'}){
+    print "UNKNOWN - $answer\n";
+}
+
+exit $state;
+
+
+
+
diff --git a/plugins/check_raid.pl b/plugins/check_raid.pl

new file mode 100755 (executable)

index 0000000..ebbc205
--- /dev/null
+++ b/plugins/check_raid.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl -w
+# ------------------------------------------------------------------------------
+# File Name:            chech_raid.pl
+# Author:               Thomas Nilsen - Norway
+# Date:                 14/06/2003
+# Version:              0.1
+# Description:          This script will check to see if any software raid
+#                       devices are down.
+# Email:                thomas.nilsen@doc-s.co.uk
+# WWW:                  www.doc-s.co.uk
+# ------------------------------------------------------------------------------
+# Copyright 2003 (c) Thomas Nilsen
+# Credits go to Ethan Galstad for coding Nagios
+# License GPL
+# ------------------------------------------------------------------------------
+# Date          Author          Reason
+# ----          ------          ------
+# 14/06/2003    TN              Initial Release
+#                               - Format of mdstat assumed to be "2 line" per
+#                                 device with [??] on the second line.
+# ------------------------------------------------------------------------------
+
+use strict;
+use warnings;
+use Getopt::Long;;
+use vars qw($opt_V $opt_h $opt_t $opt_F $PROGNAME);
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+$PROGNAME="check_raid";
+
+$ENV{'PATH'}='';
+$ENV{'BASH_ENV'}='';
+$ENV{'ENV'}='';
+my ( $line, $prevline, $stat, $state ,@device, $msg, $status, $timeout);
+
+$stat="/proc/mdstat";
+
+#Option checking
+Getopt::Long::Configure('bundling');
+$status = GetOptions(
+                "V"   => \$opt_V, "version"    => \$opt_V,
+                "h"   => \$opt_h, "help"       => \$opt_h,
+               "F"   => \$opt_F, "filename"   => \$opt_F,
+                "t"   => \$opt_t, "timeout"  => \$opt_t);
+# Version
+if ($opt_V) {
+        print_revision($PROGNAME,'$Revision: 1.1 $');
+        exit $ERRORS{'OK'};
+}
+# Help 
+if ($opt_h) {
+        #print_help();
+        exit $ERRORS{'OK'};
+}
+# Filename supplied
+if ($opt_F) {
+       $opt_F = shift; 
+       $stat = $1 if ($opt_F =~ /^(.*)$/);
+
+       if ( ! -r $stat ) {
+               print "Invalid mdstat file: $opt_F\n";
+               exit $ERRORS{'UNKNOWN'};
+       }
+}
+
+$timeout = $TIMEOUT;
+($opt_t) && ($opt_t =~ /^([0-9]+)$/) && ($timeout = $1);
+
+# Just in case of problems, let's not hang Nagios
+$SIG{'ALRM'} = sub {
+        print ("ERROR: No response (alarm)\n");
+        exit $ERRORS{'UNKNOWN'};
+};
+alarm($timeout);
+
+# Start checking the file...
+open (FH, $stat);
+$state = $ERRORS{'OK'};
+$msg ="";
+
+# Now check the mdstat file..
+while (<FH>) {
+       $line= $_;
+       if( $line =~ / \[_|_\]|U_|_U /) {
+               $state = $ERRORS{'CRITICAL'};
+               @device = split(/ /,$prevline);
+               $msg = $msg . $device[0] . ": - ";
+       }       
+       $prevline = $line;
+}
+close (FH);
+
+if ( $state == $ERRORS{'CRITICAL'} ) { 
+       print "CRITICAL - Device(s) $msg have failed\n"; 
+} elsif ( $state == $ERRORS{'OK'} )
+        { print "OK - All devices are online\n"; }
+exit $state;
+
+
diff --git a/plugins/my_check_dhcp.pl b/plugins/my_check_dhcp.pl

new file mode 100755 (executable)

index 0000000..469d2f3
--- /dev/null
+++ b/plugins/my_check_dhcp.pl
@@ -0,0 +1,47 @@
+#!/usr/bin/perl -w
+# ----------------------------------------------------------------------------
+# File Name:            my_check_process.pl
+# Author:               Sergey Yurevich
+# Date:                 05/04/2007
+# Version:              0.1
+# Description:          script will check if there is a process running
+# ----------------------------------------------------------------------------
+
+use strict;
+use warnings;
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+@ARGV == 1 or die "usage: my_check_dhcp.pl ip\n"; 
+
+my ($ip) = @ARGV;
+
+#dhcping                - check dhcp service
+#-c 140.181.67.143      - from  hadesdaq (140.181.67.143)
+#-s 140.181.75.158      - on lxhadesdaq (140.181.75.158)
+#-h 00:40:9E:00:99:E2   - if one gets an answer from MAC address (00:40:9E:00:99:E2)
+
+my $answer = `dhcping -c 140.181.67.143 -s $ip -h 00:40:9E:00:99:E2`;
+chop($answer);
+
+my $state;
+
+if($answer =~/Got answer from/){
+    $state = $ERRORS{'OK'};
+}
+else{
+    $state = $ERRORS{'CRITICAL'};
+}
+
+if($state == $ERRORS{'OK'}){
+    print "OK - dhcp is running\n";
+}
+elsif($state == $ERRORS{'CRITICAL'}){
+    print "CRITICAL - dhcp is not running!\n";
+}
+
+exit $state;
+
+
+
+
diff --git a/plugins/my_check_disk_smartctl.pl b/plugins/my_check_disk_smartctl.pl

new file mode 100755 (executable)

index 0000000..5ddb147
--- /dev/null
+++ b/plugins/my_check_disk_smartctl.pl
@@ -0,0 +1,158 @@
+#! /usr/bin/perl -w
+# ----------------------------------------------------------------------------
+# File Name:            my_check_disk_smartctl.pl
+# Author:               Sergey Yurevich
+# Date:                 16/01/2007
+# Version:              0.1
+# Description:          script will perform SMART overall-health 
+#                       self-assessment test + temperature check.
+# ----------------------------------------------------------------------------
+
+use strict;
+use Data::Dumper;
+use lib '/usr/local/nagios/libexec/' ;
+use utils qw($TIMEOUT %ERRORS &print_revision &support &usage);
+use Getopt::Long;
+
+Getopt::Long::Configure( 'bundling' );
+
+my (@devices, $temper, $warntemp, $crittemp);
+
+GetOptions( "d|devices=s" => \@devices,
+           "t|temper"    => \$temper,
+           "w|warning=i"   => \$warntemp,
+           "c|critical=i"  => \$crittemp );
+
+my $SMARTCTL = "/usr/sbin/smartctl";
+my $state;
+
+my $stateCrit = -1;
+my $stateWarn = -1;
+my $stateUnkn = -1;
+
+die "usage: my_check_disk_smartctl.pl [--temper -w 50 -c 60] -d /dev/sda -d /dev/sdb ...\n" unless @devices;
+
+#- loop over disks
+foreach my $disk (@devices)
+{
+    $state = -1;
+
+    #- valid devices: /dev/hda,..., /dev/sda,...
+    unless ($disk =~ /(\/dev\/[hs]d[0-9a-z]+)/){
+       print "ERROR: Invalid disk: $disk\n";
+       exit $ERRORS{'UNKNOWN'};
+    }
+
+    if( $temper )
+    {
+       my $temperature = &check_temperature( $disk );
+
+       if( $temperature > $crittemp ) 
+       { 
+           print "CRIRICAL! $disk: temperature is $temperature ";
+           $stateCrit = $ERRORS{'CRITICAL'};
+       }
+       elsif( $temperature > $warntemp )
+       {
+           print "WARNING! $disk: temperature is $temperature ";
+           $stateWarn = $ERRORS{'WARNING'};
+       }
+       elsif( $temperature eq "" )
+       {
+           print "CRIRICAL! No output from smartctl -A $disk ";
+           $stateCrit = $ERRORS{'CRITICAL'};
+       }
+       elsif( $temperature == -1 )
+       {
+           print "CRIRICAL! Temperature check failed! ";
+           $stateCrit = $ERRORS{'CRITICAL'};
+       }
+       else
+       {
+           print "OK! $disk: temperature = $temperature ";
+       }
+    }
+
+    my $command = "$SMARTCTL -H $disk";
+    my $status  = `$command`;
+
+    #$status = "hgftrefsd FAIL";
+
+    if ($status eq "") {
+       print "ERROR: no output from '$command'\n";
+       $state = $ERRORS{'CRITICAL'};
+    }
+
+    #- $ok gets equal the last line of smartctl output if PASSED...
+    my $ok = $1 if $status =~ /\n(.*?test result: PASSED\n)/i or
+                  $status =~ /\n(.*?Sense: Ok!\n)/i; 
+    if ($ok){
+       #print "$disk: $ok";
+       print "OK! $disk: SMART health test: PASSED ";
+       $state = $ERRORS{'OK'};
+    }
+
+    #- $fail gets equal the last line of smartctl output if FAIL...
+    my $fail = $1 if $status =~ /\n(.*?[^WHEN_]FAIL[^ED][^\n]*)/i;
+    if ($fail){
+       print "CRITICAL! $disk: $fail";
+       $state = $ERRORS{'CRITICAL'};
+    }
+
+    #- $old gets equal the last line of smartctl output if OLD...
+    my $old = $1 if $status =~ /\n(.*?OLD[^_age][^\n]*)/i;
+    if ($old){
+       print "WARNING! $disk: $old\n";
+       $state = $ERRORS{'WARNING'};
+    }
+    
+    if($state == $ERRORS{'CRITICAL'}){
+       $stateCrit = $ERRORS{'CRITICAL'};
+    }
+    elsif($state == $ERRORS{'WARNING'}){
+       $stateWarn = $ERRORS{'WARNING'};
+    }
+    elsif($state == -1){
+       print "UNKNOWN! Check manually: $SMARTCTL -H $disk ";
+       $stateUnkn = $ERRORS{'UNKNOWN'};
+    }
+}
+
+if($stateCrit == $ERRORS{'CRITICAL'}){
+    exit $stateCrit;
+}
+elsif($stateUnkn == $ERRORS{'UNKNOWN'}){
+    exit $stateUnkn;
+}
+elsif($stateWarn == $ERRORS{'WARNING'}){
+    exit $stateWarn;
+}
+else{
+    exit $ERRORS{'OK'};
+}
+
+sub check_temperature
+{
+    my ($disk) = @_;
+
+    my $command = "$SMARTCTL -A $disk";
+    my $temperature = -1;
+
+    my @status  = `$command`;
+
+    #print Dumper @status;
+
+    foreach my $line ( @status ) 
+    {
+       chop( $line );
+       if( $line =~ "194 Temperature_Celsius" )
+       {
+           my @words = split(/ +/, $line);
+           $temperature = $words[9];
+       }
+    }
+
+    return $temperature;
+}
+
+
diff --git a/plugins/my_check_eblog.pl b/plugins/my_check_eblog.pl

new file mode 100755 (executable)

index 0000000..a8e458b
--- /dev/null
+++ b/plugins/my_check_eblog.pl
@@ -0,0 +1,192 @@
+#!/usr/bin/perl -w
+
+########################################################
+#
+# Author: S.Y.
+#
+# This script checks Event Builder log file and 
+# estimates the number of files with discarded events
+# above a given threshold
+#
+########################################################
+
+use strict;
+use Data::Dumper;
+use Tie::File;
+use Fcntl;
+use IO::Handle;
+
+my $i;
+my @lines;
+my $line;
+
+my $file2read    = sprintf("%s_s.tcl", $ENV{DAQ_SETUP});
+
+tie(@lines, 'Tie::File', $file2read, mode => O_RDONLY)
+        or die "Cannot tie file $file2read: $!\n";
+
+
+#- the file info is searched only for files created during:
+my $last_minutes = 60; #last 60 minutes 
+
+#- get current time in iso format
+my ($y, $m, $d, $hh, $mm, $ss) = (localtime)[5,4,3,2,1,0]; $y += 1900; $m++;
+my $iso_now = sprintf("%d-%02d-%02d %02d:%02d:%02d", $y, $m, $d, $hh, $mm, $ss);
+
+#- init counters 
+my $filenum        = 0;
+my $errfilenum1    = 0;  #file with many evtsDiscarded 
+my $errfilenum2    = 0;  #file with many evtsDataError
+my $errfilenum3    = 0;  #file with many evtsTagError
+
+my ($evtsComplete, $evtsDiscarded, $evtsDataError, $evtsTagError);
+
+#- status info for Nagios
+my $status;
+
+#--- loop over all lines backward in the file2read
+for ( $i = $#lines; $i > 1; $i--){
+    
+    $line = $lines[$i];
+    
+    #- look for a line with "stopdate"
+    if ( $line =~ /stopdate/){
+       
+       #- check the number of problematic events in file
+       if ($filenum > 0 && $evtsComplete > 0) {
+
+           #- estimate amount of discarded events
+           my $ratio1 = $evtsDiscarded/$evtsComplete;
+           my $ratio2 = $evtsDataError/$evtsComplete;
+           my $ratio3 = $evtsTagError/$evtsComplete;
+
+           #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n";
+           
+           if ($ratio1 > 0.1) {
+               $errfilenum1++;
+           }
+           if ($ratio2 > 0.1) {
+               $errfilenum2++;
+           }
+           if ($ratio3 > 0.1) {
+               $errfilenum3++;
+           }
+       }
+
+       #- extract stop date from the line (format: "2007-05-05T19:32:53")
+       my ($v1, $v2, $stop_date) = split(" ", $line);
+       
+       #- get rid of ""
+       $stop_date =~ s/\"//g;
+
+       #-get rid of "T"
+       $stop_date =~ s/T/ /;
+
+       #- get time difference (in minutes)
+       my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now );
+
+       #print "stop_date = $stop_date, time_diff = $time_diff\n";
+
+       #- look for a recent hour
+       if ( $time_diff > $last_minutes) {
+          
+           if ($filenum == 0) {
+               $status = "OK - no new files in a log during last $last_minutes min.";
+               last;
+           }
+           elsif ($filenum > 0) {
+               my $persent1 = $errfilenum1/$filenum;
+               my $persent2 = $errfilenum2/$filenum;
+               my $persent3 = $errfilenum3/$filenum;
+
+               # if number of files with discarded events above threshold
+               # exceeds 10% -> send a WARNING
+               if ($persent1 > 0.1) {
+                   $status = "WARNING - $persent1 files with discarded events during last $last_minutes min.";
+                   last;
+               }
+               elsif ($persent2 > 0.1) {
+                   $status = "WARNING - $persent2 files with data error during last $last_minutes min.";
+                   last;  
+               }
+               elsif ($persent3 > 0.1) {
+                   $status = "WARNING - $persent3 files with tag error during last $last_minutes min.";
+                   last;  
+               }
+               else {
+                   $status = "OK - $persent1 files with discarded events during last $last_minutes min.";
+                   last;
+               }
+           }
+       } #if ( $time_diff > 60.) 
+       else {
+           
+           #- increment filenum counter
+           $filenum++;
+       }
+    } #if ( $line =~ /stopdate/){
+    else {
+
+       my ($v1, $v2);
+
+       if($line =~ /evtsComplete/) { 
+           ($v1, $v2, $evtsComplete) = split(" ", $line);
+       }
+       if($line =~ /evtsDiscarded/) {
+           ($v1, $v2, $evtsDiscarded) = split(" ", $line);
+       }
+       if($line =~ /evtsDataError/) {
+           ($v1, $v2, $evtsDataError) = split(" ", $line);
+       }
+       if($line =~ /evtsTagError/) {
+           ($v1, $v2, $evtsTagError) = split(" ", $line);
+       }
+    }
+}
+
+print "status for Nagios: $status\n";
+
+sub timeDiff (%) {
+    #
+    # this subroutine calculates the time difference in minutes
+    # usage: $timeDiffStr = &timeDiff( date1 => $sale_time, date2 => $iso_now );
+    # time format: $sale_time = "2007-05-05 19:32:53";
+    #
+
+       my %args = @_;
+
+       my @offset_days = qw(0 31 59 90 120 151 181 212 243 273 304 334);
+
+       my $year1  = substr($args{'date1'}, 0, 4);
+       my $month1 = substr($args{'date1'}, 5, 2);
+       my $day1   = substr($args{'date1'}, 8, 2);
+       my $hh1    = substr($args{'date1'},11, 2) || 0;
+       my $mm1    = substr($args{'date1'},14, 2) || 0;
+       my $ss1    = substr($args{'date1'},17, 2) if (length($args{'date1'}) > 16);
+          $ss1  ||= 0;
+
+       my $year2  = substr($args{'date2'}, 0, 4);
+       my $month2 = substr($args{'date2'}, 5, 2);
+       my $day2   = substr($args{'date2'}, 8, 2);
+       my $hh2    = substr($args{'date2'},11, 2) || 0;
+       my $mm2    = substr($args{'date2'},14, 2) || 0;
+       my $ss2    = substr($args{'date2'},17, 2) if (length($args{'date2'}) > 16);
+          $ss2  ||= 0;
+
+       my $total_days1 = $offset_days[$month1 - 1] + $day1 + 365 * $year1;
+       my $total_days2 = $offset_days[$month2 - 1] + $day2 + 365 * $year2;
+       my $days_diff   = $total_days2 - $total_days1;
+
+       my $seconds1 = $total_days1 * 86400 + $hh1 * 3600 + $mm1 * 60 + $ss1;
+       my $seconds2 = $total_days2 * 86400 + $hh2 * 3600 + $mm2 * 60 + $ss2;
+
+       my $ssDiff = $seconds2 - $seconds1;
+
+       my $dd     = int($ssDiff / 86400);
+       my $hh     = int($ssDiff /  3600) - $dd *    24;
+       my $mm     = int($ssDiff /    60) - $dd *  1440 - $hh *   60;
+       my $ss     = int($ssDiff /     1) - $dd * 86400 - $hh * 3600 - $mm * 60;
+
+       my $totminutes = int($ssDiff /    60);
+       "$totminutes";
+}
diff --git a/plugins/my_check_eblog_status.pl b/plugins/my_check_eblog_status.pl

new file mode 100755 (executable)

index 0000000..bf24e2d
--- /dev/null
+++ b/plugins/my_check_eblog_status.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/perl -w
+# ------------------------------------------------------------------------------
+# File Name:            my_check_eblog_status.pl
+# Author:               Sergey Yurevich
+# Date:                 16/01/2007
+# Version:              0.1
+# Description:          script checks the status of the process (alive/dead)
+# ------------------------------------------------------------------------------
+
+use strict;
+use warnings;
+use IO::Socket;
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+@ARGV == 2 or die "usage: my_check_eblog_status.pl host_ip host_port\n"; 
+
+my ($remote_host, $remote_port) = @ARGV;
+
+#my $remote_host = 'lxhadesdaq.gsi.de';
+#my $remote_port = '60006';
+my $protocol    = 'tcp';
+my $state;
+my $answer = "";
+
+my $socket = IO::Socket::INET->new(PeerAddr => $remote_host,
+                                  PeerPort => $remote_port,
+                                  Proto    => $protocol,
+                                  Type     => SOCK_STREAM)
+    #or die "Couldn't connect to $remote_host:$remote_port : $@\n";
+     or $answer = "WARNING - no response from my_check_eblog at $remote_host:$remote_port";
+
+if($answer){
+    $state = $ERRORS{'WARNING'};
+}
+else{
+    $answer = <$socket>;
+
+    close($socket);
+
+    if($answer =~/OK/){
+       #nagios exit code 0 = status OK = green
+       $state = $ERRORS{'OK'};
+    }
+    elsif($answer =~/WARNING/){
+       $state = $ERRORS{'WARNING'};
+    }
+    elsif($answer =~/CRITICAL/){
+       $state = $ERRORS{'CRITICAL'};
+    }
+    elsif($answer){
+       #nagios exit code 2 = status CRITICAL = red
+       $state = $ERRORS{'UNKNOWN'};
+    }
+}
+
+if($state == $ERRORS{'OK'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'WARNING'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'CRITICAL'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'UNKNOWN'}){
+    print "UNKNOWN - $answer\n";
+}
+
+exit $state;
+
+
+
+
diff --git a/plugins/my_check_ping.pl b/plugins/my_check_ping.pl

new file mode 100755 (executable)

index 0000000..f72d095
--- /dev/null
+++ b/plugins/my_check_ping.pl
@@ -0,0 +1,80 @@
+#!/usr/bin/perl -w
+
+#BEGIN{
+
+#      push @INC, "/usr/lib/perl5/site_perl/5.8.0/i586-linux-thread-multi";
+
+#}
+
+=head1 NAME
+
+check_ping.pl - pings a host and returns statistics data.
+
+=head1 VERSION
+
+Version 1.0
+
+=head1 AUTHOR
+
+(c) 2003 Hannes Schulz <mail@hannes-schulz.de>
+
+=head1 SYNOPSIS
+
+  ./check_ping.pl --host <host> --loss <warn>,<crit> --rta <warn>,<crit> 
+                  [--timeout <seconds>] [--packages <packages>]
+
+=head1 DESCRIPTION
+
+This pings a host via the C<Net::Ping> module from CPAN and returns 
+RTA and loss.
+
+=cut
+
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+use Net::Ping;
+
+my ($host,$aloss,$arta,$timeout,$pack);
+GetOptions(
+       "H|host=s",    \$host,
+       "l|loss=s",    \$aloss,
+       "r|rta=s",     \$arta,
+       "t|timeout=i", \$timeout,
+       "p|packages=i",\$pack
+);
+
+pod2usage("$0: No host given!\n") unless($host);
+pod2usage("$0: Parameter syntax error!\n") unless($aloss =~ /^\d+,\d+$/o);
+pod2usage("$0: Parameter syntax error!\n") unless($arta =~ /^\d+,\d+$/o);
+
+my ($wloss,$closs) = split /,/,$aloss;
+my ($wrta,$crta) = split /,/,$arta;
+
+pod2usage("$0: Warning > Critical!\n") unless($wloss<$closs);
+pod2usage("$0: Warning > Critical!\n") unless($wrta<$crta);
+
+$pack     ||= 5;
+$timeout  ||= ($pack*3.5);
+
+my $p = Net::Ping->new("tcp",$timeout/$pack);
+$p->hires(1);
+
+my ($ret, $duration, $ip, $nok, $dur);
+$nok = 0; $dur = 0;
+for(1..$pack){
+       ($ret, $duration, $ip) = $p->ping($host);
+       $nok++ if(!$ret);
+       $dur += $duration;
+       $p->close();
+}
+
+my $rta  = 1000 * $dur/$pack;
+my $loss = 100  * $nok/$pack;
+
+printf("PING - Packet loss = %i%%, RTA = %.2f ms\n", $loss, $rta);
+
+exit(2) if($rta>$crta or $loss>$closs);  # Nagios: Critical
+exit(1) if($rta>$wrta or $loss>$wloss);  # Nagios: Warning
+exit(0);                                 # Nagios: OK
diff --git a/plugins/my_check_proc_status.pl b/plugins/my_check_proc_status.pl

new file mode 100755 (executable)

index 0000000..d5ed355
--- /dev/null
+++ b/plugins/my_check_proc_status.pl
@@ -0,0 +1,71 @@
+#!/usr/bin/perl -w
+# ----------------------------------------------------------------------------
+# File Name:            my_check_proc_status.pl
+# Author:               Sergey Yurevich
+# Date:                 16/01/2007
+# Version:              0.1
+# Description:          script checks the status of the process (alive/dead)
+# ----------------------------------------------------------------------------
+
+use strict;
+use warnings;
+use IO::Socket;
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+@ARGV == 3 or die "usage: my_check_proc_status.pl host_ip host_port proc_name\n"; 
+
+my ($remote_host, $remote_port, $proc_name) = @ARGV;
+
+#my $remote_host = 'lxhadesdaq.gsi.de';
+#my $remote_port = '60006';
+my $protocol    = 'tcp';
+my $state;
+my $answer = "";
+
+my $socket = IO::Socket::INET->new(PeerAddr => $remote_host,
+                                  PeerPort => $remote_port,
+                                  Proto    => $protocol,
+                                  Type     => SOCK_STREAM)
+     or $answer = "CRITICAL - no response from $proc_name at $remote_host:$remote_port";
+
+if($answer){
+    $state = $ERRORS{'CRITICAL'};
+}
+else{
+    $answer = <$socket>;
+
+    close($socket);
+
+    if($answer =~/OK/){
+       $state = $ERRORS{'OK'};
+    }
+    elsif($answer =~/WARNING/){
+       $state = $ERRORS{'WARNING'};
+    }
+    elsif($answer =~/CRITICAL/){
+       $state = $ERRORS{'CRITICAL'};
+    }
+    elsif($answer){
+       $state = $ERRORS{'UNKNOWN'};
+    }
+}
+
+if($state == $ERRORS{'OK'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'WARNING'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'CRITICAL'}){
+    print "$answer\n";
+}
+elsif($state == $ERRORS{'UNKNOWN'}){
+    print "UNKNOWN - $answer\n";
+}
+
+exit $state;
+
+
+
+
diff --git a/plugins/my_check_process.pl b/plugins/my_check_process.pl

new file mode 100755 (executable)

index 0000000..9dd3bd1
--- /dev/null
+++ b/plugins/my_check_process.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/perl -w
+# ----------------------------------------------------------------------------
+# File Name:            my_check_process.pl
+# Author:               Sergey Yurevich
+# Date:                 05/04/2007
+# Version:              0.1
+# Description:          script will check if there is a process running
+# ----------------------------------------------------------------------------
+
+use strict;
+use warnings;
+use lib '/usr/local/nagios/libexec/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+@ARGV == 1 or die "usage: my_check_process.pl process_name\n"; 
+
+my ($process_name) = @ARGV;
+
+my $pids = `pidof -x $process_name`;
+chop($pids);
+
+my $state;
+
+if($pids){
+    $state = $ERRORS{'OK'};
+}
+else{
+    $state = $ERRORS{'CRITICAL'};
+}
+
+if($state == $ERRORS{'OK'}){
+    print "OK - pid of $process_name is $pids\n";
+}
+elsif($state == $ERRORS{'CRITICAL'}){
+    print "CRITICAL - $process_name is not running!\n";
+}
+
+exit $state;
+
+
+
+
diff --git a/plugins/my_check_process_qa-dst.pl b/plugins/my_check_process_qa-dst.pl

new file mode 100755 (executable)

index 0000000..3c12b09
--- /dev/null
+++ b/plugins/my_check_process_qa-dst.pl
@@ -0,0 +1,85 @@
+#!/usr/bin/perl -w
+# ----------------------------------------------------------------------------
+# File Name:            my_check_process.pl
+# Author:               Sergey Yurevich
+# Date:                 05/04/2007
+# Version:              0.1
+# Description:          script will check if there is a process running
+# ----------------------------------------------------------------------------
+
+use strict;
+use warnings;
+#use lib '/usr/local/nagios/libexec/';
+use lib '/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins-scripts/';
+use utils qw($TIMEOUT %ERRORS &print_revision &support);
+
+@ARGV == 3 or die "usage: my_check_process.pl process_name number_of_processes status_level\n"; 
+
+# process_name        - name of process to be checked.
+# status_level        - the return STATUS if process is not found.
+# number_of_processes - exact number of running processes to be checked. 
+
+my ($process_name, $proc_num, $status_level) = @ARGV;
+
+my $pids = `pidof -x $process_name`;
+chop($pids);
+
+#print "pids = $pids\n";
+my @pid_list = split(' ',$pids);
+
+my $run_proc_num = $#pid_list+1; #number of running processes with name $process_name 
+
+my $state;
+
+if($pids){
+    $state = $ERRORS{'OK'};
+}
+else{
+    $state = $ERRORS{'CRITICAL'};
+}
+
+if($state == $ERRORS{'OK'}){
+    if($proc_num == $run_proc_num){
+       print "OK - pid of $process_name is $pids\n";
+
+       exit $state;
+    }
+    elsif($proc_num > $run_proc_num){
+       print "$status_level - pid of $process_name is $pids, too few processes!\n";
+
+       #the following is needed because of passed status_level to the plugin script.
+       if($status_level eq "CRITICAL") {
+           exit $ERRORS{'CRITICAL'};
+       }
+       elsif($status_level eq "WARNING") {
+           exit $ERRORS{'WARNING'};
+       }
+    }
+    else{
+       print "$status_level - pid of $process_name is $pids, too many processes!\n";
+
+       #the following is needed because of passed status_level to the plugin script.
+       if($status_level eq "CRITICAL") {
+           exit $ERRORS{'CRITICAL'};
+       }
+       elsif($status_level eq "WARNING") {
+           exit $ERRORS{'WARNING'};
+       }
+    }
+}
+elsif($state == $ERRORS{'CRITICAL'}){
+    print "$status_level - $process_name is not running!\n";
+
+    #the following is needed because of passed status_level to the plugin script.
+    if($status_level eq "CRITICAL") {
+       exit $ERRORS{'CRITICAL'};
+    }
+    elsif($status_level eq "WARNING") {
+       exit $ERRORS{'WARNING'};
+    }
+}
+
+
+
+
+
diff --git a/plugins/my_epics.sh b/plugins/my_epics.sh

new file mode 100755 (executable)

index 0000000..ec68089
--- /dev/null
+++ b/plugins/my_epics.sh
@@ -0,0 +1,314 @@
+#!/bin/sh
+#
+##############################################################################
+##############################################################################
+##                 Nagios plugin to check EPICS PV Status                   ##
+##############################################################################
+##############################################################################
+#
+# Script to retrieve EPICS PV Name status using the "caget" command.
+# Written by Mauro Giacchini (mauro.giacchini@lnl.infn.it)
+# Last Modified: 17-11-2007
+#
+# Usage: ./check_caget.sh -pv <PV name>
+#
+# Description:
+#      This script uses caget command to retrieve the PV status. 
+#
+# Limitations:
+#      This script has been tested on Linux Fedora Core 6.
+#
+# Output:
+#      The output contains the "te" time elapsed 
+#       calculated like a difference from PV's
+# timestamp and the linux "date" command (suggestion: use ntp common server
+# to IOCs and Nagios server box). The STATUS of the service (..of the PV)
+# follow the severity rules:
+#
+# Severity (none) >>>> STATE_OK                # OK = green
+#
+# Severity MINOR  >>>> STATE_WARNING   # WARNING = yellow
+#
+# Severity MAJOR  >>>> STATE_CRITICAL  # CRITICAL = red
+#
+# PV not found    >>>> STATE_UNKNOWN   # UNKNOWNN = orange
+#
+# In case of Severity (none) it show the stdout of 
+# "caget -a" with appended the "te".
+#
+# Other notes:
+#  Firefox Plugin : A FireFox extension is avilable to monitor Nagios server.
+#  https://addons.mozilla.org/it/firefox/addon/3607
+#
+# Nagios configuration setup: 
+#      You need to add the command to commands.cfg
+# 
+# define command{
+#      command_name    check_caget
+#      command_line    $USER1$/check_caget.sh -pv $ARG1$
+#      }
+#
+#      And, you need to add the service to services.cfg
+#
+# define service{
+#        use                   generic-service ;
+#        host_name             IOC_Example     ;
+#        service_description           aiExample       ;
+#        is_volatile                   0               ;
+#        check_period          24x7            ;
+#        max_check_attempts            3               ;
+#        normal_check_interval         3               ;
+#        retry_check_interval          1               ;
+#        contact_groups                admins          ;
+#        notification_interval         120             ;
+#        notification_period           24x7            ;
+#        notification_options          w,u,c,r         ;
+#        check_command                 check_caget!rootHost:aiExample  ;
+#        }
+#
+# then place this script in the /usr/lib/nagios/plugins/ 
+# on the Nagios box server.
+# Don't forget to set the right execution permission to this file.
+#
+# Threshold and ranges: please, have a look at:
+# http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
+#
+# Last: This script still needs debugging and fixups (exercise for reader) :-)
+#
+##############################################################################
+# DEBUGGING OPTION
+# This option determines whether or not debugging messages are showed 
+# Values: 0=debugging off, 1=debugging on
+
+DEBUG="0"
+
+
+##############################################################################
+# CAGET LOCATION
+# This option determines where the caget executable is located.
+# The default /usr/bin/caget should be made with a symbolic link
+# made by root (i.e.): ln -s /opt/epics/base-3.14.9/bin/linux-x86/caget /usr/bin/caget
+
+
+CAGET_LOCATION=/home/scs/epics/base-3.14.9/bin/linux-x86/caget
+
+
+##############################################################################
+# Script exit status
+
+STATE_OK=0             # OK = green
+
+STATE_WARNING=1                # WARNING = yellow
+
+STATE_CRITICAL=2       # CRITICAL = red
+
+STATE_UNKNOWN=3        # UNKNOWNN = orange
+
+VERSION="v1.3"
+
+##############################################################################
+# print_revision() function
+
+print_revision (){
+    
+    echo "Check_caget (nagios-plugins 1.4 to nagios 2.9) (EPICS base 3.14.9) $VERSION"
+}
+
+##############################################################################
+# print_usage() function
+
+print_usage() {
+
+       echo ""
+       echo "Usage: check_caget_dev_gw -pv <PV name> "
+       echo "Usage: check_caget_dev_gw -pv <PV name> -H <EPICS_CA_ADDR_LIST>"
+       echo "Usage: check_caget_dev_gw -pv <PV name> -p <EPICS_CA_SERVER_PORT>"
+       echo "Usage: check_caget_dev_gw -pv <PV name> -expval <EXPECTED VALUE>"
+       echo "Usage: check_caget_dev_gw [-h] [--help]"
+       echo "Usage: check_caget_dev_gw [-V]"
+       echo ""
+}
+
+#####################################################################################
+# print_help() function
+
+print_help() {
+       echo ""
+       print_usage
+       echo ""
+       echo "Script to retrieve the PV status for EPICS control systems."
+       echo ""
+       echo "This plugin not developped by the Nagios Plugin group."
+       echo "Please do not e-mail them for support on this plugin, since"
+       echo "they won't know what you're talking about :P"
+       echo ""
+       echo "For contact info: mauro.giacchini@lnl.infn.it"
+       echo "Download : http://www.lnl.infn.it/~epics/"
+       echo ""
+}
+
+##############################################################################
+# Check the caget presence.
+
+
+verify_caget_presence() {
+
+
+if ! type $CAGET_LOCATION >/dev/null 2>&1; then
+
+       echo "STATUS CRITICAL: caget not found (Did you set up the rigth one Nagios USERn? _or_ caget not found!)"
+       exit $STATE_CRITICAL
+fi
+}
+
+
+##############################################################################
+# Control caget plugin input parameters
+
+EXPVAL=""
+EPICS_CA_ADDR_LIST=""  # Default YES
+EPICS_CA_SERVER_PORT="" # Default 5064 _and_   value > 5000
+EPICS_CA_SERVER_PORT_MIN="5000"
+
+while test -n "$1"; do
+
+    case "$1" in
+
+       --help)
+       print_help
+       exit $STATE_OK
+       ;;
+
+       -h)
+       print_help
+       exit $STATE_OK
+       ;;
+
+       -V)
+       print_revision
+       exit $STATE_OK
+       ;;
+
+       -pv)
+       PVNAME=$2
+       shift
+       ;;
+
+       -expval)
+       EXPVAL=$2
+       if [ -z $EXPVAL ]; then
+               echo "STATUS CRITICAL: Expected value absent"
+               exit $STATE_CRITICAL
+       fi
+       shift
+       ;;
+
+       -H)
+       EPICS_CA_ADDR_LIST=$2
+       if [ -z $EPICS_CA_ADDR_LIST ]; then
+               echo "STATUS CRITICAL: Expected EPICS_CA_ADDR_LIST absent"
+               exit $STATE_CRITICAL
+       fi
+       export EPICS_CA_ADDR_LIST
+       EPICS_CA_AUTO_ADDR_LIST="NO"
+       export EPICS_CA_AUTO_ADDR_LIST
+       shift
+       ;;
+
+       -p)
+       EPICS_CA_SERVER_PORT=$2
+       if [ -z $EPICS_CA_SERVER_PORT ]; then
+               echo "STATUS CRITICAL: Expected EPICS_CA_SERVER_PORT absent"
+               exit $STATE_CRITICAL
+       fi
+       if [ $EPICS_CA_SERVER_PORT -le $EPICS_CA_SERVER_PORT_MIN ]; then
+               echo "STATUS CRITICAL: Expected EPICS_CA_SERVER_PORT minor than allowed (5001)"
+               exit $STATE_CRITICAL
+       fi
+       export EPICS_CA_SERVER_PORT
+       shift
+       ;;
+
+       *)
+       echo ""
+       echo "Unknow argument: $1"
+       print_usage
+       exit $STATE_UNKNOWN
+       ;;
+ 
+esac
+shift
+done
+
+
+verify_caget_presence
+
+if [ -z $PVNAME ]; then
+
+    echo "STATUS CRITICAL: PV Name not specified"
+    exit $STATE_CRITICAL
+fi
+
+#####################################################################################
+# FINALLY... RETRIEVING THE VALUES (caget)
+
+
+#CAGET_REPLY=`caget -a $PVNAME`
+CAGET_REPLY=`$CAGET_LOCATION -a $PVNAME`
+
+IFS=" "
+read pvname date time value status severity<<END
+$CAGET_REPLY
+END
+
+if [ -z $pvname ]; then
+
+    echo "STATE_UNKNOWN: $PVNAME not found"
+    exit $STATE_UNKNOWN
+ fi
+
+##############################################################################
+# Calculus difference between the PV timestamp and the actual time
+
+       SPACE=" "
+       dte1=$(date --date "$date$SPACE$time" +%s)
+       dte2=$(date +%s)
+       diffSec=$((dte2-dte1))
+       if ((diffSec < 0)); then abs=-1; else abs=1; fi
+       te=$((diffSec/abs))
+#      echo "Time elapsed (sec.): $te"
+
+##############################################################################
+# Output the NAGIOS status using an expected value
+
+if [ $EXPVAL ]; then
+
+               if  [[ $value -eq $EXPVAL ]] ;
+                       then echo "STATE_OK: Expected value ($EXPVAL) to $pvname match; te: $te sec."
+                       exit $STATE_OK;
+                       else  echo "STATUS CRITICAL: Expected value ($EXPVAL) to $pvname didn't match"
+                       exit $STATE_CRITICAL; 
+               fi
+fi
+
+##############################################################################
+# Output the NAGIOS status using the Severity field
+case $severity in
+
+       MAJOR)
+       echo "STATUS CRITICAL: $pvname in MAJOR severity status; te: $te sec."
+       exit $STATE_CRITICAL
+       ;;
+
+       MINOR)
+       echo "STATE_WARNING: $pvname in MINOR severity status; te: $te sec."
+       exit $STATE_WARNING
+       ;;
+
+       *)
+       echo "STATE_OK: $pvname $value $date $time $status ; te: $te sec."
+       exit $STATE_OK
+       ;;
+esac
+
+
author	hadaq <hadaq>
	Fri, 19 Sep 2008 10:02:08 +0000 (10:02 +0000)
committer	hadaq <hadaq>
	Fri, 19 Sep 2008 10:02:08 +0000 (10:02 +0000)
config/commands.cfg	[new file with mode: 0644]	patch \| blob
config/contactgroups.cfg	[new file with mode: 0644]	patch \| blob
config/contacts.cfg	[new file with mode: 0644]	patch \| blob
config/hostgroups.cfg	[new file with mode: 0644]	patch \| blob
config/hosts.cfg	[new file with mode: 0644]	patch \| blob
config/nagios.cfg	[new file with mode: 0644]	patch \| blob
config/servicegroups.cfg	[new file with mode: 0644]	patch \| blob
config/services.cfg	[new file with mode: 0644]	patch \| blob
plugins/check_archivist.pl	[new file with mode: 0755]	patch \| blob
plugins/check_raid.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_dhcp.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_disk_smartctl.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_eblog.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_eblog_status.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_ping.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_proc_status.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_process.pl	[new file with mode: 0755]	patch \| blob
plugins/my_check_process_qa-dst.pl	[new file with mode: 0755]	patch \| blob
plugins/my_epics.sh	[new file with mode: 0755]	patch \| blob