From 714bf2b5ea7181be5f8195c4d4b02b84afadcb8f Mon Sep 17 00:00:00 2001 From: hadaq Date: Fri, 19 Sep 2008 10:02:08 +0000 Subject: [PATCH 1/1] Initial revision --- config/commands.cfg | 393 ++++++++++++ config/contactgroups.cfg | 13 + config/contacts.cfg | 60 ++ config/hostgroups.cfg | 39 ++ config/hosts.cfg | 542 +++++++++++++++++ config/nagios.cfg | 948 +++++++++++++++++++++++++++++ config/servicegroups.cfg | 20 + config/services.cfg | 923 ++++++++++++++++++++++++++++ plugins/check_archivist.pl | 72 +++ plugins/check_raid.pl | 100 +++ plugins/my_check_dhcp.pl | 47 ++ plugins/my_check_disk_smartctl.pl | 158 +++++ plugins/my_check_eblog.pl | 192 ++++++ plugins/my_check_eblog_status.pl | 74 +++ plugins/my_check_ping.pl | 80 +++ plugins/my_check_proc_status.pl | 71 +++ plugins/my_check_process.pl | 42 ++ plugins/my_check_process_qa-dst.pl | 85 +++ plugins/my_epics.sh | 314 ++++++++++ 19 files changed, 4173 insertions(+) create mode 100644 config/commands.cfg create mode 100644 config/contactgroups.cfg create mode 100644 config/contacts.cfg create mode 100644 config/hostgroups.cfg create mode 100644 config/hosts.cfg create mode 100644 config/nagios.cfg create mode 100644 config/servicegroups.cfg create mode 100644 config/services.cfg create mode 100755 plugins/check_archivist.pl create mode 100755 plugins/check_raid.pl create mode 100755 plugins/my_check_dhcp.pl create mode 100755 plugins/my_check_disk_smartctl.pl create mode 100755 plugins/my_check_eblog.pl create mode 100755 plugins/my_check_eblog_status.pl create mode 100755 plugins/my_check_ping.pl create mode 100755 plugins/my_check_proc_status.pl create mode 100755 plugins/my_check_process.pl create mode 100755 plugins/my_check_process_qa-dst.pl create mode 100755 plugins/my_epics.sh diff --git a/config/commands.cfg b/config/commands.cfg new file mode 100644 index 0000000..bcb79eb --- /dev/null +++ b/config/commands.cfg @@ -0,0 +1,393 @@ +################################################################################ +# Sample command definitions for Nagios 2.6 +# +# Read the documentation for more information on this configuration file. I've +# provided some comments here, but things may not be so clear without further +# explanation, so make sure to read the HTML documentation! +# +# Last Modified: 11-21-2006 +# +################################################################################ + + +################################################################################ +# COMMAND DEFINITIONS +# +# SYNTAX: +# +# define command{ +# template +# name +# command_name +# command_line +# } +# +# WHERE: +# +# = object name of another command definition that should be +# used as a template for this definition (optional) +# = object name of command definition, referenced by other +# command definitions that use it as a template (optional) +# = name of the command, as recognized/used by Nagios +# = command line +# +################################################################################ + + + + +################################################################################ +# +# SAMPLE SERVICE CHECK COMMANDS +# +# These are some example service check commands. They may or may not work on +# your system, as they must be modified for your plugins. See the HTML +# documentation on the plugins for examples of how to configure command definitions. +# +################################################################################ + + +################################################################################ +# NOTE: The following 'check_local_...' functions are designed to monitor +# various metrics on the host that Nagios is running on (i.e. this one). +################################################################################ + +# 'check_local_disk' command definition +define command{ + command_name check_local_disk + command_line /usr/local/nagios/libexec/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$ + } + + +# 'check_local_load' command definition +define command{ + command_name check_local_load + command_line /usr/local/nagios/libexec/check_load -w $ARG1$ -c $ARG2$ + } + + +# 'check_local_procs' command definition +define command{ + command_name check_local_procs + command_line /usr/local/nagios/libexec/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$ + } + + +# 'check_local_users' command definition +define command{ + command_name check_local_users + command_line /usr/local/nagios/libexec/check_users -w $ARG1$ -c $ARG2$ + } + +# 'check_local_nmap' command definition +define command{ + command_name check_local_nmap + command_line /usr/bin/nmap -sT -p22 -P0 localhost| grep open 2> /dev/null + } + + +################################################################################ +# NOTE: The following 'check_...' commands are used to monitor services on +# both local and remote hosts. +################################################################################ + +# 'check_dns' command definition +define command{ + command_name check_dns + command_line /usr/local/nagios/libexec/check_dns -H www.yahoo.com -s $HOSTADDRESS$ + } + + +# 'check_ftp' command definition +define command{ + command_name check_ftp + command_line /usr/local/nagios/libexec/check_ftp -H $HOSTADDRESS$ + } + + +# 'check_hpjd' command definition +define command{ + command_name check_hpjd + command_line /usr/local/nagios/libexec/check_hpjd -H $HOSTADDRESS$ -C public + } + + +# 'check_http' command definition +define command{ + command_name check_http + command_line /usr/local/nagios/libexec/check_http -H $HOSTADDRESS$ + } + + +# 'check_nntp' command definition +define command{ + command_name check_nntp + command_line /usr/local/nagios/libexec/check_nntp -H $HOSTADDRESS$ + } + + +# 'check_ping' command definition +define command{ + command_name check_ping + command_line /usr/local/nagios/libexec/check_ping -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5 + } + + +# 'check_pop' command definition +define command{ + command_name check_pop + command_line /usr/local/nagios/libexec/check_pop -H $HOSTADDRESS$ + } + + +# 'check_smtp' command definition +define command{ + command_name check_smtp + command_line /usr/local/nagios/libexec/check_smtp -H $HOSTADDRESS$ + } + + +# 'check_tcp' command definition +define command{ + command_name check_tcp + command_line /usr/local/nagios/libexec/check_tcp -H $HOSTADDRESS$ -p $ARG1$ + } + + +# 'check_telnet' command definition +define command{ + command_name check_telnet + command_line /usr/local/nagios/libexec/check_tcp -H $HOSTADDRESS$ -p 23 + } + + +# 'check_udp' command definition +define command{ + command_name check_udp + command_line /usr/local/nagios/libexec/check_udp -H $HOSTADDRESS$ -p $ARG1$ + } + +# 'check_ssh' command definition +define command{ + command_name check_ssh + command_line /usr/local/nagios/libexec/check_ssh -t $ARG1$ $HOSTADDRESS$ + } + + + +################################################################################ +# +# SAMPLE HOST CHECK COMMANDS +# +################################################################################ + + +# This command checks to see if a host is "alive" by pinging it +# The check must result in a 100% packet loss or 5 second (5000ms) round trip +# average time to produce a critical error. +# Note: Only one ICMP echo packet is sent (determined by the '-p 1' argument) + +# 'check-host-alive' command definition +define command{ + command_name check-host-alive + command_line /usr/local/nagios/libexec/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 1 + } + + + + +################################################################################ +# +# SAMPLE NOTIFICATION COMMANDS +# +# These are some example notification commands. They may or may not work on +# your system without modification. As an example, some systems will require +# you to use "/usr/bin/mailx" instead of "/usr/bin/mail" in the commands below. +# +################################################################################ + + +# 'host-notify-by-email' command definition +define command{ + command_name host-notify-by-email + command_line /usr/bin/printf "%b" "***** Nagios 2.6 *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /usr/bin/mail -s "Host $HOSTSTATE$ alert for $HOSTNAME$!" $CONTACTEMAIL$ + } + + +# 'host-notify-by-epager' command definition +define command{ + command_name host-notify-by-epager + command_line /usr/bin/printf "%b" "Host '$HOSTALIAS$' is $HOSTSTATE$\nInfo: $HOSTOUTPUT$\nTime: $LONGDATETIME$" | /usr/bin/mail -s "$NOTIFICATIONTYPE$ alert - Host $HOSTNAME$ is $HOSTSTATE$" $CONTACTPAGER$ + } + +# 'notify-by-email' command definition +define command{ + command_name notify-by-email + command_line /usr/bin/printf "%b" "***** Nagios 2.6 *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$" | /usr/bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$ + } + + +# 'notify-by-epager' command definition +define command{ + command_name notify-by-epager + command_line /usr/bin/printf "%b" "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\nInfo: $SERVICEOUTPUT$\nDate: $LONGDATETIME$" | /usr/bin/mail -s "$NOTIFICATIONTYPE$: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$" $CONTACTPAGER$ + } + + + + + +################################################################################ +# +# SAMPLE PERFORMANCE DATA COMMANDS +# +# These are sample performance data commands that can be used to send performance +# data output to two text files (one for hosts, another for services). If you +# plan on simply writing performance data out to a file, consider using the +# host_perfdata_file and service_perfdata_file options in the main config file. +# +################################################################################ + + +# 'process-host-perfdata' command definition +define command{ + command_name process-host-perfdata + command_line /usr/bin/printf "%b" "$LASTHOSTCHECK$\t$HOSTNAME$\t$HOSTSTATE$\t$HOSTATTEMPT$\t$HOSTSTATETYPE$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$\n" >> /usr/local/nagios/var/host-perfdata.out + } + + +# 'process-service-perfdata' command definition +define command{ + command_name process-service-perfdata + command_line /usr/bin/printf "%b" "$LASTSERVICECHECK$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICESTATE$\t$SERVICEATTEMPT$\t$SERVICESTATETYPE$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$\n" >> /usr/local/nagios/var/service-perfdata.out + } + + +######################################################################## +# +# Remote host check commands go first +# +######################################################################## + +#check_proc_qa-dst_by_ssh +define command { + command_name check_proc_qa-dst_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins-scripts/my_check_process_qa-dst.pl $ARG1$ $ARG2$ $ARG3$" + +} + +#check_proc_by_ssh +define command { + command_name check_proc_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins-scripts/my_check_process.pl $ARG1$" + +} + +#check_proc2_by_ssh +define command { + command_name check_proc2_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_process.pl $ARG1$" + +} + +#check_proc2 +define command { + command_name check_proc2 + command_line /usr/local/nagios/libexec/my_check_process.pl $ARG1$ +} + +#check_by_ssh check_disk on lxg0447 (special command because of a path to check_disk on lxg0447) +define command { + command_name check_disk_by_ssh_lxg0447 + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$" +} + +#check_by_ssh check_disk on lxg0451 (special command because of a path to check_disk on lxg0451) +define command { + command_name check_disk_by_ssh_lxg0451 + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$" +} + +#check_by_ssh check_disk +define command { + command_name check_disk_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$" +} + +#check_by_ssh my_check_raid.pl +define command { + command_name check_raid_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_raid.pl" +} + +#check_by_ssh check_load +define command { + command_name check_load_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/check_load -w $ARG1$,$ARG2$,$ARG3$ -c $ARG4$,$ARG5$,$ARG6$" +} + +#check_load +define command { + command_name check_load + command_line /usr/local/nagios/libexec/check_load -w $ARG1$,$ARG2$,$ARG3$ -c $ARG4$,$ARG5$,$ARG6$ +} + +#my_check_raid.pl +define command { + command_name check_raid + command_line /usr/local/nagios/libexec/my_check_raid.pl +} + +#my_check_archivist.pl +define command { + command_name check_archivist + command_line /usr/local/nagios/libexec/my_check_archivist.pl $HOSTADDRESS$ $ARG1$ +} + +#my_check_disk_smartctl.pl +define command { + command_name check_disk_smartctl + command_line /usr/local/nagios/libexec/my_check_disk_smartctl.pl -d $ARG1$ -d $ARG2$ +} + +#my_check_disk_smartctl.pl +define command { + command_name check_disk_smartctl_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_disk_smartctl.pl $ARG1$ $ARG2$" +} + +#my_check_disk_smartctl.pl +define command { + command_name check_disk_smartctl_temp + command_line /usr/local/nagios/libexec/my_check_disk_smartctl.pl -d $ARG1$ -d $ARG2$ -t -w $ARG3$ -c $ARG4$ +} + +#my_check_disk_smartctl.pl +define command { + command_name check_disk_smartctl_temp_by_ssh + command_line /usr/local/nagios/libexec/check_by_ssh -H $HOSTADDRESS$ -C "/usr/local/nagios/libexec/my_check_disk_smartctl.pl -d $ARG1$ -d $ARG2$ -t -w $ARG3$ -c $ARG4$" +} + +#my_check_dhcp.pl +define command { + command_name check_dhcp + command_line /usr/local/nagios/libexec/my_check_dhcp.pl $ARG1$ +} + +#my_check_proc_status.pl +define command { + command_name check_proc_status + command_line /usr/local/nagios/libexec/my_check_proc_status.pl $HOSTADDRESS$ $ARG1$ $ARG2$ +} + +#my_check_eblog_status.pl +define command { + command_name check_eblog + command_line /usr/local/nagios/libexec/my_check_eblog_status.pl $HOSTADDRESS$ $ARG1$ +} + +#my_epics.sh +define command { + command_name check_epics + command_line /usr/local/nagios/libexec/my_epics.sh -pv $ARG1$ +} \ No newline at end of file diff --git a/config/contactgroups.cfg b/config/contactgroups.cfg new file mode 100644 index 0000000..1f856fa --- /dev/null +++ b/config/contactgroups.cfg @@ -0,0 +1,13 @@ +# 'linux-admins' contact group definition +define contactgroup{ + contactgroup_name linux-admins + alias Linux Administrators + members hadaq,Michael,Ingo + } + +# 'qa-dst-admins' contact group definition +#define contactgroup{ +# contactgroup_name qa-dst-admins +# alias online QA/DST Administrators +# members hadaq,Jacek,Malgorzata +# } \ No newline at end of file diff --git a/config/contacts.cfg b/config/contacts.cfg new file mode 100644 index 0000000..0b4eeae --- /dev/null +++ b/config/contacts.cfg @@ -0,0 +1,60 @@ +# 'nagios' contact definition +define contact{ + contact_name hadaq + alias Nagios Admin + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email s.yurevich@gsi.de + } + +define contact{ + contact_name Michael + alias DAQ Expert + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email m.traxler@gsi.de + } + +define contact{ + contact_name Ingo + alias DAQ Expert 2 + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email froehlich@physik.uni-frankfurt.de + } + +define contact{ + contact_name Jacek + alias QA/DST Expert + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email otwinow@hades2.if.uj.edu.pl + } + +define contact{ + contact_name Malgorzata + alias QA/DST Expert 2 + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email M.Sudol@gsi.de + } \ No newline at end of file diff --git a/config/hostgroups.cfg b/config/hostgroups.cfg new file mode 100644 index 0000000..f38558e --- /dev/null +++ b/config/hostgroups.cfg @@ -0,0 +1,39 @@ +# 'linux-boxes' host group definition +define hostgroup{ + hostgroup_name vmecpu-group + alias VME CPUS +# contact_groups ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this. + members hadc01,hadc02,hadc03,hadc04,hadc05,hadc06,hadc07,hadc08,hadc09,hadc10,hadc11,hadc12,hadc13,hadc14,hadc15,hadc16,hadc17; + } + +# 'eb-servers' host group definition +define hostgroup{ + hostgroup_name hadeb-group + alias EB Servers +# contact_groups ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this. + members hadeb01,hadeb03,hadeb04,hadeb05,hadeb06a,hadeb07,lxhadesdaq + } + +# lxg-hosts group definition +define hostgroup{ + hostgroup_name lxg-group + alias lxg hosts +# contact_groups ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this. + members lxg0447,lxg0411,lxg0451,lxg0434,lxg0440,lxg0441,lxg0442,lxg0443,lxg0444,lxg0430,lxg0438,lxg0449,lxg0450 + } + +# hades-hosts group definition +define hostgroup{ + hostgroup_name hades-group + alias hades hosts +# contact_groups ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this. + members hades25,hades17,hades27 + } + +# scs-hosts group definition +define hostgroup{ + hostgroup_name scs-group + alias scs hosts +# contact_groups ; This needs to be the same value as the value located in service.cfg file. Nagios 2.5 produces an error if you define this. + members hadsc1 + } diff --git a/config/hosts.cfg b/config/hosts.cfg new file mode 100644 index 0000000..464bf9d --- /dev/null +++ b/config/hosts.cfg @@ -0,0 +1,542 @@ +# Generic host definition template +define host{ + name generic-host ; The name of this host template + notifications_enabled 1 ; Host notifications are enabled + event_handler_enabled 1 ; Host event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information + retain_nonstatus_information 1 ; Retain non-status information + + register 0 ; DONT REGISTER, JUST A TEMPLATE! + } + +define host{ + name vme-cpu ; The name of this host template + notifications_enabled 1 ; Host notifications are enabled + event_handler_enabled 1 ; Host event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information + retain_nonstatus_information 1 ; Retain non-status information + check_command check-host-alive + check_period 24x7 + contact_groups linux-admins + max_check_attempts 5 + notification_interval 120 + notification_period 24x7 + notification_options d,u,r + + register 0 ; DONT REGISTER, JUST A TEMPLATE! + } + +define host{ + name hadeb-host ; The name of this host template + notifications_enabled 1 ; Host notifications are enabled + event_handler_enabled 1 ; Host event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information + retain_nonstatus_information 1 ; Retain non-status information + check_command check-host-alive + check_period 24x7 + contact_groups linux-admins + max_check_attempts 5 + notification_interval 120 + notification_period 24x7 + notification_options d,u,r + + register 0 ; DONT REGISTER, JUST A TEMPLATE! + } + +define host{ + name lxg-host ; The name of this host template + notifications_enabled 1 ; Host notifications are enabled + event_handler_enabled 1 ; Host event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information + retain_nonstatus_information 1 ; Retain non-status information + check_command check-host-alive + check_period 24x7 + contact_groups linux-admins + max_check_attempts 5 + notification_interval 120 + notification_period 24x7 + notification_options d,u,r + + register 0 ; DONT REGISTER, JUST A TEMPLATE! + } + +define host{ + name hades-host ; The name of this host template + notifications_enabled 1 ; Host notifications are enabled + event_handler_enabled 1 ; Host event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information + retain_nonstatus_information 1 ; Retain non-status information + check_command check-host-alive + check_period 24x7 + contact_groups linux-admins + max_check_attempts 5 + notification_interval 120 + notification_period 24x7 + notification_options d,u,r + + register 0 ; DONT REGISTER, JUST A TEMPLATE! + } + +define host{ + name scs-host ; The name of this host template + notifications_enabled 1 ; Host notifications are enabled + event_handler_enabled 1 ; Host event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information + retain_nonstatus_information 1 ; Retain non-status information + check_command check-host-alive + check_period 24x7 + contact_groups linux-admins + max_check_attempts 5 + notification_interval 120 + notification_period 24x7 + notification_options d,u,r + + register 0 ; DONT REGISTER, JUST A TEMPLATE! + } + +# 'localhost' host definition +define host{ + name localhost + use generic-host ; Name of host template to use + host_name hadesdaq + alias nagios server + address 127.0.0.1 + check_command check-host-alive + check_period 24x7 + contact_groups linux-admins + max_check_attempts 5 + notification_interval 120 + notification_period 24x7 + notification_options d,u,r + register 1 + } + +# hadeb01 host definition +define host{ + name hadeb01 + use hadeb-host ; Name of host template to use + host_name hadeb01 + alias old eb server + address 140.181.96.30 + register 1 + } + +# hadeb03 host definition +define host{ + name hadeb03 + use hadeb-host ; Name of host template to use + host_name hadeb03 + alias backup server + address 140.181.97.118 + register 1 + } + +# hadeb04 host definition +define host{ + name hadeb04 + use hadeb-host ; Name of host template to use + host_name hadeb04 + alias server + address 140.181.83.152 + register 1 + } + +# hadeb05 host definition +define host{ + name hadeb05 + use hadeb-host ; Name of host template to use + host_name hadeb05 + alias server + address 140.181.93.18 + register 1 + } + +# hadeb06 host definition +define host{ + name hadeb06a + use hadeb-host ; Name of host template to use + host_name hadeb06a + alias server + address 140.181.93.112 + register 1 + } + +#hadeb07 +define host{ + name hadeb07 + use hadeb-host ; Name of host template to use + host_name hadeb07 + alias backup server + address 140.181.103.216 + register 1 + } + +#lxhadesdaq host definition +define host{ + name lxhadesdaq + use generic-host ; Name of host template to use + host_name lxhadesdaq + alias main server + address 140.181.75.158 + check_command check-host-alive + check_period 24x7 ; new + contact_groups linux-admins ; new + max_check_attempts 5 + notification_interval 120 + notification_period 24x7 + notification_options d,u,r + register 1 + } + +#hadc01 +define host{ + name hadc01 + use vme-cpu ; Name of host template to use + host_name hadc01 + alias --- + address 140.181.82.98 + register 1 + } + +#hadc02 +define host{ + name hadc02 + use vme-cpu ; Name of host template to use + host_name hadc02 + alias MDC-1 readout in cave + address 140.181.84.20 + register 1 + } + +#hadc03 +define host{ + name hadc03 + use vme-cpu ; Name of host template to use + host_name hadc03 + alias TOF-0 readout in cave + address 140.181.87.78 + register 1 + } + +#hadc04 +define host{ + name hadc04 + use vme-cpu ; Name of host template to use + host_name hadc04 + alias TOF-1 readout in cave + address 140.181.87.80 + register 1 + } + +#hadc05 +define host{ + name hadc05 + use vme-cpu ; Name of host template to use + host_name hadc05 + alias TOF-2 readout in cave + address 140.181.87.82 + register 1 + } + +#hadc06 +define host{ + name hadc06 + use vme-cpu ; Name of host template to use + host_name hadc06 + alias TOF-3 readout in cave + address 140.181.87.84 + register 1 + } + +#hadc07 +define host{ + name hadc07 + use vme-cpu ; Name of host template to use + host_name hadc07 + alias TOF-4 readout in cave + address 140.181.87.86 + register 1 + } + +#hadc08 +define host{ + name hadc08 + use vme-cpu ; Name of host template to use + host_name hadc08 + alias Matching Unit + address 140.181.87.88 + register 1 + } + +#hadc09 +define host{ + name hadc09 + use vme-cpu ; Name of host template to use + host_name hadc09 + alias Ingos lab in Frankfurt + address 140.181.87.90 + register 1 + } + +#hadc10 +define host{ + name hadc10 + use vme-cpu ; Name of host template to use + host_name hadc10 + alias Shower + address 140.181.87.92 + register 1 + } + +#hadc11 +define host{ + name hadc11 + use vme-cpu ; Name of host template to use + host_name hadc11 + alias RICH1, Torte TU-Munchen + address 140.181.87.94 + register 1 + } + +#hadc12 +define host{ + name hadc12 + use vme-cpu ; Name of host template to use + host_name hadc12 + alias RICH1 + address 140.181.87.96 + register 1 + } + +#hadc13 +define host{ + name hadc13 + use vme-cpu ; Name of host template to use + host_name hadc13 + alias RICH2 + address 140.181.87.98 + register 1 + } + +#hadc14 +define host{ + name hadc14 + use vme-cpu ; Name of host template to use + host_name hadc14 + alias RICH3 + address 140.181.87.100 + register 1 + } + +#hadc15 +define host{ + name hadc15 + use vme-cpu ; Name of host template to use + host_name hadc15 + alias MDC-0 readout in cave + address 140.181.87.102 + register 1 + } + +#hadc16 +define host{ + name hadc16 + use vme-cpu ; Name of host template to use + host_name hadc16 + alias EE-Lab, GSI + address 140.181.87.104 + register 1 + } + +#hadc17 +define host{ + name hadc17 + use vme-cpu ; Name of host template to use + host_name hadc17 + alias Lab in Giessen, Tiago + address 140.181.87.106 + register 1 + } + +#lxg0411 +define host{ + name lxg0411 + use lxg-host ; Name of host template to use + host_name lxg0411 + alias QA Server (Go4) + address 140.181.74.222 + register 1 + } + +#lxg0447 +define host{ + name lxg0447 + use lxg-host ; Name of host template to use + host_name lxg0447 + alias QA RAM-Disk + address 140.181.92.234 + register 1 + } + +#lxg0430 +define host{ + name lxg0430 + use lxg-host ; Name of host template to use + host_name lxg0430 + alias online DST + address 140.181.67.145 + register 1 + } + +#lxg0434 +define host{ + name lxg0434 + use lxg-host ; Name of host template to use + host_name lxg0434 + alias EPICS Oracle + address 140.181.84.32 + register 1 + } + +#lxg0438 +define host{ + name lxg0438 + use lxg-host ; Name of host template to use + host_name lxg0438 + alias Rossendorf PC + address 140.181.84.40 + register 1 + } + + +#lxg0440 +define host{ + name lxg0440 + use lxg-host ; Name of host template to use + host_name lxg0440 + alias RICH acc PC + address 140.181.92.220 + register 1 + } + +#lxg0441 +define host{ + name lxg0441 + use lxg-host ; Name of host template to use + host_name lxg0441 + alias MDC acc PC + address 140.181.92.222 + register 1 + } + +#lxg0442 +define host{ + name lxg0442 + use lxg-host ; Name of host template to use + host_name lxg0442 + alias Start/Veto/Trigger acc PC + address 140.181.92.224 + register 1 + } + +#lxg0443 +define host{ + name lxg0443 + use lxg-host ; Name of host template to use + host_name lxg0443 + alias TOF/TOFino acc PC + address 140.181.92.226 + register 1 + } + +#lxg0444 +define host{ + name lxg0444 + use lxg-host ; Name of host template to use + host_name lxg0444 + alias Shower acc PC + address 140.181.92.228 + register 1 + } + +#lxg0449 +define host{ + name lxg0449 + use lxg-host ; Name of host template to use + host_name lxg0449 + alias pc in cave + address 140.181.102.238 + register 1 + } + +#lxg0450 +define host{ + name lxg0450 + use lxg-host ; Name of host template to use + host_name lxg0450 + alias pc in cave + address 140.181.102.240 + register 1 + } + +#lxg0451 +define host{ + name lxg0451 + use lxg-host ; Name of host template to use + host_name lxg0451 + alias online DST PC1 + address 140.181.103.214 + register 1 + } + +#hades25 +define host{ + name hades25 + use hades-host ; Name of host template to use + host_name hades25 + alias Slow Control System + address 140.181.107.26 + register 1 + } + +#hades17 (used to be hades26) +define host{ + name hades17 + use hades-host ; Name of host template to use + host_name hades17 + alias - System + address 140.181.100.181 ; used to be 140.181.107.28 (hades26) + register 1 + } + +#hades27 +define host{ + name hades27 + use hades-host ; Name of host template to use + host_name hades27 + alias - System + address 140.181.107.30 + register 1 + } + +#hadsc1 +define host{ + name hadsc1 + use scs-host ; Name of host template to use + host_name hadsc1 + alias - System + address 140.181.111.196 + register 1 + } diff --git a/config/nagios.cfg b/config/nagios.cfg new file mode 100644 index 0000000..4a65928 --- /dev/null +++ b/config/nagios.cfg @@ -0,0 +1,948 @@ +############################################################################## +# +# NAGIOS.CFG - Sample Main Config File for Nagios 2.6 +# +# Read the documentation for more information on this configuration +# file. I've provided some comments here, but things may not be so +# clear without further explanation. +# +# Last Modified: 11-21-2006 +# +############################################################################## + + +# LOG FILE +# This is the main log file where service and host events are logged +# for historical purposes. This should be the first option specified +# in the config file!!! + +#log_file=/usr/local/nagios/var/nagios.log +log_file=/var/log/nagios/nagios.log + +# OBJECT CONFIGURATION FILE(S) +# This is the configuration file in which you define hosts, host +# groups, contacts, contact groups, services, etc. I guess it would +# be better called an object definition file, but for historical +# reasons it isn't. You can split object definitions into several +# different config files by using multiple cfg_file statements here. +# Nagios will read and process all the config files you define. +# This can be very useful if you want to keep command definitions +# separate from host and contact definitions... + +# Command definitions +cfg_file=/usr/local/nagios/etc/commands.cfg + +# Host and service definitions for monitoring this machine +#cfg_file=/usr/local/nagios/etc/localhost.cfg + + +# You can split other types of object definitions across several +# config files if you wish (as done here), or keep them all in a +# single config file. + +cfg_file=/usr/local/nagios/etc/contactgroups.cfg +cfg_file=/usr/local/nagios/etc/contacts.cfg +#cfg_file=/usr/local/nagios/etc/dependencies.cfg +#cfg_file=/usr/local/nagios/etc/escalations.cfg +cfg_file=/usr/local/nagios/etc/hostgroups.cfg +cfg_file=/usr/local/nagios/etc/hosts.cfg +cfg_file=/usr/local/nagios/etc/services.cfg +#cfg_file=/usr/local/nagios/etc/services_qadst.cfg +cfg_file=/usr/local/nagios/etc/servicegroups.cfg +cfg_file=/usr/local/nagios/etc/timeperiods.cfg + +# Extended host/service info definitions are now stored along with +# other object definitions: +#cfg_file=/usr/local/nagios/etc/hostextinfo.cfg +#cfg_file=/usr/local/nagios/etc/serviceextinfo.cfg + +# You can also tell Nagios to process all config files (with a .cfg +# extension) in a particular directory by using the cfg_dir +# directive as shown below: + +#cfg_dir=/usr/local/nagios/etc/servers +#cfg_dir=/usr/local/nagios/etc/printers +#cfg_dir=/usr/local/nagios/etc/switches +#cfg_dir=/usr/local/nagios/etc/routers + + + +# OBJECT CACHE FILE +# This option determines where object definitions are cached when +# Nagios starts/restarts. The CGIs read object definitions from +# this cache file (rather than looking at the object config files +# directly) in order to prevent inconsistencies that can occur +# when the config files are modified after Nagios starts. + +object_cache_file=/usr/local/nagios/var/objects.cache + + + +# RESOURCE FILE +# This is an optional resource file that contains $USERx$ macro +# definitions. Multiple resource files can be specified by using +# multiple resource_file definitions. The CGIs will not attempt to +# read the contents of resource files, so information that is +# considered to be sensitive (usernames, passwords, etc) can be +# defined as macros in this file and restrictive permissions (600) +# can be placed on this file. + +resource_file=/usr/local/nagios/etc/resource.cfg + + + +# STATUS FILE +# This is where the current status of all monitored services and +# hosts is stored. Its contents are read and processed by the CGIs. +# The contents of the status file are deleted every time Nagios +# restarts. + +#status_file=/usr/local/nagios/var/status.dat +status_file=/var/log/nagios/status.dat + + +# NAGIOS USER +# This determines the effective user that Nagios should run as. +# You can either supply a username or a UID. + +nagios_user=hadaq + + + +# NAGIOS GROUP +# This determines the effective group that Nagios should run as. +# You can either supply a group name or a GID. + +nagios_group=users + + + +# EXTERNAL COMMAND OPTION +# This option allows you to specify whether or not Nagios should check +# for external commands (in the command file defined below). By default +# Nagios will *not* check for external commands, just to be on the +# cautious side. If you want to be able to use the CGI command interface +# you will have to enable this. Setting this value to 0 disables command +# checking (the default), other values enable it. + +check_external_commands=1 + + + +# EXTERNAL COMMAND CHECK INTERVAL +# This is the interval at which Nagios should check for external commands. +# This value works of the interval_length you specify later. If you leave +# that at its default value of 60 (seconds), a value of 1 here will cause +# Nagios to check for external commands every minute. If you specify a +# number followed by an "s" (i.e. 15s), this will be interpreted to mean +# actual seconds rather than a multiple of the interval_length variable. +# Note: In addition to reading the external command file at regularly +# scheduled intervals, Nagios will also check for external commands after +# event handlers are executed. +# NOTE: Setting this value to -1 causes Nagios to check the external +# command file as often as possible. + +#command_check_interval=15s +command_check_interval=-1 + + + +# EXTERNAL COMMAND FILE +# This is the file that Nagios checks for external command requests. +# It is also where the command CGI will write commands that are submitted +# by users, so it must be writeable by the user that the web server +# is running as (usually 'nobody'). Permissions should be set at the +# directory level instead of on the file, as the file is deleted every +# time its contents are processed. + +command_file=/usr/local/nagios/var/rw/nagios.cmd + + + +# COMMENT FILE +# This is the file that Nagios will use for storing host and service +# comments. + +#comment_file=/usr/local/nagios/var/comments.dat +comment_file=/var/log/nagios/comments.dat + + +# DOWNTIME FILE +# This is the file that Nagios will use for storing host and service +# downtime data. + +#downtime_file=/usr/local/nagios/var/downtime.dat +downtime_file=/var/log/nagios/downtime.dat + + +# LOCK FILE +# This is the lockfile that Nagios will use to store its PID number +# in when it is running in daemon mode. + +#lock_file=/usr/local/nagios/var/nagios.lock +lock_file=/var/log/nagios/nagios.lock + + +# TEMP FILE +# This is a temporary file that is used as scratch space when Nagios +# updates the status log, cleans the comment file, etc. This file +# is created, used, and deleted throughout the time that Nagios is +# running. + +temp_file=/usr/local/nagios/var/nagios.tmp + + + +# EVENT BROKER OPTIONS +# Controls what (if any) data gets sent to the event broker. +# Values: 0 = Broker nothing +# -1 = Broker everything +# = See documentation + +event_broker_options=-1 + + + +# EVENT BROKER MODULE(S) +# This directive is used to specify an event broker module that should +# by loaded by Nagios at startup. Use multiple directives if you want +# to load more than one module. Arguments that should be passed to +# the module at startup are seperated from the module path by a space. +# +# Example: +# +# broker_module= [moduleargs] + +#broker_module=/somewhere/module1.o +#broker_module=/somewhere/module2.o arg1 arg2=3 debug=0 + + + + +# LOG ROTATION METHOD +# This is the log rotation method that Nagios should use to rotate +# the main log file. Values are as follows.. +# n = None - don't rotate the log +# h = Hourly rotation (top of the hour) +# d = Daily rotation (midnight every day) +# w = Weekly rotation (midnight on Saturday evening) +# m = Monthly rotation (midnight last day of month) + +log_rotation_method=d + + + +# LOG ARCHIVE PATH +# This is the directory where archived (rotated) log files should be +# placed (assuming you've chosen to do log rotation). + +log_archive_path=/usr/local/nagios/var/archives + + + +# LOGGING OPTIONS +# If you want messages logged to the syslog facility, as well as the +# NetAlarm log file set this option to 1. If not, set it to 0. + +use_syslog=0 + + + +# NOTIFICATION LOGGING OPTION +# If you don't want notifications to be logged, set this value to 0. +# If notifications should be logged, set the value to 1. + +log_notifications=0 + + + +# SERVICE RETRY LOGGING OPTION +# If you don't want service check retries to be logged, set this value +# to 0. If retries should be logged, set the value to 1. + +log_service_retries=1 + + + +# HOST RETRY LOGGING OPTION +# If you don't want host check retries to be logged, set this value to +# 0. If retries should be logged, set the value to 1. + +log_host_retries=1 + + + +# EVENT HANDLER LOGGING OPTION +# If you don't want host and service event handlers to be logged, set +# this value to 0. If event handlers should be logged, set the value +# to 1. + +log_event_handlers=1 + + + +# INITIAL STATES LOGGING OPTION +# If you want Nagios to log all initial host and service states to +# the main log file (the first time the service or host is checked) +# you can enable this option by setting this value to 1. If you +# are not using an external application that does long term state +# statistics reporting, you do not need to enable this option. In +# this case, set the value to 0. + +log_initial_states=0 + + + +# EXTERNAL COMMANDS LOGGING OPTION +# If you don't want Nagios to log external commands, set this value +# to 0. If external commands should be logged, set this value to 1. +# Note: This option does not include logging of passive service +# checks - see the option below for controlling whether or not +# passive checks are logged. + +log_external_commands=1 + + + +# PASSIVE CHECKS LOGGING OPTION +# If you don't want Nagios to log passive host and service checks, set +# this value to 0. If passive checks should be logged, set +# this value to 1. + +log_passive_checks=1 + + + +# GLOBAL HOST AND SERVICE EVENT HANDLERS +# These options allow you to specify a host and service event handler +# command that is to be run for every host or service state change. +# The global event handler is executed immediately prior to the event +# handler that you have optionally specified in each host or +# service definition. The command argument is the short name of a +# command definition that you define in your host configuration file. +# Read the HTML docs for more information. + +#global_host_event_handler=somecommand +#global_service_event_handler=somecommand + + + +# SERVICE INTER-CHECK DELAY METHOD +# This is the method that Nagios should use when initially +# "spreading out" service checks when it starts monitoring. The +# default is to use smart delay calculation, which will try to +# space all service checks out evenly to minimize CPU load. +# Using the dumb setting will cause all checks to be scheduled +# at the same time (with no delay between them)! This is not a +# good thing for production, but is useful when testing the +# parallelization functionality. +# n = None - don't use any delay between checks +# d = Use a "dumb" delay of 1 second between checks +# s = Use "smart" inter-check delay calculation +# x.xx = Use an inter-check delay of x.xx seconds + +service_inter_check_delay_method=s + + + +# MAXIMUM SERVICE CHECK SPREAD +# This variable determines the timeframe (in minutes) from the +# program start time that an initial check of all services should +# be completed. Default is 30 minutes. + +max_service_check_spread=30 + + + +# SERVICE CHECK INTERLEAVE FACTOR +# This variable determines how service checks are interleaved. +# Interleaving the service checks allows for a more even +# distribution of service checks and reduced load on remote +# hosts. Setting this value to 1 is equivalent to how versions +# of Nagios previous to 0.0.5 did service checks. Set this +# value to s (smart) for automatic calculation of the interleave +# factor unless you have a specific reason to change it. +# s = Use "smart" interleave factor calculation +# x = Use an interleave factor of x, where x is a +# number greater than or equal to 1. + +service_interleave_factor=s + + + +# HOST INTER-CHECK DELAY METHOD +# This is the method that Nagios should use when initially +# "spreading out" host checks when it starts monitoring. The +# default is to use smart delay calculation, which will try to +# space all host checks out evenly to minimize CPU load. +# Using the dumb setting will cause all checks to be scheduled +# at the same time (with no delay between them)! +# n = None - don't use any delay between checks +# d = Use a "dumb" delay of 1 second between checks +# s = Use "smart" inter-check delay calculation +# x.xx = Use an inter-check delay of x.xx seconds + +host_inter_check_delay_method=s + + + +# MAXIMUM HOST CHECK SPREAD +# This variable determines the timeframe (in minutes) from the +# program start time that an initial check of all hosts should +# be completed. Default is 30 minutes. + +max_host_check_spread=30 + + + +# MAXIMUM CONCURRENT SERVICE CHECKS +# This option allows you to specify the maximum number of +# service checks that can be run in parallel at any given time. +# Specifying a value of 1 for this variable essentially prevents +# any service checks from being parallelized. A value of 0 +# will not restrict the number of concurrent checks that are +# being executed. + +max_concurrent_checks=0 + + + +# SERVICE CHECK REAPER FREQUENCY +# This is the frequency (in seconds!) that Nagios will process +# the results of services that have been checked. + +service_reaper_frequency=10 + + + + +# AUTO-RESCHEDULING OPTION +# This option determines whether or not Nagios will attempt to +# automatically reschedule active host and service checks to +# "smooth" them out over time. This can help balance the load on +# the monitoring server. +# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE +# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY + +auto_reschedule_checks=0 + + + +# AUTO-RESCHEDULING INTERVAL +# This option determines how often (in seconds) Nagios will +# attempt to automatically reschedule checks. This option only +# has an effect if the auto_reschedule_checks option is enabled. +# Default is 30 seconds. +# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE +# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY + +auto_rescheduling_interval=30 + + + + +# AUTO-RESCHEDULING WINDOW +# This option determines the "window" of time (in seconds) that +# Nagios will look at when automatically rescheduling checks. +# Only host and service checks that occur in the next X seconds +# (determined by this variable) will be rescheduled. This option +# only has an effect if the auto_reschedule_checks option is +# enabled. Default is 180 seconds (3 minutes). +# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE +# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY + +auto_rescheduling_window=180 + + + +# SLEEP TIME +# This is the number of seconds to sleep between checking for system +# events and service checks that need to be run. + +sleep_time=0.25 + + + +# TIMEOUT VALUES +# These options control how much time Nagios will allow various +# types of commands to execute before killing them off. Options +# are available for controlling maximum time allotted for +# service checks, host checks, event handlers, notifications, the +# ocsp command, and performance data commands. All values are in +# seconds. + +service_check_timeout=60 +host_check_timeout=30 +event_handler_timeout=30 +notification_timeout=30 +ocsp_timeout=5 +perfdata_timeout=5 + + + +# RETAIN STATE INFORMATION +# This setting determines whether or not Nagios will save state +# information for services and hosts before it shuts down. Upon +# startup Nagios will reload all saved service and host state +# information before starting to monitor. This is useful for +# maintaining long-term data on state statistics, etc, but will +# slow Nagios down a bit when it (re)starts. Since its only +# a one-time penalty, I think its well worth the additional +# startup delay. + +retain_state_information=1 + + + +# STATE RETENTION FILE +# This is the file that Nagios should use to store host and +# service state information before it shuts down. The state +# information in this file is also read immediately prior to +# starting to monitor the network when Nagios is restarted. +# This file is used only if the preserve_state_information +# variable is set to 1. + +state_retention_file=/usr/local/nagios/var/retention.dat + + + +# RETENTION DATA UPDATE INTERVAL +# This setting determines how often (in minutes) that Nagios +# will automatically save retention data during normal operation. +# If you set this value to 0, Nagios will not save retention +# data at regular interval, but it will still save retention +# data before shutting down or restarting. If you have disabled +# state retention, this option has no effect. + +retention_update_interval=60 + + + +# USE RETAINED PROGRAM STATE +# This setting determines whether or not Nagios will set +# program status variables based on the values saved in the +# retention file. If you want to use retained program status +# information, set this value to 1. If not, set this value +# to 0. + +use_retained_program_state=1 + + + +# USE RETAINED SCHEDULING INFO +# This setting determines whether or not Nagios will retain +# the scheduling info (next check time) for hosts and services +# based on the values saved in the retention file. If you +# If you want to use retained scheduling info, set this +# value to 1. If not, set this value to 0. + +use_retained_scheduling_info=0 + + + +# INTERVAL LENGTH +# This is the seconds per unit interval as used in the +# host/contact/service configuration files. Setting this to 60 means +# that each interval is one minute long (60 seconds). Other settings +# have not been tested much, so your mileage is likely to vary... + +interval_length=60 + + + +# AGGRESSIVE HOST CHECKING OPTION +# If you don't want to turn on aggressive host checking features, set +# this value to 0 (the default). Otherwise set this value to 1 to +# enable the aggressive check option. Read the docs for more info +# on what aggressive host check is or check out the source code in +# base/checks.c + +use_aggressive_host_checking=0 + + + +# SERVICE CHECK EXECUTION OPTION +# This determines whether or not Nagios will actively execute +# service checks when it initially starts. If this option is +# disabled, checks are not actively made, but Nagios can still +# receive and process passive check results that come in. Unless +# you're implementing redundant hosts or have a special need for +# disabling the execution of service checks, leave this enabled! +# Values: 1 = enable checks, 0 = disable checks + +execute_service_checks=1 + + + +# PASSIVE SERVICE CHECK ACCEPTANCE OPTION +# This determines whether or not Nagios will accept passive +# service checks results when it initially (re)starts. +# Values: 1 = accept passive checks, 0 = reject passive checks + +accept_passive_service_checks=1 + + + +# HOST CHECK EXECUTION OPTION +# This determines whether or not Nagios will actively execute +# host checks when it initially starts. If this option is +# disabled, checks are not actively made, but Nagios can still +# receive and process passive check results that come in. Unless +# you're implementing redundant hosts or have a special need for +# disabling the execution of host checks, leave this enabled! +# Values: 1 = enable checks, 0 = disable checks + +execute_host_checks=1 + + + +# PASSIVE HOST CHECK ACCEPTANCE OPTION +# This determines whether or not Nagios will accept passive +# host checks results when it initially (re)starts. +# Values: 1 = accept passive checks, 0 = reject passive checks + +accept_passive_host_checks=1 + + + +# NOTIFICATIONS OPTION +# This determines whether or not Nagios will sent out any host or +# service notifications when it is initially (re)started. +# Values: 1 = enable notifications, 0 = disable notifications + +enable_notifications=0 + + + +# EVENT HANDLER USE OPTION +# This determines whether or not Nagios will run any host or +# service event handlers when it is initially (re)started. Unless +# you're implementing redundant hosts, leave this option enabled. +# Values: 1 = enable event handlers, 0 = disable event handlers + +enable_event_handlers=1 + + + +# PROCESS PERFORMANCE DATA OPTION +# This determines whether or not Nagios will process performance +# data returned from service and host checks. If this option is +# enabled, host performance data will be processed using the +# host_perfdata_command (defined below) and service performance +# data will be processed using the service_perfdata_command (also +# defined below). Read the HTML docs for more information on +# performance data. +# Values: 1 = process performance data, 0 = do not process performance data + +process_performance_data=0 + + + +# HOST AND SERVICE PERFORMANCE DATA PROCESSING COMMANDS +# These commands are run after every host and service check is +# performed. These commands are executed only if the +# enable_performance_data option (above) is set to 1. The command +# argument is the short name of a command definition that you +# define in your host configuration file. Read the HTML docs for +# more information on performance data. + +#host_perfdata_command=process-host-perfdata +#service_perfdata_command=process-service-perfdata + + + +# HOST AND SERVICE PERFORMANCE DATA FILES +# These files are used to store host and service performance data. +# Performance data is only written to these files if the +# enable_performance_data option (above) is set to 1. + +#host_perfdata_file=/tmp/host-perfdata +#service_perfdata_file=/tmp/service-perfdata + + + +# HOST AND SERVICE PERFORMANCE DATA FILE TEMPLATES +# These options determine what data is written (and how) to the +# performance data files. The templates may contain macros, special +# characters (\t for tab, \r for carriage return, \n for newline) +# and plain text. A newline is automatically added after each write +# to the performance data file. Some examples of what you can do are +# shown below. + +#host_perfdata_file_template=[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$ +#service_perfdata_file_template=[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$ + + + + +# HOST AND SERVICE PERFORMANCE DATA FILE MODES +# This option determines whether or not the host and service +# performance data files are opened in write ("w") or append ("a") +# mode. Unless you are the files are named pipes, you will probably +# want to use the default mode of append ("a"). + +#host_perfdata_file_mode=a +#service_perfdata_file_mode=a + + + +# HOST AND SERVICE PERFORMANCE DATA FILE PROCESSING INTERVAL +# These options determine how often (in seconds) the host and service +# performance data files are processed using the commands defined +# below. A value of 0 indicates the files should not be periodically +# processed. + +#host_perfdata_file_processing_interval=0 +#service_perfdata_file_processing_interval=0 + + + +# HOST AND SERVICE PERFORMANCE DATA FILE PROCESSING COMMANDS +# These commands are used to periodically process the host and +# service performance data files. The interval at which the +# processing occurs is determined by the options above. + +#host_perfdata_file_processing_command=process-host-perfdata-file +#service_perfdata_file_processing_command=process-service-perfdata-file + + + +# OBSESS OVER SERVICE CHECKS OPTION +# This determines whether or not Nagios will obsess over service +# checks and run the ocsp_command defined below. Unless you're +# planning on implementing distributed monitoring, do not enable +# this option. Read the HTML docs for more information on +# implementing distributed monitoring. +# Values: 1 = obsess over services, 0 = do not obsess (default) + +obsess_over_services=0 + + + +# OBSESSIVE COMPULSIVE SERVICE PROCESSOR COMMAND +# This is the command that is run for every service check that is +# processed by Nagios. This command is executed only if the +# obsess_over_service option (above) is set to 1. The command +# argument is the short name of a command definition that you +# define in your host configuration file. Read the HTML docs for +# more information on implementing distributed monitoring. + +#ocsp_command=somecommand + + + +# ORPHANED SERVICE CHECK OPTION +# This determines whether or not Nagios will periodically +# check for orphaned services. Since service checks are not +# rescheduled until the results of their previous execution +# instance are processed, there exists a possibility that some +# checks may never get rescheduled. This seems to be a rare +# problem and should not happen under normal circumstances. +# If you have problems with service checks never getting +# rescheduled, you might want to try enabling this option. +# Values: 1 = enable checks, 0 = disable checks + +check_for_orphaned_services=1 + + + +# SERVICE FRESHNESS CHECK OPTION +# This option determines whether or not Nagios will periodically +# check the "freshness" of service results. Enabling this option +# is useful for ensuring passive checks are received in a timely +# manner. +# Values: 1 = enabled freshness checking, 0 = disable freshness checking + +check_service_freshness=1 + + + +# SERVICE FRESHNESS CHECK INTERVAL +# This setting determines how often (in seconds) Nagios will +# check the "freshness" of service check results. If you have +# disabled service freshness checking, this option has no effect. + +service_freshness_check_interval=60 + + + +# HOST FRESHNESS CHECK OPTION +# This option determines whether or not Nagios will periodically +# check the "freshness" of host results. Enabling this option +# is useful for ensuring passive checks are received in a timely +# manner. +# Values: 1 = enabled freshness checking, 0 = disable freshness checking + +check_host_freshness=0 + + + +# HOST FRESHNESS CHECK INTERVAL +# This setting determines how often (in seconds) Nagios will +# check the "freshness" of host check results. If you have +# disabled host freshness checking, this option has no effect. + +host_freshness_check_interval=60 + + + +# AGGREGATED STATUS UPDATES +# This option determines whether or not Nagios will +# aggregate updates of host, service, and program status +# data. Normally, status data is updated immediately when +# a change occurs. This can result in high CPU loads if +# you are monitoring a lot of services. If you want Nagios +# to only refresh status data every few seconds, disable +# this option. +# Values: 1 = enable aggregate updates, 0 = disable aggregate updates + +aggregate_status_updates=1 + + + +# AGGREGATED STATUS UPDATE INTERVAL +# Combined with the aggregate_status_updates option, +# this option determines the frequency (in seconds!) that +# Nagios will periodically dump program, host, and +# service status data. If you are not using aggregated +# status data updates, this option has no effect. + +status_update_interval=15 + + + +# FLAP DETECTION OPTION +# This option determines whether or not Nagios will try +# and detect hosts and services that are "flapping". +# Flapping occurs when a host or service changes between +# states too frequently. When Nagios detects that a +# host or service is flapping, it will temporarily suppress +# notifications for that host/service until it stops +# flapping. Flap detection is very experimental, so read +# the HTML documentation before enabling this feature! +# Values: 1 = enable flap detection +# 0 = disable flap detection (default) + +enable_flap_detection=0 + + + +# FLAP DETECTION THRESHOLDS FOR HOSTS AND SERVICES +# Read the HTML documentation on flap detection for +# an explanation of what this option does. This option +# has no effect if flap detection is disabled. + +low_service_flap_threshold=5.0 +high_service_flap_threshold=20.0 +low_host_flap_threshold=5.0 +high_host_flap_threshold=20.0 + + + +# DATE FORMAT OPTION +# This option determines how short dates are displayed. Valid options +# include: +# us (MM-DD-YYYY HH:MM:SS) +# euro (DD-MM-YYYY HH:MM:SS) +# iso8601 (YYYY-MM-DD HH:MM:SS) +# strict-iso8601 (YYYY-MM-DDTHH:MM:SS) +# + +date_format=iso8601 + + + +# P1.PL FILE LOCATION +# This value determines where the p1.pl perl script (used by the +# embedded Perl interpreter) is located. If you didn't compile +# Nagios with embedded Perl support, this option has no effect. + +p1_file=/usr/local/nagios/bin/p1.pl + + + +# ILLEGAL OBJECT NAME CHARACTERS +# This option allows you to specify illegal characters that cannot +# be used in host names, service descriptions, or names of other +# object types. + +illegal_object_name_chars=`~!$%^&*|'"<>?,()= + + + +# ILLEGAL MACRO OUTPUT CHARACTERS +# This option allows you to specify illegal characters that are +# stripped from macros before being used in notifications, event +# handlers, etc. This DOES NOT affect macros used in service or +# host check commands. +# The following macros are stripped of the characters you specify: +# $HOSTOUTPUT$ +# $HOSTPERFDATA$ +# $HOSTACKAUTHOR$ +# $HOSTACKCOMMENT$ +# $SERVICEOUTPUT$ +# $SERVICEPERFDATA$ +# $SERVICEACKAUTHOR$ +# $SERVICEACKCOMMENT$ + +illegal_macro_output_chars=`~$&|'"<> + + + +# REGULAR EXPRESSION MATCHING +# This option controls whether or not regular expression matching +# takes place in the object config files. Regular expression +# matching is used to match host, hostgroup, service, and service +# group names/descriptions in some fields of various object types. +# Values: 1 = enable regexp matching, 0 = disable regexp matching + +use_regexp_matching=0 + + + +# "TRUE" REGULAR EXPRESSION MATCHING +# This option controls whether or not "true" regular expression +# matching takes place in the object config files. This option +# only has an effect if regular expression matching is enabled +# (see above). If this option is DISABLED, regular expression +# matching only occurs if a string contains wildcard characters +# (* and ?). If the option is ENABLED, regexp matching occurs +# all the time (which can be annoying). +# Values: 1 = enable true matching, 0 = disable true matching + +use_true_regexp_matching=0 + + + + +# ADMINISTRATOR EMAIL ADDRESS +# The email address of the administrator of *this* machine (the one +# doing the monitoring). Nagios never uses this value itself, but +# you can access this value by using the $ADMINEMAIL$ macro in your +# notification commands. + +admin_email=s.yurevich@gsi.de + + +# ADMINISTRATOR PAGER NUMBER/ADDRESS +# The pager number/address for the administrator of *this* machine. +# Nagios never uses this value itself, but you can access this +# value by using the $ADMINPAGER$ macro in your notification +# commands. + +admin_pager=pagehadaq + + + +# DAEMON CORE DUMP OPTION +# This option determines whether or not Nagios is allowed to create +# a core dump when it runs as a daemon. Note that it is generally +# considered bad form to allow this, but it may be useful for +# debugging purposes. +# Values: 1 - Allow core dumps +# 0 - Do not allow core dumps (default) + +daemon_dumps_core=0 + + + diff --git a/config/servicegroups.cfg b/config/servicegroups.cfg new file mode 100644 index 0000000..44714f7 --- /dev/null +++ b/config/servicegroups.cfg @@ -0,0 +1,20 @@ +# SOUND SERVER service group +define servicegroup{ + servicegroup_name soundserver-group + alias SOUND SERVER + members hadesdaq,SOUND_SERVER,lxhadesdaq,SOUND_SERVER,hadc08,SOUND_SERVER; + } + +# HARD DISK TEST service group +define servicegroup{ + servicegroup_name harddisk-group + alias DISK TEST + members hadesdaq,DISK TEST,hadesdaq,RAID1,hadeb07,DISK_AB TEST,hadeb07,DISK_CD TEST,hades17,DISK TEST,hades17,RAID1,hades25,DISK TEST,hades25,RAID1,hades27,DISK TEST,hades27,RAID1; + } + +# online QA/DST service group +#define servicegroup{ +# servicegroup_name onlinedst-group +# alias online QA/DST +# members lxg0411,updateQA,lxg0411,updateDST,lxg0430,runPairDST,lxg0440,runPairDST,lxg0441,runPairDST,lxg0442,runPairDST,lxg0443,runPairDST,lxg0444,runPairDST,lxg0451,runQA,lxg0452,runPairDST; +# } \ No newline at end of file diff --git a/config/services.cfg b/config/services.cfg new file mode 100644 index 0000000..d723892 --- /dev/null +++ b/config/services.cfg @@ -0,0 +1,923 @@ +# Generic service definition template - This is NOT a real service, just a template! + +define service{ + name generic-service ; The 'name' of this service template + active_checks_enabled 1 ; Active service checks are enabled + passive_checks_enabled 1 ; Passive service checks are enabled accepted + parallelize_check 1 ; Active service checks should be par allelized (disabling this can lead to major performance problems) + obsess_over_service 1 ; We should obsess over this service (if necessary) + check_freshness 0 ; Default is to NOT check service 'freshness' + notifications_enabled 1 ; Service notifications are enabled + event_handler_enabled 1 ; Service event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + failure_prediction_enabled 1 ; Failure prediction is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information across program restarts + retain_nonstatus_information 1 ; Retain non-status information across program restarts + is_volatile 0 ; The service is not volatile + register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE! + } + +# PING-SERVICE +define service{ + use generic-service + name ping-service + hostgroups * + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 5 + retry_check_interval 1 + contact_groups linux-admins ; Make sure that the value here is also located in the contactgroup.cfg + notification_interval 120 + notification_period 24x7 + notification_options c,r + + register 0 + } + +# SSH-SERVICE +define service{ + use generic-service + name ssh-service + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + hostgroups * + + register 0 + } + +# PING-SERVICE for lxg hosts +define service{ + use generic-service + name ping-service-lxg + hostgroups * + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins ; Make sure that the value here is also located in the contactgroup.cfg + notification_interval 120 + notification_period 24x7 + notification_options c,r + + register 0 + } + +# SSH-SERVICE for lxg hosts +define service{ + use generic-service + name ssh-service-lxg + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + hostgroups * + + register 0 + } + +# PING +define service{ + use ping-service ; Name of service template to use + hostgroup_name vmecpu-group,hadeb-group +# host_name * + service_description PING + check_command check_ping!100.0,20%!500.0,60% + } + + +# SSH +define service{ + use ssh-service +# host_name * + hostgroup_name vmecpu-group,hadeb-group + service_description SSH + check_command check_ssh!2 +} + +############# COMMON SERVICES FOR LXG04** +# PING +define service{ + use ping-service-lxg ; Name of service template to use + hostgroup_name lxg-group,hades-group +# host_name * + service_description PING + check_command check_ping!100.0,20%!500.0,60% + } + + +# SSH +define service{ + use ssh-service-lxg +# host_name * + hostgroup_name lxg-group,hades-group + service_description SSH + check_command check_ssh!2 +} + +####################### hadesdaq ########################## +# local raid +define service{ + use generic-service + host_name hadesdaq + service_description RAID1 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 30 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_raid + } + +# local check load +define service{ + use generic-service + host_name hadesdaq + service_description CPU LOAD + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 5 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_load!10!10!0.9!20!20!1.5! + } + +# local disk test +define service{ + use generic-service + host_name hadesdaq + service_description DISK TEST + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 480 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 480 + notification_period 24x7 + notification_options c,r + check_command check_disk_smartctl_temp!/dev/sda!/dev/sdb!50!60! + } + +# check process: sound_server.pl +define service{ + use generic-service + host_name hadesdaq + service_description SOUND_SERVER + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 5 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 480 + notification_period 24x7 + notification_options c,r + check_command check_proc2!sound_server.pl! + } + +################### lxhadesdaq ######################## +#check disk space +define service{ + use generic-service + host_name lxhadesdaq + service_description /DATA + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 30 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/data! + } + +#check disk space +define service{ + use generic-service + host_name lxhadesdaq + service_description /VAR + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!8%!4%!/var! + } + +# remote cpu load +define service{ + use generic-service + host_name lxhadesdaq + service_description CPU LOAD + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 5 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_load_by_ssh!10!10!2.5!20!20!3.5! + } + +#check archivist +define service{ + use generic-service + host_name lxhadesdaq + service_description ARCHIVIST + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 5 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_archivist!1978! + } + +#check process: runinfo2ora.pl +define service{ + use generic-service + host_name lxhadesdaq + service_description RUNINFO2ORA + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc2_by_ssh!runinfo2ora.pl! + } + +#check process: sound_server.pl +define service{ + use generic-service + host_name lxhadesdaq + service_description SOUND_SERVER + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc2_by_ssh!sound_server.pl! + } + +#check process: dhcp service +define service{ + use generic-service + host_name lxhadesdaq + service_description DHCP + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 120 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_dhcp!140.181.75.158! + } + +#check EB log file for discardred events +define service{ + use generic-service + host_name lxhadesdaq + service_description DISCARDED EVTS + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_eblog!50501! + } + +#check EB log file for discardred events +define service{ + use generic-service + host_name lxhadesdaq + service_description LUSTRE + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 2 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_status!50502!check_lustre! + } + +####################### hadeb01 ########################### +# remote disk check +#define service{ +# use generic-service +# host_name hadeb01 +# service_description /VAR +# is_volatile 0 +# check_period 24x7 +# max_check_attempts 3 +# normal_check_interval 60 +# retry_check_interval 1 +# contact_groups linux-admins +# notification_interval 120 +# notification_period 24x7 +# notification_options c,r +# check_command check_disk_by_ssh!20%!10%!/var! +# } + +####################### hadeb03 ########################### +# remote disk check +define service{ + use generic-service + host_name hadeb03 + service_description /D/HADEB03 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/d/hadeb03! + } + +# remote disk check +define service{ + use generic-service + host_name hadeb03 + service_description /D/HADEB03B + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/d/hadeb03b! + } + +# remote raid check +define service{ + use generic-service + host_name hadeb03 + service_description RAID1 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_raid_by_ssh + } + +####################### hadeb04 ########################### +# remote disk check +define service{ + use generic-service + host_name hadeb04 + service_description /DATA/HADEB04 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/data/hadeb04! + } + +####################### hadeb05 ########################### +# remote disk check +define service{ + use generic-service + host_name hadeb05 + service_description / + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/! + } + +# remote cpu load +define service{ + use generic-service + host_name hadeb05 + service_description CPU LOAD + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 5 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_load_by_ssh!10!10!1.0!20!20!1.5! + } + +####################### hadeb06 ########################### +# remote disk check +define service{ + use generic-service + host_name hadeb06a + service_description /DATA/HADEB06 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/data/hadeb06! + } + +#define service{ +# use generic-service +# host_name hadeb06a +# service_description CONNECT_RES_RAM +# is_volatile 0 +# check_period 24x7 +# max_check_attempts 3 +# normal_check_interval 10 +# retry_check_interval 1 +# contact_groups linux-admins +# notification_interval 120 +# notification_period 24x7 +# notification_options c,r +# check_command check_proc2_by_ssh!connect_res_ram! +# } + +#define service{ +# use generic-service +# host_name hadeb06a +# service_description GET_HLD_RAMDISK +# is_volatile 0 +# check_period 24x7 +# max_check_attempts 3 +# normal_check_interval 10 +# retry_check_interval 1 +# contact_groups linux-admins +# notification_interval 120 +# notification_period 24x7 +# notification_options c,r +# check_command check_proc2_by_ssh!get_hld_ramdisk! +# } + +# remote process (connect_res) check status +define service{ + use generic-service + host_name hadeb06a + service_description CONNECT_RES_RAM + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_status!50501!connect_res_ram! + } + +# remote process (get_hld_ramdisk) check status +define service{ + use generic-service + host_name hadeb06a + service_description GET_HLD_RAMDISK + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_status!50502!get_hld_ramdisk! + } + +####################### lxg0434 ########################### +# remote process (check_archiver) check status +define service{ + use generic-service + host_name lxg0434 + service_description ARCHIVER + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 5 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_status!50501!check_archiver! + } + +####################### lxg0447 ########################### +# remote disk check +define service{ + use generic-service + host_name lxg0447 + service_description /DATA.LOCAL2 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh_lxg0447!30%!20%!/data.local2! + } + +# remote process (connect_res) check +define service{ + use generic-service + host_name lxg0447 + service_description CONNECT_RES + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_by_ssh!connect_res! + } + +# remote process (connect_res) check status +define service{ + use generic-service + host_name lxg0447 + service_description CONNECT_RES STATUS + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_status!50501!connect_res! + } + +####################### lxg0451 ########################### +# remote disk check +define service{ + use generic-service + host_name lxg0451 + service_description /DATA.LOCAL2 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh_lxg0451!15%!10%!/data.local2! + } + +# remote process (connect_res) check +define service{ + use generic-service + host_name lxg0451 + service_description CONNECT_RES + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 60 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_by_ssh!connect_res! + } + +# remote process (connect_res) check status +define service{ + use generic-service + host_name lxg0451 + service_description CONNECT_RES STATUS + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc_status!50501!connect_res! + } + +####################### hadeb07 ########################### +# remote disk check +define service{ + use generic-service + host_name hadeb07 + service_description DISK_AB TEST + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!50!60! + } + +define service{ + use generic-service + host_name hadeb07 + service_description DISK_CD TEST + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_smartctl_temp_by_ssh!/dev/sdc!/dev/sdd!50!60! + } + +define service{ + use generic-service + host_name hadeb07 + service_description BACKUP + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_proc_status!50501!check_backup! + } + +###################### hadc08 ############################# +#check process: sound_server.pl +define service{ + use generic-service + host_name hadc08 + service_description SOUND_SERVER + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 10 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 120 + notification_period 24x7 + notification_options c,r + check_command check_proc2_by_ssh!sound_server.pl! + } + +###################### hades25 ############################ +## local disk test +define service{ + use generic-service + host_name hades25 + service_description DISK TEST + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!50!60! + } + +# remote raid check +define service{ + use generic-service + host_name hades25 + service_description RAID1 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_raid_by_ssh + } + +#check disk space +define service{ + use generic-service + host_name hades25 + service_description / + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/! + } + +###################### hades17 ############################ +## local disk test +define service{ + use generic-service + host_name hades17 + service_description DISK TEST + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!60!70! + } + +# remote raid check +define service{ + use generic-service + host_name hades17 + service_description RAID1 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_raid_by_ssh + } + +#check disk space +define service{ + use generic-service + host_name hades17 + service_description / + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/! + } + +###################### hades27 ############################ +## remote disk test +define service{ + use generic-service + host_name hades27 + service_description DISK TEST + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_smartctl_temp_by_ssh!/dev/sda!/dev/sdb!60!70! + } + +# remote raid check +define service{ + use generic-service + host_name hades27 + service_description RAID1 + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_raid_by_ssh + } + +#check disk space +define service{ + use generic-service + host_name hades27 + service_description / + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_disk_by_ssh!20%!10%!/! + } + +#check EPICS +define service{ + use generic-service + host_name hadsc1 + service_description EPICS test + is_volatile 0 + check_period 24x7 + max_check_attempts 3 + normal_check_interval 1440 + retry_check_interval 1 + contact_groups linux-admins + notification_interval 1440 + notification_period 24x7 + notification_options c,r + check_command check_epics!HAD:hadsc1:scan1! + } diff --git a/plugins/check_archivist.pl b/plugins/check_archivist.pl new file mode 100755 index 0000000..3e96d1c --- /dev/null +++ b/plugins/check_archivist.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w +# --------------------------------------------------------------------------- +# File Name: my_check_archivist.pl +# Author: Sergey Yurevich +# Date: 16/01/2007 +# Version: 0.1 +# Description: script will check to see if there +# is a message from archivist +# --------------------------------------------------------------------------- + +use strict; +use warnings; +use IO::Socket; +use lib '/usr/local/nagios/libexec/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +@ARGV == 2 or die "usage: my_check_archivist.pl host_ip host_port\n"; + +my ($remote_host, $remote_port) = @ARGV; + +#my $remote_host = 'lxhadesdaq.gsi.de'; +#my $remote_port = '60006'; +my $protocol = 'tcp'; +my $state; +my $answer = ""; + +my $socket = IO::Socket::INET->new(PeerAddr => $remote_host, + PeerPort => $remote_port, + Proto => $protocol, + Type => SOCK_STREAM) + or $answer = "CRITICAL - no response from archivist at $remote_host:$remote_port"; + +if($answer){ + $state = $ERRORS{'CRITICAL'}; +} +else{ + $answer = <$socket>; + + close($socket); + + if($answer =~/OK/){ + $state = $ERRORS{'OK'}; + } + elsif($answer =~/WARNING/){ + $state = $ERRORS{'WARNING'}; + } + elsif($answer =~/CRITICAL/){ + $state = $ERRORS{'CRITICAL'}; + } + elsif($answer){ + $state = $ERRORS{'UNKNOWN'}; + } +} + +if($state == $ERRORS{'OK'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'WARNING'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'CRITICAL'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'UNKNOWN'}){ + print "UNKNOWN - $answer\n"; +} + +exit $state; + + + + diff --git a/plugins/check_raid.pl b/plugins/check_raid.pl new file mode 100755 index 0000000..ebbc205 --- /dev/null +++ b/plugins/check_raid.pl @@ -0,0 +1,100 @@ +#!/usr/bin/perl -w +# ------------------------------------------------------------------------------ +# File Name: chech_raid.pl +# Author: Thomas Nilsen - Norway +# Date: 14/06/2003 +# Version: 0.1 +# Description: This script will check to see if any software raid +# devices are down. +# Email: thomas.nilsen@doc-s.co.uk +# WWW: www.doc-s.co.uk +# ------------------------------------------------------------------------------ +# Copyright 2003 (c) Thomas Nilsen +# Credits go to Ethan Galstad for coding Nagios +# License GPL +# ------------------------------------------------------------------------------ +# Date Author Reason +# ---- ------ ------ +# 14/06/2003 TN Initial Release +# - Format of mdstat assumed to be "2 line" per +# device with [??] on the second line. +# ------------------------------------------------------------------------------ + +use strict; +use warnings; +use Getopt::Long;; +use vars qw($opt_V $opt_h $opt_t $opt_F $PROGNAME); +use lib '/usr/local/nagios/libexec/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +$PROGNAME="check_raid"; + +$ENV{'PATH'}=''; +$ENV{'BASH_ENV'}=''; +$ENV{'ENV'}=''; +my ( $line, $prevline, $stat, $state ,@device, $msg, $status, $timeout); + +$stat="/proc/mdstat"; + +#Option checking +Getopt::Long::Configure('bundling'); +$status = GetOptions( + "V" => \$opt_V, "version" => \$opt_V, + "h" => \$opt_h, "help" => \$opt_h, + "F" => \$opt_F, "filename" => \$opt_F, + "t" => \$opt_t, "timeout" => \$opt_t); +# Version +if ($opt_V) { + print_revision($PROGNAME,'$Revision: 1.1 $'); + exit $ERRORS{'OK'}; +} +# Help +if ($opt_h) { + #print_help(); + exit $ERRORS{'OK'}; +} +# Filename supplied +if ($opt_F) { + $opt_F = shift; + $stat = $1 if ($opt_F =~ /^(.*)$/); + + if ( ! -r $stat ) { + print "Invalid mdstat file: $opt_F\n"; + exit $ERRORS{'UNKNOWN'}; + } +} + +$timeout = $TIMEOUT; +($opt_t) && ($opt_t =~ /^([0-9]+)$/) && ($timeout = $1); + +# Just in case of problems, let's not hang Nagios +$SIG{'ALRM'} = sub { + print ("ERROR: No response (alarm)\n"); + exit $ERRORS{'UNKNOWN'}; +}; +alarm($timeout); + +# Start checking the file... +open (FH, $stat); +$state = $ERRORS{'OK'}; +$msg =""; + +# Now check the mdstat file.. +while () { + $line= $_; + if( $line =~ / \[_|_\]|U_|_U /) { + $state = $ERRORS{'CRITICAL'}; + @device = split(/ /,$prevline); + $msg = $msg . $device[0] . ": - "; + } + $prevline = $line; +} +close (FH); + +if ( $state == $ERRORS{'CRITICAL'} ) { + print "CRITICAL - Device(s) $msg have failed\n"; +} elsif ( $state == $ERRORS{'OK'} ) + { print "OK - All devices are online\n"; } +exit $state; + + diff --git a/plugins/my_check_dhcp.pl b/plugins/my_check_dhcp.pl new file mode 100755 index 0000000..469d2f3 --- /dev/null +++ b/plugins/my_check_dhcp.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl -w +# ---------------------------------------------------------------------------- +# File Name: my_check_process.pl +# Author: Sergey Yurevich +# Date: 05/04/2007 +# Version: 0.1 +# Description: script will check if there is a process running +# ---------------------------------------------------------------------------- + +use strict; +use warnings; +use lib '/usr/local/nagios/libexec/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +@ARGV == 1 or die "usage: my_check_dhcp.pl ip\n"; + +my ($ip) = @ARGV; + +#dhcping - check dhcp service +#-c 140.181.67.143 - from hadesdaq (140.181.67.143) +#-s 140.181.75.158 - on lxhadesdaq (140.181.75.158) +#-h 00:40:9E:00:99:E2 - if one gets an answer from MAC address (00:40:9E:00:99:E2) + +my $answer = `dhcping -c 140.181.67.143 -s $ip -h 00:40:9E:00:99:E2`; +chop($answer); + +my $state; + +if($answer =~/Got answer from/){ + $state = $ERRORS{'OK'}; +} +else{ + $state = $ERRORS{'CRITICAL'}; +} + +if($state == $ERRORS{'OK'}){ + print "OK - dhcp is running\n"; +} +elsif($state == $ERRORS{'CRITICAL'}){ + print "CRITICAL - dhcp is not running!\n"; +} + +exit $state; + + + + diff --git a/plugins/my_check_disk_smartctl.pl b/plugins/my_check_disk_smartctl.pl new file mode 100755 index 0000000..5ddb147 --- /dev/null +++ b/plugins/my_check_disk_smartctl.pl @@ -0,0 +1,158 @@ +#! /usr/bin/perl -w +# ---------------------------------------------------------------------------- +# File Name: my_check_disk_smartctl.pl +# Author: Sergey Yurevich +# Date: 16/01/2007 +# Version: 0.1 +# Description: script will perform SMART overall-health +# self-assessment test + temperature check. +# ---------------------------------------------------------------------------- + +use strict; +use Data::Dumper; +use lib '/usr/local/nagios/libexec/' ; +use utils qw($TIMEOUT %ERRORS &print_revision &support &usage); +use Getopt::Long; + +Getopt::Long::Configure( 'bundling' ); + +my (@devices, $temper, $warntemp, $crittemp); + +GetOptions( "d|devices=s" => \@devices, + "t|temper" => \$temper, + "w|warning=i" => \$warntemp, + "c|critical=i" => \$crittemp ); + +my $SMARTCTL = "/usr/sbin/smartctl"; +my $state; + +my $stateCrit = -1; +my $stateWarn = -1; +my $stateUnkn = -1; + +die "usage: my_check_disk_smartctl.pl [--temper -w 50 -c 60] -d /dev/sda -d /dev/sdb ...\n" unless @devices; + +#- loop over disks +foreach my $disk (@devices) +{ + $state = -1; + + #- valid devices: /dev/hda,..., /dev/sda,... + unless ($disk =~ /(\/dev\/[hs]d[0-9a-z]+)/){ + print "ERROR: Invalid disk: $disk\n"; + exit $ERRORS{'UNKNOWN'}; + } + + if( $temper ) + { + my $temperature = &check_temperature( $disk ); + + if( $temperature > $crittemp ) + { + print "CRIRICAL! $disk: temperature is $temperature "; + $stateCrit = $ERRORS{'CRITICAL'}; + } + elsif( $temperature > $warntemp ) + { + print "WARNING! $disk: temperature is $temperature "; + $stateWarn = $ERRORS{'WARNING'}; + } + elsif( $temperature eq "" ) + { + print "CRIRICAL! No output from smartctl -A $disk "; + $stateCrit = $ERRORS{'CRITICAL'}; + } + elsif( $temperature == -1 ) + { + print "CRIRICAL! Temperature check failed! "; + $stateCrit = $ERRORS{'CRITICAL'}; + } + else + { + print "OK! $disk: temperature = $temperature "; + } + } + + my $command = "$SMARTCTL -H $disk"; + my $status = `$command`; + + #$status = "hgftrefsd FAIL"; + + if ($status eq "") { + print "ERROR: no output from '$command'\n"; + $state = $ERRORS{'CRITICAL'}; + } + + #- $ok gets equal the last line of smartctl output if PASSED... + my $ok = $1 if $status =~ /\n(.*?test result: PASSED\n)/i or + $status =~ /\n(.*?Sense: Ok!\n)/i; + if ($ok){ + #print "$disk: $ok"; + print "OK! $disk: SMART health test: PASSED "; + $state = $ERRORS{'OK'}; + } + + #- $fail gets equal the last line of smartctl output if FAIL... + my $fail = $1 if $status =~ /\n(.*?[^WHEN_]FAIL[^ED][^\n]*)/i; + if ($fail){ + print "CRITICAL! $disk: $fail"; + $state = $ERRORS{'CRITICAL'}; + } + + #- $old gets equal the last line of smartctl output if OLD... + my $old = $1 if $status =~ /\n(.*?OLD[^_age][^\n]*)/i; + if ($old){ + print "WARNING! $disk: $old\n"; + $state = $ERRORS{'WARNING'}; + } + + if($state == $ERRORS{'CRITICAL'}){ + $stateCrit = $ERRORS{'CRITICAL'}; + } + elsif($state == $ERRORS{'WARNING'}){ + $stateWarn = $ERRORS{'WARNING'}; + } + elsif($state == -1){ + print "UNKNOWN! Check manually: $SMARTCTL -H $disk "; + $stateUnkn = $ERRORS{'UNKNOWN'}; + } +} + +if($stateCrit == $ERRORS{'CRITICAL'}){ + exit $stateCrit; +} +elsif($stateUnkn == $ERRORS{'UNKNOWN'}){ + exit $stateUnkn; +} +elsif($stateWarn == $ERRORS{'WARNING'}){ + exit $stateWarn; +} +else{ + exit $ERRORS{'OK'}; +} + +sub check_temperature +{ + my ($disk) = @_; + + my $command = "$SMARTCTL -A $disk"; + my $temperature = -1; + + my @status = `$command`; + + #print Dumper @status; + + foreach my $line ( @status ) + { + chop( $line ); + if( $line =~ "194 Temperature_Celsius" ) + { + my @words = split(/ +/, $line); + $temperature = $words[9]; + } + } + + return $temperature; +} + + diff --git a/plugins/my_check_eblog.pl b/plugins/my_check_eblog.pl new file mode 100755 index 0000000..a8e458b --- /dev/null +++ b/plugins/my_check_eblog.pl @@ -0,0 +1,192 @@ +#!/usr/bin/perl -w + +######################################################## +# +# Author: S.Y. +# +# This script checks Event Builder log file and +# estimates the number of files with discarded events +# above a given threshold +# +######################################################## + +use strict; +use Data::Dumper; +use Tie::File; +use Fcntl; +use IO::Handle; + +my $i; +my @lines; +my $line; + +my $file2read = sprintf("%s_s.tcl", $ENV{DAQ_SETUP}); + +tie(@lines, 'Tie::File', $file2read, mode => O_RDONLY) + or die "Cannot tie file $file2read: $!\n"; + + +#- the file info is searched only for files created during: +my $last_minutes = 60; #last 60 minutes + +#- get current time in iso format +my ($y, $m, $d, $hh, $mm, $ss) = (localtime)[5,4,3,2,1,0]; $y += 1900; $m++; +my $iso_now = sprintf("%d-%02d-%02d %02d:%02d:%02d", $y, $m, $d, $hh, $mm, $ss); + +#- init counters +my $filenum = 0; +my $errfilenum1 = 0; #file with many evtsDiscarded +my $errfilenum2 = 0; #file with many evtsDataError +my $errfilenum3 = 0; #file with many evtsTagError + +my ($evtsComplete, $evtsDiscarded, $evtsDataError, $evtsTagError); + +#- status info for Nagios +my $status; + +#--- loop over all lines backward in the file2read +for ( $i = $#lines; $i > 1; $i--){ + + $line = $lines[$i]; + + #- look for a line with "stopdate" + if ( $line =~ /stopdate/){ + + #- check the number of problematic events in file + if ($filenum > 0 && $evtsComplete > 0) { + + #- estimate amount of discarded events + my $ratio1 = $evtsDiscarded/$evtsComplete; + my $ratio2 = $evtsDataError/$evtsComplete; + my $ratio3 = $evtsTagError/$evtsComplete; + + #print "evtsComplete = $evtsComplete, evtsDiscarded = $evtsDiscarded, ratio1 = $ratio1\n"; + + if ($ratio1 > 0.1) { + $errfilenum1++; + } + if ($ratio2 > 0.1) { + $errfilenum2++; + } + if ($ratio3 > 0.1) { + $errfilenum3++; + } + } + + #- extract stop date from the line (format: "2007-05-05T19:32:53") + my ($v1, $v2, $stop_date) = split(" ", $line); + + #- get rid of "" + $stop_date =~ s/\"//g; + + #-get rid of "T" + $stop_date =~ s/T/ /; + + #- get time difference (in minutes) + my $time_diff = &timeDiff( date1 => $stop_date, date2 => $iso_now ); + + #print "stop_date = $stop_date, time_diff = $time_diff\n"; + + #- look for a recent hour + if ( $time_diff > $last_minutes) { + + if ($filenum == 0) { + $status = "OK - no new files in a log during last $last_minutes min."; + last; + } + elsif ($filenum > 0) { + my $persent1 = $errfilenum1/$filenum; + my $persent2 = $errfilenum2/$filenum; + my $persent3 = $errfilenum3/$filenum; + + # if number of files with discarded events above threshold + # exceeds 10% -> send a WARNING + if ($persent1 > 0.1) { + $status = "WARNING - $persent1 files with discarded events during last $last_minutes min."; + last; + } + elsif ($persent2 > 0.1) { + $status = "WARNING - $persent2 files with data error during last $last_minutes min."; + last; + } + elsif ($persent3 > 0.1) { + $status = "WARNING - $persent3 files with tag error during last $last_minutes min."; + last; + } + else { + $status = "OK - $persent1 files with discarded events during last $last_minutes min."; + last; + } + } + } #if ( $time_diff > 60.) + else { + + #- increment filenum counter + $filenum++; + } + } #if ( $line =~ /stopdate/){ + else { + + my ($v1, $v2); + + if($line =~ /evtsComplete/) { + ($v1, $v2, $evtsComplete) = split(" ", $line); + } + if($line =~ /evtsDiscarded/) { + ($v1, $v2, $evtsDiscarded) = split(" ", $line); + } + if($line =~ /evtsDataError/) { + ($v1, $v2, $evtsDataError) = split(" ", $line); + } + if($line =~ /evtsTagError/) { + ($v1, $v2, $evtsTagError) = split(" ", $line); + } + } +} + +print "status for Nagios: $status\n"; + +sub timeDiff (%) { + # + # this subroutine calculates the time difference in minutes + # usage: $timeDiffStr = &timeDiff( date1 => $sale_time, date2 => $iso_now ); + # time format: $sale_time = "2007-05-05 19:32:53"; + # + + my %args = @_; + + my @offset_days = qw(0 31 59 90 120 151 181 212 243 273 304 334); + + my $year1 = substr($args{'date1'}, 0, 4); + my $month1 = substr($args{'date1'}, 5, 2); + my $day1 = substr($args{'date1'}, 8, 2); + my $hh1 = substr($args{'date1'},11, 2) || 0; + my $mm1 = substr($args{'date1'},14, 2) || 0; + my $ss1 = substr($args{'date1'},17, 2) if (length($args{'date1'}) > 16); + $ss1 ||= 0; + + my $year2 = substr($args{'date2'}, 0, 4); + my $month2 = substr($args{'date2'}, 5, 2); + my $day2 = substr($args{'date2'}, 8, 2); + my $hh2 = substr($args{'date2'},11, 2) || 0; + my $mm2 = substr($args{'date2'},14, 2) || 0; + my $ss2 = substr($args{'date2'},17, 2) if (length($args{'date2'}) > 16); + $ss2 ||= 0; + + my $total_days1 = $offset_days[$month1 - 1] + $day1 + 365 * $year1; + my $total_days2 = $offset_days[$month2 - 1] + $day2 + 365 * $year2; + my $days_diff = $total_days2 - $total_days1; + + my $seconds1 = $total_days1 * 86400 + $hh1 * 3600 + $mm1 * 60 + $ss1; + my $seconds2 = $total_days2 * 86400 + $hh2 * 3600 + $mm2 * 60 + $ss2; + + my $ssDiff = $seconds2 - $seconds1; + + my $dd = int($ssDiff / 86400); + my $hh = int($ssDiff / 3600) - $dd * 24; + my $mm = int($ssDiff / 60) - $dd * 1440 - $hh * 60; + my $ss = int($ssDiff / 1) - $dd * 86400 - $hh * 3600 - $mm * 60; + + my $totminutes = int($ssDiff / 60); + "$totminutes"; +} diff --git a/plugins/my_check_eblog_status.pl b/plugins/my_check_eblog_status.pl new file mode 100755 index 0000000..bf24e2d --- /dev/null +++ b/plugins/my_check_eblog_status.pl @@ -0,0 +1,74 @@ +#!/usr/bin/perl -w +# ------------------------------------------------------------------------------ +# File Name: my_check_eblog_status.pl +# Author: Sergey Yurevich +# Date: 16/01/2007 +# Version: 0.1 +# Description: script checks the status of the process (alive/dead) +# ------------------------------------------------------------------------------ + +use strict; +use warnings; +use IO::Socket; +use lib '/usr/local/nagios/libexec/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +@ARGV == 2 or die "usage: my_check_eblog_status.pl host_ip host_port\n"; + +my ($remote_host, $remote_port) = @ARGV; + +#my $remote_host = 'lxhadesdaq.gsi.de'; +#my $remote_port = '60006'; +my $protocol = 'tcp'; +my $state; +my $answer = ""; + +my $socket = IO::Socket::INET->new(PeerAddr => $remote_host, + PeerPort => $remote_port, + Proto => $protocol, + Type => SOCK_STREAM) + #or die "Couldn't connect to $remote_host:$remote_port : $@\n"; + or $answer = "WARNING - no response from my_check_eblog at $remote_host:$remote_port"; + +if($answer){ + $state = $ERRORS{'WARNING'}; +} +else{ + $answer = <$socket>; + + close($socket); + + if($answer =~/OK/){ + #nagios exit code 0 = status OK = green + $state = $ERRORS{'OK'}; + } + elsif($answer =~/WARNING/){ + $state = $ERRORS{'WARNING'}; + } + elsif($answer =~/CRITICAL/){ + $state = $ERRORS{'CRITICAL'}; + } + elsif($answer){ + #nagios exit code 2 = status CRITICAL = red + $state = $ERRORS{'UNKNOWN'}; + } +} + +if($state == $ERRORS{'OK'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'WARNING'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'CRITICAL'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'UNKNOWN'}){ + print "UNKNOWN - $answer\n"; +} + +exit $state; + + + + diff --git a/plugins/my_check_ping.pl b/plugins/my_check_ping.pl new file mode 100755 index 0000000..f72d095 --- /dev/null +++ b/plugins/my_check_ping.pl @@ -0,0 +1,80 @@ +#!/usr/bin/perl -w + +#BEGIN{ + +# push @INC, "/usr/lib/perl5/site_perl/5.8.0/i586-linux-thread-multi"; + +#} + +=head1 NAME + +check_ping.pl - pings a host and returns statistics data. + +=head1 VERSION + +Version 1.0 + +=head1 AUTHOR + +(c) 2003 Hannes Schulz + +=head1 SYNOPSIS + + ./check_ping.pl --host --loss , --rta , + [--timeout ] [--packages ] + +=head1 DESCRIPTION + +This pings a host via the C module from CPAN and returns +RTA and loss. + +=cut + +use strict; + +use Getopt::Long; +use Pod::Usage; +use Net::Ping; + +my ($host,$aloss,$arta,$timeout,$pack); +GetOptions( + "H|host=s", \$host, + "l|loss=s", \$aloss, + "r|rta=s", \$arta, + "t|timeout=i", \$timeout, + "p|packages=i",\$pack +); + +pod2usage("$0: No host given!\n") unless($host); +pod2usage("$0: Parameter syntax error!\n") unless($aloss =~ /^\d+,\d+$/o); +pod2usage("$0: Parameter syntax error!\n") unless($arta =~ /^\d+,\d+$/o); + +my ($wloss,$closs) = split /,/,$aloss; +my ($wrta,$crta) = split /,/,$arta; + +pod2usage("$0: Warning > Critical!\n") unless($wloss<$closs); +pod2usage("$0: Warning > Critical!\n") unless($wrta<$crta); + +$pack ||= 5; +$timeout ||= ($pack*3.5); + +my $p = Net::Ping->new("tcp",$timeout/$pack); +$p->hires(1); + +my ($ret, $duration, $ip, $nok, $dur); +$nok = 0; $dur = 0; +for(1..$pack){ + ($ret, $duration, $ip) = $p->ping($host); + $nok++ if(!$ret); + $dur += $duration; + $p->close(); +} + +my $rta = 1000 * $dur/$pack; +my $loss = 100 * $nok/$pack; + +printf("PING - Packet loss = %i%%, RTA = %.2f ms\n", $loss, $rta); + +exit(2) if($rta>$crta or $loss>$closs); # Nagios: Critical +exit(1) if($rta>$wrta or $loss>$wloss); # Nagios: Warning +exit(0); # Nagios: OK diff --git a/plugins/my_check_proc_status.pl b/plugins/my_check_proc_status.pl new file mode 100755 index 0000000..d5ed355 --- /dev/null +++ b/plugins/my_check_proc_status.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w +# ---------------------------------------------------------------------------- +# File Name: my_check_proc_status.pl +# Author: Sergey Yurevich +# Date: 16/01/2007 +# Version: 0.1 +# Description: script checks the status of the process (alive/dead) +# ---------------------------------------------------------------------------- + +use strict; +use warnings; +use IO::Socket; +use lib '/usr/local/nagios/libexec/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +@ARGV == 3 or die "usage: my_check_proc_status.pl host_ip host_port proc_name\n"; + +my ($remote_host, $remote_port, $proc_name) = @ARGV; + +#my $remote_host = 'lxhadesdaq.gsi.de'; +#my $remote_port = '60006'; +my $protocol = 'tcp'; +my $state; +my $answer = ""; + +my $socket = IO::Socket::INET->new(PeerAddr => $remote_host, + PeerPort => $remote_port, + Proto => $protocol, + Type => SOCK_STREAM) + or $answer = "CRITICAL - no response from $proc_name at $remote_host:$remote_port"; + +if($answer){ + $state = $ERRORS{'CRITICAL'}; +} +else{ + $answer = <$socket>; + + close($socket); + + if($answer =~/OK/){ + $state = $ERRORS{'OK'}; + } + elsif($answer =~/WARNING/){ + $state = $ERRORS{'WARNING'}; + } + elsif($answer =~/CRITICAL/){ + $state = $ERRORS{'CRITICAL'}; + } + elsif($answer){ + $state = $ERRORS{'UNKNOWN'}; + } +} + +if($state == $ERRORS{'OK'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'WARNING'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'CRITICAL'}){ + print "$answer\n"; +} +elsif($state == $ERRORS{'UNKNOWN'}){ + print "UNKNOWN - $answer\n"; +} + +exit $state; + + + + diff --git a/plugins/my_check_process.pl b/plugins/my_check_process.pl new file mode 100755 index 0000000..9dd3bd1 --- /dev/null +++ b/plugins/my_check_process.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl -w +# ---------------------------------------------------------------------------- +# File Name: my_check_process.pl +# Author: Sergey Yurevich +# Date: 05/04/2007 +# Version: 0.1 +# Description: script will check if there is a process running +# ---------------------------------------------------------------------------- + +use strict; +use warnings; +use lib '/usr/local/nagios/libexec/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +@ARGV == 1 or die "usage: my_check_process.pl process_name\n"; + +my ($process_name) = @ARGV; + +my $pids = `pidof -x $process_name`; +chop($pids); + +my $state; + +if($pids){ + $state = $ERRORS{'OK'}; +} +else{ + $state = $ERRORS{'CRITICAL'}; +} + +if($state == $ERRORS{'OK'}){ + print "OK - pid of $process_name is $pids\n"; +} +elsif($state == $ERRORS{'CRITICAL'}){ + print "CRITICAL - $process_name is not running!\n"; +} + +exit $state; + + + + diff --git a/plugins/my_check_process_qa-dst.pl b/plugins/my_check_process_qa-dst.pl new file mode 100755 index 0000000..3c12b09 --- /dev/null +++ b/plugins/my_check_process_qa-dst.pl @@ -0,0 +1,85 @@ +#!/usr/bin/perl -w +# ---------------------------------------------------------------------------- +# File Name: my_check_process.pl +# Author: Sergey Yurevich +# Date: 05/04/2007 +# Version: 0.1 +# Description: script will check if there is a process running +# ---------------------------------------------------------------------------- + +use strict; +use warnings; +#use lib '/usr/local/nagios/libexec/'; +use lib '/misc/hadaq/nagios/nagios-plugins-1.4.5/plugins-scripts/'; +use utils qw($TIMEOUT %ERRORS &print_revision &support); + +@ARGV == 3 or die "usage: my_check_process.pl process_name number_of_processes status_level\n"; + +# process_name - name of process to be checked. +# status_level - the return STATUS if process is not found. +# number_of_processes - exact number of running processes to be checked. + +my ($process_name, $proc_num, $status_level) = @ARGV; + +my $pids = `pidof -x $process_name`; +chop($pids); + +#print "pids = $pids\n"; +my @pid_list = split(' ',$pids); + +my $run_proc_num = $#pid_list+1; #number of running processes with name $process_name + +my $state; + +if($pids){ + $state = $ERRORS{'OK'}; +} +else{ + $state = $ERRORS{'CRITICAL'}; +} + +if($state == $ERRORS{'OK'}){ + if($proc_num == $run_proc_num){ + print "OK - pid of $process_name is $pids\n"; + + exit $state; + } + elsif($proc_num > $run_proc_num){ + print "$status_level - pid of $process_name is $pids, too few processes!\n"; + + #the following is needed because of passed status_level to the plugin script. + if($status_level eq "CRITICAL") { + exit $ERRORS{'CRITICAL'}; + } + elsif($status_level eq "WARNING") { + exit $ERRORS{'WARNING'}; + } + } + else{ + print "$status_level - pid of $process_name is $pids, too many processes!\n"; + + #the following is needed because of passed status_level to the plugin script. + if($status_level eq "CRITICAL") { + exit $ERRORS{'CRITICAL'}; + } + elsif($status_level eq "WARNING") { + exit $ERRORS{'WARNING'}; + } + } +} +elsif($state == $ERRORS{'CRITICAL'}){ + print "$status_level - $process_name is not running!\n"; + + #the following is needed because of passed status_level to the plugin script. + if($status_level eq "CRITICAL") { + exit $ERRORS{'CRITICAL'}; + } + elsif($status_level eq "WARNING") { + exit $ERRORS{'WARNING'}; + } +} + + + + + diff --git a/plugins/my_epics.sh b/plugins/my_epics.sh new file mode 100755 index 0000000..ec68089 --- /dev/null +++ b/plugins/my_epics.sh @@ -0,0 +1,314 @@ +#!/bin/sh +# +############################################################################## +############################################################################## +## Nagios plugin to check EPICS PV Status ## +############################################################################## +############################################################################## +# +# Script to retrieve EPICS PV Name status using the "caget" command. +# Written by Mauro Giacchini (mauro.giacchini@lnl.infn.it) +# Last Modified: 17-11-2007 +# +# Usage: ./check_caget.sh -pv +# +# Description: +# This script uses caget command to retrieve the PV status. +# +# Limitations: +# This script has been tested on Linux Fedora Core 6. +# +# Output: +# The output contains the "te" time elapsed +# calculated like a difference from PV's +# timestamp and the linux "date" command (suggestion: use ntp common server +# to IOCs and Nagios server box). The STATUS of the service (..of the PV) +# follow the severity rules: +# +# Severity (none) >>>> STATE_OK # OK = green +# +# Severity MINOR >>>> STATE_WARNING # WARNING = yellow +# +# Severity MAJOR >>>> STATE_CRITICAL # CRITICAL = red +# +# PV not found >>>> STATE_UNKNOWN # UNKNOWNN = orange +# +# In case of Severity (none) it show the stdout of +# "caget -a" with appended the "te". +# +# Other notes: +# Firefox Plugin : A FireFox extension is avilable to monitor Nagios server. +# https://addons.mozilla.org/it/firefox/addon/3607 +# +# Nagios configuration setup: +# You need to add the command to commands.cfg +# +# define command{ +# command_name check_caget +# command_line $USER1$/check_caget.sh -pv $ARG1$ +# } +# +# And, you need to add the service to services.cfg +# +# define service{ +# use generic-service ; +# host_name IOC_Example ; +# service_description aiExample ; +# is_volatile 0 ; +# check_period 24x7 ; +# max_check_attempts 3 ; +# normal_check_interval 3 ; +# retry_check_interval 1 ; +# contact_groups admins ; +# notification_interval 120 ; +# notification_period 24x7 ; +# notification_options w,u,c,r ; +# check_command check_caget!rootHost:aiExample ; +# } +# +# then place this script in the /usr/lib/nagios/plugins/ +# on the Nagios box server. +# Don't forget to set the right execution permission to this file. +# +# Threshold and ranges: please, have a look at: +# http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT +# +# Last: This script still needs debugging and fixups (exercise for reader) :-) +# +############################################################################## +# DEBUGGING OPTION +# This option determines whether or not debugging messages are showed +# Values: 0=debugging off, 1=debugging on + +DEBUG="0" + + +############################################################################## +# CAGET LOCATION +# This option determines where the caget executable is located. +# The default /usr/bin/caget should be made with a symbolic link +# made by root (i.e.): ln -s /opt/epics/base-3.14.9/bin/linux-x86/caget /usr/bin/caget + + +CAGET_LOCATION=/home/scs/epics/base-3.14.9/bin/linux-x86/caget + + +############################################################################## +# Script exit status + +STATE_OK=0 # OK = green + +STATE_WARNING=1 # WARNING = yellow + +STATE_CRITICAL=2 # CRITICAL = red + +STATE_UNKNOWN=3 # UNKNOWNN = orange + +VERSION="v1.3" + +############################################################################## +# print_revision() function + +print_revision (){ + + echo "Check_caget (nagios-plugins 1.4 to nagios 2.9) (EPICS base 3.14.9) $VERSION" +} + +############################################################################## +# print_usage() function + +print_usage() { + + echo "" + echo "Usage: check_caget_dev_gw -pv " + echo "Usage: check_caget_dev_gw -pv -H " + echo "Usage: check_caget_dev_gw -pv -p " + echo "Usage: check_caget_dev_gw -pv -expval " + echo "Usage: check_caget_dev_gw [-h] [--help]" + echo "Usage: check_caget_dev_gw [-V]" + echo "" +} + +##################################################################################### +# print_help() function + +print_help() { + echo "" + print_usage + echo "" + echo "Script to retrieve the PV status for EPICS control systems." + echo "" + echo "This plugin not developped by the Nagios Plugin group." + echo "Please do not e-mail them for support on this plugin, since" + echo "they won't know what you're talking about :P" + echo "" + echo "For contact info: mauro.giacchini@lnl.infn.it" + echo "Download : http://www.lnl.infn.it/~epics/" + echo "" +} + +############################################################################## +# Check the caget presence. + + +verify_caget_presence() { + + +if ! type $CAGET_LOCATION >/dev/null 2>&1; then + + echo "STATUS CRITICAL: caget not found (Did you set up the rigth one Nagios USERn? _or_ caget not found!)" + exit $STATE_CRITICAL +fi +} + + +############################################################################## +# Control caget plugin input parameters + +EXPVAL="" +EPICS_CA_ADDR_LIST="" # Default YES +EPICS_CA_SERVER_PORT="" # Default 5064 _and_ value > 5000 +EPICS_CA_SERVER_PORT_MIN="5000" + +while test -n "$1"; do + + case "$1" in + + --help) + print_help + exit $STATE_OK + ;; + + -h) + print_help + exit $STATE_OK + ;; + + -V) + print_revision + exit $STATE_OK + ;; + + -pv) + PVNAME=$2 + shift + ;; + + -expval) + EXPVAL=$2 + if [ -z $EXPVAL ]; then + echo "STATUS CRITICAL: Expected value absent" + exit $STATE_CRITICAL + fi + shift + ;; + + -H) + EPICS_CA_ADDR_LIST=$2 + if [ -z $EPICS_CA_ADDR_LIST ]; then + echo "STATUS CRITICAL: Expected EPICS_CA_ADDR_LIST absent" + exit $STATE_CRITICAL + fi + export EPICS_CA_ADDR_LIST + EPICS_CA_AUTO_ADDR_LIST="NO" + export EPICS_CA_AUTO_ADDR_LIST + shift + ;; + + -p) + EPICS_CA_SERVER_PORT=$2 + if [ -z $EPICS_CA_SERVER_PORT ]; then + echo "STATUS CRITICAL: Expected EPICS_CA_SERVER_PORT absent" + exit $STATE_CRITICAL + fi + if [ $EPICS_CA_SERVER_PORT -le $EPICS_CA_SERVER_PORT_MIN ]; then + echo "STATUS CRITICAL: Expected EPICS_CA_SERVER_PORT minor than allowed (5001)" + exit $STATE_CRITICAL + fi + export EPICS_CA_SERVER_PORT + shift + ;; + + *) + echo "" + echo "Unknow argument: $1" + print_usage + exit $STATE_UNKNOWN + ;; + +esac +shift +done + + +verify_caget_presence + +if [ -z $PVNAME ]; then + + echo "STATUS CRITICAL: PV Name not specified" + exit $STATE_CRITICAL +fi + +##################################################################################### +# FINALLY... RETRIEVING THE VALUES (caget) + + +#CAGET_REPLY=`caget -a $PVNAME` +CAGET_REPLY=`$CAGET_LOCATION -a $PVNAME` + +IFS=" " +read pvname date time value status severity<