From 6b06b5e0853973063fdb93503e9e3e2b1408cd00 Mon Sep 17 00:00:00 2001 From: hadaq Date: Mon, 13 Feb 2012 12:36:43 +0000 Subject: [PATCH] JAM: Added eventhandler for runinfo2oracle and epics ioc restart --- icinga/objects/commands.cfg | 16 ++++++++++++++++ icinga/objects/hosts_eb_servers.cfg | 12 +++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/icinga/objects/commands.cfg b/icinga/objects/commands.cfg index 21b54b4..0f8e9a2 100644 --- a/icinga/objects/commands.cfg +++ b/icinga/objects/commands.cfg @@ -308,9 +308,25 @@ define command { command_line $USER1$/check_by_ssh -l $ARG1$ -H $HOSTADDRESS$ -C "/home/hadaq/nagios/plugins/my_epics.sh -pv $ARG2$ -H 192.168.103.255 -expval $ARG3$" } +# restart any process by name if state is critical after 3 attempts define command { command_name restart_process command_line $USER1$/my_restart_handler.pl -r -m $ARG1$@$HOSTADDRESS$ -s $ARG2$ -a $ARG3$ -x $SERVICESTATE$ -y $SERVICESTATETYPE$ -z $SERVICEATTEMPT$ } +# restart eventbuilder epics if state is critical after 3 attempts +define command { + command_name restart_eb_epics + command_line $USER1$/my_restart_handler.pl -m hadaq@lxhadesdaq -s /home/hadaq/trbsoft/daq/evtbuild/start_eb_gbe.pl -a "-i start -n 1-16" -x $SERVICESTATE$ -y $SERVICESTATETYPE$ -z $SERVICEATTEMPT$ +} + +# restart oracle export processes if state is critical after 3 attempts +define command { + command_name restart_run2oracle + command_line $USER1$/my_restart_handler.pl -m hadaq@lxhadesdaq -s /home/hadaq/trbsoft/daq/oracle/runinfo2orastart_parallel.sh -a "" -x $SERVICESTATE$ -y $SERVICESTATETYPE$ -z $SERVICEATTEMPT$ +} + + + + diff --git a/icinga/objects/hosts_eb_servers.cfg b/icinga/objects/hosts_eb_servers.cfg index 2f875fd..5862f9b 100644 --- a/icinga/objects/hosts_eb_servers.cfg +++ b/icinga/objects/hosts_eb_servers.cfg @@ -136,6 +136,7 @@ define service{ # } # runinfo2ora with multiple processes: +# if not all processes are there, we restart everything. define service{ use remote-service @@ -145,7 +146,8 @@ define service{ retry_check_interval 1 notification_interval 30 notification_options c,r - check_command check_multi_proc_by_ssh!hadaq!runinfo2ora.pl!16!WARNING! + check_command check_multi_proc_by_ssh!hadaq!runinfo2ora.pl!16!CRITICAL! + event_handler restart_run2oracle } @@ -265,7 +267,8 @@ define service{ ####### here check iocs for eventbuilders: - +# there must be exactly 4 processes on any active EB server +# if not, we will restart _all_ iocs on all machines define service{ use remote-service hostgroup_name eb-servers-active @@ -274,12 +277,15 @@ define service{ retry_check_interval 1 notification_interval 30 notification_options c,w,r - check_command check_multi_proc_by_ssh!hadaq!SCREEN!4!WARNING! + check_command check_multi_proc_by_ssh!hadaq!SCREEN!4!CRITICAL! + event_handler restart_eb_epics } ## direct check of running iocs with fine granularity: +# note: these will give critical if ioc is there, but eb process is not +# these will give unknown if ioc is not available -- 2.43.0