From 75e62f9447ed6b70fc90bffc067a0fbbeb5440d3 Mon Sep 17 00:00:00 2001 From: hadaq Date: Tue, 9 Aug 2011 11:33:20 +0000 Subject: [PATCH] Bugfix in eventbuiding: when master stream had lost a trigger sequence number (possibly by udp layer), eventbuilder could not recover subevent synchronization -> increase of discarded events until regular program exit. This was observed at Aug2011 beamtime several times per day with cosmic and test runs. Workaround with a delayed pop of master stream queue. JAM (Joern Adamczewski-Musch) --- hadaq/evtbuild.c | 52 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/hadaq/evtbuild.c b/hadaq/evtbuild.c index 5fb2bf7..9de629c 100644 --- a/hadaq/evtbuild.c +++ b/hadaq/evtbuild.c @@ -1,8 +1,8 @@ -static char *rcsId = "$Header: /misc/hadesprojects/daq/cvsroot/eventbuilder/hadaq/evtbuild.c,v 6.147 2011-08-05 08:31:39 hadaq Exp $"; +static char *rcsId = "$Header: /misc/hadesprojects/daq/cvsroot/eventbuilder/hadaq/evtbuild.c,v 6.148 2011-08-09 11:33:20 hadaq Exp $"; #define _POSIX_C_SOURCE 199309L #define SYSLOG_NAMES -/* #define RFIO */ +#define RFIO #include #include @@ -489,6 +489,8 @@ static int openFile(TheArgs *theArgs, TheStats *theStats) static char outLustrePath[_POSIX_PATH_MAX]; static char sec_path[_POSIX_PATH_MAX]; static once = 1; + /*static unsigned long old_runNr=0; + static unsigned open_count=0; */ seqNr = 0; @@ -517,6 +519,26 @@ static int openFile(TheArgs *theArgs, TheStats *theStats) iocTime = theArgs->runNr + TIMEOFFSET; strftime(fileName + strlen(fileName), 18, "%y%j%H%M%S", localtime(&iocTime)); + + + /* JAM: check here if new run id from epics is our old run id + * this means we closed our file because of reaching file size limit + * before master ioc assigned new run id. In this case, we need to modify file name! + * Still disabled to ensure old naming convention for analysis scripts... + * */ + /*if(old_runNr!=0 && (theArgs->runNr > old_runNr)) + { + open_count=0; we have a real new run nr, reset file count for this run nr + } + else if(old_runNr!=0 && (theArgs->runNr ==old_runNr)) + { + sprintf(msglog, " openFile: begin next file %d for run id: %s", open_count,fileName); + storeLogInfo(theArgs, msglog); + } + sprintf(fileName, "%s_%02d_", fileName, open_count++); file name is always run id + file count + */ + + /* if ebnum == 0 then we assume that there is only 1 EB, * RUNID distribution by IOC is not needed and file name * can be in the usual format without EB identificator. @@ -860,6 +882,8 @@ int main(int argc, char *argv[]) uint32_t currTrigTag; uint32_t currId; + uint32_t failTrigNr = 0; + signal(SIGINT, sigHandler); signal(SIGTERM, sigHandler); signal(SIGHUP, sigHandler); @@ -1080,6 +1104,7 @@ int main(int argc, char *argv[]) unsigned long popCnt = 0; currId = 0; + int popMaster = 0; while (setjmp(terminateJmp) == 0) { void *evt; @@ -1157,7 +1182,7 @@ int main(int argc, char *argv[]) evt = newEvt(EvtDecoding_64bitAligned, EvtId_data); popCnt = 0; /* Counter of popped up subevents */ - + popMaster = 1; /* flag to pop subevent from master stream i=0 */ for (i = 0; i < theArgs->nrOfMsgs && !evtIsBroken; i += step) { uint32_t trigNr; uint32_t trigTag; @@ -1245,8 +1270,9 @@ int main(int argc, char *argv[]) evt = Evt_appendSubEvt(evt, subEvt); } - - HadTuQueue_pop(hadTuQueue[i]); + if (i > 0) { + HadTuQueue_pop(hadTuQueue[i]); /* delay pop of master stream until broken checked */ + } step = 1; } else if (trigNr < currTrigNr) { if (theArgs->debugOptsCnt) { @@ -1257,7 +1283,8 @@ int main(int argc, char *argv[]) /* BUGBUG subevt discarded, not in statistic */ HadTuQueue_pop(hadTuQueue[i]); step = 0; - + popMaster = 0; /* do not pop subevent of master channel in this case. needed for next cycle */ + failTrigNr = trigNr; /* remember skipped trigger number for debug output */ popCnt++; if (popCnt > 10000) { if (theArgs->debugOptsCnt) @@ -1278,7 +1305,20 @@ int main(int argc, char *argv[]) ShmTrans_free(shmTrans[i]); } } + + /* New JAM: treat case that master stream has lost a packet */ + if (popMaster) { + /* Regular case: delayed pop of last master stream entry after check ok */ + HadTuQueue_pop(hadTuQueue[0]); + } else { + /* display error and keep last event in hope for later synchronization */ + sprintf(msglog, " Master message stream has missed trigger sequence nr:%d. Kept subevent nr %d in master queue", + failTrigNr, currTrigNr); + storeLogInfo(theArgs, msglog); + } + if (!evtIsBroken) { + if (theArgs->debugOptsCnt) { if (evtBrokenFlag) { Debug_print(theArgs, theStats, theDebug); -- 2.43.0