From 239d2ea41067db00465fbff85909949444a42cdb Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 24 Sep 2021 17:08:01 +0200 Subject: [PATCH] HA: after heartbeat expiry, stop writing to database and hand over If it's not possible for Icinga DB to write through the heartbeat within its validity period it cannot signal to other instances that it still is alive and has the hand over. There's also no point in retrying for this individual heartbeat any longer. --- pkg/icingadb/ha.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pkg/icingadb/ha.go b/pkg/icingadb/ha.go index 64f7c8ff..3d3daa72 100644 --- a/pkg/icingadb/ha.go +++ b/pkg/icingadb/ha.go @@ -146,9 +146,18 @@ func (h *HA) controller() { shouldLog = true default: } - if err = h.realize(s, t, shouldLog); err != nil { + + realizeCtx, cancelRealizeCtx := context.WithDeadline(h.ctx, m.ExpiryTime()) + err = h.realize(realizeCtx, s, t, shouldLog) + cancelRealizeCtx() + if errors.Is(err, context.DeadlineExceeded) { + h.signalHandover() + continue + } + if err != nil { h.abort(err) } + if !oldInstancesRemoved { go h.removeOldInstances(s) oldInstancesRemoved = true @@ -169,13 +178,13 @@ func (h *HA) controller() { } } -func (h *HA) realize(s *icingaredisv1.IcingaStatus, t *types.UnixMilli, shouldLog bool) error { +func (h *HA) realize(ctx context.Context, s *icingaredisv1.IcingaStatus, t *types.UnixMilli, shouldLog bool) error { boff := backoff.NewExponentialWithJitter(time.Millisecond*256, time.Second*3) for attempt := 0; true; attempt++ { sleep := boff(uint64(attempt)) time.Sleep(sleep) - ctx, cancelCtx := context.WithCancel(h.ctx) + ctx, cancelCtx := context.WithCancel(ctx) tx, err := h.db.BeginTxx(ctx, &sql.TxOptions{ Isolation: sql.LevelSerializable, })