HA: after heartbeat expiry, stop writing to database and hand over

If it's not possible for Icinga DB to write through the heartbeat within its
validity period it cannot signal to other instances that it still is alive and
has the hand over. There's also no point in retrying for this individual
heartbeat any longer.
This commit is contained in:
Julian Brost 2021-09-24 17:08:01 +02:00
parent a34aef4fc5
commit 239d2ea410

View file

@ -146,9 +146,18 @@ func (h *HA) controller() {
shouldLog = true
default:
}
if err = h.realize(s, t, shouldLog); err != nil {
realizeCtx, cancelRealizeCtx := context.WithDeadline(h.ctx, m.ExpiryTime())
err = h.realize(realizeCtx, s, t, shouldLog)
cancelRealizeCtx()
if errors.Is(err, context.DeadlineExceeded) {
h.signalHandover()
continue
}
if err != nil {
h.abort(err)
}
if !oldInstancesRemoved {
go h.removeOldInstances(s)
oldInstancesRemoved = true
@ -169,13 +178,13 @@ func (h *HA) controller() {
}
}
func (h *HA) realize(s *icingaredisv1.IcingaStatus, t *types.UnixMilli, shouldLog bool) error {
func (h *HA) realize(ctx context.Context, s *icingaredisv1.IcingaStatus, t *types.UnixMilli, shouldLog bool) error {
boff := backoff.NewExponentialWithJitter(time.Millisecond*256, time.Second*3)
for attempt := 0; true; attempt++ {
sleep := boff(uint64(attempt))
time.Sleep(sleep)
ctx, cancelCtx := context.WithCancel(h.ctx)
ctx, cancelCtx := context.WithCancel(ctx)
tx, err := h.db.BeginTxx(ctx, &sql.TxOptions{
Isolation: sql.LevelSerializable,
})