Skip to content

Commit 8aabac6

Browse files
add backoff retry on service check (#2262)
* add backoff retry on service check to pass time if the network is not setup yet * - directly push errors to zui after running each check - move the retrying to an upper level to retry the whole check with pushing to zui * cleaner err check * run each health check in separate routine
1 parent 40a1746 commit 8aabac6

1 file changed

Lines changed: 41 additions & 13 deletions

File tree

pkg/perf/healthcheck/healthcheck.go

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ import (
44
"context"
55
"fmt"
66
"os"
7+
"sync"
8+
"time"
79

10+
"github.com/cenkalti/backoff"
811
"github.com/rs/zerolog/log"
912
"github.com/threefoldtech/zos/pkg/app"
1013
"github.com/threefoldtech/zos/pkg/perf"
@@ -60,24 +63,49 @@ func (h *healthcheckTask) Run(ctx context.Context) (interface{}, error) {
6063
log.Debug().Msg("starting health check task")
6164
errs := make(map[string][]string)
6265

66+
cl := perf.GetZbusClient(ctx)
67+
zui := stubs.NewZUIStub(cl)
68+
69+
var wg sync.WaitGroup
70+
var mut sync.Mutex
6371
for label, check := range h.checks {
64-
errors := check(ctx)
65-
if len(errors) == 0 {
66-
continue
67-
}
72+
wg.Add(1)
6873

69-
errs[label] = errorsToStrings(errors)
70-
}
74+
go func(label string, check checkFunc) {
75+
defer wg.Done()
7176

72-
cl := perf.GetZbusClient(ctx)
73-
zui := stubs.NewZUIStub(cl)
77+
op := func() error {
78+
errors := check(ctx)
7479

75-
for label := range h.checks {
76-
err := zui.PushErrors(ctx, label, errs[label])
77-
if err != nil {
78-
return nil, err
79-
}
80+
mut.Lock()
81+
defer mut.Unlock()
82+
errs[label] = errorsToStrings(errors)
83+
84+
if err := zui.PushErrors(ctx, label, errs[label]); err != nil {
85+
return err
86+
}
87+
88+
if len(errors) != 0 {
89+
return fmt.Errorf("failed health check")
90+
}
91+
92+
return nil
93+
}
94+
95+
notify := func(err error, t time.Duration) {
96+
log.Error().Err(err).Str("check", label).Dur("retry-in", t).Msg("failed health check. retrying")
97+
}
98+
99+
bo := backoff.NewExponentialBackOff()
100+
bo.InitialInterval = 30 * time.Second
101+
bo.MaxInterval = 30 * time.Second
102+
bo.MaxElapsedTime = 10 * time.Minute
103+
104+
_ = backoff.RetryNotify(op, bo, notify)
105+
}(label, check)
80106
}
107+
wg.Wait()
108+
81109
return errs, nil
82110
}
83111

0 commit comments

Comments
 (0)