Add periodic background snapshot reconcile

Interval is configurable with new etcd-snapshot-reconcile-interval flag

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
This commit is contained in:
Brad Davidson 2025-03-04 23:05:47 +00:00 committed by Brad Davidson
parent bed1f66880
commit d694dd1db9
6 changed files with 57 additions and 33 deletions

View file

@ -92,6 +92,7 @@ type Server struct {
EtcdExposeMetrics bool
EtcdSnapshotDir string
EtcdSnapshotCron string
EtcdSnapshotReconcile time.Duration
EtcdSnapshotRetention int
EtcdSnapshotCompress bool
EtcdListFormat string
@ -390,6 +391,12 @@ var ServerFlags = []cli.Flag{
Destination: &ServerConfig.EtcdSnapshotCron,
Value: "0 */12 * * *",
},
&cli.DurationFlag{
Name: "etcd-snapshot-reconcile-interval",
Usage: "(db) Snapshot reconcile interval",
Destination: &ServerConfig.EtcdSnapshotReconcile,
Value: 10 * time.Minute,
},
&cli.IntFlag{
Name: "etcd-snapshot-retention",
Usage: "(db) Number of snapshots to retain",

View file

@ -184,12 +184,19 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont
serverConfig.ControlConfig.VModule = cmds.LogConfig.VModule
if !cfg.EtcdDisableSnapshots || cfg.ClusterReset {
if cfg.EtcdSnapshotReconcile <= 0 {
return errors.New("etcd-snapshot-reconcile-interval must be greater than 0s")
}
serverConfig.ControlConfig.EtcdSnapshotCompress = cfg.EtcdSnapshotCompress
serverConfig.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName
serverConfig.ControlConfig.EtcdSnapshotCron = cfg.EtcdSnapshotCron
serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir
serverConfig.ControlConfig.EtcdSnapshotReconcile = metav1.Duration{Duration: cfg.EtcdSnapshotReconcile}
serverConfig.ControlConfig.EtcdSnapshotRetention = cfg.EtcdSnapshotRetention
if cfg.EtcdS3 {
if cfg.EtcdS3Timeout <= 0 {
return errors.New("etcd-s3-timeout must be greater than 0s")
}
serverConfig.ControlConfig.EtcdS3 = &config.EtcdS3{
AccessKey: cfg.EtcdS3AccessKey,
Bucket: cfg.EtcdS3BucketName,

View file

@ -3,7 +3,6 @@ package cluster
import (
"context"
"net/url"
"runtime"
"strings"
"time"
@ -44,53 +43,61 @@ func (c *Cluster) Start(ctx context.Context) (<-chan struct{}, error) {
return ready, nil
}
// start managed database (if necessary)
// start managed etcd database; when kine is in use this is a no-op.
if err := c.start(ctx); err != nil {
return nil, pkgerrors.WithMessage(err, "start managed database")
}
// get the wait channel for testing managed database readiness
ready, err := c.testClusterDB(ctx)
if err != nil {
return nil, err
}
// get the wait channel for testing etcd server readiness; when kine is in
// use the channel is closed immediately.
ready := c.testClusterDB(ctx)
// set c.config.Datastore and c.config.Runtime.EtcdConfig with values
// necessary to build etcd clients, and start kine listener if necessary.
if err := c.startStorage(ctx, false); err != nil {
return nil, err
}
// if necessary, store bootstrap data to datastore
// if necessary, store bootstrap data to datastore. saveBootstrap is only set
// when using kine, so this can be done before the ready channel has been closed.
if c.saveBootstrap {
if err := Save(ctx, c.config, false); err != nil {
return nil, err
}
}
// at this point, if etcd is in use, it's bootstrapping is complete
// so save the bootstrap data. We will need for etcd to be up. If
// the save call returns an error, we panic since subsequent etcd
// snapshots will be empty.
if c.managedDB != nil {
go func() {
for {
select {
case <-ready:
// always save to managed etcd, to ensure that any file modified locally are in sync with the datastore.
// this will panic if multiple keys exist, to prevent nodes from running with different bootstrap data.
if err := Save(ctx, c.config, false); err != nil {
panic(err)
}
if !c.config.EtcdDisableSnapshots {
_ = wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
err := c.managedDB.ReconcileSnapshotData(ctx)
if err != nil {
// do an initial reconcile of snapshots with a fast retry until it succeeds
wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
if err := c.managedDB.ReconcileSnapshotData(ctx); err != nil {
logrus.Errorf("Failed to record snapshots for cluster: %v", err)
return false, nil
}
return true, nil
})
// continue reconciling snapshots in the background at the configured interval.
// the interval is jittered by 5% to avoid all nodes reconciling at the same time.
wait.JitterUntilWithContext(ctx, func(ctx context.Context) {
if err := c.managedDB.ReconcileSnapshotData(ctx); err != nil {
logrus.Errorf("Failed to record snapshots for cluster: %v", err)
}
return err == nil, nil
})
}, c.config.EtcdSnapshotReconcile.Duration, 0.05, false)
}
return
default:
runtime.Gosched()
case <-ctx.Done():
return
}
}
}()

View file

@ -25,11 +25,11 @@ import (
// testClusterDB returns a channel that will be closed when the datastore connection is available.
// The datastore is tested for readiness every 5 seconds until the test succeeds.
func (c *Cluster) testClusterDB(ctx context.Context) (<-chan struct{}, error) {
func (c *Cluster) testClusterDB(ctx context.Context) <-chan struct{} {
result := make(chan struct{})
if c.managedDB == nil {
close(result)
return result, nil
return result
}
go func() {
@ -50,7 +50,7 @@ func (c *Cluster) testClusterDB(ctx context.Context) (<-chan struct{}, error) {
}
}()
return result, nil
return result
}
// start starts the database, unless a cluster reset has been requested, in which case

View file

@ -235,17 +235,18 @@ type Control struct {
ClusterResetRestorePath string
MinTLSVersion string
CipherSuites []string
TLSMinVersion uint16 `json:"-"`
TLSCipherSuites []uint16 `json:"-"`
EtcdSnapshotName string `json:"-"`
EtcdDisableSnapshots bool `json:"-"`
EtcdExposeMetrics bool `json:"-"`
EtcdSnapshotDir string `json:"-"`
EtcdSnapshotCron string `json:"-"`
EtcdSnapshotRetention int `json:"-"`
EtcdSnapshotCompress bool `json:"-"`
EtcdListFormat string `json:"-"`
EtcdS3 *EtcdS3 `json:"-"`
TLSMinVersion uint16 `json:"-"`
TLSCipherSuites []uint16 `json:"-"`
EtcdSnapshotName string `json:"-"`
EtcdDisableSnapshots bool `json:"-"`
EtcdExposeMetrics bool `json:"-"`
EtcdSnapshotDir string `json:"-"`
EtcdSnapshotCron string `json:"-"`
EtcdSnapshotReconcile metav1.Duration `json:"-"`
EtcdSnapshotRetention int `json:"-"`
EtcdSnapshotCompress bool `json:"-"`
EtcdListFormat string `json:"-"`
EtcdS3 *EtcdS3 `json:"-"`
ServerNodeName string
VLevel int
VModule string

View file

@ -31,6 +31,7 @@ import (
healthpb "google.golang.org/grpc/health/grpc_health_v1"
"google.golang.org/grpc/reflection"
"google.golang.org/grpc/status"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilnet "k8s.io/apimachinery/pkg/util/net"
"k8s.io/apimachinery/pkg/util/wait"
)
@ -67,6 +68,7 @@ func generateTestConfig() *config.Control {
DataDir: "/tmp/k3s/", // Different than the default value
EtcdSnapshotName: "etcd-snapshot",
EtcdSnapshotCron: "0 */12 * * *",
EtcdSnapshotReconcile: metav1.Duration{Duration: 10 * time.Minute},
EtcdSnapshotRetention: 5,
EtcdS3: &config.EtcdS3{
Endpoint: "s3.amazonaws.com",