From d694dd1db91cf8016d033b24ac69c4daf01223e1 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Tue, 4 Mar 2025 23:05:47 +0000 Subject: [PATCH] Add periodic background snapshot reconcile Interval is configurable with new etcd-snapshot-reconcile-interval flag Signed-off-by: Brad Davidson --- pkg/cli/cmds/server.go | 7 ++++++ pkg/cli/server/server.go | 7 ++++++ pkg/cluster/cluster.go | 45 +++++++++++++++++++++---------------- pkg/cluster/managed.go | 6 ++--- pkg/daemons/config/types.go | 23 ++++++++++--------- pkg/etcd/etcd_linux_test.go | 2 ++ 6 files changed, 57 insertions(+), 33 deletions(-) diff --git a/pkg/cli/cmds/server.go b/pkg/cli/cmds/server.go index b30cb6ff324..37fcb131ced 100644 --- a/pkg/cli/cmds/server.go +++ b/pkg/cli/cmds/server.go @@ -92,6 +92,7 @@ type Server struct { EtcdExposeMetrics bool EtcdSnapshotDir string EtcdSnapshotCron string + EtcdSnapshotReconcile time.Duration EtcdSnapshotRetention int EtcdSnapshotCompress bool EtcdListFormat string @@ -390,6 +391,12 @@ var ServerFlags = []cli.Flag{ Destination: &ServerConfig.EtcdSnapshotCron, Value: "0 */12 * * *", }, + &cli.DurationFlag{ + Name: "etcd-snapshot-reconcile-interval", + Usage: "(db) Snapshot reconcile interval", + Destination: &ServerConfig.EtcdSnapshotReconcile, + Value: 10 * time.Minute, + }, &cli.IntFlag{ Name: "etcd-snapshot-retention", Usage: "(db) Number of snapshots to retain", diff --git a/pkg/cli/server/server.go b/pkg/cli/server/server.go index 652b42727f9..5d6af3aea07 100644 --- a/pkg/cli/server/server.go +++ b/pkg/cli/server/server.go @@ -184,12 +184,19 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont serverConfig.ControlConfig.VModule = cmds.LogConfig.VModule if !cfg.EtcdDisableSnapshots || cfg.ClusterReset { + if cfg.EtcdSnapshotReconcile <= 0 { + return errors.New("etcd-snapshot-reconcile-interval must be greater than 0s") + } serverConfig.ControlConfig.EtcdSnapshotCompress = cfg.EtcdSnapshotCompress serverConfig.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName serverConfig.ControlConfig.EtcdSnapshotCron = cfg.EtcdSnapshotCron serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir + serverConfig.ControlConfig.EtcdSnapshotReconcile = metav1.Duration{Duration: cfg.EtcdSnapshotReconcile} serverConfig.ControlConfig.EtcdSnapshotRetention = cfg.EtcdSnapshotRetention if cfg.EtcdS3 { + if cfg.EtcdS3Timeout <= 0 { + return errors.New("etcd-s3-timeout must be greater than 0s") + } serverConfig.ControlConfig.EtcdS3 = &config.EtcdS3{ AccessKey: cfg.EtcdS3AccessKey, Bucket: cfg.EtcdS3BucketName, diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 96baebc05b0..67c58b92a83 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -3,7 +3,6 @@ package cluster import ( "context" "net/url" - "runtime" "strings" "time" @@ -44,53 +43,61 @@ func (c *Cluster) Start(ctx context.Context) (<-chan struct{}, error) { return ready, nil } - // start managed database (if necessary) + // start managed etcd database; when kine is in use this is a no-op. if err := c.start(ctx); err != nil { return nil, pkgerrors.WithMessage(err, "start managed database") } - // get the wait channel for testing managed database readiness - ready, err := c.testClusterDB(ctx) - if err != nil { - return nil, err - } + // get the wait channel for testing etcd server readiness; when kine is in + // use the channel is closed immediately. + ready := c.testClusterDB(ctx) + // set c.config.Datastore and c.config.Runtime.EtcdConfig with values + // necessary to build etcd clients, and start kine listener if necessary. if err := c.startStorage(ctx, false); err != nil { return nil, err } - // if necessary, store bootstrap data to datastore + // if necessary, store bootstrap data to datastore. saveBootstrap is only set + // when using kine, so this can be done before the ready channel has been closed. if c.saveBootstrap { if err := Save(ctx, c.config, false); err != nil { return nil, err } } - // at this point, if etcd is in use, it's bootstrapping is complete - // so save the bootstrap data. We will need for etcd to be up. If - // the save call returns an error, we panic since subsequent etcd - // snapshots will be empty. if c.managedDB != nil { go func() { for { select { case <-ready: + // always save to managed etcd, to ensure that any file modified locally are in sync with the datastore. + // this will panic if multiple keys exist, to prevent nodes from running with different bootstrap data. if err := Save(ctx, c.config, false); err != nil { panic(err) } if !c.config.EtcdDisableSnapshots { - _ = wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { - err := c.managedDB.ReconcileSnapshotData(ctx) - if err != nil { + // do an initial reconcile of snapshots with a fast retry until it succeeds + wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { + if err := c.managedDB.ReconcileSnapshotData(ctx); err != nil { + logrus.Errorf("Failed to record snapshots for cluster: %v", err) + return false, nil + } + return true, nil + }) + + // continue reconciling snapshots in the background at the configured interval. + // the interval is jittered by 5% to avoid all nodes reconciling at the same time. + wait.JitterUntilWithContext(ctx, func(ctx context.Context) { + if err := c.managedDB.ReconcileSnapshotData(ctx); err != nil { logrus.Errorf("Failed to record snapshots for cluster: %v", err) } - return err == nil, nil - }) + }, c.config.EtcdSnapshotReconcile.Duration, 0.05, false) } return - default: - runtime.Gosched() + case <-ctx.Done(): + return } } }() diff --git a/pkg/cluster/managed.go b/pkg/cluster/managed.go index fb289becaf9..79a3d13906a 100644 --- a/pkg/cluster/managed.go +++ b/pkg/cluster/managed.go @@ -25,11 +25,11 @@ import ( // testClusterDB returns a channel that will be closed when the datastore connection is available. // The datastore is tested for readiness every 5 seconds until the test succeeds. -func (c *Cluster) testClusterDB(ctx context.Context) (<-chan struct{}, error) { +func (c *Cluster) testClusterDB(ctx context.Context) <-chan struct{} { result := make(chan struct{}) if c.managedDB == nil { close(result) - return result, nil + return result } go func() { @@ -50,7 +50,7 @@ func (c *Cluster) testClusterDB(ctx context.Context) (<-chan struct{}, error) { } }() - return result, nil + return result } // start starts the database, unless a cluster reset has been requested, in which case diff --git a/pkg/daemons/config/types.go b/pkg/daemons/config/types.go index 1f7c0a2a5f0..2c7976fc732 100644 --- a/pkg/daemons/config/types.go +++ b/pkg/daemons/config/types.go @@ -235,17 +235,18 @@ type Control struct { ClusterResetRestorePath string MinTLSVersion string CipherSuites []string - TLSMinVersion uint16 `json:"-"` - TLSCipherSuites []uint16 `json:"-"` - EtcdSnapshotName string `json:"-"` - EtcdDisableSnapshots bool `json:"-"` - EtcdExposeMetrics bool `json:"-"` - EtcdSnapshotDir string `json:"-"` - EtcdSnapshotCron string `json:"-"` - EtcdSnapshotRetention int `json:"-"` - EtcdSnapshotCompress bool `json:"-"` - EtcdListFormat string `json:"-"` - EtcdS3 *EtcdS3 `json:"-"` + TLSMinVersion uint16 `json:"-"` + TLSCipherSuites []uint16 `json:"-"` + EtcdSnapshotName string `json:"-"` + EtcdDisableSnapshots bool `json:"-"` + EtcdExposeMetrics bool `json:"-"` + EtcdSnapshotDir string `json:"-"` + EtcdSnapshotCron string `json:"-"` + EtcdSnapshotReconcile metav1.Duration `json:"-"` + EtcdSnapshotRetention int `json:"-"` + EtcdSnapshotCompress bool `json:"-"` + EtcdListFormat string `json:"-"` + EtcdS3 *EtcdS3 `json:"-"` ServerNodeName string VLevel int VModule string diff --git a/pkg/etcd/etcd_linux_test.go b/pkg/etcd/etcd_linux_test.go index 737342ff69a..c6f6b6860c1 100644 --- a/pkg/etcd/etcd_linux_test.go +++ b/pkg/etcd/etcd_linux_test.go @@ -31,6 +31,7 @@ import ( healthpb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/reflection" "google.golang.org/grpc/status" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilnet "k8s.io/apimachinery/pkg/util/net" "k8s.io/apimachinery/pkg/util/wait" ) @@ -67,6 +68,7 @@ func generateTestConfig() *config.Control { DataDir: "/tmp/k3s/", // Different than the default value EtcdSnapshotName: "etcd-snapshot", EtcdSnapshotCron: "0 */12 * * *", + EtcdSnapshotReconcile: metav1.Duration{Duration: 10 * time.Minute}, EtcdSnapshotRetention: 5, EtcdS3: &config.EtcdS3{ Endpoint: "s3.amazonaws.com",