mirror of
https://github.com/k3s-io/k3s.git
synced 2026-05-28 04:34:19 -04:00
Add periodic background snapshot reconcile
Interval is configurable with new etcd-snapshot-reconcile-interval flag Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
This commit is contained in:
parent
bed1f66880
commit
d694dd1db9
6 changed files with 57 additions and 33 deletions
|
|
@ -92,6 +92,7 @@ type Server struct {
|
|||
EtcdExposeMetrics bool
|
||||
EtcdSnapshotDir string
|
||||
EtcdSnapshotCron string
|
||||
EtcdSnapshotReconcile time.Duration
|
||||
EtcdSnapshotRetention int
|
||||
EtcdSnapshotCompress bool
|
||||
EtcdListFormat string
|
||||
|
|
@ -390,6 +391,12 @@ var ServerFlags = []cli.Flag{
|
|||
Destination: &ServerConfig.EtcdSnapshotCron,
|
||||
Value: "0 */12 * * *",
|
||||
},
|
||||
&cli.DurationFlag{
|
||||
Name: "etcd-snapshot-reconcile-interval",
|
||||
Usage: "(db) Snapshot reconcile interval",
|
||||
Destination: &ServerConfig.EtcdSnapshotReconcile,
|
||||
Value: 10 * time.Minute,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "etcd-snapshot-retention",
|
||||
Usage: "(db) Number of snapshots to retain",
|
||||
|
|
|
|||
|
|
@ -184,12 +184,19 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont
|
|||
serverConfig.ControlConfig.VModule = cmds.LogConfig.VModule
|
||||
|
||||
if !cfg.EtcdDisableSnapshots || cfg.ClusterReset {
|
||||
if cfg.EtcdSnapshotReconcile <= 0 {
|
||||
return errors.New("etcd-snapshot-reconcile-interval must be greater than 0s")
|
||||
}
|
||||
serverConfig.ControlConfig.EtcdSnapshotCompress = cfg.EtcdSnapshotCompress
|
||||
serverConfig.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName
|
||||
serverConfig.ControlConfig.EtcdSnapshotCron = cfg.EtcdSnapshotCron
|
||||
serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir
|
||||
serverConfig.ControlConfig.EtcdSnapshotReconcile = metav1.Duration{Duration: cfg.EtcdSnapshotReconcile}
|
||||
serverConfig.ControlConfig.EtcdSnapshotRetention = cfg.EtcdSnapshotRetention
|
||||
if cfg.EtcdS3 {
|
||||
if cfg.EtcdS3Timeout <= 0 {
|
||||
return errors.New("etcd-s3-timeout must be greater than 0s")
|
||||
}
|
||||
serverConfig.ControlConfig.EtcdS3 = &config.EtcdS3{
|
||||
AccessKey: cfg.EtcdS3AccessKey,
|
||||
Bucket: cfg.EtcdS3BucketName,
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ package cluster
|
|||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
|
@ -44,53 +43,61 @@ func (c *Cluster) Start(ctx context.Context) (<-chan struct{}, error) {
|
|||
return ready, nil
|
||||
}
|
||||
|
||||
// start managed database (if necessary)
|
||||
// start managed etcd database; when kine is in use this is a no-op.
|
||||
if err := c.start(ctx); err != nil {
|
||||
return nil, pkgerrors.WithMessage(err, "start managed database")
|
||||
}
|
||||
|
||||
// get the wait channel for testing managed database readiness
|
||||
ready, err := c.testClusterDB(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// get the wait channel for testing etcd server readiness; when kine is in
|
||||
// use the channel is closed immediately.
|
||||
ready := c.testClusterDB(ctx)
|
||||
|
||||
// set c.config.Datastore and c.config.Runtime.EtcdConfig with values
|
||||
// necessary to build etcd clients, and start kine listener if necessary.
|
||||
if err := c.startStorage(ctx, false); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// if necessary, store bootstrap data to datastore
|
||||
// if necessary, store bootstrap data to datastore. saveBootstrap is only set
|
||||
// when using kine, so this can be done before the ready channel has been closed.
|
||||
if c.saveBootstrap {
|
||||
if err := Save(ctx, c.config, false); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// at this point, if etcd is in use, it's bootstrapping is complete
|
||||
// so save the bootstrap data. We will need for etcd to be up. If
|
||||
// the save call returns an error, we panic since subsequent etcd
|
||||
// snapshots will be empty.
|
||||
if c.managedDB != nil {
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-ready:
|
||||
// always save to managed etcd, to ensure that any file modified locally are in sync with the datastore.
|
||||
// this will panic if multiple keys exist, to prevent nodes from running with different bootstrap data.
|
||||
if err := Save(ctx, c.config, false); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if !c.config.EtcdDisableSnapshots {
|
||||
_ = wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
|
||||
err := c.managedDB.ReconcileSnapshotData(ctx)
|
||||
if err != nil {
|
||||
// do an initial reconcile of snapshots with a fast retry until it succeeds
|
||||
wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
|
||||
if err := c.managedDB.ReconcileSnapshotData(ctx); err != nil {
|
||||
logrus.Errorf("Failed to record snapshots for cluster: %v", err)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
|
||||
// continue reconciling snapshots in the background at the configured interval.
|
||||
// the interval is jittered by 5% to avoid all nodes reconciling at the same time.
|
||||
wait.JitterUntilWithContext(ctx, func(ctx context.Context) {
|
||||
if err := c.managedDB.ReconcileSnapshotData(ctx); err != nil {
|
||||
logrus.Errorf("Failed to record snapshots for cluster: %v", err)
|
||||
}
|
||||
return err == nil, nil
|
||||
})
|
||||
}, c.config.EtcdSnapshotReconcile.Duration, 0.05, false)
|
||||
}
|
||||
return
|
||||
default:
|
||||
runtime.Gosched()
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
|
|
|||
|
|
@ -25,11 +25,11 @@ import (
|
|||
|
||||
// testClusterDB returns a channel that will be closed when the datastore connection is available.
|
||||
// The datastore is tested for readiness every 5 seconds until the test succeeds.
|
||||
func (c *Cluster) testClusterDB(ctx context.Context) (<-chan struct{}, error) {
|
||||
func (c *Cluster) testClusterDB(ctx context.Context) <-chan struct{} {
|
||||
result := make(chan struct{})
|
||||
if c.managedDB == nil {
|
||||
close(result)
|
||||
return result, nil
|
||||
return result
|
||||
}
|
||||
|
||||
go func() {
|
||||
|
|
@ -50,7 +50,7 @@ func (c *Cluster) testClusterDB(ctx context.Context) (<-chan struct{}, error) {
|
|||
}
|
||||
}()
|
||||
|
||||
return result, nil
|
||||
return result
|
||||
}
|
||||
|
||||
// start starts the database, unless a cluster reset has been requested, in which case
|
||||
|
|
|
|||
|
|
@ -235,17 +235,18 @@ type Control struct {
|
|||
ClusterResetRestorePath string
|
||||
MinTLSVersion string
|
||||
CipherSuites []string
|
||||
TLSMinVersion uint16 `json:"-"`
|
||||
TLSCipherSuites []uint16 `json:"-"`
|
||||
EtcdSnapshotName string `json:"-"`
|
||||
EtcdDisableSnapshots bool `json:"-"`
|
||||
EtcdExposeMetrics bool `json:"-"`
|
||||
EtcdSnapshotDir string `json:"-"`
|
||||
EtcdSnapshotCron string `json:"-"`
|
||||
EtcdSnapshotRetention int `json:"-"`
|
||||
EtcdSnapshotCompress bool `json:"-"`
|
||||
EtcdListFormat string `json:"-"`
|
||||
EtcdS3 *EtcdS3 `json:"-"`
|
||||
TLSMinVersion uint16 `json:"-"`
|
||||
TLSCipherSuites []uint16 `json:"-"`
|
||||
EtcdSnapshotName string `json:"-"`
|
||||
EtcdDisableSnapshots bool `json:"-"`
|
||||
EtcdExposeMetrics bool `json:"-"`
|
||||
EtcdSnapshotDir string `json:"-"`
|
||||
EtcdSnapshotCron string `json:"-"`
|
||||
EtcdSnapshotReconcile metav1.Duration `json:"-"`
|
||||
EtcdSnapshotRetention int `json:"-"`
|
||||
EtcdSnapshotCompress bool `json:"-"`
|
||||
EtcdListFormat string `json:"-"`
|
||||
EtcdS3 *EtcdS3 `json:"-"`
|
||||
ServerNodeName string
|
||||
VLevel int
|
||||
VModule string
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import (
|
|||
healthpb "google.golang.org/grpc/health/grpc_health_v1"
|
||||
"google.golang.org/grpc/reflection"
|
||||
"google.golang.org/grpc/status"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
utilnet "k8s.io/apimachinery/pkg/util/net"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
)
|
||||
|
|
@ -67,6 +68,7 @@ func generateTestConfig() *config.Control {
|
|||
DataDir: "/tmp/k3s/", // Different than the default value
|
||||
EtcdSnapshotName: "etcd-snapshot",
|
||||
EtcdSnapshotCron: "0 */12 * * *",
|
||||
EtcdSnapshotReconcile: metav1.Duration{Duration: 10 * time.Minute},
|
||||
EtcdSnapshotRetention: 5,
|
||||
EtcdS3: &config.EtcdS3{
|
||||
Endpoint: "s3.amazonaws.com",
|
||||
|
|
|
|||
Loading…
Reference in a new issue