Merge pull request #18200 from roidelapluie/roidelapluie/retention-validation
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Compliance testing (push) Waiting to run
CI / Build Prometheus for common architectures (push) Waiting to run
CI / Build Prometheus for all architectures (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run

Multiple fixes in retention configuration
This commit is contained in:
Julien 2026-03-20 12:27:37 +01:00 committed by GitHub
commit 16876bab95
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 173 additions and 48 deletions

View file

@ -675,6 +675,18 @@ func main() {
os.Exit(2)
}
// Set TSDB retention defaults from CLI flags before any config file is loaded.
// This makes CLI flags act as the default when no retention section is present.
cliRetentionDuration := cfg.tsdb.RetentionDuration
cliMaxBytes := cfg.tsdb.MaxBytes
if cliRetentionDuration == 0 && cliMaxBytes == 0 {
cliRetentionDuration = defaultRetentionDuration
}
config.DefaultTSDBRetentionConfig = config.TSDBRetentionConfig{
Time: cliRetentionDuration,
Size: cliMaxBytes,
}
// Throw error for invalid config before starting other components.
var cfgFile *config.Config
if cfgFile, err = config.LoadFile(cfg.configFile, agentMode, promslog.NewNopLogger()); err != nil {
@ -716,21 +728,11 @@ func main() {
logger.Warn("The option --storage.tsdb.block-reload-interval is set to a value less than 1s. Setting it to 1s to avoid overload.")
cfg.tsdb.BlockReloadInterval = model.Duration(1 * time.Second)
}
if cfgFile.StorageConfig.TSDBConfig != nil {
cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
cfg.tsdb.StaleSeriesCompactionThreshold = cfgFile.StorageConfig.TSDBConfig.StaleSeriesCompactionThreshold
if cfgFile.StorageConfig.TSDBConfig.Retention != nil {
if cfgFile.StorageConfig.TSDBConfig.Retention.Time > 0 {
cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time
}
if cfgFile.StorageConfig.TSDBConfig.Retention.Size > 0 {
cfg.tsdb.MaxBytes = cfgFile.StorageConfig.TSDBConfig.Retention.Size
}
if cfgFile.StorageConfig.TSDBConfig.Retention.Percentage > 0 {
cfg.tsdb.MaxPercentage = cfgFile.StorageConfig.TSDBConfig.Retention.Percentage
}
}
}
cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
cfg.tsdb.StaleSeriesCompactionThreshold = cfgFile.StorageConfig.TSDBConfig.StaleSeriesCompactionThreshold
cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time
cfg.tsdb.MaxBytes = cfgFile.StorageConfig.TSDBConfig.Retention.Size
cfg.tsdb.MaxPercentage = cfgFile.StorageConfig.TSDBConfig.Retention.Percentage
// Set Go runtime parameters before we get too far into initialization.
updateGoGC(cfgFile, logger)
@ -782,11 +784,6 @@ func main() {
cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/")
if !agentMode {
if cfg.tsdb.RetentionDuration == 0 && cfg.tsdb.MaxBytes == 0 && cfg.tsdb.MaxPercentage == 0 {
cfg.tsdb.RetentionDuration = defaultRetentionDuration
logger.Info("No time, size or percentage retention was set so using the default time retention", "duration", defaultRetentionDuration)
}
// Check for overflows. This limits our max retention to 100y.
if cfg.tsdb.RetentionDuration < 0 {
y, err := model.ParseDuration("100y")
@ -1035,8 +1032,29 @@ func main() {
reloaders := []reloader{
{
name: "db_storage",
reloader: localStorage.ApplyConfig,
name: "db_storage",
reloader: func() func(*config.Config) error {
lastTSDBRetention := config.TSDBRetentionConfig{}
return func(cfg *config.Config) error {
err := localStorage.ApplyConfig(cfg)
if err != nil || agentMode || cfg.StorageConfig.TSDBConfig == nil || cfg.StorageConfig.TSDBConfig.Retention == nil {
return err
}
curr := cfg.StorageConfig.TSDBConfig.Retention
if *curr == lastTSDBRetention {
return nil
}
logger.Info("TSDB retention updated",
"duration", curr.Time,
"size", curr.Size,
"percentage", curr.Percentage,
)
lastTSDBRetention = *curr
return nil
}
}(),
}, {
name: "remote_storage",
reloader: remoteStorage.ApplyConfig,

View file

@ -83,6 +83,13 @@ func Load(s string, logger *slog.Logger) (*Config, error) {
return nil, err
}
// When the config body is empty, UnmarshalYAML is never called, so
// TSDBConfig may still be nil.
if cfg.StorageConfig.TSDBConfig == nil {
retention := DefaultTSDBRetentionConfig
cfg.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention}
}
b := labels.NewScratchBuilder(0)
cfg.GlobalConfig.ExternalLabels.Range(func(v labels.Label) {
newV := os.Expand(v.Value, func(s string) string {
@ -276,6 +283,9 @@ var (
// For backwards compatibility.
LabelNamePreserveMultipleUnderscores: true,
}
// DefaultTSDBRetentionConfig is the default TSDB retention configuration.
DefaultTSDBRetentionConfig TSDBRetentionConfig
)
// Config is the top-level configuration for Prometheus's config files.
@ -405,6 +415,13 @@ func (c *Config) UnmarshalYAML(unmarshal func(any) error) error {
c.Runtime = DefaultRuntimeConfig
}
// If no storage.tsdb section is present, TSDBConfig is nil and its
// UnmarshalYAML never runs. Inject the default retention here.
if c.StorageConfig.TSDBConfig == nil {
retention := DefaultTSDBRetentionConfig
c.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention}
}
for _, rf := range c.RuleFiles {
if !patRulePath.MatchString(rf) {
return fmt.Errorf("invalid rule file path %q", rf)
@ -1097,6 +1114,22 @@ type TSDBRetentionConfig struct {
Percentage uint `yaml:"percentage,omitempty"`
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (t *TSDBRetentionConfig) UnmarshalYAML(unmarshal func(any) error) error {
*t = TSDBRetentionConfig{}
type plain TSDBRetentionConfig
if err := unmarshal((*plain)(t)); err != nil {
return err
}
if t.Size < 0 {
return fmt.Errorf("'storage.tsdb.retention.size' must be greater than or equal to 0, got %v", t.Size)
}
if t.Percentage > 100 {
return fmt.Errorf("'storage.tsdb.retention.percentage' must be in the range [0, 100], got %v", t.Percentage)
}
return nil
}
// TSDBConfig configures runtime reloadable configuration options.
type TSDBConfig struct {
// OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted
@ -1127,6 +1160,11 @@ func (t *TSDBConfig) UnmarshalYAML(unmarshal func(any) error) error {
t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds()
if t.Retention == nil {
retention := DefaultTSDBRetentionConfig
t.Retention = &retention
}
return nil
}

View file

@ -20,9 +20,10 @@ const ruleFilesConfigFile = "testdata/rules_abs_path.good.yml"
var ruleFilesExpectedConf = &Config{
loaded: true,
GlobalConfig: DefaultGlobalConfig,
Runtime: DefaultRuntimeConfig,
OTLPConfig: DefaultOTLPConfig,
GlobalConfig: DefaultGlobalConfig,
Runtime: DefaultRuntimeConfig,
OTLPConfig: DefaultOTLPConfig,
StorageConfig: StorageConfig{TSDBConfig: &TSDBConfig{Retention: &TSDBRetentionConfig{}}},
RuleFiles: []string{
"testdata/first.rules",
"testdata/rules/second.rules",

View file

@ -2626,6 +2626,22 @@ var expectedErrors = []struct {
filename: "stackit_endpoint.bad.yml",
errMsg: "invalid endpoint",
},
{
filename: "tsdb_retention_time.bad.yml",
errMsg: `not a valid duration string: "-1h"`,
},
{
filename: "tsdb_retention_size.bad.yml",
errMsg: `'storage.tsdb.retention.size' must be greater than or equal to 0`,
},
{
filename: "tsdb_retention_percentage.bad.yml",
errMsg: `'storage.tsdb.retention.percentage' must be in the range [0, 100]`,
},
{
filename: "tsdb_retention_percentage_negative.bad.yml",
errMsg: "cannot unmarshal !!int `-1` into uint",
},
}
func TestBadConfigs(t *testing.T) {
@ -2649,6 +2665,8 @@ func TestEmptyConfig(t *testing.T) {
require.NoError(t, err)
exp := DefaultConfig
exp.loaded = true
retention := DefaultTSDBRetentionConfig
exp.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention}
require.Equal(t, exp, *c)
require.Equal(t, 75, c.Runtime.GoGC)
}
@ -2700,6 +2718,10 @@ func TestGlobalConfig(t *testing.T) {
require.NoError(t, err)
exp := DefaultConfig
exp.loaded = true
// TSDBConfig is always injected by Config.UnmarshalYAML even when no
// storage.tsdb section is present, so the expected config must include it.
retention := DefaultTSDBRetentionConfig
exp.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention}
require.Equal(t, exp, *c)
})

View file

@ -18,8 +18,9 @@ const ruleFilesConfigFile = "testdata/rules_abs_path_windows.good.yml"
var ruleFilesExpectedConf = &Config{
loaded: true,
GlobalConfig: DefaultGlobalConfig,
Runtime: DefaultRuntimeConfig,
GlobalConfig: DefaultGlobalConfig,
Runtime: DefaultRuntimeConfig,
StorageConfig: StorageConfig{TSDBConfig: &TSDBConfig{Retention: &TSDBRetentionConfig{}}},
RuleFiles: []string{
"testdata\\first.rules",
"testdata\\rules\\second.rules",

View file

@ -0,0 +1,4 @@
storage:
tsdb:
retention:
percentage: 101

View file

@ -0,0 +1,4 @@
storage:
tsdb:
retention:
percentage: -1

View file

@ -0,0 +1,4 @@
storage:
tsdb:
retention:
size: -1GB

View file

@ -0,0 +1,4 @@
storage:
tsdb:
retention:
time: -1h

View file

@ -3877,9 +3877,9 @@ with this feature.
# or when a compaction completes, whichever comes first.
[ retention: <retention> ] :
# How long to retain samples in storage. If neither this option nor the size option
# is set, the retention time defaults to 15d. Units Supported: y, w, d, h, m, s, ms.
# is set, the retention time defaults to 15d. Setting this to 0 disables time-based retention.
# This option takes precedence over the deprecated command-line flag --storage.tsdb.retention.time.
[ time: <duration> | default = 15d ]
[ time: <duration> ]
# Maximum number of bytes that can be stored for blocks. A unit is required,
# supported units: B, KB, MB, GB, TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B.

View file

@ -1286,18 +1286,12 @@ func (db *DB) ApplyConfig(conf *config.Config) error {
// Update retention configuration if provided.
if conf.StorageConfig.TSDBConfig.Retention != nil {
db.retentionMtx.Lock()
if conf.StorageConfig.TSDBConfig.Retention.Time > 0 {
db.opts.RetentionDuration = int64(conf.StorageConfig.TSDBConfig.Retention.Time)
db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds())
}
if conf.StorageConfig.TSDBConfig.Retention.Size > 0 {
db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size)
db.metrics.maxBytes.Set(float64(db.opts.MaxBytes))
}
if conf.StorageConfig.TSDBConfig.Retention.Percentage > 0 {
db.opts.MaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage
db.metrics.maxPercentage.Set(float64(db.opts.MaxPercentage))
}
db.opts.RetentionDuration = int64(time.Duration(conf.StorageConfig.TSDBConfig.Retention.Time) / time.Millisecond)
db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds())
db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size)
db.metrics.maxBytes.Set(float64(db.opts.MaxBytes))
db.opts.MaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage
db.metrics.maxPercentage.Set(float64(db.opts.MaxPercentage))
db.retentionMtx.Unlock()
}
} else {

View file

@ -1748,7 +1748,7 @@ func TestRuntimeRetentionConfigChange(t *testing.T) {
StorageConfig: config.StorageConfig{
TSDBConfig: &config.TSDBConfig{
Retention: &config.TSDBRetentionConfig{
Time: model.Duration(shorterRetentionDuration),
Time: model.Duration(time.Duration(shorterRetentionDuration) * time.Millisecond),
},
},
},
@ -1777,6 +1777,31 @@ func TestRuntimeRetentionConfigChange(t *testing.T) {
require.Positive(t, int(prom_testutil.ToFloat64(db.metrics.timeRetentionCount)), "time retention count should be incremented")
}
// TestApplyConfigRetentionDurationMetricUnit verifies that after a config
// reload the prometheus_tsdb_retention_limit_seconds metric reports the
// retention in seconds.
func TestApplyConfigRetentionDurationMetricUnit(t *testing.T) {
oneHourMs := int64(time.Hour / time.Millisecond)
db := newTestDB(t, withOpts(&Options{RetentionDuration: oneHourMs}))
cfg := &config.Config{
StorageConfig: config.StorageConfig{
TSDBConfig: &config.TSDBConfig{
Retention: &config.TSDBRetentionConfig{
Time: model.Duration(time.Hour),
},
},
},
}
require.NoError(t, db.ApplyConfig(cfg))
require.Equal(t, oneHourMs, db.getRetentionDuration())
gotSeconds := prom_testutil.ToFloat64(db.metrics.retentionDuration)
wantSeconds := time.Hour.Seconds()
require.Equal(t, wantSeconds, gotSeconds)
}
func TestNotMatcherSelectsLabelsUnsetSeries(t *testing.T) {
db := newTestDB(t)

View file

@ -253,6 +253,11 @@ func (h *Handler) ApplyConfig(conf *config.Config) error {
defer h.mtx.Unlock()
h.config = conf
if conf.StorageConfig.TSDBConfig != nil && conf.StorageConfig.TSDBConfig.Retention != nil {
h.options.TSDBRetentionDuration = conf.StorageConfig.TSDBConfig.Retention.Time
h.options.TSDBMaxBytes = conf.StorageConfig.TSDBConfig.Retention.Size
h.options.TSDBMaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage
}
return nil
}
@ -866,20 +871,25 @@ func (h *Handler) runtimeInfo() (api_v1.RuntimeInfo, error) {
status.Hostname = hostname
status.ServerTime = time.Now().UTC()
if h.options.TSDBRetentionDuration != 0 {
status.StorageRetention = h.options.TSDBRetentionDuration.String()
h.mtx.RLock()
tsdbRetentionDuration := h.options.TSDBRetentionDuration
tsdbMaxBytes := h.options.TSDBMaxBytes
tsdbMaxPercentage := h.options.TSDBMaxPercentage
h.mtx.RUnlock()
if tsdbRetentionDuration != 0 {
status.StorageRetention = tsdbRetentionDuration.String()
}
if h.options.TSDBMaxBytes != 0 {
if tsdbMaxBytes != 0 {
if status.StorageRetention != "" {
status.StorageRetention += " or "
}
status.StorageRetention += h.options.TSDBMaxBytes.String()
status.StorageRetention += tsdbMaxBytes.String()
}
if h.options.TSDBMaxPercentage != 0 {
if tsdbMaxPercentage != 0 {
if status.StorageRetention != "" {
status.StorageRetention += " or "
}
status.StorageRetention = status.StorageRetention + strconv.FormatUint(uint64(h.options.TSDBMaxPercentage), 10) + "%"
status.StorageRetention = status.StorageRetention + strconv.FormatUint(uint64(tsdbMaxPercentage), 10) + "%"
}
metrics, err := h.gatherer.Gather()