From 19170ecbcf14d9275aa11b18333aaad87e1c10c4 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Wed, 23 Mar 2022 15:37:28 +0100 Subject: [PATCH] SLA reporting: history retention for SLA tables --- cmd/icingadb/main.go | 11 +-- config.yml.example | 10 ++- doc/03-Configuration.md | 15 ++-- pkg/config/config.go | 10 +-- pkg/config/history_retention.go | 15 ++-- pkg/icingadb/history/retention.go | 127 +++++++++++++++++++++------- tests/cleanup_and_retention_test.go | 49 +++++++---- 7 files changed, 162 insertions(+), 75 deletions(-) diff --git a/cmd/icingadb/main.go b/cmd/icingadb/main.go index 164a43db..8dd91653 100644 --- a/cmd/icingadb/main.go +++ b/cmd/icingadb/main.go @@ -133,11 +133,12 @@ func run() int { ods := overdue.NewSync(db, rc, logs.GetChildLogger("overdue-sync")) ret := history.NewRetention( db, - cmd.Config.HistoryRetention.Days, - cmd.Config.HistoryRetention.Interval, - cmd.Config.HistoryRetention.Count, - cmd.Config.HistoryRetention.Options, - logs.GetChildLogger("history-retention"), + cmd.Config.Retention.HistoryDays, + cmd.Config.Retention.SlaDays, + cmd.Config.Retention.Interval, + cmd.Config.Retention.Count, + cmd.Config.Retention.Options, + logs.GetChildLogger("retention"), ) sig := make(chan os.Signal, 1) diff --git a/config.yml.example b/config.yml.example index 963aee0b..5c74bb15 100644 --- a/config.yml.example +++ b/config.yml.example @@ -33,15 +33,17 @@ logging: # dump-signals: # heartbeat: # high-availability: -# history-retention: # history-sync: # overdue-sync: # redis: +# retention: # runtime-updates: -history-retention: - # Number of days to retain historical data. By default, historical data is retained forever. -# days: +retention: + # Number of days to retain full historical data. By default, historical data is retained forever. +# history-days: + # Number of days to retain historical data for SLA reporting. By default, it is retained forever. +# sla-days: # Map of history category to number of days to retain its data in order to # enable retention only for specific categories or to override the number that has been configured in days. options: diff --git a/doc/03-Configuration.md b/doc/03-Configuration.md index 19def87f..7c58ddb6 100644 --- a/doc/03-Configuration.md +++ b/doc/03-Configuration.md @@ -57,10 +57,10 @@ database | Database connection status and queries. dump-signals | Dump signals received from Icinga. heartbeat | Icinga heartbeats received through Redis. high-availability | Manages responsibility of Icinga DB instances. -history-retention | Deletes historical data that exceed their configured retention period. history-sync | Synchronization of history entries from Redis to MySQL. overdue-sync | Calculation and synchronization of the overdue status of checkables. redis | Redis connection status and queries. +retention | Deletes historical data that exceed their configured retention period. runtime-updates | Runtime updates of config objects after the initial config synchronization. ### Duration String @@ -68,12 +68,15 @@ runtime-updates | Runtime updates of config objects after the initial c A duration string is a sequence of decimal numbers and a unit suffix, such as `"20s"`. Valid units are `"ms"`, `"s"`, `"m"` and `"h"`. -## History Retention +## Retention By default, no historical data is deleted, which means that the longer the data is retained, the more disk space is required to store it. History retention is an optional feature that allows you to limit the number of days that historical data is available for each history category. +There are separate options for the full history tables used to display history information in the web interface and +SLA tables which store the minimal information required for SLA reporting, allowing to keep this information for longer with a smaller storage footprint. -| Option | Description | -|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. | -| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification` and `state`. | +| Option | Description | +|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| history-days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. | +| sla-days | **Optional.** Number of days to retain historical data for SLA reporting. | +| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification`, `sla` and `state`. | diff --git a/pkg/config/config.go b/pkg/config/config.go index bb341abe..89e040b4 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -14,10 +14,10 @@ import ( // Config defines Icinga DB config. type Config struct { - Database Database `yaml:"database"` - Redis Redis `yaml:"redis"` - Logging Logging `yaml:"logging"` - HistoryRetention HistoryRetention `yaml:"history-retention"` + Database Database `yaml:"database"` + Redis Redis `yaml:"redis"` + Logging Logging `yaml:"logging"` + Retention Retention `yaml:"retention"` } // Validate checks constraints in the supplied configuration and returns an error if they are violated. @@ -31,7 +31,7 @@ func (c *Config) Validate() error { if err := c.Logging.Validate(); err != nil { return err } - if err := c.HistoryRetention.Validate(); err != nil { + if err := c.Retention.Validate(); err != nil { return err } diff --git a/pkg/config/history_retention.go b/pkg/config/history_retention.go index 6709cd61..d4373b70 100644 --- a/pkg/config/history_retention.go +++ b/pkg/config/history_retention.go @@ -6,17 +6,18 @@ import ( "time" ) -// HistoryRetention defines configuration for history retention. -type HistoryRetention struct { - Days uint64 `yaml:"days"` - Interval time.Duration `yaml:"interval" default:"1h"` - Count uint64 `yaml:"count" default:"5000"` - Options history.RetentionOptions `yaml:"options"` +// Retention defines configuration for history retention. +type Retention struct { + HistoryDays uint64 `yaml:"history-days"` + SlaDays uint64 `yaml:"sla-days"` + Interval time.Duration `yaml:"interval" default:"1h"` + Count uint64 `yaml:"count" default:"5000"` + Options history.RetentionOptions `yaml:"options"` } // Validate checks constraints in the supplied retention configuration and // returns an error if they are violated. -func (r *HistoryRetention) Validate() error { +func (r *Retention) Validate() error { if r.Interval <= 0 { return errors.New("retention interval must be positive") } diff --git a/pkg/icingadb/history/retention.go b/pkg/icingadb/history/retention.go index 920b79c2..6ebe97c8 100644 --- a/pkg/icingadb/history/retention.go +++ b/pkg/icingadb/history/retention.go @@ -11,39 +11,85 @@ import ( "time" ) +type RetentionType int + +const ( + RetentionHistory RetentionType = iota + RetentionSla +) + +type retentionStatement struct { + icingadb.CleanupStmt + RetentionType + Category string +} + // RetentionStatements maps history categories with corresponding cleanup statements. -var RetentionStatements = map[string]icingadb.CleanupStmt{ - "acknowledgement": { +var RetentionStatements = []retentionStatement{{ + RetentionType: RetentionHistory, + Category: "acknowledgement", + CleanupStmt: icingadb.CleanupStmt{ Table: "acknowledgement_history", PK: "id", Column: "clear_time", }, - "comment": { +}, { + RetentionType: RetentionHistory, + Category: "comment", + CleanupStmt: icingadb.CleanupStmt{ Table: "comment_history", PK: "comment_id", Column: "remove_time", }, - "downtime": { +}, { + RetentionType: RetentionHistory, + Category: "downtime", + CleanupStmt: icingadb.CleanupStmt{ Table: "downtime_history", PK: "downtime_id", Column: "end_time", }, - "flapping": { +}, { + RetentionType: RetentionHistory, + Category: "flapping", + CleanupStmt: icingadb.CleanupStmt{ Table: "flapping_history", PK: "id", Column: "end_time", }, - "notification": { +}, { + RetentionType: RetentionHistory, + Category: "notification", + CleanupStmt: icingadb.CleanupStmt{ Table: "notification_history", PK: "id", Column: "send_time", }, - "state": { +}, { + RetentionType: RetentionHistory, + Category: "state", + CleanupStmt: icingadb.CleanupStmt{ Table: "state_history", PK: "id", Column: "event_time", }, -} +}, { + RetentionType: RetentionSla, + Category: "sla_downtime", + CleanupStmt: icingadb.CleanupStmt{ + Table: "sla_history_downtime", + PK: "downtime_id", + Column: "downtime_end", + }, +}, { + RetentionType: RetentionSla, + Category: "sla_state", + CleanupStmt: icingadb.CleanupStmt{ + Table: "sla_history_state", + PK: "id", + Column: "event_time", + }, +}} // RetentionOptions defines the non-default mapping of history categories with their retention period in days. type RetentionOptions map[string]uint64 @@ -51,8 +97,15 @@ type RetentionOptions map[string]uint64 // Validate checks constraints in the supplied retention options and // returns an error if they are violated. func (o RetentionOptions) Validate() error { + allowedCategories := make(map[string]struct{}) + for _, stmt := range RetentionStatements { + if stmt.RetentionType == RetentionHistory { + allowedCategories[stmt.Category] = struct{}{} + } + } + for category := range o { - if _, ok := RetentionStatements[category]; !ok { + if _, ok := allowedCategories[category]; !ok { return errors.Errorf("invalid key %s for history retention", category) } } @@ -62,23 +115,28 @@ func (o RetentionOptions) Validate() error { // Retention deletes rows from history tables that exceed their configured retention period. type Retention struct { - db *icingadb.DB - logger *logging.Logger - days uint64 - interval time.Duration - count uint64 - options RetentionOptions + db *icingadb.DB + logger *logging.Logger + historyDays uint64 + slaDays uint64 + interval time.Duration + count uint64 + options RetentionOptions } // NewRetention returns a new Retention. -func NewRetention(db *icingadb.DB, days uint64, interval time.Duration, count uint64, options RetentionOptions, logger *logging.Logger) *Retention { +func NewRetention( + db *icingadb.DB, historyDays uint64, slaDays uint64, interval time.Duration, + count uint64, options RetentionOptions, logger *logging.Logger, +) *Retention { return &Retention{ - db: db, - logger: logger, - days: days, - interval: interval, - count: count, - options: options, + db: db, + logger: logger, + historyDays: historyDays, + slaDays: slaDays, + interval: interval, + count: count, + options: options, } } @@ -94,32 +152,39 @@ func (r *Retention) StartWithCallback(ctx context.Context, c func(table string, errs := make(chan error, 1) - for category, stmt := range RetentionStatements { - days, ok := r.options[category] - if !ok { - days = r.days + for _, stmt := range RetentionStatements { + var days uint64 + switch stmt.RetentionType { + case RetentionHistory: + if d, ok := r.options[stmt.Category]; ok { + days = d + } else { + days = r.historyDays + } + case RetentionSla: + days = r.slaDays } if days < 1 { - r.logger.Debugf("Skipping history retention for category %s", category) + r.logger.Debugf("Skipping history retention for category %s", stmt.Category) continue } r.logger.Debugw( - fmt.Sprintf("Starting history retention for category %s", category), + fmt.Sprintf("Starting history retention for category %s", stmt.Category), zap.Uint64("count", r.count), zap.Duration("interval", r.interval), zap.Uint64("retention-days", days), ) - category := category stmt := stmt periodic.Start(ctx, r.interval, func(tick periodic.Tick) { olderThan := tick.Time.AddDate(0, 0, -int(days)) - r.logger.Debugf("Cleaning up historical data for category %s older than %s", category, olderThan) + r.logger.Debugf("Cleaning up historical data for category %s from table %s older than %s", + stmt.Category, stmt.Table, olderThan) - rs, err := r.db.CleanupOlderThan(ctx, stmt, r.count, olderThan) + rs, err := r.db.CleanupOlderThan(ctx, stmt.CleanupStmt, r.count, olderThan) if err != nil { select { case errs <- err: diff --git a/tests/cleanup_and_retention_test.go b/tests/cleanup_and_retention_test.go index a3049398..b6ba49b1 100644 --- a/tests/cleanup_and_retention_test.go +++ b/tests/cleanup_and_retention_test.go @@ -9,6 +9,7 @@ import ( "github.com/jmoiron/sqlx" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "strings" "testing" "time" ) @@ -22,7 +23,8 @@ func TestCleanupAndRetention(t *testing.T) { t.Cleanup(func() { _ = db.Close() }) reten := retention{ - Days: 7, + HistoryDays: 7, + SlaDays: 30, Options: map[string]int{ "acknowledgement": 0, // No cleanup. "comment": 1, @@ -31,6 +33,16 @@ func TestCleanupAndRetention(t *testing.T) { }, } + daysForCategory := func(category string) int { + if strings.HasPrefix(category, "sla_") { + return reten.SlaDays + } else if d, ok := reten.Options[category]; ok { + return d + } else { + return reten.HistoryDays + } + } + rowsToDelete := 10000 rowsToSpare := 1000 @@ -38,11 +50,7 @@ func TestCleanupAndRetention(t *testing.T) { err := dropNotNullColumns(db, stmt) assert.NoError(t, err) - retentionDays, ok := reten.Options[category] - if !ok { - retentionDays = reten.Days - } - + retentionDays := daysForCategory(category) start := time.Now().AddDate(0, 0, -retentionDays).Add(-1 * time.Millisecond * time.Duration(rowsToDelete)) startMilli := start.UnixMilli() @@ -75,18 +83,14 @@ func TestCleanupAndRetention(t *testing.T) { i.Reload() waitForDumpDoneSignal(t, r, 20*time.Second, 100*time.Millisecond) config, err := yaml.Marshal(struct { - Retention retention `yaml:"history-retention"` + Retention retention `yaml:"retention"` }{reten}) assert.NoError(t, err) it.IcingaDbInstanceT(t, r, rdb, services.WithIcingaDbConfig(string(config))) eventually.Assert(t, func(t require.TestingT) { for category, stmt := range retentionStatements { - retentionDays, ok := reten.Options[category] - if !ok { - retentionDays = reten.Days - } - + retentionDays := daysForCategory(category) threshold := time.Now().AddDate(0, 0, -retentionDays) thresholdMilli := threshold.UnixMilli() @@ -106,10 +110,10 @@ func TestCleanupAndRetention(t *testing.T) { if retentionDays == 0 { // No cleanup. - assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there") + assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there for %s", category) } else { - assert.Equal(t, 0, rowsLeft, "rows left in retention period") - assert.Equal(t, rowsToSpare, rowsSpared, "rows spared") + assert.Equal(t, 0, rowsLeft, "rows left in retention period for %s", category) + assert.Equal(t, rowsToSpare, rowsSpared, "rows spared for %s", category) } } }, time.Minute, time.Second) @@ -122,8 +126,9 @@ type cleanupStmt struct { } type retention struct { - Days int `yaml:"days"` - Options map[string]int `yaml:"options"` + HistoryDays int `yaml:"history-days"` + SlaDays int `yaml:"sla-days"` + Options map[string]int `yaml:"options"` } var retentionStatements = map[string]cleanupStmt{ @@ -157,6 +162,16 @@ var retentionStatements = map[string]cleanupStmt{ PK: "id", Column: "event_time", }, + "sla_downtime": { + Table: "sla_history_downtime", + PK: "downtime_id", + Column: "downtime_end", + }, + "sla_state": { + Table: "sla_history_state", + PK: "id", + Column: "event_time", + }, } // dropNotNullColumns drops all columns with a NOT NULL constraint that are not