Merge pull request #247 from Icinga/feature/sla-reporting

SLA reporting
This commit is contained in:
Julian Brost 2022-05-13 12:21:47 +02:00 committed by GitHub
commit da230d4f92
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
24 changed files with 1765 additions and 108 deletions

58
.github/workflows/sql.yml vendored Normal file
View file

@ -0,0 +1,58 @@
name: SQL
on:
push:
branches:
- master
pull_request: {}
jobs:
sql:
name: ${{ matrix.database.name }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
database:
- {type: MYSQL, name: MySQL 5.5, image: "icinga/icingadb-mysql:5.5"}
- {type: MYSQL, name: MySQL 5.6, image: "icinga/icingadb-mysql:5.6"}
- {type: MYSQL, name: MySQL 5.7, image: "mysql:5.7"}
- {type: MYSQL, name: MySQL latest, image: "mysql:latest"}
- {type: MYSQL, name: MariaDB 10.1, image: "mariadb:10.1"}
- {type: MYSQL, name: MariaDB 10.2, image: "mariadb:10.2"}
- {type: MYSQL, name: MariaDB 10.3, image: "mariadb:10.3"}
- {type: MYSQL, name: MariaDB 10.4, image: "mariadb:10.4"}
- {type: MYSQL, name: MariaDB 10.5, image: "mariadb:10.5"}
- {type: MYSQL, name: MariaDB 10.6, image: "mariadb:10.6"}
- {type: MYSQL, name: MariaDB 10.7, image: "mariadb:10.7"}
- {type: MYSQL, name: MariaDB latest, image: "mariadb:latest"}
- {type: PGSQL, name: PostgreSQL 9.6, image: "postgres:9.6"}
- {type: PGSQL, name: PostgreSQL 10, image: "postgres:10"}
- {type: PGSQL, name: PostgreSQL 11, image: "postgres:11"}
- {type: PGSQL, name: PostgreSQL 12, image: "postgres:12"}
- {type: PGSQL, name: PostgreSQL 13, image: "postgres:13"}
- {type: PGSQL, name: PostgreSQL latest, image: "postgres:latest"}
steps:
- name: Setup Go
uses: actions/setup-go@v1
with:
go-version: '^1.16'
- name: Checkout code
uses: actions/checkout@v2
- name: Download dependencies
run: go get -v -t -d ./...
working-directory: tests/
- name: Run tests
env:
ICINGADB_TESTS_DATABASE_TYPE: ${{ matrix.database.type }}
ICINGA_TESTING_${{ matrix.database.type }}_IMAGE: ${{ matrix.database.image }}
ICINGA_TESTING_ICINGADB_SCHEMA_MYSQL: ${{ github.workspace }}/schema/mysql/schema.sql
ICINGA_TESTING_ICINGADB_SCHEMA_PGSQL: ${{ github.workspace }}/schema/pgsql/schema.sql
timeout-minutes: 10
run: go test -v -timeout 5m ./sql
working-directory: tests/

View file

@ -133,11 +133,12 @@ func run() int {
ods := overdue.NewSync(db, rc, logs.GetChildLogger("overdue-sync"))
ret := history.NewRetention(
db,
cmd.Config.HistoryRetention.Days,
cmd.Config.HistoryRetention.Interval,
cmd.Config.HistoryRetention.Count,
cmd.Config.HistoryRetention.Options,
logs.GetChildLogger("history-retention"),
cmd.Config.Retention.HistoryDays,
cmd.Config.Retention.SlaDays,
cmd.Config.Retention.Interval,
cmd.Config.Retention.Count,
cmd.Config.Retention.Options,
logs.GetChildLogger("retention"),
)
sig := make(chan os.Signal, 1)

View file

@ -33,15 +33,17 @@ logging:
# dump-signals:
# heartbeat:
# high-availability:
# history-retention:
# history-sync:
# overdue-sync:
# redis:
# retention:
# runtime-updates:
history-retention:
# Number of days to retain historical data. By default, historical data is retained forever.
# days:
retention:
# Number of days to retain full historical data. By default, historical data is retained forever.
# history-days:
# Number of days to retain historical data for SLA reporting. By default, it is retained forever.
# sla-days:
# Map of history category to number of days to retain its data in order to
# enable retention only for specific categories or to override the number that has been configured in days.
options:

View file

@ -3,7 +3,7 @@
## Requirements <a id="installation-requirements"></a>
* Local Redis instance (Will be installed during this documentation)
* MySQL/MariaDB/PostgreSQL database `icingadb`, user and schema imports (Will be set up during this documentation)
* MySQL (≥5.5), MariaDB (≥10.1), or PostgreSQL (≥9.6): database, user and schema imports (Will be set up during this documentation)
## Setting up Icinga DB <a id="setting-up-icingadb"></a>
@ -176,7 +176,6 @@ psql icingadb <<<'CREATE EXTENSION IF NOT EXISTS citext;'
```
The CREATE EXTENSION command requires the postgresql-contrib package.
(On RHEL/CentOS 7: rh-postgresql95-postgresql-contrib)
Edit `pg_hba.conf`, insert the following before everything else:
@ -187,7 +186,6 @@ host all icingadb ::/0 md5
```
To apply those changes, run `systemctl reload postgresql`.
(On RHEL/CentOS 7 the service is called "rh-postgresql95-postgresql".)
After creating the database you can import the Icinga DB schema using the
following command. Enter the password when asked.
@ -196,9 +194,6 @@ following command. Enter the password when asked.
psql -U icingadb icingadb < /usr/share/icingadb/schema/pgsql/schema.sql
```
On RHEL/CentOS 7 prefix "createuser", "createdb" and "psql" with
"/opt/rh/rh-postgresql95/root/usr/bin/".
### Running Icinga DB <a id="running-icingadb"></a>
Foreground:

View file

@ -57,10 +57,10 @@ database | Database connection status and queries.
dump-signals | Dump signals received from Icinga.
heartbeat | Icinga heartbeats received through Redis.
high-availability | Manages responsibility of Icinga DB instances.
history-retention | Deletes historical data that exceed their configured retention period.
history-sync | Synchronization of history entries from Redis to MySQL.
overdue-sync | Calculation and synchronization of the overdue status of checkables.
redis | Redis connection status and queries.
retention | Deletes historical data that exceed their configured retention period.
runtime-updates | Runtime updates of config objects after the initial config synchronization.
### Duration String <a id="duration-string"></a>
@ -68,12 +68,15 @@ runtime-updates | Runtime updates of config objects after the initial c
A duration string is a sequence of decimal numbers and a unit suffix, such as `"20s"`.
Valid units are `"ms"`, `"s"`, `"m"` and `"h"`.
## History Retention <a id="configuration-history-retention"></a>
## Retention <a id="configuration-retention"></a>
By default, no historical data is deleted, which means that the longer the data is retained, the more disk space is required to store it.
History retention is an optional feature that allows you to limit the number of days that historical data is available for each history category.
There are separate options for the full history tables used to display history information in the web interface and
SLA tables which store the minimal information required for SLA reporting, allowing to keep this information for longer with a smaller storage footprint.
| Option | Description |
|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. |
| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification` and `state`. |
| Option | Description |
|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| history-days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. |
| sla-days | **Optional.** Number of days to retain historical data for SLA reporting. |
| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification`, `sla` and `state`. |

View file

@ -14,10 +14,10 @@ import (
// Config defines Icinga DB config.
type Config struct {
Database Database `yaml:"database"`
Redis Redis `yaml:"redis"`
Logging Logging `yaml:"logging"`
HistoryRetention HistoryRetention `yaml:"history-retention"`
Database Database `yaml:"database"`
Redis Redis `yaml:"redis"`
Logging Logging `yaml:"logging"`
Retention Retention `yaml:"retention"`
}
// Validate checks constraints in the supplied configuration and returns an error if they are violated.
@ -31,7 +31,7 @@ func (c *Config) Validate() error {
if err := c.Logging.Validate(); err != nil {
return err
}
if err := c.HistoryRetention.Validate(); err != nil {
if err := c.Retention.Validate(); err != nil {
return err
}

View file

@ -6,17 +6,18 @@ import (
"time"
)
// HistoryRetention defines configuration for history retention.
type HistoryRetention struct {
Days uint64 `yaml:"days"`
Interval time.Duration `yaml:"interval" default:"1h"`
Count uint64 `yaml:"count" default:"5000"`
Options history.RetentionOptions `yaml:"options"`
// Retention defines configuration for history retention.
type Retention struct {
HistoryDays uint64 `yaml:"history-days"`
SlaDays uint64 `yaml:"sla-days"`
Interval time.Duration `yaml:"interval" default:"1h"`
Count uint64 `yaml:"count" default:"5000"`
Options history.RetentionOptions `yaml:"options"`
}
// Validate checks constraints in the supplied retention configuration and
// returns an error if they are violated.
func (r *HistoryRetention) Validate() error {
func (r *Retention) Validate() error {
if r.Interval <= 0 {
return errors.New("retention interval must be positive")
}

View file

@ -11,39 +11,85 @@ import (
"time"
)
type RetentionType int
const (
RetentionHistory RetentionType = iota
RetentionSla
)
type retentionStatement struct {
icingadb.CleanupStmt
RetentionType
Category string
}
// RetentionStatements maps history categories with corresponding cleanup statements.
var RetentionStatements = map[string]icingadb.CleanupStmt{
"acknowledgement": {
var RetentionStatements = []retentionStatement{{
RetentionType: RetentionHistory,
Category: "acknowledgement",
CleanupStmt: icingadb.CleanupStmt{
Table: "acknowledgement_history",
PK: "id",
Column: "clear_time",
},
"comment": {
}, {
RetentionType: RetentionHistory,
Category: "comment",
CleanupStmt: icingadb.CleanupStmt{
Table: "comment_history",
PK: "comment_id",
Column: "remove_time",
},
"downtime": {
}, {
RetentionType: RetentionHistory,
Category: "downtime",
CleanupStmt: icingadb.CleanupStmt{
Table: "downtime_history",
PK: "downtime_id",
Column: "end_time",
},
"flapping": {
}, {
RetentionType: RetentionHistory,
Category: "flapping",
CleanupStmt: icingadb.CleanupStmt{
Table: "flapping_history",
PK: "id",
Column: "end_time",
},
"notification": {
}, {
RetentionType: RetentionHistory,
Category: "notification",
CleanupStmt: icingadb.CleanupStmt{
Table: "notification_history",
PK: "id",
Column: "send_time",
},
"state": {
}, {
RetentionType: RetentionHistory,
Category: "state",
CleanupStmt: icingadb.CleanupStmt{
Table: "state_history",
PK: "id",
Column: "event_time",
},
}
}, {
RetentionType: RetentionSla,
Category: "sla_downtime",
CleanupStmt: icingadb.CleanupStmt{
Table: "sla_history_downtime",
PK: "downtime_id",
Column: "downtime_end",
},
}, {
RetentionType: RetentionSla,
Category: "sla_state",
CleanupStmt: icingadb.CleanupStmt{
Table: "sla_history_state",
PK: "id",
Column: "event_time",
},
}}
// RetentionOptions defines the non-default mapping of history categories with their retention period in days.
type RetentionOptions map[string]uint64
@ -51,8 +97,15 @@ type RetentionOptions map[string]uint64
// Validate checks constraints in the supplied retention options and
// returns an error if they are violated.
func (o RetentionOptions) Validate() error {
allowedCategories := make(map[string]struct{})
for _, stmt := range RetentionStatements {
if stmt.RetentionType == RetentionHistory {
allowedCategories[stmt.Category] = struct{}{}
}
}
for category := range o {
if _, ok := RetentionStatements[category]; !ok {
if _, ok := allowedCategories[category]; !ok {
return errors.Errorf("invalid key %s for history retention", category)
}
}
@ -62,23 +115,28 @@ func (o RetentionOptions) Validate() error {
// Retention deletes rows from history tables that exceed their configured retention period.
type Retention struct {
db *icingadb.DB
logger *logging.Logger
days uint64
interval time.Duration
count uint64
options RetentionOptions
db *icingadb.DB
logger *logging.Logger
historyDays uint64
slaDays uint64
interval time.Duration
count uint64
options RetentionOptions
}
// NewRetention returns a new Retention.
func NewRetention(db *icingadb.DB, days uint64, interval time.Duration, count uint64, options RetentionOptions, logger *logging.Logger) *Retention {
func NewRetention(
db *icingadb.DB, historyDays uint64, slaDays uint64, interval time.Duration,
count uint64, options RetentionOptions, logger *logging.Logger,
) *Retention {
return &Retention{
db: db,
logger: logger,
days: days,
interval: interval,
count: count,
options: options,
db: db,
logger: logger,
historyDays: historyDays,
slaDays: slaDays,
interval: interval,
count: count,
options: options,
}
}
@ -94,32 +152,39 @@ func (r *Retention) StartWithCallback(ctx context.Context, c func(table string,
errs := make(chan error, 1)
for category, stmt := range RetentionStatements {
days, ok := r.options[category]
if !ok {
days = r.days
for _, stmt := range RetentionStatements {
var days uint64
switch stmt.RetentionType {
case RetentionHistory:
if d, ok := r.options[stmt.Category]; ok {
days = d
} else {
days = r.historyDays
}
case RetentionSla:
days = r.slaDays
}
if days < 1 {
r.logger.Debugf("Skipping history retention for category %s", category)
r.logger.Debugf("Skipping history retention for category %s", stmt.Category)
continue
}
r.logger.Debugw(
fmt.Sprintf("Starting history retention for category %s", category),
fmt.Sprintf("Starting history retention for category %s", stmt.Category),
zap.Uint64("count", r.count),
zap.Duration("interval", r.interval),
zap.Uint64("retention-days", days),
)
category := category
stmt := stmt
periodic.Start(ctx, r.interval, func(tick periodic.Tick) {
olderThan := tick.Time.AddDate(0, 0, -int(days))
r.logger.Debugf("Cleaning up historical data for category %s older than %s", category, olderThan)
r.logger.Debugf("Cleaning up historical data for category %s from table %s older than %s",
stmt.Category, stmt.Table, olderThan)
rs, err := r.db.CleanupOlderThan(ctx, stmt, r.count, olderThan)
rs, err := r.db.CleanupOlderThan(ctx, stmt.CleanupStmt, r.count, olderThan)
if err != nil {
select {
case errs <- err:

View file

@ -0,0 +1,26 @@
package history
import (
"github.com/go-redis/redis/v8"
"github.com/icinga/icingadb/pkg/icingadb/v1/history"
"github.com/icinga/icingadb/pkg/structify"
"github.com/icinga/icingadb/pkg/types"
"reflect"
)
var slaStateStructify = structify.MakeMapStructifier(reflect.TypeOf((*history.SlaHistoryState)(nil)).Elem(), "json")
func stateHistoryToSlaEntity(entry redis.XMessage) ([]history.UpserterEntity, error) {
slaStateInterface, err := slaStateStructify(entry.Values)
if err != nil {
return nil, err
}
slaState := slaStateInterface.(*history.SlaHistoryState)
if slaState.StateType != types.StateHard {
// only hard state changes are relevant for SLA history, discard all others
return nil, nil
}
return []history.UpserterEntity{slaState}, nil
}

View file

@ -359,12 +359,14 @@ var syncPipelines = map[string][]stageFunc{
writeOneEntityStage((*v1.HistoryNotification)(nil)), // history (depends on notification_history)
},
"state": {
writeOneEntityStage((*v1.StateHistory)(nil)), // state_history
writeOneEntityStage((*v1.HistoryState)(nil)), // history (depends on state_history)
writeOneEntityStage((*v1.StateHistory)(nil)), // state_history
writeOneEntityStage((*v1.HistoryState)(nil)), // history (depends on state_history)
writeMultiEntityStage(stateHistoryToSlaEntity), // sla_history_state
},
"downtime": {
writeOneEntityStage((*v1.DowntimeHistory)(nil)), // downtime_history
writeOneEntityStage((*v1.HistoryDowntime)(nil)), // history (depends on downtime_history)
writeOneEntityStage((*v1.DowntimeHistory)(nil)), // downtime_history
writeOneEntityStage((*v1.HistoryDowntime)(nil)), // history (depends on downtime_history)
writeOneEntityStage((*v1.SlaHistoryDowntime)(nil)), // sla_history_downtime
},
"comment": {
writeOneEntityStage((*v1.CommentHistory)(nil)), // comment_history

View file

@ -80,6 +80,30 @@ func (*HistoryDowntime) TableName() string {
return "history"
}
type SlaHistoryDowntime struct {
DowntimeHistoryEntity `json:",inline"`
HistoryTableMeta `json:",inline"`
SlaHistoryDowntimeUpserter `json:",inline"`
DowntimeStart types.UnixMilli `json:"start_time"`
HasBeenCancelled types.Bool `json:"has_been_cancelled" db:"-"`
CancelTime types.UnixMilli `json:"cancel_time" db:"-"`
EndTime types.UnixMilli `json:"end_time" db:"-"`
}
// Init implements the contracts.Initer interface.
func (s *SlaHistoryDowntime) Init() {
s.DowntimeEnd.History = s
}
type SlaHistoryDowntimeUpserter struct {
DowntimeEnd SlaDowntimeEndTime `json:"-"`
}
// Upsert implements the contracts.Upserter interface.
func (h *SlaHistoryDowntimeUpserter) Upsert() interface{} {
return h
}
type DowntimeEventTime struct {
History *HistoryDowntime `db:"-"`
}
@ -109,6 +133,19 @@ func (et DowntimeEventTime) Value() (driver.Value, error) {
}
}
type SlaDowntimeEndTime struct {
History *SlaHistoryDowntime `db:"-"`
}
// Value implements the driver.Valuer interface.
func (et SlaDowntimeEndTime) Value() (driver.Value, error) {
if et.History.HasBeenCancelled.Valid && et.History.HasBeenCancelled.Bool {
return et.History.CancelTime.Value()
} else {
return et.History.EndTime.Value()
}
}
// Assert interface compliance.
var (
_ contracts.Entity = (*DowntimeHistoryEntity)(nil)
@ -117,5 +154,8 @@ var (
_ contracts.Initer = (*HistoryDowntime)(nil)
_ contracts.TableNamer = (*HistoryDowntime)(nil)
_ UpserterEntity = (*HistoryDowntime)(nil)
_ contracts.Initer = (*SlaHistoryDowntime)(nil)
_ UpserterEntity = (*SlaHistoryDowntime)(nil)
_ driver.Valuer = DowntimeEventTime{}
_ driver.Valuer = SlaDowntimeEndTime{}
)

View file

@ -33,9 +33,19 @@ func (*HistoryState) TableName() string {
return "history"
}
type SlaHistoryState struct {
HistoryTableEntity `json:",inline"`
HistoryTableMeta `json:",inline"`
EventTime types.UnixMilli `json:"event_time"`
StateType types.StateType `json:"state_type" db:"-"`
HardState uint8 `json:"hard_state"`
PreviousHardState uint8 `json:"previous_hard_state"`
}
// Assert interface compliance.
var (
_ UpserterEntity = (*StateHistory)(nil)
_ contracts.TableNamer = (*HistoryState)(nil)
_ UpserterEntity = (*HistoryState)(nil)
_ UpserterEntity = (*SlaHistoryState)(nil)
)

View file

@ -46,10 +46,15 @@ func badStateType(t interface{}) error {
return errors.Errorf("bad state type: %#v", t)
}
const (
StateSoft = StateType(0)
StateHard = StateType(1)
)
// stateTypes maps all valid StateType values to their SQL representation.
var stateTypes = map[StateType]string{
0: "soft",
1: "hard",
StateSoft: "soft",
StateHard: "hard",
}
// Assert interface compliance.

View file

@ -3,6 +3,164 @@
SET SESSION sql_mode = 'STRICT_ALL_TABLES,NO_ENGINE_SUBSTITUTION';
SET SESSION innodb_strict_mode = 1;
DROP FUNCTION IF EXISTS get_sla_ok_percent;
DELIMITER //
CREATE FUNCTION get_sla_ok_percent(
in_host_id binary(20),
in_service_id binary(20),
in_start_time bigint unsigned,
in_end_time bigint unsigned
)
RETURNS decimal(7, 4)
READS SQL DATA
BEGIN
DECLARE result decimal(7, 4);
DECLARE row_event_time bigint unsigned;
DECLARE row_event_type enum('state_change', 'downtime_start', 'downtime_end', 'end');
DECLARE row_event_prio int;
DECLARE row_hard_state tinyint unsigned;
DECLARE row_previous_hard_state tinyint unsigned;
DECLARE last_event_time bigint unsigned;
DECLARE last_hard_state tinyint unsigned;
DECLARE active_downtimes int unsigned;
DECLARE problem_time bigint unsigned;
DECLARE total_time bigint unsigned;
DECLARE done int;
DECLARE cur CURSOR FOR
(
-- all downtime_start events before the end of the SLA interval
-- for downtimes that overlap the SLA interval in any way
SELECT
GREATEST(downtime_start, in_start_time) AS event_time,
'downtime_start' AS event_type,
1 AS event_prio,
NULL AS hard_state,
NULL AS previous_hard_state
FROM sla_history_downtime d
WHERE d.host_id = in_host_id
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
AND d.downtime_start < in_end_time
AND d.downtime_end >= in_start_time
) UNION ALL (
-- all downtime_end events before the end of the SLA interval
-- for downtimes that overlap the SLA interval in any way
SELECT
downtime_end AS event_time,
'downtime_end' AS event_type,
2 AS event_prio,
NULL AS hard_state,
NULL AS previous_hard_state
FROM sla_history_downtime d
WHERE d.host_id = in_host_id
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
AND d.downtime_start < in_end_time
AND d.downtime_end >= in_start_time
AND d.downtime_end < in_end_time
) UNION ALL (
-- all state events strictly in interval
SELECT
event_time,
'state_change' AS event_type,
0 AS event_prio,
hard_state,
previous_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
AND s.event_time < in_end_time
) UNION ALL (
-- end event to keep loop simple, values are not used
SELECT
in_end_time AS event_time,
'end' AS event_type,
3 AS event_prio,
NULL AS hard_state,
NULL AS previous_hard_state
)
ORDER BY event_time, event_prio;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1;
IF in_end_time <= in_start_time THEN
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'end time must be greater than start time';
END IF;
-- Use the latest event at or before the beginning of the SLA interval as the initial state.
SELECT hard_state INTO last_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time <= in_start_time
ORDER BY s.event_time DESC
LIMIT 1;
-- If this does not exist, use the previous state from the first event after the beginning of the SLA interval.
IF last_hard_state IS NULL THEN
SELECT previous_hard_state INTO last_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
ORDER BY s.event_time ASC
LIMIT 1;
END IF;
-- If this also does not exist, use the current host/service state.
IF last_hard_state IS NULL THEN
IF in_service_id IS NULL THEN
SELECT hard_state INTO last_hard_state
FROM host_state s
WHERE s.host_id = in_host_id;
ELSE
SELECT hard_state INTO last_hard_state
FROM service_state s
WHERE s.host_id = in_host_id
AND s.service_id = in_service_id;
END IF;
END IF;
IF last_hard_state IS NULL THEN
SET last_hard_state = 0;
END IF;
SET problem_time = 0;
SET total_time = in_end_time - in_start_time;
SET last_event_time = in_start_time;
SET active_downtimes = 0;
SET done = 0;
OPEN cur;
read_loop: LOOP
FETCH cur INTO row_event_time, row_event_type, row_event_prio, row_hard_state, row_previous_hard_state;
IF done THEN
LEAVE read_loop;
END IF;
IF row_previous_hard_state = 99 THEN
SET total_time = total_time - (row_event_time - last_event_time);
ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1))
AND last_hard_state != 99
AND active_downtimes = 0
THEN
SET problem_time = problem_time + row_event_time - last_event_time;
END IF;
SET last_event_time = row_event_time;
IF row_event_type = 'state_change' THEN
SET last_hard_state = row_hard_state;
ELSEIF row_event_type = 'downtime_start' THEN
SET active_downtimes = active_downtimes + 1;
ELSEIF row_event_type = 'downtime_end' THEN
SET active_downtimes = active_downtimes - 1;
END IF;
END LOOP;
CLOSE cur;
SET result = 100 * (total_time - problem_time) / total_time;
RETURN result;
END//
DELIMITER ;
CREATE TABLE host (
id binary(20) NOT NULL COMMENT 'sha1(environment.id + name)',
environment_id binary(20) NOT NULL COMMENT 'environment.id',
@ -1124,6 +1282,39 @@ CREATE TABLE history (
INDEX idx_history_host_service_id (host_id, service_id, event_time) COMMENT 'Host/service history detail filter'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
CREATE TABLE sla_history_state (
id binary(20) NOT NULL COMMENT 'state_history.id (may reference already deleted rows)',
environment_id binary(20) NOT NULL COMMENT 'environment.id',
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
object_type enum('host', 'service') NOT NULL,
host_id binary(20) NOT NULL COMMENT 'host.id',
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
event_time bigint unsigned NOT NULL COMMENT 'unix timestamp the event occurred',
hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state after this event',
previous_hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state before this event',
PRIMARY KEY (id),
INDEX idx_sla_history_state_event (host_id, service_id, event_time)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
CREATE TABLE sla_history_downtime (
environment_id binary(20) NOT NULL COMMENT 'environment.id',
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
object_type enum('host', 'service') NOT NULL,
host_id binary(20) NOT NULL COMMENT 'host.id',
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
downtime_id binary(20) NOT NULL COMMENT 'downtime.id (may reference already deleted rows)',
downtime_start BIGINT UNSIGNED NOT NULL COMMENT 'start time of the downtime',
downtime_end BIGINT UNSIGNED NOT NULL COMMENT 'end time of the downtime',
PRIMARY KEY (downtime_id),
INDEX idx_sla_history_downtime_event (host_id, service_id, downtime_start, downtime_end)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
CREATE TABLE icingadb_schema (
id int unsigned NOT NULL AUTO_INCREMENT,
version smallint unsigned NOT NULL,

View file

@ -1,3 +1,161 @@
DROP FUNCTION IF EXISTS get_sla_ok_percent;
DELIMITER //
CREATE FUNCTION get_sla_ok_percent(
in_host_id binary(20),
in_service_id binary(20),
in_start_time bigint unsigned,
in_end_time bigint unsigned
)
RETURNS decimal(7, 4)
READS SQL DATA
BEGIN
DECLARE result decimal(7, 4);
DECLARE row_event_time bigint unsigned;
DECLARE row_event_type enum('state_change', 'downtime_start', 'downtime_end', 'end');
DECLARE row_event_prio int;
DECLARE row_hard_state tinyint unsigned;
DECLARE row_previous_hard_state tinyint unsigned;
DECLARE last_event_time bigint unsigned;
DECLARE last_hard_state tinyint unsigned;
DECLARE active_downtimes int unsigned;
DECLARE problem_time bigint unsigned;
DECLARE total_time bigint unsigned;
DECLARE done int;
DECLARE cur CURSOR FOR
(
-- all downtime_start events before the end of the SLA interval
-- for downtimes that overlap the SLA interval in any way
SELECT
GREATEST(downtime_start, in_start_time) AS event_time,
'downtime_start' AS event_type,
1 AS event_prio,
NULL AS hard_state,
NULL AS previous_hard_state
FROM sla_history_downtime d
WHERE d.host_id = in_host_id
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
AND d.downtime_start < in_end_time
AND d.downtime_end >= in_start_time
) UNION ALL (
-- all downtime_end events before the end of the SLA interval
-- for downtimes that overlap the SLA interval in any way
SELECT
downtime_end AS event_time,
'downtime_end' AS event_type,
2 AS event_prio,
NULL AS hard_state,
NULL AS previous_hard_state
FROM sla_history_downtime d
WHERE d.host_id = in_host_id
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
AND d.downtime_start < in_end_time
AND d.downtime_end >= in_start_time
AND d.downtime_end < in_end_time
) UNION ALL (
-- all state events strictly in interval
SELECT
event_time,
'state_change' AS event_type,
0 AS event_prio,
hard_state,
previous_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
AND s.event_time < in_end_time
) UNION ALL (
-- end event to keep loop simple, values are not used
SELECT
in_end_time AS event_time,
'end' AS event_type,
3 AS event_prio,
NULL AS hard_state,
NULL AS previous_hard_state
)
ORDER BY event_time, event_prio;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1;
IF in_end_time <= in_start_time THEN
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'end time must be greater than start time';
END IF;
-- Use the latest event at or before the beginning of the SLA interval as the initial state.
SELECT hard_state INTO last_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time <= in_start_time
ORDER BY s.event_time DESC
LIMIT 1;
-- If this does not exist, use the previous state from the first event after the beginning of the SLA interval.
IF last_hard_state IS NULL THEN
SELECT previous_hard_state INTO last_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
ORDER BY s.event_time ASC
LIMIT 1;
END IF;
-- If this also does not exist, use the current host/service state.
IF last_hard_state IS NULL THEN
IF in_service_id IS NULL THEN
SELECT hard_state INTO last_hard_state
FROM host_state s
WHERE s.host_id = in_host_id;
ELSE
SELECT hard_state INTO last_hard_state
FROM service_state s
WHERE s.host_id = in_host_id
AND s.service_id = in_service_id;
END IF;
END IF;
IF last_hard_state IS NULL THEN
SET last_hard_state = 0;
END IF;
SET problem_time = 0;
SET total_time = in_end_time - in_start_time;
SET last_event_time = in_start_time;
SET active_downtimes = 0;
SET done = 0;
OPEN cur;
read_loop: LOOP
FETCH cur INTO row_event_time, row_event_type, row_event_prio, row_hard_state, row_previous_hard_state;
IF done THEN
LEAVE read_loop;
END IF;
IF row_previous_hard_state = 99 THEN
SET total_time = total_time - (row_event_time - last_event_time);
ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1))
AND last_hard_state != 99
AND active_downtimes = 0
THEN
SET problem_time = problem_time + row_event_time - last_event_time;
END IF;
SET last_event_time = row_event_time;
IF row_event_type = 'state_change' THEN
SET last_hard_state = row_hard_state;
ELSEIF row_event_type = 'downtime_start' THEN
SET active_downtimes = active_downtimes + 1;
ELSEIF row_event_type = 'downtime_end' THEN
SET active_downtimes = active_downtimes - 1;
END IF;
END LOOP;
CLOSE cur;
SET result = 100 * (total_time - problem_time) / total_time;
RETURN result;
END//
DELIMITER ;
ALTER TABLE hostgroup
DROP INDEX idx_hostroup_name,
ADD INDEX idx_hostgroup_name (name) COMMENT 'Host/service/host group list filtered by host group name';
@ -47,5 +205,52 @@ ALTER TABLE customvar
ALTER TABLE customvar_flat
MODIFY flatname varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'Path converted with `.` and `[ ]`';
CREATE TABLE sla_history_state (
id binary(20) NOT NULL COMMENT 'state_history.id (may reference already deleted rows)',
environment_id binary(20) NOT NULL COMMENT 'environment.id',
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
object_type enum('host', 'service') NOT NULL,
host_id binary(20) NOT NULL COMMENT 'host.id',
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
event_time bigint unsigned NOT NULL COMMENT 'unix timestamp the event occurred',
hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state after this event',
previous_hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state before this event',
PRIMARY KEY (id),
INDEX idx_sla_history_state_event (host_id, service_id, event_time)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
INSERT INTO sla_history_state
(id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state)
SELECT id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state
FROM state_history
WHERE state_type = 'hard'
ON DUPLICATE KEY UPDATE sla_history_state.id = sla_history_state.id;
CREATE TABLE sla_history_downtime (
environment_id binary(20) NOT NULL COMMENT 'environment.id',
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
object_type enum('host', 'service') NOT NULL,
host_id binary(20) NOT NULL COMMENT 'host.id',
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
downtime_id binary(20) NOT NULL COMMENT 'downtime.id (may reference already deleted rows)',
downtime_start BIGINT UNSIGNED NOT NULL COMMENT 'start time of the downtime',
downtime_end BIGINT UNSIGNED NOT NULL COMMENT 'end time of the downtime',
PRIMARY KEY (downtime_id),
INDEX idx_sla_history_downtime_event (host_id, service_id, downtime_start, downtime_end)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
INSERT INTO sla_history_downtime
(environment_id, endpoint_id, object_type, host_id, service_id, downtime_id, downtime_start, downtime_end)
SELECT environment_id, endpoint_id, object_type, host_id, service_id, downtime_id,
start_time AS downtime_start, IF(has_been_cancelled = 'y', cancel_time, end_time) AS downtime_end
FROM downtime_history
ON DUPLICATE KEY UPDATE sla_history_downtime.downtime_id = sla_history_downtime.downtime_id;
INSERT INTO icingadb_schema (version, TIMESTAMP)
VALUES (3, CURRENT_TIMESTAMP() * 1000);

View file

@ -21,6 +21,146 @@ CREATE TYPE comment_type AS ENUM ( 'comment', 'ack' );
CREATE TYPE notification_type AS ENUM ( 'downtime_start', 'downtime_end', 'downtime_removed', 'custom', 'acknowledgement', 'problem', 'recovery', 'flapping_start', 'flapping_end' );
CREATE TYPE history_type AS ENUM ( 'notification', 'state_change', 'downtime_start', 'downtime_end', 'comment_add', 'comment_remove', 'flapping_start', 'flapping_end', 'ack_set', 'ack_clear' );
CREATE OR REPLACE FUNCTION get_sla_ok_percent(
in_host_id bytea20,
in_service_id bytea20,
in_start_time biguint,
in_end_time biguint
)
RETURNS decimal(7, 4)
LANGUAGE plpgsql
STABLE
PARALLEL RESTRICTED
AS $$
DECLARE
last_event_time biguint := in_start_time;
last_hard_state tinyuint;
active_downtimes uint := 0;
problem_time biguint := 0;
total_time biguint;
row record;
BEGIN
IF in_end_time <= in_start_time THEN
RAISE 'end time must be greater than start time';
END IF;
total_time := in_end_time - in_start_time;
-- Use the latest event at or before the beginning of the SLA interval as the initial state.
SELECT hard_state INTO last_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time <= in_start_time
ORDER BY s.event_time DESC
LIMIT 1;
-- If this does not exist, use the previous state from the first event after the beginning of the SLA interval.
IF last_hard_state IS NULL THEN
SELECT previous_hard_state INTO last_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
ORDER BY s.event_time ASC
LIMIT 1;
END IF;
-- If this also does not exist, use the current host/service state.
IF last_hard_state IS NULL THEN
IF in_service_id IS NULL THEN
SELECT hard_state INTO last_hard_state
FROM host_state s
WHERE s.host_id = in_host_id;
ELSE
SELECT hard_state INTO last_hard_state
FROM service_state s
WHERE s.host_id = in_host_id
AND s.service_id = in_service_id;
END IF;
END IF;
IF last_hard_state IS NULL THEN
last_hard_state := 0;
END IF;
FOR row IN
(
-- all downtime_start events before the end of the SLA interval
-- for downtimes that overlap the SLA interval in any way
SELECT
GREATEST(downtime_start, in_start_time) AS event_time,
'downtime_start' AS event_type,
1 AS event_prio,
NULL::tinyuint AS hard_state,
NULL::tinyuint AS previous_hard_state
FROM sla_history_downtime d
WHERE d.host_id = in_host_id
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
AND d.downtime_start < in_end_time
AND d.downtime_end >= in_start_time
) UNION ALL (
-- all downtime_end events before the end of the SLA interval
-- for downtimes that overlap the SLA interval in any way
SELECT
downtime_end AS event_time,
'downtime_end' AS event_type,
2 AS event_prio,
NULL::tinyuint AS hard_state,
NULL::tinyuint AS previous_hard_state
FROM sla_history_downtime d
WHERE d.host_id = in_host_id
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
AND d.downtime_start < in_end_time
AND d.downtime_end >= in_start_time
AND d.downtime_end < in_end_time
) UNION ALL (
-- all state events strictly in interval
SELECT
event_time,
'state_change' AS event_type,
0 AS event_prio,
hard_state,
previous_hard_state
FROM sla_history_state s
WHERE s.host_id = in_host_id
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
AND s.event_time > in_start_time
AND s.event_time < in_end_time
) UNION ALL (
-- end event to keep loop simple, values are not used
SELECT
in_end_time AS event_time,
'end' AS event_type,
3 AS event_prio,
NULL::tinyuint AS hard_state,
NULL::tinyuint AS previous_hard_state
)
ORDER BY event_time, event_prio
LOOP
IF row.previous_hard_state = 99 THEN
total_time := total_time - (row.event_time - last_event_time);
ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1))
AND last_hard_state != 99
AND active_downtimes = 0
THEN
problem_time := problem_time + row.event_time - last_event_time;
END IF;
last_event_time := row.event_time;
IF row.event_type = 'state_change' THEN
last_hard_state := row.hard_state;
ELSEIF row.event_type = 'downtime_start' THEN
active_downtimes := active_downtimes + 1;
ELSEIF row.event_type = 'downtime_end' THEN
active_downtimes := active_downtimes - 1;
END IF;
END LOOP;
RETURN 100 * (total_time - problem_time) / total_time;
END;
$$;
CREATE TABLE host (
id bytea20 NOT NULL,
environment_id bytea20 NOT NULL,
@ -1894,6 +2034,68 @@ COMMENT ON COLUMN history.acknowledgement_history_id IS 'acknowledgement_history
COMMENT ON INDEX idx_history_event_time IS 'History filtered/ordered by event_time';
COMMENT ON INDEX idx_history_host_service_id IS 'Host/service history detail filter';
CREATE TABLE sla_history_state (
id bytea20 NOT NULL,
environment_id bytea20 NOT NULL,
endpoint_id bytea20 DEFAULT NULL,
object_type checkable_type NOT NULL,
host_id bytea20 NOT NULL,
service_id bytea20 DEFAULT NULL,
event_time biguint NOT NULL,
hard_state tinyuint NOT NULL,
previous_hard_state tinyuint NOT NULL,
CONSTRAINT pk_sla_history_state PRIMARY KEY (id)
);
ALTER TABLE sla_history_state ALTER COLUMN id SET STORAGE PLAIN;
ALTER TABLE sla_history_state ALTER COLUMN environment_id SET STORAGE PLAIN;
ALTER TABLE sla_history_state ALTER COLUMN endpoint_id SET STORAGE PLAIN;
ALTER TABLE sla_history_state ALTER COLUMN host_id SET STORAGE PLAIN;
ALTER TABLE sla_history_state ALTER COLUMN service_id SET STORAGE PLAIN;
CREATE INDEX idx_sla_history_state_event ON sla_history_state(host_id, service_id, event_time);
COMMENT ON COLUMN sla_history_state.id IS 'state_history.id (may reference already deleted rows)';
COMMENT ON COLUMN sla_history_state.environment_id IS 'environment.id';
COMMENT ON COLUMN sla_history_state.endpoint_id IS 'endpoint.id';
COMMENT ON COLUMN sla_history_state.host_id IS 'host.id';
COMMENT ON COLUMN sla_history_state.service_id IS 'service.id';
COMMENT ON COLUMN sla_history_state.event_time IS 'unix timestamp the event occurred';
COMMENT ON COLUMN sla_history_state.hard_state IS 'hard state after this event';
COMMENT ON COLUMN sla_history_state.previous_hard_state IS 'hard state before this event';
CREATE TABLE sla_history_downtime (
environment_id bytea20 NOT NULL,
endpoint_id bytea20 DEFAULT NULL,
object_type checkable_type NOT NULL,
host_id bytea20 NOT NULL,
service_id bytea20 DEFAULT NULL,
downtime_id bytea20 NOT NULL,
downtime_start biguint NOT NULL,
downtime_end biguint NOT NULL,
CONSTRAINT pk_sla_history_downtime PRIMARY KEY (downtime_id)
);
ALTER TABLE sla_history_downtime ALTER COLUMN environment_id SET STORAGE PLAIN;
ALTER TABLE sla_history_downtime ALTER COLUMN endpoint_id SET STORAGE PLAIN;
ALTER TABLE sla_history_downtime ALTER COLUMN host_id SET STORAGE PLAIN;
ALTER TABLE sla_history_downtime ALTER COLUMN service_id SET STORAGE PLAIN;
ALTER TABLE sla_history_downtime ALTER COLUMN downtime_id SET STORAGE PLAIN;
CREATE INDEX idx_sla_history_downtime_event ON sla_history_downtime(host_id, service_id, downtime_start, downtime_end);
COMMENT ON COLUMN sla_history_downtime.environment_id IS 'environment.id';
COMMENT ON COLUMN sla_history_downtime.endpoint_id IS 'endpoint.id';
COMMENT ON COLUMN sla_history_downtime.host_id IS 'host.id';
COMMENT ON COLUMN sla_history_downtime.service_id IS 'service.id';
COMMENT ON COLUMN sla_history_downtime.downtime_id IS 'downtime.id (may reference already deleted rows)';
COMMENT ON COLUMN sla_history_downtime.downtime_start IS 'start time of the downtime';
COMMENT ON COLUMN sla_history_downtime.downtime_end IS 'end time of the downtime';
CREATE SEQUENCE icingadb_schema_id_seq;
CREATE TABLE icingadb_schema (

View file

@ -9,6 +9,7 @@ import (
"github.com/jmoiron/sqlx"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"strings"
"testing"
"time"
)
@ -22,7 +23,8 @@ func TestCleanupAndRetention(t *testing.T) {
t.Cleanup(func() { _ = db.Close() })
reten := retention{
Days: 7,
HistoryDays: 7,
SlaDays: 30,
Options: map[string]int{
"acknowledgement": 0, // No cleanup.
"comment": 1,
@ -31,6 +33,16 @@ func TestCleanupAndRetention(t *testing.T) {
},
}
daysForCategory := func(category string) int {
if strings.HasPrefix(category, "sla_") {
return reten.SlaDays
} else if d, ok := reten.Options[category]; ok {
return d
} else {
return reten.HistoryDays
}
}
rowsToDelete := 10000
rowsToSpare := 1000
@ -38,11 +50,7 @@ func TestCleanupAndRetention(t *testing.T) {
err := dropNotNullColumns(db, stmt)
assert.NoError(t, err)
retentionDays, ok := reten.Options[category]
if !ok {
retentionDays = reten.Days
}
retentionDays := daysForCategory(category)
start := time.Now().AddDate(0, 0, -retentionDays).Add(-1 * time.Millisecond * time.Duration(rowsToDelete))
startMilli := start.UnixMilli()
@ -75,18 +83,14 @@ func TestCleanupAndRetention(t *testing.T) {
i.Reload()
waitForDumpDoneSignal(t, r, 20*time.Second, 100*time.Millisecond)
config, err := yaml.Marshal(struct {
Retention retention `yaml:"history-retention"`
Retention retention `yaml:"retention"`
}{reten})
assert.NoError(t, err)
it.IcingaDbInstanceT(t, r, rdb, services.WithIcingaDbConfig(string(config)))
eventually.Assert(t, func(t require.TestingT) {
for category, stmt := range retentionStatements {
retentionDays, ok := reten.Options[category]
if !ok {
retentionDays = reten.Days
}
retentionDays := daysForCategory(category)
threshold := time.Now().AddDate(0, 0, -retentionDays)
thresholdMilli := threshold.UnixMilli()
@ -106,10 +110,10 @@ func TestCleanupAndRetention(t *testing.T) {
if retentionDays == 0 {
// No cleanup.
assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there")
assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there for %s", category)
} else {
assert.Equal(t, 0, rowsLeft, "rows left in retention period")
assert.Equal(t, rowsToSpare, rowsSpared, "rows spared")
assert.Equal(t, 0, rowsLeft, "rows left in retention period for %s", category)
assert.Equal(t, rowsToSpare, rowsSpared, "rows spared for %s", category)
}
}
}, time.Minute, time.Second)
@ -122,8 +126,9 @@ type cleanupStmt struct {
}
type retention struct {
Days int `yaml:"days"`
Options map[string]int `yaml:"options"`
HistoryDays int `yaml:"history-days"`
SlaDays int `yaml:"sla-days"`
Options map[string]int `yaml:"options"`
}
var retentionStatements = map[string]cleanupStmt{
@ -157,6 +162,16 @@ var retentionStatements = map[string]cleanupStmt{
PK: "id",
Column: "event_time",
},
"sla_downtime": {
Table: "sla_history_downtime",
PK: "downtime_id",
Column: "downtime_end",
},
"sla_state": {
Table: "sla_history_state",
PK: "id",
Column: "event_time",
},
}
// dropNotNullColumns drops all columns with a NOT NULL constraint that are not

View file

@ -5,10 +5,12 @@ go 1.16
require (
github.com/containerd/containerd v1.5.6 // indirect
github.com/go-redis/redis/v8 v8.11.4
github.com/go-sql-driver/mysql v1.6.0
github.com/goccy/go-yaml v1.9.5
github.com/google/uuid v1.3.0
github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c
github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01
github.com/jmoiron/sqlx v1.3.4
github.com/lib/pq v1.10.5
github.com/stretchr/testify v1.7.0
go.uber.org/zap v1.21.0
golang.org/x/net v0.0.0-20211020060615-d418f374d309 // indirect

View file

@ -37,6 +37,8 @@ github.com/Azure/go-autorest/logger v0.2.0/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZ
github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/Icinga/go-libs v0.0.0-20220420130327-ef58ad52edd8 h1:hG4Y/LPERK9i+P8/jnYlq9PeDd9deIkwEWOIimDU3uk=
github.com/Icinga/go-libs v0.0.0-20220420130327-ef58ad52edd8/go.mod h1:xlgU55MKs/vIg1fMlAEBSrslahYayZNwjXvf3w1dvyA=
github.com/Microsoft/go-winio v0.4.11/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA=
github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA=
github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw=
@ -60,6 +62,7 @@ github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5
github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY=
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ=
@ -68,6 +71,7 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alexflint/go-filemutex v0.0.0-20171022225611-72bdc8eae2ae/go.mod h1:CgnQgUtFrFz9mxFNtED3jI5tLDjKlOM+oUF/sTk6ps0=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
@ -402,8 +406,8 @@ github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c h1:jJVPK9dvsZ99Xl/xDOBM+DEj+7i7En51/04ckc/lTZc=
github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c/go.mod h1:W9pLmq2dsgLSag568N/LDHNu4oah6qWvjT05Drz2RYw=
github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01 h1:0dwlZFGWPnmmhvHr2P7chxMwzbW7+R3iX6SyeFBd+WM=
github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01/go.mod h1:ZP0pyqhmrRwwQ6FpAfz7UZMgmH7i3vOjEOm9JcFwOw0=
github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
github.com/imdario/mergo v0.3.8/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
github.com/imdario/mergo v0.3.10/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA=
@ -444,8 +448,9 @@ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y=
github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII=
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/lib/pq v1.10.4 h1:SO9z7FRPzA03QhHKJrH5BXA6HU1rS4V2nIVrrNC1iYk=
github.com/lib/pq v1.10.4/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/lib/pq v1.10.5 h1:J+gdV2cUmX7ZqL2B0lFcW0m+egaHC2V3lpO8nWxyYiQ=
github.com/lib/pq v1.10.5/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
@ -459,6 +464,7 @@ github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHX
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o=
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
@ -725,6 +731,7 @@ golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzB
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=

View file

@ -0,0 +1,28 @@
package utils
import (
"fmt"
"github.com/icinga/icinga-testing"
"github.com/icinga/icinga-testing/services"
"os"
"strings"
"testing"
)
func GetDatabase(it *icingatesting.IT, t testing.TB) services.RelationalDatabase {
k := "ICINGADB_TESTS_DATABASE_TYPE"
v := strings.ToLower(os.Getenv(k))
var rdb services.RelationalDatabase
switch v {
case "mysql":
rdb = it.MysqlDatabaseT(t)
case "pgsql":
rdb = it.PostgresqlDatabaseT(t)
default:
panic(fmt.Sprintf(`unknown database in %s environment variable: %q (must be "mysql" or "pgsql")`, k, v))
}
return rdb
}

View file

@ -1,10 +1,9 @@
package icingadb_test
import (
"fmt"
"github.com/icinga/icinga-testing"
"github.com/icinga/icinga-testing/services"
"os"
"github.com/icinga/icingadb/tests/internal/utils"
"testing"
)
@ -26,19 +25,5 @@ func getDatabase(t testing.TB) services.RelationalDatabase {
}
func getEmptyDatabase(t testing.TB) services.RelationalDatabase {
k := "ICINGADB_TESTS_DATABASE_TYPE"
v := os.Getenv(k)
var rdb services.RelationalDatabase
switch v {
case "mysql":
rdb = it.MysqlDatabaseT(t)
case "pgsql":
rdb = it.PostgresqlDatabaseT(t)
default:
panic(fmt.Sprintf(`unknown database in %s environment variable: %q (must be "mysql" or "pgsql")`, k, v))
}
return rdb
return utils.GetDatabase(it, t)
}

385
tests/sla_test.go Normal file
View file

@ -0,0 +1,385 @@
package icingadb_test
import (
"bytes"
"context"
"encoding/json"
"fmt"
"github.com/icinga/icinga-testing/utils"
"github.com/icinga/icinga-testing/utils/eventually"
"github.com/jmoiron/sqlx"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"math"
"net/http"
"testing"
"time"
)
func TestSla(t *testing.T) {
m := it.MysqlDatabaseT(t)
m.ImportIcingaDbSchema()
r := it.RedisServerT(t)
i := it.Icinga2NodeT(t, "master")
i.EnableIcingaDb(r)
err := i.Reload()
require.NoError(t, err, "icinga2 should reload without error")
it.IcingaDbInstanceT(t, r, m)
client := i.ApiClient()
t.Run("StateEvents", func(t *testing.T) {
t.Parallel()
hostname := utils.UniqueName(t, "host")
client.CreateHost(t, hostname, map[string]interface{}{
"attrs": map[string]interface{}{
"enable_active_checks": false,
"enable_passive_checks": true,
"check_command": "dummy",
"max_check_attempts": 3,
},
})
type StateChange struct {
Time float64
State int
}
var stateChanges []StateChange
processCheckResult := func(exitStatus int, isHard bool) *ObjectsHostsResponse {
time.Sleep(10 * time.Millisecond) // ensure there is a bit of difference in ms resolution
output := utils.UniqueName(t, "output")
data := ActionsProcessCheckResultRequest{
Type: "Host",
Filter: fmt.Sprintf(`host.name==%q`, hostname),
ExitStatus: exitStatus,
PluginOutput: output,
}
dataJson, err := json.Marshal(data)
require.NoError(t, err, "marshal request")
response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(dataJson))
require.NoError(t, err, "process-check-result")
require.Equal(t, 200, response.StatusCode, "process-check-result")
response, err = client.GetJson("/v1/objects/hosts/" + hostname)
require.NoError(t, err, "get host: request")
require.Equal(t, 200, response.StatusCode, "get host: request")
var hosts ObjectsHostsResponse
err = json.NewDecoder(response.Body).Decode(&hosts)
require.NoError(t, err, "get host: parse response")
require.Equal(t, 1, len(hosts.Results), "there must be one host in the response")
host := hosts.Results[0]
require.Equal(t, output, host.Attrs.LastCheckResult.Output,
"last check result should be visible in host object")
require.Equal(t, exitStatus, host.Attrs.State, "soft state should match check result")
if isHard {
require.Equal(t, exitStatus, host.Attrs.LastHardState, "hard state should match check result")
if len(stateChanges) > 0 {
require.Greater(t, host.Attrs.LastHardStateChange, stateChanges[len(stateChanges)-1].Time,
"last_hard_state_change_time of host should have changed")
}
stateChanges = append(stateChanges, StateChange{
Time: host.Attrs.LastHardStateChange,
State: exitStatus,
})
} else {
require.NotEmpty(t, stateChanges, "there should be a hard state change prior to a soft one")
require.Equal(t, stateChanges[len(stateChanges)-1].Time, host.Attrs.LastHardStateChange,
"check result should not lead to a hard state change, i.e. last_hard_state_change should not change")
}
return &hosts
}
processCheckResult(0, true) // hard (UNKNOWN -> UP)
processCheckResult(1, false) // soft
processCheckResult(1, false) // soft
processCheckResult(1, true) // hard (UP -> DOWN)
processCheckResult(1, false) // hard
processCheckResult(0, true) // hard (DOWN -> UP)
processCheckResult(0, false) // hard
assert.Equal(t, 3, len(stateChanges), "there should be three hard state changes")
db, err := sqlx.Connect("mysql", m.DSN())
require.NoError(t, err, "connecting to mysql")
defer func() { _ = db.Close() }()
type Row struct {
Time int64 `db:"event_time"`
State int `db:"hard_state"`
}
eventually.Assert(t, func(t require.TestingT) {
var rows []Row
err = db.Select(&rows, db.Rebind("SELECT s.event_time, s.hard_state FROM sla_history_state s "+
"JOIN host ON host.id = s.host_id WHERE host.name = ? ORDER BY event_time ASC"), hostname)
require.NoError(t, err, "select sla_history_state")
assert.Equal(t, len(stateChanges), len(rows), "number of sla_history_state entries")
for i := range rows {
assert.WithinDuration(t, time.UnixMilli(int64(stateChanges[i].Time*1000)), time.UnixMilli(rows[i].Time),
time.Millisecond, "event time should match state change time")
assert.Equal(t, stateChanges[i].State, rows[i].State, "hard state should match")
}
}, 5*time.Second, 200*time.Millisecond)
redis := r.Open()
defer func() { _ = redis.Close() }()
logger := it.Logger(t)
logger.Debug("redis state history", zap.Bool("before", true))
eventually.Assert(t, func(t require.TestingT) {
result, err := redis.XRange(context.Background(), "icinga:history:stream:state", "-", "+").Result()
require.NoError(t, err, "reading state history stream should not fail")
logger.Debug("redis state history", zap.Any("values", result))
assert.Empty(t, result, "redis state history stream should be drained")
}, 5*time.Second, 10*time.Millisecond)
logger.Debug("redis state history", zap.Bool("after", true))
})
t.Run("DowntimeEvents", func(t *testing.T) {
t.Parallel()
type Options struct {
Fixed bool // Whether to schedule a fixed or flexible downtime.
Cancel bool // Whether to cancel the downtime or let it expire.
}
downtimeTest := func(t *testing.T, o Options) {
hostname := utils.UniqueName(t, "host")
client.CreateHost(t, hostname, map[string]interface{}{
"attrs": map[string]interface{}{
"enable_active_checks": false,
"enable_passive_checks": true,
"check_command": "dummy",
"max_check_attempts": 1,
},
})
processCheckResult := func(status int) time.Time {
output := utils.RandomString(8)
reqBody, err := json.Marshal(ActionsProcessCheckResultRequest{
Type: "Host",
Filter: fmt.Sprintf(`host.name==%q`, hostname),
ExitStatus: status,
PluginOutput: output,
})
require.NoError(t, err, "marshal request")
response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(reqBody))
require.NoError(t, err, "process-check-result")
require.Equal(t, 200, response.StatusCode, "process-check-result")
response, err = client.GetJson("/v1/objects/hosts/" + hostname)
require.NoError(t, err, "get host: request")
require.Equal(t, 200, response.StatusCode, "get host: request")
var hosts ObjectsHostsResponse
err = json.NewDecoder(response.Body).Decode(&hosts)
require.NoError(t, err, "get host: parse response")
require.Equal(t, 1, len(hosts.Results), "there must be one host in the response")
host := hosts.Results[0]
require.Equal(t, output, host.Attrs.LastCheckResult.Output,
"last check result should be visible in host object")
require.Equal(t, 1, host.Attrs.StateType, "host should be in hard state")
require.Equal(t, status, host.Attrs.State, "state should match check result")
sec, nsec := math.Modf(host.Attrs.LastCheckResult.ExecutionEnd)
return time.Unix(int64(sec), int64(nsec*1e9))
}
// Ensure that host is in UP state.
processCheckResult(0)
refTime := time.Now().Truncate(time.Second)
// Schedule the downtime start in the past so that we would notice if Icinga 2/DB would
// use the current time somewhere where we expect the scheduled start time.
downtimeStart := refTime.Add(-1 * time.Hour)
var downtimeEnd time.Time
if o.Cancel || !o.Fixed {
// Downtimes we will cancel can expire long in the future as we don't have to wait for it.
// Same for flexible downtimes as for these, we don't have to wait until the scheduled end but only
// for their duration.
downtimeEnd = refTime.Add(1 * time.Hour)
} else {
// Let all other downtimes expire soon (fixed downtimes where we wait for expiry).
downtimeEnd = refTime.Add(5 * time.Second)
}
var duration time.Duration
if !o.Fixed {
duration = 10 * time.Second
}
req, err := json.Marshal(ActionsScheduleDowntimeRequest{
Type: "Host",
Filter: fmt.Sprintf(`host.name==%q`, hostname),
StartTime: downtimeStart.Unix(),
EndTime: downtimeEnd.Unix(),
Fixed: o.Fixed,
Duration: duration.Seconds(),
Author: utils.RandomString(8),
Comment: utils.RandomString(8),
})
require.NoError(t, err, "marshal request")
response, err := client.PostJson("/v1/actions/schedule-downtime", bytes.NewBuffer(req))
require.NoError(t, err, "schedule-downtime")
require.Equal(t, 200, response.StatusCode, "schedule-downtime")
var scheduleResponse ActionsScheduleDowntimeResponse
err = json.NewDecoder(response.Body).Decode(&scheduleResponse)
require.NoError(t, err, "decode schedule-downtime response")
require.Equal(t, 1, len(scheduleResponse.Results), "schedule-downtime should return 1 result")
require.Equal(t, http.StatusOK, scheduleResponse.Results[0].Code, "schedule-downtime should return 1 result")
downtimeName := scheduleResponse.Results[0].Name
type Row struct {
Start int64 `db:"downtime_start"`
End int64 `db:"downtime_end"`
}
db, err := sqlx.Connect("mysql", m.DSN())
require.NoError(t, err, "connecting to mysql")
defer func() { _ = db.Close() }()
if !o.Fixed {
// Give Icinga 2 and Icinga DB some time that if they would generate an SLA history event in error,
// they have a chance to do so before we check for its absence.
time.Sleep(10 * time.Second)
var count int
err = db.Get(&count, db.Rebind("SELECT COUNT(*) FROM sla_history_downtime s "+
"JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
require.NoError(t, err, "select sla_history_state")
assert.Zero(t, count, "there should be no event in sla_history_downtime when scheduling a flexible downtime on an UP host")
}
// Bring host into DOWN state.
criticalTime := processCheckResult(1)
eventually.Assert(t, func(t require.TestingT) {
var rows []Row
err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+
"JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
require.NoError(t, err, "select sla_history_state")
require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row")
if o.Fixed {
assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start),
"downtime_start should match scheduled start time")
assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End),
"downtime_end should match scheduled end time")
} else {
assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second,
"downtime_start should match time of host state change")
assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)),
"downtime_end - downtime_start duration should match scheduled duration")
}
}, 5*time.Second, 200*time.Millisecond)
redis := r.Open()
defer func() { _ = redis.Close() }()
eventually.Assert(t, func(t require.TestingT) {
result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result()
require.NoError(t, err, "reading downtime history stream should not fail")
assert.Empty(t, result, "redis downtime history stream should be drained")
}, 5*time.Second, 10*time.Millisecond)
if o.Cancel {
req, err = json.Marshal(ActionsRemoveDowntimeRequest{
Downtime: downtimeName,
})
require.NoError(t, err, "marshal remove-downtime request")
response, err = client.PostJson("/v1/actions/remove-downtime", bytes.NewBuffer(req))
require.NoError(t, err, "remove-downtime")
require.Equal(t, 200, response.StatusCode, "remove-downtime")
}
downtimeCancel := time.Now()
if !o.Cancel {
// Wait for downtime to expire + a few extra seconds. The row should not be updated, give
// enough time to have a chance catching if Icinga DB updates it nonetheless.
if !o.Fixed {
time.Sleep(duration + 5*time.Second)
} else {
d := time.Until(downtimeEnd) + 5*time.Second
require.Less(t, d, time.Minute, "bug in tests: don't wait too long")
time.Sleep(d)
}
}
eventually.Assert(t, func(t require.TestingT) {
var rows []Row
err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+
"JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
require.NoError(t, err, "select sla_history_state")
require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row")
if o.Fixed {
assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start),
"downtime_start should match scheduled start")
} else {
assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second,
"downtime_start should match critical time")
}
if o.Cancel {
// Allow more delta for the end time after cancel as we did not choose the exact time.
assert.WithinDuration(t, downtimeCancel, time.UnixMilli(rows[0].End), time.Second,
"downtime_end should match cancel time")
} else if o.Fixed {
assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End),
"downtime_start should match scheduled end")
} else {
assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)),
"downtime_end - downtime_start duration should match scheduled duration")
}
}, 5*time.Second, 200*time.Millisecond)
eventually.Assert(t, func(t require.TestingT) {
result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result()
require.NoError(t, err, "reading downtime history stream should not fail")
assert.Empty(t, result, "redis downtime history stream should be drained")
}, 5*time.Second, 10*time.Millisecond)
}
t.Run("Fixed", func(t *testing.T) {
t.Parallel()
t.Run("Cancel", func(t *testing.T) {
t.Parallel()
downtimeTest(t, Options{Fixed: true, Cancel: true})
})
t.Run("Expire", func(t *testing.T) {
t.Parallel()
downtimeTest(t, Options{Fixed: true, Cancel: false})
})
})
t.Run("Flexible", func(t *testing.T) {
t.Parallel()
t.Run("Cancel", func(t *testing.T) {
t.Parallel()
downtimeTest(t, Options{Fixed: false, Cancel: true})
})
t.Run("Expire", func(t *testing.T) {
t.Parallel()
downtimeTest(t, Options{Fixed: false, Cancel: false})
})
})
})
}

23
tests/sql/main_test.go Normal file
View file

@ -0,0 +1,23 @@
package sql_test
import (
"github.com/icinga/icinga-testing"
"github.com/icinga/icinga-testing/services"
"github.com/icinga/icingadb/tests/internal/utils"
"testing"
)
var it *icingatesting.IT
func TestMain(m *testing.M) {
it = icingatesting.NewIT()
defer it.Cleanup()
m.Run()
}
func getDatabase(t testing.TB) services.RelationalDatabase {
rdb := utils.GetDatabase(it, t)
rdb.ImportIcingaDbSchema()
return rdb
}

406
tests/sql/sla_test.go Normal file
View file

@ -0,0 +1,406 @@
package sql_test
import (
"crypto/rand"
"database/sql/driver"
"fmt"
"github.com/go-sql-driver/mysql"
"github.com/jmoiron/sqlx"
"github.com/lib/pq"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"testing"
)
func TestSla(t *testing.T) {
rdb := getDatabase(t)
db, err := sqlx.Open(rdb.Driver(), rdb.DSN())
require.NoError(t, err, "connect to database")
type TestData struct {
Name string
Events []SlaHistoryEvent
Start uint64
End uint64
Expected float64
}
tests := []TestData{{
Name: "EmptyHistory",
// Empty history implies no previous problem state, therefore SLA should be 100%
Events: nil,
Start: 1000,
End: 2000,
Expected: 100.0,
}, {
Name: "MultipleStateChanges",
// Some flapping, test that all changes are considered.
Events: []SlaHistoryEvent{
&State{Time: 1000, State: 2, PreviousState: 99}, // -10%
&State{Time: 1100, State: 0, PreviousState: 2},
&State{Time: 1300, State: 2, PreviousState: 0}, // -10%
&State{Time: 1400, State: 0, PreviousState: 2},
&State{Time: 1600, State: 2, PreviousState: 0}, // -10%
&State{Time: 1700, State: 0, PreviousState: 2},
&State{Time: 1900, State: 2, PreviousState: 0}, // -10%
},
Start: 1000,
End: 2000,
Expected: 60.0,
}, {
Name: "OverlappingDowntimesAndProblems",
// SLA should be 90%:
// 1000..1100: OK, no downtime
// 1100..1200: OK, in downtime
// 1200..1300: CRITICAL, in downtime
// 1300..1400: CRITICAL, no downtime (only period counting for SLA, -10%)
// 1400..1500: CRITICAL, in downtime
// 1500..1600: OK, in downtime
// 1600..2000: OK, no downtime
Events: []SlaHistoryEvent{
&Downtime{Start: 1100, End: 1300},
&Downtime{Start: 1400, End: 1600},
&State{Time: 1200, State: 2, PreviousState: 0},
&State{Time: 1500, State: 0, PreviousState: 2},
},
Start: 1000,
End: 2000,
Expected: 90.0,
}, {
Name: "CriticalBeforeInterval",
// If there is no event within the SLA interval, the last state from before the interval should be used.
Events: []SlaHistoryEvent{
&State{Time: 0, State: 2, PreviousState: 99},
},
Start: 1000,
End: 2000,
Expected: 0.0,
}, {
Name: "CriticalBeforeIntervalWithDowntime",
// State change and downtime start from before the SLA interval should be considered if still relevant.
Events: []SlaHistoryEvent{
&State{Time: 800, State: 2, PreviousState: 99},
&Downtime{Start: 600, End: 1800},
},
Start: 1000,
End: 2000,
Expected: 80.0,
}, {
Name: "CriticalBeforeIntervalWithOverlappingDowntimes",
// Test that overlapping downtimes are properly accounted for.
Events: []SlaHistoryEvent{
&State{Time: 800, State: 2, PreviousState: 99},
&Downtime{Start: 600, End: 1000},
&Downtime{Start: 800, End: 1200},
&Downtime{Start: 1000, End: 1400},
// Everything except 1400-1600 is covered by downtimes, -20%
&Downtime{Start: 1600, End: 2000},
&Downtime{Start: 1800, End: 2200},
},
Start: 1000,
End: 2000,
Expected: 80.0,
}, {
Name: "FallbackToPreviousState",
// If there is no state event from before the SLA interval, the previous hard state from the first event
// after the beginning of the SLA interval should be used as the initial state.
Events: []SlaHistoryEvent{
&State{Time: 1200, State: 0, PreviousState: 2},
},
Start: 1000,
End: 2000,
Expected: 80.0,
}, {
Name: "FallbackToCurrentState",
// If there are no state history events, the current state of the checkable should be used.
Events: []SlaHistoryEvent{
&CurrentState{State: 2},
},
Start: 1000,
End: 2000,
Expected: 0.0,
}, {
Name: "PreferInitialStateFromBeforeOverLaterState",
// The previous_hard_state should only be used as a fallback when there is no event from before the
// SLA interval. Therefore, the latter should be preferred if there is conflicting information.
Events: []SlaHistoryEvent{
&State{Time: 800, State: 2, PreviousState: 99},
&State{Time: 1200, State: 0, PreviousState: 0},
},
Start: 1000,
End: 2000,
Expected: 80.0,
}, {
Name: "PreferInitialStateFromBeforeOverCurrentState",
// The current state should only be used as a fallback when there is no state history event.
// Therefore, the latter should be preferred if there is conflicting information.
Events: []SlaHistoryEvent{
&State{Time: 800, State: 2, PreviousState: 99},
&CurrentState{State: 0},
},
Start: 1000,
End: 2000,
Expected: 0.0,
}, {
Name: "PreferLaterStateOverCurrentState",
// The current state should only be used as a fallback when there is no state history event.
// Therefore, the latter should be preferred if there is conflicting information.
Events: []SlaHistoryEvent{
&State{Time: 1200, State: 0, PreviousState: 2},
&CurrentState{State: 2},
},
Start: 1000,
End: 2000,
Expected: 80.0,
}, {
Name: "InitialUnknownReducesTotalTime",
Events: []SlaHistoryEvent{
&State{Time: 1500, State: 2, PreviousState: 99},
&State{Time: 1700, State: 0, PreviousState: 2},
&CurrentState{State: 0},
},
Start: 1000,
End: 2000,
Expected: 60,
}, {
Name: "IntermediateUnknownReducesTotalTime",
Events: []SlaHistoryEvent{
&State{Time: 1000, State: 0, PreviousState: 2},
&State{Time: 1100, State: 2, PreviousState: 0},
&State{Time: 1600, State: 0, PreviousState: 99},
&State{Time: 1800, State: 2, PreviousState: 0},
&CurrentState{State: 0},
},
Start: 1000,
End: 2000,
Expected: 60,
}}
for _, test := range tests {
t.Run(test.Name, func(t *testing.T) {
testSla(t, db, test.Events, test.Start, test.End, test.Expected, "unexpected SLA value")
})
}
t.Run("Invalid", func(t *testing.T) {
m := SlaHistoryMeta{
EnvironmentId: make([]byte, 20),
EndpointId: make([]byte, 20),
ObjectType: "host",
HostId: make([]byte, 20),
}
checkErr := func(t *testing.T, err error) {
require.Error(t, err, "SLA function should return an error")
switch d := db.DriverName(); d {
case "mysql":
var mysqlErr *mysql.MySQLError
require.ErrorAs(t, err, &mysqlErr, "SLA function should return a MySQL error")
// https://dev.mysql.com/doc/mysql-errors/8.0/en/server-error-reference.html#error_er_signal_exception
assert.Equal(t, uint16(1644), mysqlErr.Number, "MySQL error should be ER_SIGNAL_EXCEPTION")
assert.Equal(t, "end time must be greater than start time", mysqlErr.Message,
"MySQL error should contain custom message")
case "postgres":
var pqErr *pq.Error
require.ErrorAs(t, err, &pqErr, "SLA function should return a PostgreSQL error")
assert.Equal(t, pq.ErrorCode("P0001"), pqErr.Code, "MySQL error should be ER_SIGNAL_EXCEPTION")
assert.Equal(t, "end time must be greater than start time", pqErr.Message,
"PostgreSQL error should contain custom message")
default:
panic(fmt.Sprintf("unknown database driver %q", d))
}
}
t.Run("ZeroDuration", func(t *testing.T) {
_, err := execSqlSlaFunc(db, &m, 1000, 1000)
checkErr(t, err)
})
t.Run("NegativeDuration", func(t *testing.T) {
_, err := execSqlSlaFunc(db, &m, 2000, 1000)
checkErr(t, err)
})
})
}
func execSqlSlaFunc(db *sqlx.DB, m *SlaHistoryMeta, start uint64, end uint64) (float64, error) {
var result float64
err := db.Get(&result, db.Rebind("SELECT get_sla_ok_percent(?, ?, ?, ?)"),
m.HostId, m.ServiceId, start, end)
return result, err
}
func testSla(t *testing.T, db *sqlx.DB, events []SlaHistoryEvent, start uint64, end uint64, expected float64, msg string) {
t.Run("Host", func(t *testing.T) {
testSlaWithObjectType(t, db, events, false, start, end, expected, msg)
})
t.Run("Service", func(t *testing.T) {
testSlaWithObjectType(t, db, events, true, start, end, expected, msg)
})
}
func testSlaWithObjectType(t *testing.T, db *sqlx.DB,
events []SlaHistoryEvent, service bool, start uint64, end uint64, expected float64, msg string,
) {
makeId := func() []byte {
id := make([]byte, 20)
_, err := rand.Read(id)
require.NoError(t, err, "generating random id failed")
return id
}
meta := SlaHistoryMeta{
EnvironmentId: makeId(),
EndpointId: makeId(),
HostId: makeId(),
}
if service {
meta.ObjectType = "service"
meta.ServiceId = makeId()
} else {
meta.ObjectType = "host"
}
for _, event := range events {
err := event.WriteSlaEventToDatabase(db, &meta)
require.NoErrorf(t, err, "Inserting SLA history for %#v failed", event)
}
r, err := execSqlSlaFunc(db, &meta, start, end)
require.NoError(t, err, "SLA query should not fail")
assert.Equal(t, expected, r, msg)
}
type SlaHistoryMeta struct {
EnvironmentId NullableBytes `db:"environment_id"`
EndpointId NullableBytes `db:"endpoint_id"`
ObjectType string `db:"object_type"`
HostId NullableBytes `db:"host_id"`
ServiceId NullableBytes `db:"service_id"`
}
type SlaHistoryEvent interface {
WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error
}
type State struct {
Time uint64
State uint8
PreviousState uint8
}
var _ SlaHistoryEvent = (*State)(nil)
func (s *State) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error {
type values struct {
*SlaHistoryMeta
Id []byte `db:"id"`
EventTime uint64 `db:"event_time"`
HardState uint8 `db:"hard_state"`
PreviousHardState uint8 `db:"previous_hard_state"`
}
id := make([]byte, 20)
_, err := rand.Read(id)
if err != nil {
return err
}
_, err = db.NamedExec("INSERT INTO sla_history_state"+
" (id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state)"+
" VALUES (:id, :environment_id, :endpoint_id, :object_type, :host_id, :service_id, :event_time, :hard_state, :previous_hard_state)",
&values{
SlaHistoryMeta: m,
Id: id[:],
EventTime: s.Time,
HardState: s.State,
PreviousHardState: s.PreviousState,
})
return err
}
type CurrentState struct {
State uint8
}
func (c *CurrentState) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error {
type values struct {
*SlaHistoryMeta
State uint8 `db:"state"`
PropertiesChecksum NullableBytes `db:"properties_checksum"`
}
v := values{
SlaHistoryMeta: m,
State: c.State,
PropertiesChecksum: make([]byte, 20),
}
if len(m.ServiceId) == 0 {
_, err := db.NamedExec("INSERT INTO host_state"+
" (id, host_id, environment_id, properties_checksum, soft_state, previous_soft_state,"+
" hard_state, previous_hard_state, attempt, severity, last_state_change, next_check, next_update)"+
" VALUES (:host_id, :host_id, :environment_id, :properties_checksum, :state, :state, :state, :state, 0, 0, 0, 0, 0)",
&v)
return err
} else {
_, err := db.NamedExec("INSERT INTO service_state"+
" (id, host_id, service_id, environment_id, properties_checksum, soft_state, previous_soft_state,"+
" hard_state, previous_hard_state, attempt, severity, last_state_change, next_check, next_update)"+
" VALUES (:service_id, :host_id, :service_id, :environment_id, :properties_checksum, :state, :state, :state, :state, 0, 0, 0, 0, 0)",
&v)
return err
}
}
var _ SlaHistoryEvent = (*CurrentState)(nil)
type Downtime struct {
Start uint64
End uint64
}
var _ SlaHistoryEvent = (*Downtime)(nil)
type slaHistoryDowntime struct {
*SlaHistoryMeta
DowntimeId []byte `db:"downtime_id"`
DowntimeStart uint64 `db:"downtime_start"`
DowntimeEnd uint64 `db:"downtime_end"`
}
func (d *Downtime) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error {
downtimeId := make([]byte, 20)
_, err := rand.Read(downtimeId)
if err != nil {
return err
}
_, err = db.NamedExec("INSERT INTO sla_history_downtime"+
" (environment_id, endpoint_id, object_type, host_id, service_id, downtime_id, downtime_start, downtime_end)"+
" VALUES (:environment_id, :endpoint_id, :object_type, :host_id,"+
" :service_id, :downtime_id, :downtime_start, :downtime_end)",
&slaHistoryDowntime{
SlaHistoryMeta: m,
DowntimeId: downtimeId[:],
DowntimeStart: d.Start,
DowntimeEnd: d.End,
})
return err
}
// NullableBytes allows writing to binary columns in a database with support for NULL.
type NullableBytes []byte
// Value implements the database/sql/driver.Valuer interface.
func (b NullableBytes) Value() (driver.Value, error) {
if b != nil {
return []byte(b), nil
}
// any(nil) is treated as NULL in contrast to []byte(nil) which is a non-NULL byte sequence of length 0.
return nil, nil
}