From 7cd6fd98ce8e05e4939df6fb30f96fce5810fe5d Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Mon, 23 Aug 2021 16:15:39 +0200 Subject: [PATCH 1/8] SLA reporting: SQL schema --- schema/mysql/schema.sql | 33 ++++++++++++++++++ schema/mysql/upgrades/1.0.0.sql | 33 ++++++++++++++++++ schema/pgsql/schema.sql | 62 +++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) diff --git a/schema/mysql/schema.sql b/schema/mysql/schema.sql index ebce21e9..0cecb7ea 100644 --- a/schema/mysql/schema.sql +++ b/schema/mysql/schema.sql @@ -1124,6 +1124,39 @@ CREATE TABLE history ( INDEX idx_history_host_service_id (host_id, service_id, event_time) COMMENT 'Host/service history detail filter' ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; +CREATE TABLE sla_history_state ( + id binary(20) NOT NULL COMMENT 'state_history.id (may reference already deleted rows)', + environment_id binary(20) NOT NULL COMMENT 'environment.id', + endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id', + object_type enum('host', 'service') NOT NULL, + host_id binary(20) NOT NULL COMMENT 'host.id', + service_id binary(20) DEFAULT NULL COMMENT 'service.id', + + event_time bigint unsigned NOT NULL COMMENT 'unix timestamp the event occurred', + hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state after this event', + previous_hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state before this event', + + PRIMARY KEY (id), + + INDEX idx_sla_history_state_event (host_id, service_id, event_time) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; + +CREATE TABLE sla_history_downtime ( + environment_id binary(20) NOT NULL COMMENT 'environment.id', + endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id', + object_type enum('host', 'service') NOT NULL, + host_id binary(20) NOT NULL COMMENT 'host.id', + service_id binary(20) DEFAULT NULL COMMENT 'service.id', + + downtime_id binary(20) NOT NULL COMMENT 'downtime.id (may reference already deleted rows)', + downtime_start BIGINT UNSIGNED NOT NULL COMMENT 'start time of the downtime', + downtime_end BIGINT UNSIGNED NOT NULL COMMENT 'end time of the downtime', + + PRIMARY KEY (downtime_id), + + INDEX idx_sla_history_downtime_event (host_id, service_id, downtime_start, downtime_end) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; + CREATE TABLE icingadb_schema ( id int unsigned NOT NULL AUTO_INCREMENT, version smallint unsigned NOT NULL, diff --git a/schema/mysql/upgrades/1.0.0.sql b/schema/mysql/upgrades/1.0.0.sql index 6991e212..ec347d9e 100644 --- a/schema/mysql/upgrades/1.0.0.sql +++ b/schema/mysql/upgrades/1.0.0.sql @@ -47,5 +47,38 @@ ALTER TABLE customvar ALTER TABLE customvar_flat MODIFY flatname varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'Path converted with `.` and `[ ]`'; +CREATE TABLE sla_history_state ( + id binary(20) NOT NULL COMMENT 'state_history.id (may reference already deleted rows)', + environment_id binary(20) NOT NULL COMMENT 'environment.id', + endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id', + object_type enum('host', 'service') NOT NULL, + host_id binary(20) NOT NULL COMMENT 'host.id', + service_id binary(20) DEFAULT NULL COMMENT 'service.id', + + event_time bigint unsigned NOT NULL COMMENT 'unix timestamp the event occurred', + hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state after this event', + previous_hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state before this event', + + PRIMARY KEY (id), + + INDEX idx_sla_history_state_event (host_id, service_id, event_time) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; + +CREATE TABLE sla_history_downtime ( + environment_id binary(20) NOT NULL COMMENT 'environment.id', + endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id', + object_type enum('host', 'service') NOT NULL, + host_id binary(20) NOT NULL COMMENT 'host.id', + service_id binary(20) DEFAULT NULL COMMENT 'service.id', + + downtime_id binary(20) NOT NULL COMMENT 'downtime.id (may reference already deleted rows)', + downtime_start BIGINT UNSIGNED NOT NULL COMMENT 'start time of the downtime', + downtime_end BIGINT UNSIGNED NOT NULL COMMENT 'end time of the downtime', + + PRIMARY KEY (downtime_id), + + INDEX idx_sla_history_downtime_event (host_id, service_id, downtime_start, downtime_end) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; + INSERT INTO icingadb_schema (version, TIMESTAMP) VALUES (3, CURRENT_TIMESTAMP() * 1000); diff --git a/schema/pgsql/schema.sql b/schema/pgsql/schema.sql index de650190..8a2de033 100644 --- a/schema/pgsql/schema.sql +++ b/schema/pgsql/schema.sql @@ -1894,6 +1894,68 @@ COMMENT ON COLUMN history.acknowledgement_history_id IS 'acknowledgement_history COMMENT ON INDEX idx_history_event_time IS 'History filtered/ordered by event_time'; COMMENT ON INDEX idx_history_host_service_id IS 'Host/service history detail filter'; +CREATE TABLE sla_history_state ( + id bytea20 NOT NULL, + environment_id bytea20 NOT NULL, + endpoint_id bytea20 DEFAULT NULL, + object_type checkable_type NOT NULL, + host_id bytea20 NOT NULL, + service_id bytea20 DEFAULT NULL, + + event_time biguint NOT NULL, + hard_state tinyuint NOT NULL, + previous_hard_state tinyuint NOT NULL, + + CONSTRAINT pk_sla_history_state PRIMARY KEY (id) +); + +ALTER TABLE sla_history_state ALTER COLUMN id SET STORAGE PLAIN; +ALTER TABLE sla_history_state ALTER COLUMN environment_id SET STORAGE PLAIN; +ALTER TABLE sla_history_state ALTER COLUMN endpoint_id SET STORAGE PLAIN; +ALTER TABLE sla_history_state ALTER COLUMN host_id SET STORAGE PLAIN; +ALTER TABLE sla_history_state ALTER COLUMN service_id SET STORAGE PLAIN; + +CREATE INDEX idx_sla_history_state_event ON sla_history_state(host_id, service_id, event_time); + +COMMENT ON COLUMN sla_history_state.id IS 'state_history.id (may reference already deleted rows)'; +COMMENT ON COLUMN sla_history_state.environment_id IS 'environment.id'; +COMMENT ON COLUMN sla_history_state.endpoint_id IS 'endpoint.id'; +COMMENT ON COLUMN sla_history_state.host_id IS 'host.id'; +COMMENT ON COLUMN sla_history_state.service_id IS 'service.id'; +COMMENT ON COLUMN sla_history_state.event_time IS 'unix timestamp the event occurred'; +COMMENT ON COLUMN sla_history_state.hard_state IS 'hard state after this event'; +COMMENT ON COLUMN sla_history_state.previous_hard_state IS 'hard state before this event'; + +CREATE TABLE sla_history_downtime ( + environment_id bytea20 NOT NULL, + endpoint_id bytea20 DEFAULT NULL, + object_type checkable_type NOT NULL, + host_id bytea20 NOT NULL, + service_id bytea20 DEFAULT NULL, + + downtime_id bytea20 NOT NULL, + downtime_start biguint NOT NULL, + downtime_end biguint NOT NULL, + + CONSTRAINT pk_sla_history_downtime PRIMARY KEY (downtime_id) +); + +ALTER TABLE sla_history_downtime ALTER COLUMN environment_id SET STORAGE PLAIN; +ALTER TABLE sla_history_downtime ALTER COLUMN endpoint_id SET STORAGE PLAIN; +ALTER TABLE sla_history_downtime ALTER COLUMN host_id SET STORAGE PLAIN; +ALTER TABLE sla_history_downtime ALTER COLUMN service_id SET STORAGE PLAIN; +ALTER TABLE sla_history_downtime ALTER COLUMN downtime_id SET STORAGE PLAIN; + +CREATE INDEX idx_sla_history_downtime_event ON sla_history_downtime(host_id, service_id, downtime_start, downtime_end); + +COMMENT ON COLUMN sla_history_downtime.environment_id IS 'environment.id'; +COMMENT ON COLUMN sla_history_downtime.endpoint_id IS 'endpoint.id'; +COMMENT ON COLUMN sla_history_downtime.host_id IS 'host.id'; +COMMENT ON COLUMN sla_history_downtime.service_id IS 'service.id'; +COMMENT ON COLUMN sla_history_downtime.downtime_id IS 'downtime.id (may reference already deleted rows)'; +COMMENT ON COLUMN sla_history_downtime.downtime_start IS 'start time of the downtime'; +COMMENT ON COLUMN sla_history_downtime.downtime_end IS 'end time of the downtime'; + CREATE SEQUENCE icingadb_schema_id_seq; CREATE TABLE icingadb_schema ( From e9bd5cd40f36ea581271d085185d05a185a35dc9 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 24 Aug 2021 11:16:39 +0200 Subject: [PATCH 2/8] SLA reporting: additionally write relevant history events to dedicated SLA tables --- pkg/icingadb/history/sla.go | 26 +++++++++++++++++++ pkg/icingadb/history/sync.go | 10 +++++--- pkg/icingadb/v1/history/downtime.go | 40 +++++++++++++++++++++++++++++ pkg/icingadb/v1/history/state.go | 10 ++++++++ pkg/types/state_type.go | 9 +++++-- 5 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 pkg/icingadb/history/sla.go diff --git a/pkg/icingadb/history/sla.go b/pkg/icingadb/history/sla.go new file mode 100644 index 00000000..79d22c72 --- /dev/null +++ b/pkg/icingadb/history/sla.go @@ -0,0 +1,26 @@ +package history + +import ( + "github.com/go-redis/redis/v8" + "github.com/icinga/icingadb/pkg/icingadb/v1/history" + "github.com/icinga/icingadb/pkg/structify" + "github.com/icinga/icingadb/pkg/types" + "reflect" +) + +var slaStateStructify = structify.MakeMapStructifier(reflect.TypeOf((*history.SlaHistoryState)(nil)).Elem(), "json") + +func stateHistoryToSlaEntity(entry redis.XMessage) ([]history.UpserterEntity, error) { + slaStateInterface, err := slaStateStructify(entry.Values) + if err != nil { + return nil, err + } + slaState := slaStateInterface.(*history.SlaHistoryState) + + if slaState.StateType != types.StateHard { + // only hard state changes are relevant for SLA history, discard all others + return nil, nil + } + + return []history.UpserterEntity{slaState}, nil +} diff --git a/pkg/icingadb/history/sync.go b/pkg/icingadb/history/sync.go index e1d6160b..d5bebd65 100644 --- a/pkg/icingadb/history/sync.go +++ b/pkg/icingadb/history/sync.go @@ -359,12 +359,14 @@ var syncPipelines = map[string][]stageFunc{ writeOneEntityStage((*v1.HistoryNotification)(nil)), // history (depends on notification_history) }, "state": { - writeOneEntityStage((*v1.StateHistory)(nil)), // state_history - writeOneEntityStage((*v1.HistoryState)(nil)), // history (depends on state_history) + writeOneEntityStage((*v1.StateHistory)(nil)), // state_history + writeOneEntityStage((*v1.HistoryState)(nil)), // history (depends on state_history) + writeMultiEntityStage(stateHistoryToSlaEntity), // sla_history_state }, "downtime": { - writeOneEntityStage((*v1.DowntimeHistory)(nil)), // downtime_history - writeOneEntityStage((*v1.HistoryDowntime)(nil)), // history (depends on downtime_history) + writeOneEntityStage((*v1.DowntimeHistory)(nil)), // downtime_history + writeOneEntityStage((*v1.HistoryDowntime)(nil)), // history (depends on downtime_history) + writeOneEntityStage((*v1.SlaHistoryDowntime)(nil)), // sla_history_downtime }, "comment": { writeOneEntityStage((*v1.CommentHistory)(nil)), // comment_history diff --git a/pkg/icingadb/v1/history/downtime.go b/pkg/icingadb/v1/history/downtime.go index 448c9ecf..99f93f6a 100644 --- a/pkg/icingadb/v1/history/downtime.go +++ b/pkg/icingadb/v1/history/downtime.go @@ -80,6 +80,30 @@ func (*HistoryDowntime) TableName() string { return "history" } +type SlaHistoryDowntime struct { + DowntimeHistoryEntity `json:",inline"` + HistoryTableMeta `json:",inline"` + SlaHistoryDowntimeUpserter `json:",inline"` + DowntimeStart types.UnixMilli `json:"start_time"` + HasBeenCancelled types.Bool `json:"has_been_cancelled" db:"-"` + CancelTime types.UnixMilli `json:"cancel_time" db:"-"` + EndTime types.UnixMilli `json:"end_time" db:"-"` +} + +// Init implements the contracts.Initer interface. +func (s *SlaHistoryDowntime) Init() { + s.DowntimeEnd.History = s +} + +type SlaHistoryDowntimeUpserter struct { + DowntimeEnd SlaDowntimeEndTime `json:"-"` +} + +// Upsert implements the contracts.Upserter interface. +func (h *SlaHistoryDowntimeUpserter) Upsert() interface{} { + return h +} + type DowntimeEventTime struct { History *HistoryDowntime `db:"-"` } @@ -109,6 +133,19 @@ func (et DowntimeEventTime) Value() (driver.Value, error) { } } +type SlaDowntimeEndTime struct { + History *SlaHistoryDowntime `db:"-"` +} + +// Value implements the driver.Valuer interface. +func (et SlaDowntimeEndTime) Value() (driver.Value, error) { + if et.History.HasBeenCancelled.Valid && et.History.HasBeenCancelled.Bool { + return et.History.CancelTime.Value() + } else { + return et.History.EndTime.Value() + } +} + // Assert interface compliance. var ( _ contracts.Entity = (*DowntimeHistoryEntity)(nil) @@ -117,5 +154,8 @@ var ( _ contracts.Initer = (*HistoryDowntime)(nil) _ contracts.TableNamer = (*HistoryDowntime)(nil) _ UpserterEntity = (*HistoryDowntime)(nil) + _ contracts.Initer = (*SlaHistoryDowntime)(nil) + _ UpserterEntity = (*SlaHistoryDowntime)(nil) _ driver.Valuer = DowntimeEventTime{} + _ driver.Valuer = SlaDowntimeEndTime{} ) diff --git a/pkg/icingadb/v1/history/state.go b/pkg/icingadb/v1/history/state.go index ad5f4703..8a913812 100644 --- a/pkg/icingadb/v1/history/state.go +++ b/pkg/icingadb/v1/history/state.go @@ -33,9 +33,19 @@ func (*HistoryState) TableName() string { return "history" } +type SlaHistoryState struct { + HistoryTableEntity `json:",inline"` + HistoryTableMeta `json:",inline"` + EventTime types.UnixMilli `json:"event_time"` + StateType types.StateType `json:"state_type" db:"-"` + HardState uint8 `json:"hard_state"` + PreviousHardState uint8 `json:"previous_hard_state"` +} + // Assert interface compliance. var ( _ UpserterEntity = (*StateHistory)(nil) _ contracts.TableNamer = (*HistoryState)(nil) _ UpserterEntity = (*HistoryState)(nil) + _ UpserterEntity = (*SlaHistoryState)(nil) ) diff --git a/pkg/types/state_type.go b/pkg/types/state_type.go index 8a24819e..f0cc69af 100644 --- a/pkg/types/state_type.go +++ b/pkg/types/state_type.go @@ -46,10 +46,15 @@ func badStateType(t interface{}) error { return errors.Errorf("bad state type: %#v", t) } +const ( + StateSoft = StateType(0) + StateHard = StateType(1) +) + // stateTypes maps all valid StateType values to their SQL representation. var stateTypes = map[StateType]string{ - 0: "soft", - 1: "hard", + StateSoft: "soft", + StateHard: "hard", } // Assert interface compliance. From a0b1bc31962d132189daadef945ae0259fd615b8 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Fri, 6 May 2022 12:42:23 +0200 Subject: [PATCH 3/8] List required RDBMS versions and increase PostgreSQL minimum MySQL minimum versions are based on what was used in Icinga DB 1.0.0 RC1. The required PostgreSQL version is bumped to at least 9.6 as this allows to label functions as `PARALLEL RESTRICTED` like it's done by the SLA reporting. This or newer versions are available for both CentOS/RHEL 8 and SLES 12 which were the primary constraints for the previous version requirement. For CentOS/RHEL, don't specify some additional paths and package names with a specific version, as any newer version is also fine and there's a choice of multiple versions. Also, we don't guide the user through actually installing the PostgreSQL server, so they should already be familiar with the differing names if their distribution reuqires them. [1] https://github.com/Icinga/icingadb/blob/v1.0.0-rc1/.github/workflows/go.yml --- doc/02-Installation.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/doc/02-Installation.md b/doc/02-Installation.md index 5bfcddfa..414843d1 100644 --- a/doc/02-Installation.md +++ b/doc/02-Installation.md @@ -3,7 +3,7 @@ ## Requirements * Local Redis instance (Will be installed during this documentation) -* MySQL/MariaDB/PostgreSQL database `icingadb`, user and schema imports (Will be set up during this documentation) +* MySQL (≥5.5), MariaDB (≥10.1), or PostgreSQL (≥9.6): database, user and schema imports (Will be set up during this documentation) ## Setting up Icinga DB @@ -176,7 +176,6 @@ psql icingadb <<<'CREATE EXTENSION IF NOT EXISTS citext;' ``` The CREATE EXTENSION command requires the postgresql-contrib package. -(On RHEL/CentOS 7: rh-postgresql95-postgresql-contrib) Edit `pg_hba.conf`, insert the following before everything else: @@ -187,7 +186,6 @@ host all icingadb ::/0 md5 ``` To apply those changes, run `systemctl reload postgresql`. -(On RHEL/CentOS 7 the service is called "rh-postgresql95-postgresql".) After creating the database you can import the Icinga DB schema using the following command. Enter the password when asked. @@ -196,9 +194,6 @@ following command. Enter the password when asked. psql -U icingadb icingadb < /usr/share/icingadb/schema/pgsql/schema.sql ``` -On RHEL/CentOS 7 prefix "createuser", "createdb" and "psql" with -"/opt/rh/rh-postgresql95/root/usr/bin/". - ### Running Icinga DB Foreground: From b81392857a4f5b499f9f2f2a1670885514bf58e4 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 24 Aug 2021 11:59:58 +0200 Subject: [PATCH 4/8] SLA reporting: add SQL stored function to calculate SLA value --- schema/mysql/schema.sql | 158 ++++++++++++++++++++++++++++++++ schema/mysql/upgrades/1.0.0.sql | 158 ++++++++++++++++++++++++++++++++ schema/pgsql/schema.sql | 140 ++++++++++++++++++++++++++++ 3 files changed, 456 insertions(+) diff --git a/schema/mysql/schema.sql b/schema/mysql/schema.sql index 0cecb7ea..8f36c697 100644 --- a/schema/mysql/schema.sql +++ b/schema/mysql/schema.sql @@ -3,6 +3,164 @@ SET SESSION sql_mode = 'STRICT_ALL_TABLES,NO_ENGINE_SUBSTITUTION'; SET SESSION innodb_strict_mode = 1; +DROP FUNCTION IF EXISTS get_sla_ok_percent; +DELIMITER // +CREATE FUNCTION get_sla_ok_percent( + in_host_id binary(20), + in_service_id binary(20), + in_start_time bigint unsigned, + in_end_time bigint unsigned +) +RETURNS decimal(7, 4) +READS SQL DATA +BEGIN + DECLARE result decimal(7, 4); + DECLARE row_event_time bigint unsigned; + DECLARE row_event_type enum('state_change', 'downtime_start', 'downtime_end', 'end'); + DECLARE row_event_prio int; + DECLARE row_hard_state tinyint unsigned; + DECLARE row_previous_hard_state tinyint unsigned; + DECLARE last_event_time bigint unsigned; + DECLARE last_hard_state tinyint unsigned; + DECLARE active_downtimes int unsigned; + DECLARE problem_time bigint unsigned; + DECLARE total_time bigint unsigned; + DECLARE done int; + DECLARE cur CURSOR FOR + ( + -- all downtime_start events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + GREATEST(downtime_start, in_start_time) AS event_time, + 'downtime_start' AS event_type, + 1 AS event_prio, + NULL AS hard_state, + NULL AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + ) UNION ALL ( + -- all downtime_end events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + downtime_end AS event_time, + 'downtime_end' AS event_type, + 2 AS event_prio, + NULL AS hard_state, + NULL AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + AND d.downtime_end < in_end_time + ) UNION ALL ( + -- all state events strictly in interval + SELECT + event_time, + 'state_change' AS event_type, + 0 AS event_prio, + hard_state, + previous_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + AND s.event_time < in_end_time + ) UNION ALL ( + -- end event to keep loop simple, values are not used + SELECT + in_end_time AS event_time, + 'end' AS event_type, + 3 AS event_prio, + NULL AS hard_state, + NULL AS previous_hard_state + ) + ORDER BY event_time, event_prio; + DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1; + + IF in_end_time <= in_start_time THEN + SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'end time must be greater than start time'; + END IF; + + -- Use the latest event at or before the beginning of the SLA interval as the initial state. + SELECT hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time <= in_start_time + ORDER BY s.event_time DESC + LIMIT 1; + + -- If this does not exist, use the previous state from the first event after the beginning of the SLA interval. + IF last_hard_state IS NULL THEN + SELECT previous_hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + ORDER BY s.event_time ASC + LIMIT 1; + END IF; + + -- If this also does not exist, use the current host/service state. + IF last_hard_state IS NULL THEN + IF in_service_id IS NULL THEN + SELECT hard_state INTO last_hard_state + FROM host_state s + WHERE s.host_id = in_host_id; + ELSE + SELECT hard_state INTO last_hard_state + FROM service_state s + WHERE s.host_id = in_host_id + AND s.service_id = in_service_id; + END IF; + END IF; + + IF last_hard_state IS NULL THEN + SET last_hard_state = 0; + END IF; + + SET problem_time = 0; + SET total_time = in_end_time - in_start_time; + SET last_event_time = in_start_time; + SET active_downtimes = 0; + + SET done = 0; + OPEN cur; + read_loop: LOOP + FETCH cur INTO row_event_time, row_event_type, row_event_prio, row_hard_state, row_previous_hard_state; + IF done THEN + LEAVE read_loop; + END IF; + + IF row_previous_hard_state = 99 THEN + SET total_time = total_time - (row_event_time - last_event_time); + ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1)) + AND last_hard_state != 99 + AND active_downtimes = 0 + THEN + SET problem_time = problem_time + row_event_time - last_event_time; + END IF; + + SET last_event_time = row_event_time; + IF row_event_type = 'state_change' THEN + SET last_hard_state = row_hard_state; + ELSEIF row_event_type = 'downtime_start' THEN + SET active_downtimes = active_downtimes + 1; + ELSEIF row_event_type = 'downtime_end' THEN + SET active_downtimes = active_downtimes - 1; + END IF; + END LOOP; + CLOSE cur; + + SET result = 100 * (total_time - problem_time) / total_time; + RETURN result; +END// +DELIMITER ; + CREATE TABLE host ( id binary(20) NOT NULL COMMENT 'sha1(environment.id + name)', environment_id binary(20) NOT NULL COMMENT 'environment.id', diff --git a/schema/mysql/upgrades/1.0.0.sql b/schema/mysql/upgrades/1.0.0.sql index ec347d9e..0409e528 100644 --- a/schema/mysql/upgrades/1.0.0.sql +++ b/schema/mysql/upgrades/1.0.0.sql @@ -1,3 +1,161 @@ +DROP FUNCTION IF EXISTS get_sla_ok_percent; +DELIMITER // +CREATE FUNCTION get_sla_ok_percent( + in_host_id binary(20), + in_service_id binary(20), + in_start_time bigint unsigned, + in_end_time bigint unsigned +) +RETURNS decimal(7, 4) +READS SQL DATA +BEGIN + DECLARE result decimal(7, 4); + DECLARE row_event_time bigint unsigned; + DECLARE row_event_type enum('state_change', 'downtime_start', 'downtime_end', 'end'); + DECLARE row_event_prio int; + DECLARE row_hard_state tinyint unsigned; + DECLARE row_previous_hard_state tinyint unsigned; + DECLARE last_event_time bigint unsigned; + DECLARE last_hard_state tinyint unsigned; + DECLARE active_downtimes int unsigned; + DECLARE problem_time bigint unsigned; + DECLARE total_time bigint unsigned; + DECLARE done int; + DECLARE cur CURSOR FOR + ( + -- all downtime_start events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + GREATEST(downtime_start, in_start_time) AS event_time, + 'downtime_start' AS event_type, + 1 AS event_prio, + NULL AS hard_state, + NULL AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + ) UNION ALL ( + -- all downtime_end events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + downtime_end AS event_time, + 'downtime_end' AS event_type, + 2 AS event_prio, + NULL AS hard_state, + NULL AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + AND d.downtime_end < in_end_time + ) UNION ALL ( + -- all state events strictly in interval + SELECT + event_time, + 'state_change' AS event_type, + 0 AS event_prio, + hard_state, + previous_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + AND s.event_time < in_end_time + ) UNION ALL ( + -- end event to keep loop simple, values are not used + SELECT + in_end_time AS event_time, + 'end' AS event_type, + 3 AS event_prio, + NULL AS hard_state, + NULL AS previous_hard_state + ) + ORDER BY event_time, event_prio; + DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1; + + IF in_end_time <= in_start_time THEN + SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'end time must be greater than start time'; + END IF; + + -- Use the latest event at or before the beginning of the SLA interval as the initial state. + SELECT hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time <= in_start_time + ORDER BY s.event_time DESC + LIMIT 1; + + -- If this does not exist, use the previous state from the first event after the beginning of the SLA interval. + IF last_hard_state IS NULL THEN + SELECT previous_hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + ORDER BY s.event_time ASC + LIMIT 1; + END IF; + + -- If this also does not exist, use the current host/service state. + IF last_hard_state IS NULL THEN + IF in_service_id IS NULL THEN + SELECT hard_state INTO last_hard_state + FROM host_state s + WHERE s.host_id = in_host_id; + ELSE + SELECT hard_state INTO last_hard_state + FROM service_state s + WHERE s.host_id = in_host_id + AND s.service_id = in_service_id; + END IF; + END IF; + + IF last_hard_state IS NULL THEN + SET last_hard_state = 0; + END IF; + + SET problem_time = 0; + SET total_time = in_end_time - in_start_time; + SET last_event_time = in_start_time; + SET active_downtimes = 0; + + SET done = 0; + OPEN cur; + read_loop: LOOP + FETCH cur INTO row_event_time, row_event_type, row_event_prio, row_hard_state, row_previous_hard_state; + IF done THEN + LEAVE read_loop; + END IF; + + IF row_previous_hard_state = 99 THEN + SET total_time = total_time - (row_event_time - last_event_time); + ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1)) + AND last_hard_state != 99 + AND active_downtimes = 0 + THEN + SET problem_time = problem_time + row_event_time - last_event_time; + END IF; + + SET last_event_time = row_event_time; + IF row_event_type = 'state_change' THEN + SET last_hard_state = row_hard_state; + ELSEIF row_event_type = 'downtime_start' THEN + SET active_downtimes = active_downtimes + 1; + ELSEIF row_event_type = 'downtime_end' THEN + SET active_downtimes = active_downtimes - 1; + END IF; + END LOOP; + CLOSE cur; + + SET result = 100 * (total_time - problem_time) / total_time; + RETURN result; +END// +DELIMITER ; + ALTER TABLE hostgroup DROP INDEX idx_hostroup_name, ADD INDEX idx_hostgroup_name (name) COMMENT 'Host/service/host group list filtered by host group name'; diff --git a/schema/pgsql/schema.sql b/schema/pgsql/schema.sql index 8a2de033..95486b34 100644 --- a/schema/pgsql/schema.sql +++ b/schema/pgsql/schema.sql @@ -21,6 +21,146 @@ CREATE TYPE comment_type AS ENUM ( 'comment', 'ack' ); CREATE TYPE notification_type AS ENUM ( 'downtime_start', 'downtime_end', 'downtime_removed', 'custom', 'acknowledgement', 'problem', 'recovery', 'flapping_start', 'flapping_end' ); CREATE TYPE history_type AS ENUM ( 'notification', 'state_change', 'downtime_start', 'downtime_end', 'comment_add', 'comment_remove', 'flapping_start', 'flapping_end', 'ack_set', 'ack_clear' ); +CREATE OR REPLACE FUNCTION get_sla_ok_percent( + in_host_id bytea20, + in_service_id bytea20, + in_start_time biguint, + in_end_time biguint +) +RETURNS decimal(7, 4) +LANGUAGE plpgsql +STABLE +PARALLEL RESTRICTED +AS $$ +DECLARE + last_event_time biguint := in_start_time; + last_hard_state tinyuint; + active_downtimes uint := 0; + problem_time biguint := 0; + total_time biguint; + row record; +BEGIN + IF in_end_time <= in_start_time THEN + RAISE 'end time must be greater than start time'; + END IF; + + total_time := in_end_time - in_start_time; + + -- Use the latest event at or before the beginning of the SLA interval as the initial state. + SELECT hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time <= in_start_time + ORDER BY s.event_time DESC + LIMIT 1; + + -- If this does not exist, use the previous state from the first event after the beginning of the SLA interval. + IF last_hard_state IS NULL THEN + SELECT previous_hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + ORDER BY s.event_time ASC + LIMIT 1; + END IF; + + -- If this also does not exist, use the current host/service state. + IF last_hard_state IS NULL THEN + IF in_service_id IS NULL THEN + SELECT hard_state INTO last_hard_state + FROM host_state s + WHERE s.host_id = in_host_id; + ELSE + SELECT hard_state INTO last_hard_state + FROM service_state s + WHERE s.host_id = in_host_id + AND s.service_id = in_service_id; + END IF; + END IF; + + IF last_hard_state IS NULL THEN + last_hard_state := 0; + END IF; + + FOR row IN + ( + -- all downtime_start events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + GREATEST(downtime_start, in_start_time) AS event_time, + 'downtime_start' AS event_type, + 1 AS event_prio, + NULL::tinyuint AS hard_state, + NULL::tinyuint AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + ) UNION ALL ( + -- all downtime_end events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + downtime_end AS event_time, + 'downtime_end' AS event_type, + 2 AS event_prio, + NULL::tinyuint AS hard_state, + NULL::tinyuint AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + AND d.downtime_end < in_end_time + ) UNION ALL ( + -- all state events strictly in interval + SELECT + event_time, + 'state_change' AS event_type, + 0 AS event_prio, + hard_state, + previous_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + AND s.event_time < in_end_time + ) UNION ALL ( + -- end event to keep loop simple, values are not used + SELECT + in_end_time AS event_time, + 'end' AS event_type, + 3 AS event_prio, + NULL::tinyuint AS hard_state, + NULL::tinyuint AS previous_hard_state + ) + ORDER BY event_time, event_prio + LOOP + IF row.previous_hard_state = 99 THEN + total_time := total_time - (row.event_time - last_event_time); + ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1)) + AND last_hard_state != 99 + AND active_downtimes = 0 + THEN + problem_time := problem_time + row.event_time - last_event_time; + END IF; + + last_event_time := row.event_time; + IF row.event_type = 'state_change' THEN + last_hard_state := row.hard_state; + ELSEIF row.event_type = 'downtime_start' THEN + active_downtimes := active_downtimes + 1; + ELSEIF row.event_type = 'downtime_end' THEN + active_downtimes := active_downtimes - 1; + END IF; + END LOOP; + + RETURN 100 * (total_time - problem_time) / total_time; +END; +$$; + CREATE TABLE host ( id bytea20 NOT NULL, environment_id bytea20 NOT NULL, From 5ea82188dc6ccc0fc9a2049b60c822a1871186eb Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Tue, 24 Aug 2021 15:12:31 +0200 Subject: [PATCH 5/8] SLA reporting: tests for the SQL stored function --- .github/workflows/sql.yml | 58 +++++ tests/go.mod | 4 +- tests/go.sum | 13 +- tests/internal/utils/database.go | 28 +++ tests/main_test.go | 19 +- tests/sql/main_test.go | 23 ++ tests/sql/sla_test.go | 406 +++++++++++++++++++++++++++++++ 7 files changed, 530 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/sql.yml create mode 100644 tests/internal/utils/database.go create mode 100644 tests/sql/main_test.go create mode 100644 tests/sql/sla_test.go diff --git a/.github/workflows/sql.yml b/.github/workflows/sql.yml new file mode 100644 index 00000000..c5e4304f --- /dev/null +++ b/.github/workflows/sql.yml @@ -0,0 +1,58 @@ +name: SQL + +on: + push: + branches: + - master + pull_request: {} + +jobs: + sql: + name: ${{ matrix.database.name }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + database: + - {type: MYSQL, name: MySQL 5.5, image: "icinga/icingadb-mysql:5.5"} + - {type: MYSQL, name: MySQL 5.6, image: "icinga/icingadb-mysql:5.6"} + - {type: MYSQL, name: MySQL 5.7, image: "mysql:5.7"} + - {type: MYSQL, name: MySQL latest, image: "mysql:latest"} + - {type: MYSQL, name: MariaDB 10.1, image: "mariadb:10.1"} + - {type: MYSQL, name: MariaDB 10.2, image: "mariadb:10.2"} + - {type: MYSQL, name: MariaDB 10.3, image: "mariadb:10.3"} + - {type: MYSQL, name: MariaDB 10.4, image: "mariadb:10.4"} + - {type: MYSQL, name: MariaDB 10.5, image: "mariadb:10.5"} + - {type: MYSQL, name: MariaDB 10.6, image: "mariadb:10.6"} + - {type: MYSQL, name: MariaDB 10.7, image: "mariadb:10.7"} + - {type: MYSQL, name: MariaDB latest, image: "mariadb:latest"} + - {type: PGSQL, name: PostgreSQL 9.6, image: "postgres:9.6"} + - {type: PGSQL, name: PostgreSQL 10, image: "postgres:10"} + - {type: PGSQL, name: PostgreSQL 11, image: "postgres:11"} + - {type: PGSQL, name: PostgreSQL 12, image: "postgres:12"} + - {type: PGSQL, name: PostgreSQL 13, image: "postgres:13"} + - {type: PGSQL, name: PostgreSQL latest, image: "postgres:latest"} + + steps: + - name: Setup Go + uses: actions/setup-go@v1 + with: + go-version: '^1.16' + + - name: Checkout code + uses: actions/checkout@v2 + + - name: Download dependencies + run: go get -v -t -d ./... + working-directory: tests/ + + - name: Run tests + env: + ICINGADB_TESTS_DATABASE_TYPE: ${{ matrix.database.type }} + ICINGA_TESTING_${{ matrix.database.type }}_IMAGE: ${{ matrix.database.image }} + ICINGA_TESTING_ICINGADB_SCHEMA_MYSQL: ${{ github.workspace }}/schema/mysql/schema.sql + ICINGA_TESTING_ICINGADB_SCHEMA_PGSQL: ${{ github.workspace }}/schema/pgsql/schema.sql + timeout-minutes: 10 + run: go test -v -timeout 5m ./sql + working-directory: tests/ diff --git a/tests/go.mod b/tests/go.mod index eda66e40..a0184ea3 100644 --- a/tests/go.mod +++ b/tests/go.mod @@ -5,10 +5,12 @@ go 1.16 require ( github.com/containerd/containerd v1.5.6 // indirect github.com/go-redis/redis/v8 v8.11.4 + github.com/go-sql-driver/mysql v1.6.0 github.com/goccy/go-yaml v1.9.5 github.com/google/uuid v1.3.0 - github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c + github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01 github.com/jmoiron/sqlx v1.3.4 + github.com/lib/pq v1.10.5 github.com/stretchr/testify v1.7.0 go.uber.org/zap v1.21.0 golang.org/x/net v0.0.0-20211020060615-d418f374d309 // indirect diff --git a/tests/go.sum b/tests/go.sum index 4477fc77..bfb5934f 100644 --- a/tests/go.sum +++ b/tests/go.sum @@ -37,6 +37,8 @@ github.com/Azure/go-autorest/logger v0.2.0/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZ github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/Icinga/go-libs v0.0.0-20220420130327-ef58ad52edd8 h1:hG4Y/LPERK9i+P8/jnYlq9PeDd9deIkwEWOIimDU3uk= +github.com/Icinga/go-libs v0.0.0-20220420130327-ef58ad52edd8/go.mod h1:xlgU55MKs/vIg1fMlAEBSrslahYayZNwjXvf3w1dvyA= github.com/Microsoft/go-winio v0.4.11/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA= github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA= github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw= @@ -60,6 +62,7 @@ github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5 github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= @@ -68,6 +71,7 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alexflint/go-filemutex v0.0.0-20171022225611-72bdc8eae2ae/go.mod h1:CgnQgUtFrFz9mxFNtED3jI5tLDjKlOM+oUF/sTk6ps0= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= @@ -402,8 +406,8 @@ github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c h1:jJVPK9dvsZ99Xl/xDOBM+DEj+7i7En51/04ckc/lTZc= -github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c/go.mod h1:W9pLmq2dsgLSag568N/LDHNu4oah6qWvjT05Drz2RYw= +github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01 h1:0dwlZFGWPnmmhvHr2P7chxMwzbW7+R3iX6SyeFBd+WM= +github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01/go.mod h1:ZP0pyqhmrRwwQ6FpAfz7UZMgmH7i3vOjEOm9JcFwOw0= github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/imdario/mergo v0.3.8/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/imdario/mergo v0.3.10/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= @@ -444,8 +448,9 @@ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= -github.com/lib/pq v1.10.4 h1:SO9z7FRPzA03QhHKJrH5BXA6HU1rS4V2nIVrrNC1iYk= github.com/lib/pq v1.10.4/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lib/pq v1.10.5 h1:J+gdV2cUmX7ZqL2B0lFcW0m+egaHC2V3lpO8nWxyYiQ= +github.com/lib/pq v1.10.5/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= @@ -459,6 +464,7 @@ github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHX github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= +github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus= github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg= github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= @@ -725,6 +731,7 @@ golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzB golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= diff --git a/tests/internal/utils/database.go b/tests/internal/utils/database.go new file mode 100644 index 00000000..c32566b3 --- /dev/null +++ b/tests/internal/utils/database.go @@ -0,0 +1,28 @@ +package utils + +import ( + "fmt" + "github.com/icinga/icinga-testing" + "github.com/icinga/icinga-testing/services" + "os" + "strings" + "testing" +) + +func GetDatabase(it *icingatesting.IT, t testing.TB) services.RelationalDatabase { + k := "ICINGADB_TESTS_DATABASE_TYPE" + v := strings.ToLower(os.Getenv(k)) + + var rdb services.RelationalDatabase + + switch v { + case "mysql": + rdb = it.MysqlDatabaseT(t) + case "pgsql": + rdb = it.PostgresqlDatabaseT(t) + default: + panic(fmt.Sprintf(`unknown database in %s environment variable: %q (must be "mysql" or "pgsql")`, k, v)) + } + + return rdb +} diff --git a/tests/main_test.go b/tests/main_test.go index 69bf4124..bccd9473 100644 --- a/tests/main_test.go +++ b/tests/main_test.go @@ -1,10 +1,9 @@ package icingadb_test import ( - "fmt" "github.com/icinga/icinga-testing" "github.com/icinga/icinga-testing/services" - "os" + "github.com/icinga/icingadb/tests/internal/utils" "testing" ) @@ -26,19 +25,5 @@ func getDatabase(t testing.TB) services.RelationalDatabase { } func getEmptyDatabase(t testing.TB) services.RelationalDatabase { - k := "ICINGADB_TESTS_DATABASE_TYPE" - v := os.Getenv(k) - - var rdb services.RelationalDatabase - - switch v { - case "mysql": - rdb = it.MysqlDatabaseT(t) - case "pgsql": - rdb = it.PostgresqlDatabaseT(t) - default: - panic(fmt.Sprintf(`unknown database in %s environment variable: %q (must be "mysql" or "pgsql")`, k, v)) - } - - return rdb + return utils.GetDatabase(it, t) } diff --git a/tests/sql/main_test.go b/tests/sql/main_test.go new file mode 100644 index 00000000..a2951259 --- /dev/null +++ b/tests/sql/main_test.go @@ -0,0 +1,23 @@ +package sql_test + +import ( + "github.com/icinga/icinga-testing" + "github.com/icinga/icinga-testing/services" + "github.com/icinga/icingadb/tests/internal/utils" + "testing" +) + +var it *icingatesting.IT + +func TestMain(m *testing.M) { + it = icingatesting.NewIT() + defer it.Cleanup() + + m.Run() +} + +func getDatabase(t testing.TB) services.RelationalDatabase { + rdb := utils.GetDatabase(it, t) + rdb.ImportIcingaDbSchema() + return rdb +} diff --git a/tests/sql/sla_test.go b/tests/sql/sla_test.go new file mode 100644 index 00000000..b87a32d6 --- /dev/null +++ b/tests/sql/sla_test.go @@ -0,0 +1,406 @@ +package sql_test + +import ( + "crypto/rand" + "database/sql/driver" + "fmt" + "github.com/go-sql-driver/mysql" + "github.com/jmoiron/sqlx" + "github.com/lib/pq" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "testing" +) + +func TestSla(t *testing.T) { + rdb := getDatabase(t) + db, err := sqlx.Open(rdb.Driver(), rdb.DSN()) + require.NoError(t, err, "connect to database") + + type TestData struct { + Name string + Events []SlaHistoryEvent + Start uint64 + End uint64 + Expected float64 + } + + tests := []TestData{{ + Name: "EmptyHistory", + // Empty history implies no previous problem state, therefore SLA should be 100% + Events: nil, + Start: 1000, + End: 2000, + Expected: 100.0, + }, { + Name: "MultipleStateChanges", + // Some flapping, test that all changes are considered. + Events: []SlaHistoryEvent{ + &State{Time: 1000, State: 2, PreviousState: 99}, // -10% + &State{Time: 1100, State: 0, PreviousState: 2}, + &State{Time: 1300, State: 2, PreviousState: 0}, // -10% + &State{Time: 1400, State: 0, PreviousState: 2}, + &State{Time: 1600, State: 2, PreviousState: 0}, // -10% + &State{Time: 1700, State: 0, PreviousState: 2}, + &State{Time: 1900, State: 2, PreviousState: 0}, // -10% + }, + Start: 1000, + End: 2000, + Expected: 60.0, + }, { + Name: "OverlappingDowntimesAndProblems", + // SLA should be 90%: + // 1000..1100: OK, no downtime + // 1100..1200: OK, in downtime + // 1200..1300: CRITICAL, in downtime + // 1300..1400: CRITICAL, no downtime (only period counting for SLA, -10%) + // 1400..1500: CRITICAL, in downtime + // 1500..1600: OK, in downtime + // 1600..2000: OK, no downtime + Events: []SlaHistoryEvent{ + &Downtime{Start: 1100, End: 1300}, + &Downtime{Start: 1400, End: 1600}, + &State{Time: 1200, State: 2, PreviousState: 0}, + &State{Time: 1500, State: 0, PreviousState: 2}, + }, + Start: 1000, + End: 2000, + Expected: 90.0, + }, { + Name: "CriticalBeforeInterval", + // If there is no event within the SLA interval, the last state from before the interval should be used. + Events: []SlaHistoryEvent{ + &State{Time: 0, State: 2, PreviousState: 99}, + }, + Start: 1000, + End: 2000, + Expected: 0.0, + }, { + Name: "CriticalBeforeIntervalWithDowntime", + // State change and downtime start from before the SLA interval should be considered if still relevant. + Events: []SlaHistoryEvent{ + &State{Time: 800, State: 2, PreviousState: 99}, + &Downtime{Start: 600, End: 1800}, + }, + Start: 1000, + End: 2000, + Expected: 80.0, + }, { + Name: "CriticalBeforeIntervalWithOverlappingDowntimes", + // Test that overlapping downtimes are properly accounted for. + Events: []SlaHistoryEvent{ + &State{Time: 800, State: 2, PreviousState: 99}, + &Downtime{Start: 600, End: 1000}, + &Downtime{Start: 800, End: 1200}, + &Downtime{Start: 1000, End: 1400}, + // Everything except 1400-1600 is covered by downtimes, -20% + &Downtime{Start: 1600, End: 2000}, + &Downtime{Start: 1800, End: 2200}, + }, + Start: 1000, + End: 2000, + Expected: 80.0, + }, { + Name: "FallbackToPreviousState", + // If there is no state event from before the SLA interval, the previous hard state from the first event + // after the beginning of the SLA interval should be used as the initial state. + Events: []SlaHistoryEvent{ + &State{Time: 1200, State: 0, PreviousState: 2}, + }, + Start: 1000, + End: 2000, + Expected: 80.0, + }, { + Name: "FallbackToCurrentState", + // If there are no state history events, the current state of the checkable should be used. + Events: []SlaHistoryEvent{ + &CurrentState{State: 2}, + }, + Start: 1000, + End: 2000, + Expected: 0.0, + }, { + Name: "PreferInitialStateFromBeforeOverLaterState", + // The previous_hard_state should only be used as a fallback when there is no event from before the + // SLA interval. Therefore, the latter should be preferred if there is conflicting information. + Events: []SlaHistoryEvent{ + &State{Time: 800, State: 2, PreviousState: 99}, + &State{Time: 1200, State: 0, PreviousState: 0}, + }, + Start: 1000, + End: 2000, + Expected: 80.0, + }, { + Name: "PreferInitialStateFromBeforeOverCurrentState", + // The current state should only be used as a fallback when there is no state history event. + // Therefore, the latter should be preferred if there is conflicting information. + Events: []SlaHistoryEvent{ + &State{Time: 800, State: 2, PreviousState: 99}, + &CurrentState{State: 0}, + }, + Start: 1000, + End: 2000, + Expected: 0.0, + }, { + Name: "PreferLaterStateOverCurrentState", + // The current state should only be used as a fallback when there is no state history event. + // Therefore, the latter should be preferred if there is conflicting information. + Events: []SlaHistoryEvent{ + &State{Time: 1200, State: 0, PreviousState: 2}, + &CurrentState{State: 2}, + }, + Start: 1000, + End: 2000, + Expected: 80.0, + }, { + Name: "InitialUnknownReducesTotalTime", + Events: []SlaHistoryEvent{ + &State{Time: 1500, State: 2, PreviousState: 99}, + &State{Time: 1700, State: 0, PreviousState: 2}, + &CurrentState{State: 0}, + }, + Start: 1000, + End: 2000, + Expected: 60, + }, { + Name: "IntermediateUnknownReducesTotalTime", + Events: []SlaHistoryEvent{ + &State{Time: 1000, State: 0, PreviousState: 2}, + &State{Time: 1100, State: 2, PreviousState: 0}, + &State{Time: 1600, State: 0, PreviousState: 99}, + &State{Time: 1800, State: 2, PreviousState: 0}, + &CurrentState{State: 0}, + }, + Start: 1000, + End: 2000, + Expected: 60, + }} + + for _, test := range tests { + t.Run(test.Name, func(t *testing.T) { + testSla(t, db, test.Events, test.Start, test.End, test.Expected, "unexpected SLA value") + }) + } + + t.Run("Invalid", func(t *testing.T) { + m := SlaHistoryMeta{ + EnvironmentId: make([]byte, 20), + EndpointId: make([]byte, 20), + ObjectType: "host", + HostId: make([]byte, 20), + } + + checkErr := func(t *testing.T, err error) { + require.Error(t, err, "SLA function should return an error") + + switch d := db.DriverName(); d { + case "mysql": + var mysqlErr *mysql.MySQLError + require.ErrorAs(t, err, &mysqlErr, "SLA function should return a MySQL error") + // https://dev.mysql.com/doc/mysql-errors/8.0/en/server-error-reference.html#error_er_signal_exception + assert.Equal(t, uint16(1644), mysqlErr.Number, "MySQL error should be ER_SIGNAL_EXCEPTION") + assert.Equal(t, "end time must be greater than start time", mysqlErr.Message, + "MySQL error should contain custom message") + + case "postgres": + var pqErr *pq.Error + require.ErrorAs(t, err, &pqErr, "SLA function should return a PostgreSQL error") + assert.Equal(t, pq.ErrorCode("P0001"), pqErr.Code, "MySQL error should be ER_SIGNAL_EXCEPTION") + assert.Equal(t, "end time must be greater than start time", pqErr.Message, + "PostgreSQL error should contain custom message") + + default: + panic(fmt.Sprintf("unknown database driver %q", d)) + } + } + + t.Run("ZeroDuration", func(t *testing.T) { + _, err := execSqlSlaFunc(db, &m, 1000, 1000) + checkErr(t, err) + }) + + t.Run("NegativeDuration", func(t *testing.T) { + _, err := execSqlSlaFunc(db, &m, 2000, 1000) + checkErr(t, err) + }) + }) +} + +func execSqlSlaFunc(db *sqlx.DB, m *SlaHistoryMeta, start uint64, end uint64) (float64, error) { + var result float64 + err := db.Get(&result, db.Rebind("SELECT get_sla_ok_percent(?, ?, ?, ?)"), + m.HostId, m.ServiceId, start, end) + return result, err +} + +func testSla(t *testing.T, db *sqlx.DB, events []SlaHistoryEvent, start uint64, end uint64, expected float64, msg string) { + t.Run("Host", func(t *testing.T) { + testSlaWithObjectType(t, db, events, false, start, end, expected, msg) + }) + t.Run("Service", func(t *testing.T) { + testSlaWithObjectType(t, db, events, true, start, end, expected, msg) + }) +} + +func testSlaWithObjectType(t *testing.T, db *sqlx.DB, + events []SlaHistoryEvent, service bool, start uint64, end uint64, expected float64, msg string, +) { + makeId := func() []byte { + id := make([]byte, 20) + _, err := rand.Read(id) + require.NoError(t, err, "generating random id failed") + return id + } + + meta := SlaHistoryMeta{ + EnvironmentId: makeId(), + EndpointId: makeId(), + HostId: makeId(), + } + if service { + meta.ObjectType = "service" + meta.ServiceId = makeId() + } else { + meta.ObjectType = "host" + } + + for _, event := range events { + err := event.WriteSlaEventToDatabase(db, &meta) + require.NoErrorf(t, err, "Inserting SLA history for %#v failed", event) + } + + r, err := execSqlSlaFunc(db, &meta, start, end) + require.NoError(t, err, "SLA query should not fail") + assert.Equal(t, expected, r, msg) +} + +type SlaHistoryMeta struct { + EnvironmentId NullableBytes `db:"environment_id"` + EndpointId NullableBytes `db:"endpoint_id"` + ObjectType string `db:"object_type"` + HostId NullableBytes `db:"host_id"` + ServiceId NullableBytes `db:"service_id"` +} + +type SlaHistoryEvent interface { + WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error +} + +type State struct { + Time uint64 + State uint8 + PreviousState uint8 +} + +var _ SlaHistoryEvent = (*State)(nil) + +func (s *State) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error { + type values struct { + *SlaHistoryMeta + Id []byte `db:"id"` + EventTime uint64 `db:"event_time"` + HardState uint8 `db:"hard_state"` + PreviousHardState uint8 `db:"previous_hard_state"` + } + + id := make([]byte, 20) + _, err := rand.Read(id) + if err != nil { + return err + } + + _, err = db.NamedExec("INSERT INTO sla_history_state"+ + " (id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state)"+ + " VALUES (:id, :environment_id, :endpoint_id, :object_type, :host_id, :service_id, :event_time, :hard_state, :previous_hard_state)", + &values{ + SlaHistoryMeta: m, + Id: id[:], + EventTime: s.Time, + HardState: s.State, + PreviousHardState: s.PreviousState, + }) + return err +} + +type CurrentState struct { + State uint8 +} + +func (c *CurrentState) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error { + type values struct { + *SlaHistoryMeta + State uint8 `db:"state"` + PropertiesChecksum NullableBytes `db:"properties_checksum"` + } + + v := values{ + SlaHistoryMeta: m, + State: c.State, + PropertiesChecksum: make([]byte, 20), + } + + if len(m.ServiceId) == 0 { + _, err := db.NamedExec("INSERT INTO host_state"+ + " (id, host_id, environment_id, properties_checksum, soft_state, previous_soft_state,"+ + " hard_state, previous_hard_state, attempt, severity, last_state_change, next_check, next_update)"+ + " VALUES (:host_id, :host_id, :environment_id, :properties_checksum, :state, :state, :state, :state, 0, 0, 0, 0, 0)", + &v) + return err + } else { + _, err := db.NamedExec("INSERT INTO service_state"+ + " (id, host_id, service_id, environment_id, properties_checksum, soft_state, previous_soft_state,"+ + " hard_state, previous_hard_state, attempt, severity, last_state_change, next_check, next_update)"+ + " VALUES (:service_id, :host_id, :service_id, :environment_id, :properties_checksum, :state, :state, :state, :state, 0, 0, 0, 0, 0)", + &v) + return err + } +} + +var _ SlaHistoryEvent = (*CurrentState)(nil) + +type Downtime struct { + Start uint64 + End uint64 +} + +var _ SlaHistoryEvent = (*Downtime)(nil) + +type slaHistoryDowntime struct { + *SlaHistoryMeta + DowntimeId []byte `db:"downtime_id"` + DowntimeStart uint64 `db:"downtime_start"` + DowntimeEnd uint64 `db:"downtime_end"` +} + +func (d *Downtime) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error { + downtimeId := make([]byte, 20) + _, err := rand.Read(downtimeId) + if err != nil { + return err + } + + _, err = db.NamedExec("INSERT INTO sla_history_downtime"+ + " (environment_id, endpoint_id, object_type, host_id, service_id, downtime_id, downtime_start, downtime_end)"+ + " VALUES (:environment_id, :endpoint_id, :object_type, :host_id,"+ + " :service_id, :downtime_id, :downtime_start, :downtime_end)", + &slaHistoryDowntime{ + SlaHistoryMeta: m, + DowntimeId: downtimeId[:], + DowntimeStart: d.Start, + DowntimeEnd: d.End, + }) + return err +} + +// NullableBytes allows writing to binary columns in a database with support for NULL. +type NullableBytes []byte + +// Value implements the database/sql/driver.Valuer interface. +func (b NullableBytes) Value() (driver.Value, error) { + if b != nil { + return []byte(b), nil + } + + // any(nil) is treated as NULL in contrast to []byte(nil) which is a non-NULL byte sequence of length 0. + return nil, nil +} From d78ecdf994d3d164d53ce141c163c869f38e137c Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Mon, 6 Sep 2021 17:08:54 +0200 Subject: [PATCH 6/8] SLA reporting: integration tests --- tests/sla_test.go | 385 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 385 insertions(+) create mode 100644 tests/sla_test.go diff --git a/tests/sla_test.go b/tests/sla_test.go new file mode 100644 index 00000000..6fa3a0e9 --- /dev/null +++ b/tests/sla_test.go @@ -0,0 +1,385 @@ +package icingadb_test + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "github.com/icinga/icinga-testing/utils" + "github.com/icinga/icinga-testing/utils/eventually" + "github.com/jmoiron/sqlx" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + "math" + "net/http" + "testing" + "time" +) + +func TestSla(t *testing.T) { + m := it.MysqlDatabaseT(t) + m.ImportIcingaDbSchema() + + r := it.RedisServerT(t) + i := it.Icinga2NodeT(t, "master") + i.EnableIcingaDb(r) + err := i.Reload() + require.NoError(t, err, "icinga2 should reload without error") + it.IcingaDbInstanceT(t, r, m) + + client := i.ApiClient() + + t.Run("StateEvents", func(t *testing.T) { + t.Parallel() + + hostname := utils.UniqueName(t, "host") + client.CreateHost(t, hostname, map[string]interface{}{ + "attrs": map[string]interface{}{ + "enable_active_checks": false, + "enable_passive_checks": true, + "check_command": "dummy", + "max_check_attempts": 3, + }, + }) + + type StateChange struct { + Time float64 + State int + } + + var stateChanges []StateChange + + processCheckResult := func(exitStatus int, isHard bool) *ObjectsHostsResponse { + time.Sleep(10 * time.Millisecond) // ensure there is a bit of difference in ms resolution + + output := utils.UniqueName(t, "output") + data := ActionsProcessCheckResultRequest{ + Type: "Host", + Filter: fmt.Sprintf(`host.name==%q`, hostname), + ExitStatus: exitStatus, + PluginOutput: output, + } + dataJson, err := json.Marshal(data) + require.NoError(t, err, "marshal request") + response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(dataJson)) + require.NoError(t, err, "process-check-result") + require.Equal(t, 200, response.StatusCode, "process-check-result") + + response, err = client.GetJson("/v1/objects/hosts/" + hostname) + require.NoError(t, err, "get host: request") + require.Equal(t, 200, response.StatusCode, "get host: request") + + var hosts ObjectsHostsResponse + err = json.NewDecoder(response.Body).Decode(&hosts) + require.NoError(t, err, "get host: parse response") + + require.Equal(t, 1, len(hosts.Results), "there must be one host in the response") + host := hosts.Results[0] + require.Equal(t, output, host.Attrs.LastCheckResult.Output, + "last check result should be visible in host object") + require.Equal(t, exitStatus, host.Attrs.State, "soft state should match check result") + + if isHard { + require.Equal(t, exitStatus, host.Attrs.LastHardState, "hard state should match check result") + if len(stateChanges) > 0 { + require.Greater(t, host.Attrs.LastHardStateChange, stateChanges[len(stateChanges)-1].Time, + "last_hard_state_change_time of host should have changed") + } + stateChanges = append(stateChanges, StateChange{ + Time: host.Attrs.LastHardStateChange, + State: exitStatus, + }) + } else { + require.NotEmpty(t, stateChanges, "there should be a hard state change prior to a soft one") + require.Equal(t, stateChanges[len(stateChanges)-1].Time, host.Attrs.LastHardStateChange, + "check result should not lead to a hard state change, i.e. last_hard_state_change should not change") + } + + return &hosts + } + + processCheckResult(0, true) // hard (UNKNOWN -> UP) + processCheckResult(1, false) // soft + processCheckResult(1, false) // soft + processCheckResult(1, true) // hard (UP -> DOWN) + processCheckResult(1, false) // hard + processCheckResult(0, true) // hard (DOWN -> UP) + processCheckResult(0, false) // hard + + assert.Equal(t, 3, len(stateChanges), "there should be three hard state changes") + + db, err := sqlx.Connect("mysql", m.DSN()) + require.NoError(t, err, "connecting to mysql") + defer func() { _ = db.Close() }() + + type Row struct { + Time int64 `db:"event_time"` + State int `db:"hard_state"` + } + + eventually.Assert(t, func(t require.TestingT) { + var rows []Row + err = db.Select(&rows, db.Rebind("SELECT s.event_time, s.hard_state FROM sla_history_state s "+ + "JOIN host ON host.id = s.host_id WHERE host.name = ? ORDER BY event_time ASC"), hostname) + require.NoError(t, err, "select sla_history_state") + + assert.Equal(t, len(stateChanges), len(rows), "number of sla_history_state entries") + + for i := range rows { + assert.WithinDuration(t, time.UnixMilli(int64(stateChanges[i].Time*1000)), time.UnixMilli(rows[i].Time), + time.Millisecond, "event time should match state change time") + assert.Equal(t, stateChanges[i].State, rows[i].State, "hard state should match") + } + }, 5*time.Second, 200*time.Millisecond) + + redis := r.Open() + defer func() { _ = redis.Close() }() + + logger := it.Logger(t) + + logger.Debug("redis state history", zap.Bool("before", true)) + eventually.Assert(t, func(t require.TestingT) { + result, err := redis.XRange(context.Background(), "icinga:history:stream:state", "-", "+").Result() + require.NoError(t, err, "reading state history stream should not fail") + logger.Debug("redis state history", zap.Any("values", result)) + assert.Empty(t, result, "redis state history stream should be drained") + }, 5*time.Second, 10*time.Millisecond) + logger.Debug("redis state history", zap.Bool("after", true)) + }) + + t.Run("DowntimeEvents", func(t *testing.T) { + t.Parallel() + + type Options struct { + Fixed bool // Whether to schedule a fixed or flexible downtime. + Cancel bool // Whether to cancel the downtime or let it expire. + } + + downtimeTest := func(t *testing.T, o Options) { + hostname := utils.UniqueName(t, "host") + client.CreateHost(t, hostname, map[string]interface{}{ + "attrs": map[string]interface{}{ + "enable_active_checks": false, + "enable_passive_checks": true, + "check_command": "dummy", + "max_check_attempts": 1, + }, + }) + + processCheckResult := func(status int) time.Time { + output := utils.RandomString(8) + reqBody, err := json.Marshal(ActionsProcessCheckResultRequest{ + Type: "Host", + Filter: fmt.Sprintf(`host.name==%q`, hostname), + ExitStatus: status, + PluginOutput: output, + }) + require.NoError(t, err, "marshal request") + response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(reqBody)) + require.NoError(t, err, "process-check-result") + require.Equal(t, 200, response.StatusCode, "process-check-result") + + response, err = client.GetJson("/v1/objects/hosts/" + hostname) + require.NoError(t, err, "get host: request") + require.Equal(t, 200, response.StatusCode, "get host: request") + + var hosts ObjectsHostsResponse + err = json.NewDecoder(response.Body).Decode(&hosts) + require.NoError(t, err, "get host: parse response") + + require.Equal(t, 1, len(hosts.Results), "there must be one host in the response") + host := hosts.Results[0] + require.Equal(t, output, host.Attrs.LastCheckResult.Output, + "last check result should be visible in host object") + require.Equal(t, 1, host.Attrs.StateType, "host should be in hard state") + require.Equal(t, status, host.Attrs.State, "state should match check result") + + sec, nsec := math.Modf(host.Attrs.LastCheckResult.ExecutionEnd) + return time.Unix(int64(sec), int64(nsec*1e9)) + } + + // Ensure that host is in UP state. + processCheckResult(0) + + refTime := time.Now().Truncate(time.Second) + // Schedule the downtime start in the past so that we would notice if Icinga 2/DB would + // use the current time somewhere where we expect the scheduled start time. + downtimeStart := refTime.Add(-1 * time.Hour) + var downtimeEnd time.Time + if o.Cancel || !o.Fixed { + // Downtimes we will cancel can expire long in the future as we don't have to wait for it. + // Same for flexible downtimes as for these, we don't have to wait until the scheduled end but only + // for their duration. + downtimeEnd = refTime.Add(1 * time.Hour) + } else { + // Let all other downtimes expire soon (fixed downtimes where we wait for expiry). + downtimeEnd = refTime.Add(5 * time.Second) + } + + var duration time.Duration + if !o.Fixed { + duration = 10 * time.Second + } + req, err := json.Marshal(ActionsScheduleDowntimeRequest{ + Type: "Host", + Filter: fmt.Sprintf(`host.name==%q`, hostname), + StartTime: downtimeStart.Unix(), + EndTime: downtimeEnd.Unix(), + Fixed: o.Fixed, + Duration: duration.Seconds(), + Author: utils.RandomString(8), + Comment: utils.RandomString(8), + }) + require.NoError(t, err, "marshal request") + response, err := client.PostJson("/v1/actions/schedule-downtime", bytes.NewBuffer(req)) + require.NoError(t, err, "schedule-downtime") + require.Equal(t, 200, response.StatusCode, "schedule-downtime") + + var scheduleResponse ActionsScheduleDowntimeResponse + err = json.NewDecoder(response.Body).Decode(&scheduleResponse) + require.NoError(t, err, "decode schedule-downtime response") + require.Equal(t, 1, len(scheduleResponse.Results), "schedule-downtime should return 1 result") + require.Equal(t, http.StatusOK, scheduleResponse.Results[0].Code, "schedule-downtime should return 1 result") + downtimeName := scheduleResponse.Results[0].Name + + type Row struct { + Start int64 `db:"downtime_start"` + End int64 `db:"downtime_end"` + } + + db, err := sqlx.Connect("mysql", m.DSN()) + require.NoError(t, err, "connecting to mysql") + defer func() { _ = db.Close() }() + + if !o.Fixed { + // Give Icinga 2 and Icinga DB some time that if they would generate an SLA history event in error, + // they have a chance to do so before we check for its absence. + time.Sleep(10 * time.Second) + + var count int + err = db.Get(&count, db.Rebind("SELECT COUNT(*) FROM sla_history_downtime s "+ + "JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname) + require.NoError(t, err, "select sla_history_state") + assert.Zero(t, count, "there should be no event in sla_history_downtime when scheduling a flexible downtime on an UP host") + } + + // Bring host into DOWN state. + criticalTime := processCheckResult(1) + + eventually.Assert(t, func(t require.TestingT) { + var rows []Row + err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+ + "JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname) + require.NoError(t, err, "select sla_history_state") + + require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row") + if o.Fixed { + assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start), + "downtime_start should match scheduled start time") + assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End), + "downtime_end should match scheduled end time") + } else { + assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second, + "downtime_start should match time of host state change") + assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)), + "downtime_end - downtime_start duration should match scheduled duration") + } + }, 5*time.Second, 200*time.Millisecond) + + redis := r.Open() + defer func() { _ = redis.Close() }() + + eventually.Assert(t, func(t require.TestingT) { + result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result() + require.NoError(t, err, "reading downtime history stream should not fail") + assert.Empty(t, result, "redis downtime history stream should be drained") + }, 5*time.Second, 10*time.Millisecond) + + if o.Cancel { + req, err = json.Marshal(ActionsRemoveDowntimeRequest{ + Downtime: downtimeName, + }) + require.NoError(t, err, "marshal remove-downtime request") + response, err = client.PostJson("/v1/actions/remove-downtime", bytes.NewBuffer(req)) + require.NoError(t, err, "remove-downtime") + require.Equal(t, 200, response.StatusCode, "remove-downtime") + } + + downtimeCancel := time.Now() + + if !o.Cancel { + // Wait for downtime to expire + a few extra seconds. The row should not be updated, give + // enough time to have a chance catching if Icinga DB updates it nonetheless. + if !o.Fixed { + time.Sleep(duration + 5*time.Second) + } else { + d := time.Until(downtimeEnd) + 5*time.Second + require.Less(t, d, time.Minute, "bug in tests: don't wait too long") + time.Sleep(d) + } + } + + eventually.Assert(t, func(t require.TestingT) { + var rows []Row + err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+ + "JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname) + require.NoError(t, err, "select sla_history_state") + + require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row") + if o.Fixed { + assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start), + "downtime_start should match scheduled start") + } else { + assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second, + "downtime_start should match critical time") + } + if o.Cancel { + // Allow more delta for the end time after cancel as we did not choose the exact time. + assert.WithinDuration(t, downtimeCancel, time.UnixMilli(rows[0].End), time.Second, + "downtime_end should match cancel time") + } else if o.Fixed { + assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End), + "downtime_start should match scheduled end") + } else { + assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)), + "downtime_end - downtime_start duration should match scheduled duration") + } + }, 5*time.Second, 200*time.Millisecond) + + eventually.Assert(t, func(t require.TestingT) { + result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result() + require.NoError(t, err, "reading downtime history stream should not fail") + assert.Empty(t, result, "redis downtime history stream should be drained") + }, 5*time.Second, 10*time.Millisecond) + } + + t.Run("Fixed", func(t *testing.T) { + t.Parallel() + + t.Run("Cancel", func(t *testing.T) { + t.Parallel() + downtimeTest(t, Options{Fixed: true, Cancel: true}) + }) + + t.Run("Expire", func(t *testing.T) { + t.Parallel() + downtimeTest(t, Options{Fixed: true, Cancel: false}) + }) + }) + + t.Run("Flexible", func(t *testing.T) { + t.Parallel() + + t.Run("Cancel", func(t *testing.T) { + t.Parallel() + downtimeTest(t, Options{Fixed: false, Cancel: true}) + }) + + t.Run("Expire", func(t *testing.T) { + t.Parallel() + downtimeTest(t, Options{Fixed: false, Cancel: false}) + }) + }) + }) +} From d119be0da5a244b4268ff3107f820d3dd2e09edf Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Thu, 18 Nov 2021 11:03:34 +0100 Subject: [PATCH 7/8] SLA reporting: fill new tables from history in schema migration --- schema/mysql/upgrades/1.0.0.sql | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/schema/mysql/upgrades/1.0.0.sql b/schema/mysql/upgrades/1.0.0.sql index 0409e528..b0fcef10 100644 --- a/schema/mysql/upgrades/1.0.0.sql +++ b/schema/mysql/upgrades/1.0.0.sql @@ -222,6 +222,13 @@ CREATE TABLE sla_history_state ( INDEX idx_sla_history_state_event (host_id, service_id, event_time) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; +INSERT INTO sla_history_state + (id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state) + SELECT id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state + FROM state_history + WHERE state_type = 'hard' + ON DUPLICATE KEY UPDATE sla_history_state.id = sla_history_state.id; + CREATE TABLE sla_history_downtime ( environment_id binary(20) NOT NULL COMMENT 'environment.id', endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id', @@ -238,5 +245,12 @@ CREATE TABLE sla_history_downtime ( INDEX idx_sla_history_downtime_event (host_id, service_id, downtime_start, downtime_end) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC; +INSERT INTO sla_history_downtime + (environment_id, endpoint_id, object_type, host_id, service_id, downtime_id, downtime_start, downtime_end) + SELECT environment_id, endpoint_id, object_type, host_id, service_id, downtime_id, + start_time AS downtime_start, IF(has_been_cancelled = 'y', cancel_time, end_time) AS downtime_end + FROM downtime_history + ON DUPLICATE KEY UPDATE sla_history_downtime.downtime_id = sla_history_downtime.downtime_id; + INSERT INTO icingadb_schema (version, TIMESTAMP) VALUES (3, CURRENT_TIMESTAMP() * 1000); From 19170ecbcf14d9275aa11b18333aaad87e1c10c4 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Wed, 23 Mar 2022 15:37:28 +0100 Subject: [PATCH 8/8] SLA reporting: history retention for SLA tables --- cmd/icingadb/main.go | 11 +-- config.yml.example | 10 ++- doc/03-Configuration.md | 15 ++-- pkg/config/config.go | 10 +-- pkg/config/history_retention.go | 15 ++-- pkg/icingadb/history/retention.go | 127 +++++++++++++++++++++------- tests/cleanup_and_retention_test.go | 49 +++++++---- 7 files changed, 162 insertions(+), 75 deletions(-) diff --git a/cmd/icingadb/main.go b/cmd/icingadb/main.go index 164a43db..8dd91653 100644 --- a/cmd/icingadb/main.go +++ b/cmd/icingadb/main.go @@ -133,11 +133,12 @@ func run() int { ods := overdue.NewSync(db, rc, logs.GetChildLogger("overdue-sync")) ret := history.NewRetention( db, - cmd.Config.HistoryRetention.Days, - cmd.Config.HistoryRetention.Interval, - cmd.Config.HistoryRetention.Count, - cmd.Config.HistoryRetention.Options, - logs.GetChildLogger("history-retention"), + cmd.Config.Retention.HistoryDays, + cmd.Config.Retention.SlaDays, + cmd.Config.Retention.Interval, + cmd.Config.Retention.Count, + cmd.Config.Retention.Options, + logs.GetChildLogger("retention"), ) sig := make(chan os.Signal, 1) diff --git a/config.yml.example b/config.yml.example index 963aee0b..5c74bb15 100644 --- a/config.yml.example +++ b/config.yml.example @@ -33,15 +33,17 @@ logging: # dump-signals: # heartbeat: # high-availability: -# history-retention: # history-sync: # overdue-sync: # redis: +# retention: # runtime-updates: -history-retention: - # Number of days to retain historical data. By default, historical data is retained forever. -# days: +retention: + # Number of days to retain full historical data. By default, historical data is retained forever. +# history-days: + # Number of days to retain historical data for SLA reporting. By default, it is retained forever. +# sla-days: # Map of history category to number of days to retain its data in order to # enable retention only for specific categories or to override the number that has been configured in days. options: diff --git a/doc/03-Configuration.md b/doc/03-Configuration.md index 19def87f..7c58ddb6 100644 --- a/doc/03-Configuration.md +++ b/doc/03-Configuration.md @@ -57,10 +57,10 @@ database | Database connection status and queries. dump-signals | Dump signals received from Icinga. heartbeat | Icinga heartbeats received through Redis. high-availability | Manages responsibility of Icinga DB instances. -history-retention | Deletes historical data that exceed their configured retention period. history-sync | Synchronization of history entries from Redis to MySQL. overdue-sync | Calculation and synchronization of the overdue status of checkables. redis | Redis connection status and queries. +retention | Deletes historical data that exceed their configured retention period. runtime-updates | Runtime updates of config objects after the initial config synchronization. ### Duration String @@ -68,12 +68,15 @@ runtime-updates | Runtime updates of config objects after the initial c A duration string is a sequence of decimal numbers and a unit suffix, such as `"20s"`. Valid units are `"ms"`, `"s"`, `"m"` and `"h"`. -## History Retention +## Retention By default, no historical data is deleted, which means that the longer the data is retained, the more disk space is required to store it. History retention is an optional feature that allows you to limit the number of days that historical data is available for each history category. +There are separate options for the full history tables used to display history information in the web interface and +SLA tables which store the minimal information required for SLA reporting, allowing to keep this information for longer with a smaller storage footprint. -| Option | Description | -|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. | -| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification` and `state`. | +| Option | Description | +|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| history-days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. | +| sla-days | **Optional.** Number of days to retain historical data for SLA reporting. | +| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification`, `sla` and `state`. | diff --git a/pkg/config/config.go b/pkg/config/config.go index bb341abe..89e040b4 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -14,10 +14,10 @@ import ( // Config defines Icinga DB config. type Config struct { - Database Database `yaml:"database"` - Redis Redis `yaml:"redis"` - Logging Logging `yaml:"logging"` - HistoryRetention HistoryRetention `yaml:"history-retention"` + Database Database `yaml:"database"` + Redis Redis `yaml:"redis"` + Logging Logging `yaml:"logging"` + Retention Retention `yaml:"retention"` } // Validate checks constraints in the supplied configuration and returns an error if they are violated. @@ -31,7 +31,7 @@ func (c *Config) Validate() error { if err := c.Logging.Validate(); err != nil { return err } - if err := c.HistoryRetention.Validate(); err != nil { + if err := c.Retention.Validate(); err != nil { return err } diff --git a/pkg/config/history_retention.go b/pkg/config/history_retention.go index 6709cd61..d4373b70 100644 --- a/pkg/config/history_retention.go +++ b/pkg/config/history_retention.go @@ -6,17 +6,18 @@ import ( "time" ) -// HistoryRetention defines configuration for history retention. -type HistoryRetention struct { - Days uint64 `yaml:"days"` - Interval time.Duration `yaml:"interval" default:"1h"` - Count uint64 `yaml:"count" default:"5000"` - Options history.RetentionOptions `yaml:"options"` +// Retention defines configuration for history retention. +type Retention struct { + HistoryDays uint64 `yaml:"history-days"` + SlaDays uint64 `yaml:"sla-days"` + Interval time.Duration `yaml:"interval" default:"1h"` + Count uint64 `yaml:"count" default:"5000"` + Options history.RetentionOptions `yaml:"options"` } // Validate checks constraints in the supplied retention configuration and // returns an error if they are violated. -func (r *HistoryRetention) Validate() error { +func (r *Retention) Validate() error { if r.Interval <= 0 { return errors.New("retention interval must be positive") } diff --git a/pkg/icingadb/history/retention.go b/pkg/icingadb/history/retention.go index 920b79c2..6ebe97c8 100644 --- a/pkg/icingadb/history/retention.go +++ b/pkg/icingadb/history/retention.go @@ -11,39 +11,85 @@ import ( "time" ) +type RetentionType int + +const ( + RetentionHistory RetentionType = iota + RetentionSla +) + +type retentionStatement struct { + icingadb.CleanupStmt + RetentionType + Category string +} + // RetentionStatements maps history categories with corresponding cleanup statements. -var RetentionStatements = map[string]icingadb.CleanupStmt{ - "acknowledgement": { +var RetentionStatements = []retentionStatement{{ + RetentionType: RetentionHistory, + Category: "acknowledgement", + CleanupStmt: icingadb.CleanupStmt{ Table: "acknowledgement_history", PK: "id", Column: "clear_time", }, - "comment": { +}, { + RetentionType: RetentionHistory, + Category: "comment", + CleanupStmt: icingadb.CleanupStmt{ Table: "comment_history", PK: "comment_id", Column: "remove_time", }, - "downtime": { +}, { + RetentionType: RetentionHistory, + Category: "downtime", + CleanupStmt: icingadb.CleanupStmt{ Table: "downtime_history", PK: "downtime_id", Column: "end_time", }, - "flapping": { +}, { + RetentionType: RetentionHistory, + Category: "flapping", + CleanupStmt: icingadb.CleanupStmt{ Table: "flapping_history", PK: "id", Column: "end_time", }, - "notification": { +}, { + RetentionType: RetentionHistory, + Category: "notification", + CleanupStmt: icingadb.CleanupStmt{ Table: "notification_history", PK: "id", Column: "send_time", }, - "state": { +}, { + RetentionType: RetentionHistory, + Category: "state", + CleanupStmt: icingadb.CleanupStmt{ Table: "state_history", PK: "id", Column: "event_time", }, -} +}, { + RetentionType: RetentionSla, + Category: "sla_downtime", + CleanupStmt: icingadb.CleanupStmt{ + Table: "sla_history_downtime", + PK: "downtime_id", + Column: "downtime_end", + }, +}, { + RetentionType: RetentionSla, + Category: "sla_state", + CleanupStmt: icingadb.CleanupStmt{ + Table: "sla_history_state", + PK: "id", + Column: "event_time", + }, +}} // RetentionOptions defines the non-default mapping of history categories with their retention period in days. type RetentionOptions map[string]uint64 @@ -51,8 +97,15 @@ type RetentionOptions map[string]uint64 // Validate checks constraints in the supplied retention options and // returns an error if they are violated. func (o RetentionOptions) Validate() error { + allowedCategories := make(map[string]struct{}) + for _, stmt := range RetentionStatements { + if stmt.RetentionType == RetentionHistory { + allowedCategories[stmt.Category] = struct{}{} + } + } + for category := range o { - if _, ok := RetentionStatements[category]; !ok { + if _, ok := allowedCategories[category]; !ok { return errors.Errorf("invalid key %s for history retention", category) } } @@ -62,23 +115,28 @@ func (o RetentionOptions) Validate() error { // Retention deletes rows from history tables that exceed their configured retention period. type Retention struct { - db *icingadb.DB - logger *logging.Logger - days uint64 - interval time.Duration - count uint64 - options RetentionOptions + db *icingadb.DB + logger *logging.Logger + historyDays uint64 + slaDays uint64 + interval time.Duration + count uint64 + options RetentionOptions } // NewRetention returns a new Retention. -func NewRetention(db *icingadb.DB, days uint64, interval time.Duration, count uint64, options RetentionOptions, logger *logging.Logger) *Retention { +func NewRetention( + db *icingadb.DB, historyDays uint64, slaDays uint64, interval time.Duration, + count uint64, options RetentionOptions, logger *logging.Logger, +) *Retention { return &Retention{ - db: db, - logger: logger, - days: days, - interval: interval, - count: count, - options: options, + db: db, + logger: logger, + historyDays: historyDays, + slaDays: slaDays, + interval: interval, + count: count, + options: options, } } @@ -94,32 +152,39 @@ func (r *Retention) StartWithCallback(ctx context.Context, c func(table string, errs := make(chan error, 1) - for category, stmt := range RetentionStatements { - days, ok := r.options[category] - if !ok { - days = r.days + for _, stmt := range RetentionStatements { + var days uint64 + switch stmt.RetentionType { + case RetentionHistory: + if d, ok := r.options[stmt.Category]; ok { + days = d + } else { + days = r.historyDays + } + case RetentionSla: + days = r.slaDays } if days < 1 { - r.logger.Debugf("Skipping history retention for category %s", category) + r.logger.Debugf("Skipping history retention for category %s", stmt.Category) continue } r.logger.Debugw( - fmt.Sprintf("Starting history retention for category %s", category), + fmt.Sprintf("Starting history retention for category %s", stmt.Category), zap.Uint64("count", r.count), zap.Duration("interval", r.interval), zap.Uint64("retention-days", days), ) - category := category stmt := stmt periodic.Start(ctx, r.interval, func(tick periodic.Tick) { olderThan := tick.Time.AddDate(0, 0, -int(days)) - r.logger.Debugf("Cleaning up historical data for category %s older than %s", category, olderThan) + r.logger.Debugf("Cleaning up historical data for category %s from table %s older than %s", + stmt.Category, stmt.Table, olderThan) - rs, err := r.db.CleanupOlderThan(ctx, stmt, r.count, olderThan) + rs, err := r.db.CleanupOlderThan(ctx, stmt.CleanupStmt, r.count, olderThan) if err != nil { select { case errs <- err: diff --git a/tests/cleanup_and_retention_test.go b/tests/cleanup_and_retention_test.go index a3049398..b6ba49b1 100644 --- a/tests/cleanup_and_retention_test.go +++ b/tests/cleanup_and_retention_test.go @@ -9,6 +9,7 @@ import ( "github.com/jmoiron/sqlx" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "strings" "testing" "time" ) @@ -22,7 +23,8 @@ func TestCleanupAndRetention(t *testing.T) { t.Cleanup(func() { _ = db.Close() }) reten := retention{ - Days: 7, + HistoryDays: 7, + SlaDays: 30, Options: map[string]int{ "acknowledgement": 0, // No cleanup. "comment": 1, @@ -31,6 +33,16 @@ func TestCleanupAndRetention(t *testing.T) { }, } + daysForCategory := func(category string) int { + if strings.HasPrefix(category, "sla_") { + return reten.SlaDays + } else if d, ok := reten.Options[category]; ok { + return d + } else { + return reten.HistoryDays + } + } + rowsToDelete := 10000 rowsToSpare := 1000 @@ -38,11 +50,7 @@ func TestCleanupAndRetention(t *testing.T) { err := dropNotNullColumns(db, stmt) assert.NoError(t, err) - retentionDays, ok := reten.Options[category] - if !ok { - retentionDays = reten.Days - } - + retentionDays := daysForCategory(category) start := time.Now().AddDate(0, 0, -retentionDays).Add(-1 * time.Millisecond * time.Duration(rowsToDelete)) startMilli := start.UnixMilli() @@ -75,18 +83,14 @@ func TestCleanupAndRetention(t *testing.T) { i.Reload() waitForDumpDoneSignal(t, r, 20*time.Second, 100*time.Millisecond) config, err := yaml.Marshal(struct { - Retention retention `yaml:"history-retention"` + Retention retention `yaml:"retention"` }{reten}) assert.NoError(t, err) it.IcingaDbInstanceT(t, r, rdb, services.WithIcingaDbConfig(string(config))) eventually.Assert(t, func(t require.TestingT) { for category, stmt := range retentionStatements { - retentionDays, ok := reten.Options[category] - if !ok { - retentionDays = reten.Days - } - + retentionDays := daysForCategory(category) threshold := time.Now().AddDate(0, 0, -retentionDays) thresholdMilli := threshold.UnixMilli() @@ -106,10 +110,10 @@ func TestCleanupAndRetention(t *testing.T) { if retentionDays == 0 { // No cleanup. - assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there") + assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there for %s", category) } else { - assert.Equal(t, 0, rowsLeft, "rows left in retention period") - assert.Equal(t, rowsToSpare, rowsSpared, "rows spared") + assert.Equal(t, 0, rowsLeft, "rows left in retention period for %s", category) + assert.Equal(t, rowsToSpare, rowsSpared, "rows spared for %s", category) } } }, time.Minute, time.Second) @@ -122,8 +126,9 @@ type cleanupStmt struct { } type retention struct { - Days int `yaml:"days"` - Options map[string]int `yaml:"options"` + HistoryDays int `yaml:"history-days"` + SlaDays int `yaml:"sla-days"` + Options map[string]int `yaml:"options"` } var retentionStatements = map[string]cleanupStmt{ @@ -157,6 +162,16 @@ var retentionStatements = map[string]cleanupStmt{ PK: "id", Column: "event_time", }, + "sla_downtime": { + Table: "sla_history_downtime", + PK: "downtime_id", + Column: "downtime_end", + }, + "sla_state": { + Table: "sla_history_state", + PK: "id", + Column: "event_time", + }, } // dropNotNullColumns drops all columns with a NOT NULL constraint that are not