mirror of
https://github.com/Icinga/icingadb.git
synced 2026-05-28 04:35:54 -04:00
Merge pull request #247 from Icinga/feature/sla-reporting
SLA reporting
This commit is contained in:
commit
da230d4f92
24 changed files with 1765 additions and 108 deletions
58
.github/workflows/sql.yml
vendored
Normal file
58
.github/workflows/sql.yml
vendored
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
name: SQL
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request: {}
|
||||
|
||||
jobs:
|
||||
sql:
|
||||
name: ${{ matrix.database.name }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
database:
|
||||
- {type: MYSQL, name: MySQL 5.5, image: "icinga/icingadb-mysql:5.5"}
|
||||
- {type: MYSQL, name: MySQL 5.6, image: "icinga/icingadb-mysql:5.6"}
|
||||
- {type: MYSQL, name: MySQL 5.7, image: "mysql:5.7"}
|
||||
- {type: MYSQL, name: MySQL latest, image: "mysql:latest"}
|
||||
- {type: MYSQL, name: MariaDB 10.1, image: "mariadb:10.1"}
|
||||
- {type: MYSQL, name: MariaDB 10.2, image: "mariadb:10.2"}
|
||||
- {type: MYSQL, name: MariaDB 10.3, image: "mariadb:10.3"}
|
||||
- {type: MYSQL, name: MariaDB 10.4, image: "mariadb:10.4"}
|
||||
- {type: MYSQL, name: MariaDB 10.5, image: "mariadb:10.5"}
|
||||
- {type: MYSQL, name: MariaDB 10.6, image: "mariadb:10.6"}
|
||||
- {type: MYSQL, name: MariaDB 10.7, image: "mariadb:10.7"}
|
||||
- {type: MYSQL, name: MariaDB latest, image: "mariadb:latest"}
|
||||
- {type: PGSQL, name: PostgreSQL 9.6, image: "postgres:9.6"}
|
||||
- {type: PGSQL, name: PostgreSQL 10, image: "postgres:10"}
|
||||
- {type: PGSQL, name: PostgreSQL 11, image: "postgres:11"}
|
||||
- {type: PGSQL, name: PostgreSQL 12, image: "postgres:12"}
|
||||
- {type: PGSQL, name: PostgreSQL 13, image: "postgres:13"}
|
||||
- {type: PGSQL, name: PostgreSQL latest, image: "postgres:latest"}
|
||||
|
||||
steps:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v1
|
||||
with:
|
||||
go-version: '^1.16'
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Download dependencies
|
||||
run: go get -v -t -d ./...
|
||||
working-directory: tests/
|
||||
|
||||
- name: Run tests
|
||||
env:
|
||||
ICINGADB_TESTS_DATABASE_TYPE: ${{ matrix.database.type }}
|
||||
ICINGA_TESTING_${{ matrix.database.type }}_IMAGE: ${{ matrix.database.image }}
|
||||
ICINGA_TESTING_ICINGADB_SCHEMA_MYSQL: ${{ github.workspace }}/schema/mysql/schema.sql
|
||||
ICINGA_TESTING_ICINGADB_SCHEMA_PGSQL: ${{ github.workspace }}/schema/pgsql/schema.sql
|
||||
timeout-minutes: 10
|
||||
run: go test -v -timeout 5m ./sql
|
||||
working-directory: tests/
|
||||
|
|
@ -133,11 +133,12 @@ func run() int {
|
|||
ods := overdue.NewSync(db, rc, logs.GetChildLogger("overdue-sync"))
|
||||
ret := history.NewRetention(
|
||||
db,
|
||||
cmd.Config.HistoryRetention.Days,
|
||||
cmd.Config.HistoryRetention.Interval,
|
||||
cmd.Config.HistoryRetention.Count,
|
||||
cmd.Config.HistoryRetention.Options,
|
||||
logs.GetChildLogger("history-retention"),
|
||||
cmd.Config.Retention.HistoryDays,
|
||||
cmd.Config.Retention.SlaDays,
|
||||
cmd.Config.Retention.Interval,
|
||||
cmd.Config.Retention.Count,
|
||||
cmd.Config.Retention.Options,
|
||||
logs.GetChildLogger("retention"),
|
||||
)
|
||||
|
||||
sig := make(chan os.Signal, 1)
|
||||
|
|
|
|||
|
|
@ -33,15 +33,17 @@ logging:
|
|||
# dump-signals:
|
||||
# heartbeat:
|
||||
# high-availability:
|
||||
# history-retention:
|
||||
# history-sync:
|
||||
# overdue-sync:
|
||||
# redis:
|
||||
# retention:
|
||||
# runtime-updates:
|
||||
|
||||
history-retention:
|
||||
# Number of days to retain historical data. By default, historical data is retained forever.
|
||||
# days:
|
||||
retention:
|
||||
# Number of days to retain full historical data. By default, historical data is retained forever.
|
||||
# history-days:
|
||||
# Number of days to retain historical data for SLA reporting. By default, it is retained forever.
|
||||
# sla-days:
|
||||
# Map of history category to number of days to retain its data in order to
|
||||
# enable retention only for specific categories or to override the number that has been configured in days.
|
||||
options:
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
## Requirements <a id="installation-requirements"></a>
|
||||
|
||||
* Local Redis instance (Will be installed during this documentation)
|
||||
* MySQL/MariaDB/PostgreSQL database `icingadb`, user and schema imports (Will be set up during this documentation)
|
||||
* MySQL (≥5.5), MariaDB (≥10.1), or PostgreSQL (≥9.6): database, user and schema imports (Will be set up during this documentation)
|
||||
|
||||
## Setting up Icinga DB <a id="setting-up-icingadb"></a>
|
||||
|
||||
|
|
@ -176,7 +176,6 @@ psql icingadb <<<'CREATE EXTENSION IF NOT EXISTS citext;'
|
|||
```
|
||||
|
||||
The CREATE EXTENSION command requires the postgresql-contrib package.
|
||||
(On RHEL/CentOS 7: rh-postgresql95-postgresql-contrib)
|
||||
|
||||
Edit `pg_hba.conf`, insert the following before everything else:
|
||||
|
||||
|
|
@ -187,7 +186,6 @@ host all icingadb ::/0 md5
|
|||
```
|
||||
|
||||
To apply those changes, run `systemctl reload postgresql`.
|
||||
(On RHEL/CentOS 7 the service is called "rh-postgresql95-postgresql".)
|
||||
|
||||
After creating the database you can import the Icinga DB schema using the
|
||||
following command. Enter the password when asked.
|
||||
|
|
@ -196,9 +194,6 @@ following command. Enter the password when asked.
|
|||
psql -U icingadb icingadb < /usr/share/icingadb/schema/pgsql/schema.sql
|
||||
```
|
||||
|
||||
On RHEL/CentOS 7 prefix "createuser", "createdb" and "psql" with
|
||||
"/opt/rh/rh-postgresql95/root/usr/bin/".
|
||||
|
||||
### Running Icinga DB <a id="running-icingadb"></a>
|
||||
|
||||
Foreground:
|
||||
|
|
|
|||
|
|
@ -57,10 +57,10 @@ database | Database connection status and queries.
|
|||
dump-signals | Dump signals received from Icinga.
|
||||
heartbeat | Icinga heartbeats received through Redis.
|
||||
high-availability | Manages responsibility of Icinga DB instances.
|
||||
history-retention | Deletes historical data that exceed their configured retention period.
|
||||
history-sync | Synchronization of history entries from Redis to MySQL.
|
||||
overdue-sync | Calculation and synchronization of the overdue status of checkables.
|
||||
redis | Redis connection status and queries.
|
||||
retention | Deletes historical data that exceed their configured retention period.
|
||||
runtime-updates | Runtime updates of config objects after the initial config synchronization.
|
||||
|
||||
### Duration String <a id="duration-string"></a>
|
||||
|
|
@ -68,12 +68,15 @@ runtime-updates | Runtime updates of config objects after the initial c
|
|||
A duration string is a sequence of decimal numbers and a unit suffix, such as `"20s"`.
|
||||
Valid units are `"ms"`, `"s"`, `"m"` and `"h"`.
|
||||
|
||||
## History Retention <a id="configuration-history-retention"></a>
|
||||
## Retention <a id="configuration-retention"></a>
|
||||
|
||||
By default, no historical data is deleted, which means that the longer the data is retained, the more disk space is required to store it.
|
||||
History retention is an optional feature that allows you to limit the number of days that historical data is available for each history category.
|
||||
There are separate options for the full history tables used to display history information in the web interface and
|
||||
SLA tables which store the minimal information required for SLA reporting, allowing to keep this information for longer with a smaller storage footprint.
|
||||
|
||||
| Option | Description |
|
||||
|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. |
|
||||
| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification` and `state`. |
|
||||
| Option | Description |
|
||||
|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| history-days | **Optional.** Number of days to retain historical data for all history categories. Use `options` in order to enable retention only for specific categories or to override the retention days configured here. |
|
||||
| sla-days | **Optional.** Number of days to retain historical data for SLA reporting. |
|
||||
| options | **Optional.** Map of history category to number of days to retain its data. Available categories are `acknowledgement`, `comment`, `downtime`, `flapping`, `notification`, `sla` and `state`. |
|
||||
|
|
|
|||
|
|
@ -14,10 +14,10 @@ import (
|
|||
|
||||
// Config defines Icinga DB config.
|
||||
type Config struct {
|
||||
Database Database `yaml:"database"`
|
||||
Redis Redis `yaml:"redis"`
|
||||
Logging Logging `yaml:"logging"`
|
||||
HistoryRetention HistoryRetention `yaml:"history-retention"`
|
||||
Database Database `yaml:"database"`
|
||||
Redis Redis `yaml:"redis"`
|
||||
Logging Logging `yaml:"logging"`
|
||||
Retention Retention `yaml:"retention"`
|
||||
}
|
||||
|
||||
// Validate checks constraints in the supplied configuration and returns an error if they are violated.
|
||||
|
|
@ -31,7 +31,7 @@ func (c *Config) Validate() error {
|
|||
if err := c.Logging.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := c.HistoryRetention.Validate(); err != nil {
|
||||
if err := c.Retention.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,17 +6,18 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
// HistoryRetention defines configuration for history retention.
|
||||
type HistoryRetention struct {
|
||||
Days uint64 `yaml:"days"`
|
||||
Interval time.Duration `yaml:"interval" default:"1h"`
|
||||
Count uint64 `yaml:"count" default:"5000"`
|
||||
Options history.RetentionOptions `yaml:"options"`
|
||||
// Retention defines configuration for history retention.
|
||||
type Retention struct {
|
||||
HistoryDays uint64 `yaml:"history-days"`
|
||||
SlaDays uint64 `yaml:"sla-days"`
|
||||
Interval time.Duration `yaml:"interval" default:"1h"`
|
||||
Count uint64 `yaml:"count" default:"5000"`
|
||||
Options history.RetentionOptions `yaml:"options"`
|
||||
}
|
||||
|
||||
// Validate checks constraints in the supplied retention configuration and
|
||||
// returns an error if they are violated.
|
||||
func (r *HistoryRetention) Validate() error {
|
||||
func (r *Retention) Validate() error {
|
||||
if r.Interval <= 0 {
|
||||
return errors.New("retention interval must be positive")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,39 +11,85 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
type RetentionType int
|
||||
|
||||
const (
|
||||
RetentionHistory RetentionType = iota
|
||||
RetentionSla
|
||||
)
|
||||
|
||||
type retentionStatement struct {
|
||||
icingadb.CleanupStmt
|
||||
RetentionType
|
||||
Category string
|
||||
}
|
||||
|
||||
// RetentionStatements maps history categories with corresponding cleanup statements.
|
||||
var RetentionStatements = map[string]icingadb.CleanupStmt{
|
||||
"acknowledgement": {
|
||||
var RetentionStatements = []retentionStatement{{
|
||||
RetentionType: RetentionHistory,
|
||||
Category: "acknowledgement",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "acknowledgement_history",
|
||||
PK: "id",
|
||||
Column: "clear_time",
|
||||
},
|
||||
"comment": {
|
||||
}, {
|
||||
RetentionType: RetentionHistory,
|
||||
Category: "comment",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "comment_history",
|
||||
PK: "comment_id",
|
||||
Column: "remove_time",
|
||||
},
|
||||
"downtime": {
|
||||
}, {
|
||||
RetentionType: RetentionHistory,
|
||||
Category: "downtime",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "downtime_history",
|
||||
PK: "downtime_id",
|
||||
Column: "end_time",
|
||||
},
|
||||
"flapping": {
|
||||
}, {
|
||||
RetentionType: RetentionHistory,
|
||||
Category: "flapping",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "flapping_history",
|
||||
PK: "id",
|
||||
Column: "end_time",
|
||||
},
|
||||
"notification": {
|
||||
}, {
|
||||
RetentionType: RetentionHistory,
|
||||
Category: "notification",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "notification_history",
|
||||
PK: "id",
|
||||
Column: "send_time",
|
||||
},
|
||||
"state": {
|
||||
}, {
|
||||
RetentionType: RetentionHistory,
|
||||
Category: "state",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "state_history",
|
||||
PK: "id",
|
||||
Column: "event_time",
|
||||
},
|
||||
}
|
||||
}, {
|
||||
RetentionType: RetentionSla,
|
||||
Category: "sla_downtime",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "sla_history_downtime",
|
||||
PK: "downtime_id",
|
||||
Column: "downtime_end",
|
||||
},
|
||||
}, {
|
||||
RetentionType: RetentionSla,
|
||||
Category: "sla_state",
|
||||
CleanupStmt: icingadb.CleanupStmt{
|
||||
Table: "sla_history_state",
|
||||
PK: "id",
|
||||
Column: "event_time",
|
||||
},
|
||||
}}
|
||||
|
||||
// RetentionOptions defines the non-default mapping of history categories with their retention period in days.
|
||||
type RetentionOptions map[string]uint64
|
||||
|
|
@ -51,8 +97,15 @@ type RetentionOptions map[string]uint64
|
|||
// Validate checks constraints in the supplied retention options and
|
||||
// returns an error if they are violated.
|
||||
func (o RetentionOptions) Validate() error {
|
||||
allowedCategories := make(map[string]struct{})
|
||||
for _, stmt := range RetentionStatements {
|
||||
if stmt.RetentionType == RetentionHistory {
|
||||
allowedCategories[stmt.Category] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
for category := range o {
|
||||
if _, ok := RetentionStatements[category]; !ok {
|
||||
if _, ok := allowedCategories[category]; !ok {
|
||||
return errors.Errorf("invalid key %s for history retention", category)
|
||||
}
|
||||
}
|
||||
|
|
@ -62,23 +115,28 @@ func (o RetentionOptions) Validate() error {
|
|||
|
||||
// Retention deletes rows from history tables that exceed their configured retention period.
|
||||
type Retention struct {
|
||||
db *icingadb.DB
|
||||
logger *logging.Logger
|
||||
days uint64
|
||||
interval time.Duration
|
||||
count uint64
|
||||
options RetentionOptions
|
||||
db *icingadb.DB
|
||||
logger *logging.Logger
|
||||
historyDays uint64
|
||||
slaDays uint64
|
||||
interval time.Duration
|
||||
count uint64
|
||||
options RetentionOptions
|
||||
}
|
||||
|
||||
// NewRetention returns a new Retention.
|
||||
func NewRetention(db *icingadb.DB, days uint64, interval time.Duration, count uint64, options RetentionOptions, logger *logging.Logger) *Retention {
|
||||
func NewRetention(
|
||||
db *icingadb.DB, historyDays uint64, slaDays uint64, interval time.Duration,
|
||||
count uint64, options RetentionOptions, logger *logging.Logger,
|
||||
) *Retention {
|
||||
return &Retention{
|
||||
db: db,
|
||||
logger: logger,
|
||||
days: days,
|
||||
interval: interval,
|
||||
count: count,
|
||||
options: options,
|
||||
db: db,
|
||||
logger: logger,
|
||||
historyDays: historyDays,
|
||||
slaDays: slaDays,
|
||||
interval: interval,
|
||||
count: count,
|
||||
options: options,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -94,32 +152,39 @@ func (r *Retention) StartWithCallback(ctx context.Context, c func(table string,
|
|||
|
||||
errs := make(chan error, 1)
|
||||
|
||||
for category, stmt := range RetentionStatements {
|
||||
days, ok := r.options[category]
|
||||
if !ok {
|
||||
days = r.days
|
||||
for _, stmt := range RetentionStatements {
|
||||
var days uint64
|
||||
switch stmt.RetentionType {
|
||||
case RetentionHistory:
|
||||
if d, ok := r.options[stmt.Category]; ok {
|
||||
days = d
|
||||
} else {
|
||||
days = r.historyDays
|
||||
}
|
||||
case RetentionSla:
|
||||
days = r.slaDays
|
||||
}
|
||||
|
||||
if days < 1 {
|
||||
r.logger.Debugf("Skipping history retention for category %s", category)
|
||||
r.logger.Debugf("Skipping history retention for category %s", stmt.Category)
|
||||
continue
|
||||
}
|
||||
|
||||
r.logger.Debugw(
|
||||
fmt.Sprintf("Starting history retention for category %s", category),
|
||||
fmt.Sprintf("Starting history retention for category %s", stmt.Category),
|
||||
zap.Uint64("count", r.count),
|
||||
zap.Duration("interval", r.interval),
|
||||
zap.Uint64("retention-days", days),
|
||||
)
|
||||
|
||||
category := category
|
||||
stmt := stmt
|
||||
periodic.Start(ctx, r.interval, func(tick periodic.Tick) {
|
||||
olderThan := tick.Time.AddDate(0, 0, -int(days))
|
||||
|
||||
r.logger.Debugf("Cleaning up historical data for category %s older than %s", category, olderThan)
|
||||
r.logger.Debugf("Cleaning up historical data for category %s from table %s older than %s",
|
||||
stmt.Category, stmt.Table, olderThan)
|
||||
|
||||
rs, err := r.db.CleanupOlderThan(ctx, stmt, r.count, olderThan)
|
||||
rs, err := r.db.CleanupOlderThan(ctx, stmt.CleanupStmt, r.count, olderThan)
|
||||
if err != nil {
|
||||
select {
|
||||
case errs <- err:
|
||||
|
|
|
|||
26
pkg/icingadb/history/sla.go
Normal file
26
pkg/icingadb/history/sla.go
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
package history
|
||||
|
||||
import (
|
||||
"github.com/go-redis/redis/v8"
|
||||
"github.com/icinga/icingadb/pkg/icingadb/v1/history"
|
||||
"github.com/icinga/icingadb/pkg/structify"
|
||||
"github.com/icinga/icingadb/pkg/types"
|
||||
"reflect"
|
||||
)
|
||||
|
||||
var slaStateStructify = structify.MakeMapStructifier(reflect.TypeOf((*history.SlaHistoryState)(nil)).Elem(), "json")
|
||||
|
||||
func stateHistoryToSlaEntity(entry redis.XMessage) ([]history.UpserterEntity, error) {
|
||||
slaStateInterface, err := slaStateStructify(entry.Values)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
slaState := slaStateInterface.(*history.SlaHistoryState)
|
||||
|
||||
if slaState.StateType != types.StateHard {
|
||||
// only hard state changes are relevant for SLA history, discard all others
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return []history.UpserterEntity{slaState}, nil
|
||||
}
|
||||
|
|
@ -359,12 +359,14 @@ var syncPipelines = map[string][]stageFunc{
|
|||
writeOneEntityStage((*v1.HistoryNotification)(nil)), // history (depends on notification_history)
|
||||
},
|
||||
"state": {
|
||||
writeOneEntityStage((*v1.StateHistory)(nil)), // state_history
|
||||
writeOneEntityStage((*v1.HistoryState)(nil)), // history (depends on state_history)
|
||||
writeOneEntityStage((*v1.StateHistory)(nil)), // state_history
|
||||
writeOneEntityStage((*v1.HistoryState)(nil)), // history (depends on state_history)
|
||||
writeMultiEntityStage(stateHistoryToSlaEntity), // sla_history_state
|
||||
},
|
||||
"downtime": {
|
||||
writeOneEntityStage((*v1.DowntimeHistory)(nil)), // downtime_history
|
||||
writeOneEntityStage((*v1.HistoryDowntime)(nil)), // history (depends on downtime_history)
|
||||
writeOneEntityStage((*v1.DowntimeHistory)(nil)), // downtime_history
|
||||
writeOneEntityStage((*v1.HistoryDowntime)(nil)), // history (depends on downtime_history)
|
||||
writeOneEntityStage((*v1.SlaHistoryDowntime)(nil)), // sla_history_downtime
|
||||
},
|
||||
"comment": {
|
||||
writeOneEntityStage((*v1.CommentHistory)(nil)), // comment_history
|
||||
|
|
|
|||
|
|
@ -80,6 +80,30 @@ func (*HistoryDowntime) TableName() string {
|
|||
return "history"
|
||||
}
|
||||
|
||||
type SlaHistoryDowntime struct {
|
||||
DowntimeHistoryEntity `json:",inline"`
|
||||
HistoryTableMeta `json:",inline"`
|
||||
SlaHistoryDowntimeUpserter `json:",inline"`
|
||||
DowntimeStart types.UnixMilli `json:"start_time"`
|
||||
HasBeenCancelled types.Bool `json:"has_been_cancelled" db:"-"`
|
||||
CancelTime types.UnixMilli `json:"cancel_time" db:"-"`
|
||||
EndTime types.UnixMilli `json:"end_time" db:"-"`
|
||||
}
|
||||
|
||||
// Init implements the contracts.Initer interface.
|
||||
func (s *SlaHistoryDowntime) Init() {
|
||||
s.DowntimeEnd.History = s
|
||||
}
|
||||
|
||||
type SlaHistoryDowntimeUpserter struct {
|
||||
DowntimeEnd SlaDowntimeEndTime `json:"-"`
|
||||
}
|
||||
|
||||
// Upsert implements the contracts.Upserter interface.
|
||||
func (h *SlaHistoryDowntimeUpserter) Upsert() interface{} {
|
||||
return h
|
||||
}
|
||||
|
||||
type DowntimeEventTime struct {
|
||||
History *HistoryDowntime `db:"-"`
|
||||
}
|
||||
|
|
@ -109,6 +133,19 @@ func (et DowntimeEventTime) Value() (driver.Value, error) {
|
|||
}
|
||||
}
|
||||
|
||||
type SlaDowntimeEndTime struct {
|
||||
History *SlaHistoryDowntime `db:"-"`
|
||||
}
|
||||
|
||||
// Value implements the driver.Valuer interface.
|
||||
func (et SlaDowntimeEndTime) Value() (driver.Value, error) {
|
||||
if et.History.HasBeenCancelled.Valid && et.History.HasBeenCancelled.Bool {
|
||||
return et.History.CancelTime.Value()
|
||||
} else {
|
||||
return et.History.EndTime.Value()
|
||||
}
|
||||
}
|
||||
|
||||
// Assert interface compliance.
|
||||
var (
|
||||
_ contracts.Entity = (*DowntimeHistoryEntity)(nil)
|
||||
|
|
@ -117,5 +154,8 @@ var (
|
|||
_ contracts.Initer = (*HistoryDowntime)(nil)
|
||||
_ contracts.TableNamer = (*HistoryDowntime)(nil)
|
||||
_ UpserterEntity = (*HistoryDowntime)(nil)
|
||||
_ contracts.Initer = (*SlaHistoryDowntime)(nil)
|
||||
_ UpserterEntity = (*SlaHistoryDowntime)(nil)
|
||||
_ driver.Valuer = DowntimeEventTime{}
|
||||
_ driver.Valuer = SlaDowntimeEndTime{}
|
||||
)
|
||||
|
|
|
|||
|
|
@ -33,9 +33,19 @@ func (*HistoryState) TableName() string {
|
|||
return "history"
|
||||
}
|
||||
|
||||
type SlaHistoryState struct {
|
||||
HistoryTableEntity `json:",inline"`
|
||||
HistoryTableMeta `json:",inline"`
|
||||
EventTime types.UnixMilli `json:"event_time"`
|
||||
StateType types.StateType `json:"state_type" db:"-"`
|
||||
HardState uint8 `json:"hard_state"`
|
||||
PreviousHardState uint8 `json:"previous_hard_state"`
|
||||
}
|
||||
|
||||
// Assert interface compliance.
|
||||
var (
|
||||
_ UpserterEntity = (*StateHistory)(nil)
|
||||
_ contracts.TableNamer = (*HistoryState)(nil)
|
||||
_ UpserterEntity = (*HistoryState)(nil)
|
||||
_ UpserterEntity = (*SlaHistoryState)(nil)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -46,10 +46,15 @@ func badStateType(t interface{}) error {
|
|||
return errors.Errorf("bad state type: %#v", t)
|
||||
}
|
||||
|
||||
const (
|
||||
StateSoft = StateType(0)
|
||||
StateHard = StateType(1)
|
||||
)
|
||||
|
||||
// stateTypes maps all valid StateType values to their SQL representation.
|
||||
var stateTypes = map[StateType]string{
|
||||
0: "soft",
|
||||
1: "hard",
|
||||
StateSoft: "soft",
|
||||
StateHard: "hard",
|
||||
}
|
||||
|
||||
// Assert interface compliance.
|
||||
|
|
|
|||
|
|
@ -3,6 +3,164 @@
|
|||
SET SESSION sql_mode = 'STRICT_ALL_TABLES,NO_ENGINE_SUBSTITUTION';
|
||||
SET SESSION innodb_strict_mode = 1;
|
||||
|
||||
DROP FUNCTION IF EXISTS get_sla_ok_percent;
|
||||
DELIMITER //
|
||||
CREATE FUNCTION get_sla_ok_percent(
|
||||
in_host_id binary(20),
|
||||
in_service_id binary(20),
|
||||
in_start_time bigint unsigned,
|
||||
in_end_time bigint unsigned
|
||||
)
|
||||
RETURNS decimal(7, 4)
|
||||
READS SQL DATA
|
||||
BEGIN
|
||||
DECLARE result decimal(7, 4);
|
||||
DECLARE row_event_time bigint unsigned;
|
||||
DECLARE row_event_type enum('state_change', 'downtime_start', 'downtime_end', 'end');
|
||||
DECLARE row_event_prio int;
|
||||
DECLARE row_hard_state tinyint unsigned;
|
||||
DECLARE row_previous_hard_state tinyint unsigned;
|
||||
DECLARE last_event_time bigint unsigned;
|
||||
DECLARE last_hard_state tinyint unsigned;
|
||||
DECLARE active_downtimes int unsigned;
|
||||
DECLARE problem_time bigint unsigned;
|
||||
DECLARE total_time bigint unsigned;
|
||||
DECLARE done int;
|
||||
DECLARE cur CURSOR FOR
|
||||
(
|
||||
-- all downtime_start events before the end of the SLA interval
|
||||
-- for downtimes that overlap the SLA interval in any way
|
||||
SELECT
|
||||
GREATEST(downtime_start, in_start_time) AS event_time,
|
||||
'downtime_start' AS event_type,
|
||||
1 AS event_prio,
|
||||
NULL AS hard_state,
|
||||
NULL AS previous_hard_state
|
||||
FROM sla_history_downtime d
|
||||
WHERE d.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
|
||||
AND d.downtime_start < in_end_time
|
||||
AND d.downtime_end >= in_start_time
|
||||
) UNION ALL (
|
||||
-- all downtime_end events before the end of the SLA interval
|
||||
-- for downtimes that overlap the SLA interval in any way
|
||||
SELECT
|
||||
downtime_end AS event_time,
|
||||
'downtime_end' AS event_type,
|
||||
2 AS event_prio,
|
||||
NULL AS hard_state,
|
||||
NULL AS previous_hard_state
|
||||
FROM sla_history_downtime d
|
||||
WHERE d.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
|
||||
AND d.downtime_start < in_end_time
|
||||
AND d.downtime_end >= in_start_time
|
||||
AND d.downtime_end < in_end_time
|
||||
) UNION ALL (
|
||||
-- all state events strictly in interval
|
||||
SELECT
|
||||
event_time,
|
||||
'state_change' AS event_type,
|
||||
0 AS event_prio,
|
||||
hard_state,
|
||||
previous_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time > in_start_time
|
||||
AND s.event_time < in_end_time
|
||||
) UNION ALL (
|
||||
-- end event to keep loop simple, values are not used
|
||||
SELECT
|
||||
in_end_time AS event_time,
|
||||
'end' AS event_type,
|
||||
3 AS event_prio,
|
||||
NULL AS hard_state,
|
||||
NULL AS previous_hard_state
|
||||
)
|
||||
ORDER BY event_time, event_prio;
|
||||
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1;
|
||||
|
||||
IF in_end_time <= in_start_time THEN
|
||||
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'end time must be greater than start time';
|
||||
END IF;
|
||||
|
||||
-- Use the latest event at or before the beginning of the SLA interval as the initial state.
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time <= in_start_time
|
||||
ORDER BY s.event_time DESC
|
||||
LIMIT 1;
|
||||
|
||||
-- If this does not exist, use the previous state from the first event after the beginning of the SLA interval.
|
||||
IF last_hard_state IS NULL THEN
|
||||
SELECT previous_hard_state INTO last_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time > in_start_time
|
||||
ORDER BY s.event_time ASC
|
||||
LIMIT 1;
|
||||
END IF;
|
||||
|
||||
-- If this also does not exist, use the current host/service state.
|
||||
IF last_hard_state IS NULL THEN
|
||||
IF in_service_id IS NULL THEN
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM host_state s
|
||||
WHERE s.host_id = in_host_id;
|
||||
ELSE
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM service_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND s.service_id = in_service_id;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
IF last_hard_state IS NULL THEN
|
||||
SET last_hard_state = 0;
|
||||
END IF;
|
||||
|
||||
SET problem_time = 0;
|
||||
SET total_time = in_end_time - in_start_time;
|
||||
SET last_event_time = in_start_time;
|
||||
SET active_downtimes = 0;
|
||||
|
||||
SET done = 0;
|
||||
OPEN cur;
|
||||
read_loop: LOOP
|
||||
FETCH cur INTO row_event_time, row_event_type, row_event_prio, row_hard_state, row_previous_hard_state;
|
||||
IF done THEN
|
||||
LEAVE read_loop;
|
||||
END IF;
|
||||
|
||||
IF row_previous_hard_state = 99 THEN
|
||||
SET total_time = total_time - (row_event_time - last_event_time);
|
||||
ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1))
|
||||
AND last_hard_state != 99
|
||||
AND active_downtimes = 0
|
||||
THEN
|
||||
SET problem_time = problem_time + row_event_time - last_event_time;
|
||||
END IF;
|
||||
|
||||
SET last_event_time = row_event_time;
|
||||
IF row_event_type = 'state_change' THEN
|
||||
SET last_hard_state = row_hard_state;
|
||||
ELSEIF row_event_type = 'downtime_start' THEN
|
||||
SET active_downtimes = active_downtimes + 1;
|
||||
ELSEIF row_event_type = 'downtime_end' THEN
|
||||
SET active_downtimes = active_downtimes - 1;
|
||||
END IF;
|
||||
END LOOP;
|
||||
CLOSE cur;
|
||||
|
||||
SET result = 100 * (total_time - problem_time) / total_time;
|
||||
RETURN result;
|
||||
END//
|
||||
DELIMITER ;
|
||||
|
||||
CREATE TABLE host (
|
||||
id binary(20) NOT NULL COMMENT 'sha1(environment.id + name)',
|
||||
environment_id binary(20) NOT NULL COMMENT 'environment.id',
|
||||
|
|
@ -1124,6 +1282,39 @@ CREATE TABLE history (
|
|||
INDEX idx_history_host_service_id (host_id, service_id, event_time) COMMENT 'Host/service history detail filter'
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
|
||||
|
||||
CREATE TABLE sla_history_state (
|
||||
id binary(20) NOT NULL COMMENT 'state_history.id (may reference already deleted rows)',
|
||||
environment_id binary(20) NOT NULL COMMENT 'environment.id',
|
||||
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
|
||||
object_type enum('host', 'service') NOT NULL,
|
||||
host_id binary(20) NOT NULL COMMENT 'host.id',
|
||||
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
|
||||
|
||||
event_time bigint unsigned NOT NULL COMMENT 'unix timestamp the event occurred',
|
||||
hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state after this event',
|
||||
previous_hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state before this event',
|
||||
|
||||
PRIMARY KEY (id),
|
||||
|
||||
INDEX idx_sla_history_state_event (host_id, service_id, event_time)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
|
||||
|
||||
CREATE TABLE sla_history_downtime (
|
||||
environment_id binary(20) NOT NULL COMMENT 'environment.id',
|
||||
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
|
||||
object_type enum('host', 'service') NOT NULL,
|
||||
host_id binary(20) NOT NULL COMMENT 'host.id',
|
||||
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
|
||||
|
||||
downtime_id binary(20) NOT NULL COMMENT 'downtime.id (may reference already deleted rows)',
|
||||
downtime_start BIGINT UNSIGNED NOT NULL COMMENT 'start time of the downtime',
|
||||
downtime_end BIGINT UNSIGNED NOT NULL COMMENT 'end time of the downtime',
|
||||
|
||||
PRIMARY KEY (downtime_id),
|
||||
|
||||
INDEX idx_sla_history_downtime_event (host_id, service_id, downtime_start, downtime_end)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
|
||||
|
||||
CREATE TABLE icingadb_schema (
|
||||
id int unsigned NOT NULL AUTO_INCREMENT,
|
||||
version smallint unsigned NOT NULL,
|
||||
|
|
|
|||
|
|
@ -1,3 +1,161 @@
|
|||
DROP FUNCTION IF EXISTS get_sla_ok_percent;
|
||||
DELIMITER //
|
||||
CREATE FUNCTION get_sla_ok_percent(
|
||||
in_host_id binary(20),
|
||||
in_service_id binary(20),
|
||||
in_start_time bigint unsigned,
|
||||
in_end_time bigint unsigned
|
||||
)
|
||||
RETURNS decimal(7, 4)
|
||||
READS SQL DATA
|
||||
BEGIN
|
||||
DECLARE result decimal(7, 4);
|
||||
DECLARE row_event_time bigint unsigned;
|
||||
DECLARE row_event_type enum('state_change', 'downtime_start', 'downtime_end', 'end');
|
||||
DECLARE row_event_prio int;
|
||||
DECLARE row_hard_state tinyint unsigned;
|
||||
DECLARE row_previous_hard_state tinyint unsigned;
|
||||
DECLARE last_event_time bigint unsigned;
|
||||
DECLARE last_hard_state tinyint unsigned;
|
||||
DECLARE active_downtimes int unsigned;
|
||||
DECLARE problem_time bigint unsigned;
|
||||
DECLARE total_time bigint unsigned;
|
||||
DECLARE done int;
|
||||
DECLARE cur CURSOR FOR
|
||||
(
|
||||
-- all downtime_start events before the end of the SLA interval
|
||||
-- for downtimes that overlap the SLA interval in any way
|
||||
SELECT
|
||||
GREATEST(downtime_start, in_start_time) AS event_time,
|
||||
'downtime_start' AS event_type,
|
||||
1 AS event_prio,
|
||||
NULL AS hard_state,
|
||||
NULL AS previous_hard_state
|
||||
FROM sla_history_downtime d
|
||||
WHERE d.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
|
||||
AND d.downtime_start < in_end_time
|
||||
AND d.downtime_end >= in_start_time
|
||||
) UNION ALL (
|
||||
-- all downtime_end events before the end of the SLA interval
|
||||
-- for downtimes that overlap the SLA interval in any way
|
||||
SELECT
|
||||
downtime_end AS event_time,
|
||||
'downtime_end' AS event_type,
|
||||
2 AS event_prio,
|
||||
NULL AS hard_state,
|
||||
NULL AS previous_hard_state
|
||||
FROM sla_history_downtime d
|
||||
WHERE d.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
|
||||
AND d.downtime_start < in_end_time
|
||||
AND d.downtime_end >= in_start_time
|
||||
AND d.downtime_end < in_end_time
|
||||
) UNION ALL (
|
||||
-- all state events strictly in interval
|
||||
SELECT
|
||||
event_time,
|
||||
'state_change' AS event_type,
|
||||
0 AS event_prio,
|
||||
hard_state,
|
||||
previous_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time > in_start_time
|
||||
AND s.event_time < in_end_time
|
||||
) UNION ALL (
|
||||
-- end event to keep loop simple, values are not used
|
||||
SELECT
|
||||
in_end_time AS event_time,
|
||||
'end' AS event_type,
|
||||
3 AS event_prio,
|
||||
NULL AS hard_state,
|
||||
NULL AS previous_hard_state
|
||||
)
|
||||
ORDER BY event_time, event_prio;
|
||||
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = 1;
|
||||
|
||||
IF in_end_time <= in_start_time THEN
|
||||
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'end time must be greater than start time';
|
||||
END IF;
|
||||
|
||||
-- Use the latest event at or before the beginning of the SLA interval as the initial state.
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time <= in_start_time
|
||||
ORDER BY s.event_time DESC
|
||||
LIMIT 1;
|
||||
|
||||
-- If this does not exist, use the previous state from the first event after the beginning of the SLA interval.
|
||||
IF last_hard_state IS NULL THEN
|
||||
SELECT previous_hard_state INTO last_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time > in_start_time
|
||||
ORDER BY s.event_time ASC
|
||||
LIMIT 1;
|
||||
END IF;
|
||||
|
||||
-- If this also does not exist, use the current host/service state.
|
||||
IF last_hard_state IS NULL THEN
|
||||
IF in_service_id IS NULL THEN
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM host_state s
|
||||
WHERE s.host_id = in_host_id;
|
||||
ELSE
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM service_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND s.service_id = in_service_id;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
IF last_hard_state IS NULL THEN
|
||||
SET last_hard_state = 0;
|
||||
END IF;
|
||||
|
||||
SET problem_time = 0;
|
||||
SET total_time = in_end_time - in_start_time;
|
||||
SET last_event_time = in_start_time;
|
||||
SET active_downtimes = 0;
|
||||
|
||||
SET done = 0;
|
||||
OPEN cur;
|
||||
read_loop: LOOP
|
||||
FETCH cur INTO row_event_time, row_event_type, row_event_prio, row_hard_state, row_previous_hard_state;
|
||||
IF done THEN
|
||||
LEAVE read_loop;
|
||||
END IF;
|
||||
|
||||
IF row_previous_hard_state = 99 THEN
|
||||
SET total_time = total_time - (row_event_time - last_event_time);
|
||||
ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1))
|
||||
AND last_hard_state != 99
|
||||
AND active_downtimes = 0
|
||||
THEN
|
||||
SET problem_time = problem_time + row_event_time - last_event_time;
|
||||
END IF;
|
||||
|
||||
SET last_event_time = row_event_time;
|
||||
IF row_event_type = 'state_change' THEN
|
||||
SET last_hard_state = row_hard_state;
|
||||
ELSEIF row_event_type = 'downtime_start' THEN
|
||||
SET active_downtimes = active_downtimes + 1;
|
||||
ELSEIF row_event_type = 'downtime_end' THEN
|
||||
SET active_downtimes = active_downtimes - 1;
|
||||
END IF;
|
||||
END LOOP;
|
||||
CLOSE cur;
|
||||
|
||||
SET result = 100 * (total_time - problem_time) / total_time;
|
||||
RETURN result;
|
||||
END//
|
||||
DELIMITER ;
|
||||
|
||||
ALTER TABLE hostgroup
|
||||
DROP INDEX idx_hostroup_name,
|
||||
ADD INDEX idx_hostgroup_name (name) COMMENT 'Host/service/host group list filtered by host group name';
|
||||
|
|
@ -47,5 +205,52 @@ ALTER TABLE customvar
|
|||
ALTER TABLE customvar_flat
|
||||
MODIFY flatname varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'Path converted with `.` and `[ ]`';
|
||||
|
||||
CREATE TABLE sla_history_state (
|
||||
id binary(20) NOT NULL COMMENT 'state_history.id (may reference already deleted rows)',
|
||||
environment_id binary(20) NOT NULL COMMENT 'environment.id',
|
||||
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
|
||||
object_type enum('host', 'service') NOT NULL,
|
||||
host_id binary(20) NOT NULL COMMENT 'host.id',
|
||||
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
|
||||
|
||||
event_time bigint unsigned NOT NULL COMMENT 'unix timestamp the event occurred',
|
||||
hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state after this event',
|
||||
previous_hard_state TINYINT UNSIGNED NOT NULL COMMENT 'hard state before this event',
|
||||
|
||||
PRIMARY KEY (id),
|
||||
|
||||
INDEX idx_sla_history_state_event (host_id, service_id, event_time)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
|
||||
|
||||
INSERT INTO sla_history_state
|
||||
(id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state)
|
||||
SELECT id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state
|
||||
FROM state_history
|
||||
WHERE state_type = 'hard'
|
||||
ON DUPLICATE KEY UPDATE sla_history_state.id = sla_history_state.id;
|
||||
|
||||
CREATE TABLE sla_history_downtime (
|
||||
environment_id binary(20) NOT NULL COMMENT 'environment.id',
|
||||
endpoint_id binary(20) DEFAULT NULL COMMENT 'endpoint.id',
|
||||
object_type enum('host', 'service') NOT NULL,
|
||||
host_id binary(20) NOT NULL COMMENT 'host.id',
|
||||
service_id binary(20) DEFAULT NULL COMMENT 'service.id',
|
||||
|
||||
downtime_id binary(20) NOT NULL COMMENT 'downtime.id (may reference already deleted rows)',
|
||||
downtime_start BIGINT UNSIGNED NOT NULL COMMENT 'start time of the downtime',
|
||||
downtime_end BIGINT UNSIGNED NOT NULL COMMENT 'end time of the downtime',
|
||||
|
||||
PRIMARY KEY (downtime_id),
|
||||
|
||||
INDEX idx_sla_history_downtime_event (host_id, service_id, downtime_start, downtime_end)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC;
|
||||
|
||||
INSERT INTO sla_history_downtime
|
||||
(environment_id, endpoint_id, object_type, host_id, service_id, downtime_id, downtime_start, downtime_end)
|
||||
SELECT environment_id, endpoint_id, object_type, host_id, service_id, downtime_id,
|
||||
start_time AS downtime_start, IF(has_been_cancelled = 'y', cancel_time, end_time) AS downtime_end
|
||||
FROM downtime_history
|
||||
ON DUPLICATE KEY UPDATE sla_history_downtime.downtime_id = sla_history_downtime.downtime_id;
|
||||
|
||||
INSERT INTO icingadb_schema (version, TIMESTAMP)
|
||||
VALUES (3, CURRENT_TIMESTAMP() * 1000);
|
||||
|
|
|
|||
|
|
@ -21,6 +21,146 @@ CREATE TYPE comment_type AS ENUM ( 'comment', 'ack' );
|
|||
CREATE TYPE notification_type AS ENUM ( 'downtime_start', 'downtime_end', 'downtime_removed', 'custom', 'acknowledgement', 'problem', 'recovery', 'flapping_start', 'flapping_end' );
|
||||
CREATE TYPE history_type AS ENUM ( 'notification', 'state_change', 'downtime_start', 'downtime_end', 'comment_add', 'comment_remove', 'flapping_start', 'flapping_end', 'ack_set', 'ack_clear' );
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_sla_ok_percent(
|
||||
in_host_id bytea20,
|
||||
in_service_id bytea20,
|
||||
in_start_time biguint,
|
||||
in_end_time biguint
|
||||
)
|
||||
RETURNS decimal(7, 4)
|
||||
LANGUAGE plpgsql
|
||||
STABLE
|
||||
PARALLEL RESTRICTED
|
||||
AS $$
|
||||
DECLARE
|
||||
last_event_time biguint := in_start_time;
|
||||
last_hard_state tinyuint;
|
||||
active_downtimes uint := 0;
|
||||
problem_time biguint := 0;
|
||||
total_time biguint;
|
||||
row record;
|
||||
BEGIN
|
||||
IF in_end_time <= in_start_time THEN
|
||||
RAISE 'end time must be greater than start time';
|
||||
END IF;
|
||||
|
||||
total_time := in_end_time - in_start_time;
|
||||
|
||||
-- Use the latest event at or before the beginning of the SLA interval as the initial state.
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time <= in_start_time
|
||||
ORDER BY s.event_time DESC
|
||||
LIMIT 1;
|
||||
|
||||
-- If this does not exist, use the previous state from the first event after the beginning of the SLA interval.
|
||||
IF last_hard_state IS NULL THEN
|
||||
SELECT previous_hard_state INTO last_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time > in_start_time
|
||||
ORDER BY s.event_time ASC
|
||||
LIMIT 1;
|
||||
END IF;
|
||||
|
||||
-- If this also does not exist, use the current host/service state.
|
||||
IF last_hard_state IS NULL THEN
|
||||
IF in_service_id IS NULL THEN
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM host_state s
|
||||
WHERE s.host_id = in_host_id;
|
||||
ELSE
|
||||
SELECT hard_state INTO last_hard_state
|
||||
FROM service_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND s.service_id = in_service_id;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
IF last_hard_state IS NULL THEN
|
||||
last_hard_state := 0;
|
||||
END IF;
|
||||
|
||||
FOR row IN
|
||||
(
|
||||
-- all downtime_start events before the end of the SLA interval
|
||||
-- for downtimes that overlap the SLA interval in any way
|
||||
SELECT
|
||||
GREATEST(downtime_start, in_start_time) AS event_time,
|
||||
'downtime_start' AS event_type,
|
||||
1 AS event_prio,
|
||||
NULL::tinyuint AS hard_state,
|
||||
NULL::tinyuint AS previous_hard_state
|
||||
FROM sla_history_downtime d
|
||||
WHERE d.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
|
||||
AND d.downtime_start < in_end_time
|
||||
AND d.downtime_end >= in_start_time
|
||||
) UNION ALL (
|
||||
-- all downtime_end events before the end of the SLA interval
|
||||
-- for downtimes that overlap the SLA interval in any way
|
||||
SELECT
|
||||
downtime_end AS event_time,
|
||||
'downtime_end' AS event_type,
|
||||
2 AS event_prio,
|
||||
NULL::tinyuint AS hard_state,
|
||||
NULL::tinyuint AS previous_hard_state
|
||||
FROM sla_history_downtime d
|
||||
WHERE d.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id)
|
||||
AND d.downtime_start < in_end_time
|
||||
AND d.downtime_end >= in_start_time
|
||||
AND d.downtime_end < in_end_time
|
||||
) UNION ALL (
|
||||
-- all state events strictly in interval
|
||||
SELECT
|
||||
event_time,
|
||||
'state_change' AS event_type,
|
||||
0 AS event_prio,
|
||||
hard_state,
|
||||
previous_hard_state
|
||||
FROM sla_history_state s
|
||||
WHERE s.host_id = in_host_id
|
||||
AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id)
|
||||
AND s.event_time > in_start_time
|
||||
AND s.event_time < in_end_time
|
||||
) UNION ALL (
|
||||
-- end event to keep loop simple, values are not used
|
||||
SELECT
|
||||
in_end_time AS event_time,
|
||||
'end' AS event_type,
|
||||
3 AS event_prio,
|
||||
NULL::tinyuint AS hard_state,
|
||||
NULL::tinyuint AS previous_hard_state
|
||||
)
|
||||
ORDER BY event_time, event_prio
|
||||
LOOP
|
||||
IF row.previous_hard_state = 99 THEN
|
||||
total_time := total_time - (row.event_time - last_event_time);
|
||||
ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1))
|
||||
AND last_hard_state != 99
|
||||
AND active_downtimes = 0
|
||||
THEN
|
||||
problem_time := problem_time + row.event_time - last_event_time;
|
||||
END IF;
|
||||
|
||||
last_event_time := row.event_time;
|
||||
IF row.event_type = 'state_change' THEN
|
||||
last_hard_state := row.hard_state;
|
||||
ELSEIF row.event_type = 'downtime_start' THEN
|
||||
active_downtimes := active_downtimes + 1;
|
||||
ELSEIF row.event_type = 'downtime_end' THEN
|
||||
active_downtimes := active_downtimes - 1;
|
||||
END IF;
|
||||
END LOOP;
|
||||
|
||||
RETURN 100 * (total_time - problem_time) / total_time;
|
||||
END;
|
||||
$$;
|
||||
|
||||
CREATE TABLE host (
|
||||
id bytea20 NOT NULL,
|
||||
environment_id bytea20 NOT NULL,
|
||||
|
|
@ -1894,6 +2034,68 @@ COMMENT ON COLUMN history.acknowledgement_history_id IS 'acknowledgement_history
|
|||
COMMENT ON INDEX idx_history_event_time IS 'History filtered/ordered by event_time';
|
||||
COMMENT ON INDEX idx_history_host_service_id IS 'Host/service history detail filter';
|
||||
|
||||
CREATE TABLE sla_history_state (
|
||||
id bytea20 NOT NULL,
|
||||
environment_id bytea20 NOT NULL,
|
||||
endpoint_id bytea20 DEFAULT NULL,
|
||||
object_type checkable_type NOT NULL,
|
||||
host_id bytea20 NOT NULL,
|
||||
service_id bytea20 DEFAULT NULL,
|
||||
|
||||
event_time biguint NOT NULL,
|
||||
hard_state tinyuint NOT NULL,
|
||||
previous_hard_state tinyuint NOT NULL,
|
||||
|
||||
CONSTRAINT pk_sla_history_state PRIMARY KEY (id)
|
||||
);
|
||||
|
||||
ALTER TABLE sla_history_state ALTER COLUMN id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_state ALTER COLUMN environment_id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_state ALTER COLUMN endpoint_id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_state ALTER COLUMN host_id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_state ALTER COLUMN service_id SET STORAGE PLAIN;
|
||||
|
||||
CREATE INDEX idx_sla_history_state_event ON sla_history_state(host_id, service_id, event_time);
|
||||
|
||||
COMMENT ON COLUMN sla_history_state.id IS 'state_history.id (may reference already deleted rows)';
|
||||
COMMENT ON COLUMN sla_history_state.environment_id IS 'environment.id';
|
||||
COMMENT ON COLUMN sla_history_state.endpoint_id IS 'endpoint.id';
|
||||
COMMENT ON COLUMN sla_history_state.host_id IS 'host.id';
|
||||
COMMENT ON COLUMN sla_history_state.service_id IS 'service.id';
|
||||
COMMENT ON COLUMN sla_history_state.event_time IS 'unix timestamp the event occurred';
|
||||
COMMENT ON COLUMN sla_history_state.hard_state IS 'hard state after this event';
|
||||
COMMENT ON COLUMN sla_history_state.previous_hard_state IS 'hard state before this event';
|
||||
|
||||
CREATE TABLE sla_history_downtime (
|
||||
environment_id bytea20 NOT NULL,
|
||||
endpoint_id bytea20 DEFAULT NULL,
|
||||
object_type checkable_type NOT NULL,
|
||||
host_id bytea20 NOT NULL,
|
||||
service_id bytea20 DEFAULT NULL,
|
||||
|
||||
downtime_id bytea20 NOT NULL,
|
||||
downtime_start biguint NOT NULL,
|
||||
downtime_end biguint NOT NULL,
|
||||
|
||||
CONSTRAINT pk_sla_history_downtime PRIMARY KEY (downtime_id)
|
||||
);
|
||||
|
||||
ALTER TABLE sla_history_downtime ALTER COLUMN environment_id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_downtime ALTER COLUMN endpoint_id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_downtime ALTER COLUMN host_id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_downtime ALTER COLUMN service_id SET STORAGE PLAIN;
|
||||
ALTER TABLE sla_history_downtime ALTER COLUMN downtime_id SET STORAGE PLAIN;
|
||||
|
||||
CREATE INDEX idx_sla_history_downtime_event ON sla_history_downtime(host_id, service_id, downtime_start, downtime_end);
|
||||
|
||||
COMMENT ON COLUMN sla_history_downtime.environment_id IS 'environment.id';
|
||||
COMMENT ON COLUMN sla_history_downtime.endpoint_id IS 'endpoint.id';
|
||||
COMMENT ON COLUMN sla_history_downtime.host_id IS 'host.id';
|
||||
COMMENT ON COLUMN sla_history_downtime.service_id IS 'service.id';
|
||||
COMMENT ON COLUMN sla_history_downtime.downtime_id IS 'downtime.id (may reference already deleted rows)';
|
||||
COMMENT ON COLUMN sla_history_downtime.downtime_start IS 'start time of the downtime';
|
||||
COMMENT ON COLUMN sla_history_downtime.downtime_end IS 'end time of the downtime';
|
||||
|
||||
CREATE SEQUENCE icingadb_schema_id_seq;
|
||||
|
||||
CREATE TABLE icingadb_schema (
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import (
|
|||
"github.com/jmoiron/sqlx"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
|
@ -22,7 +23,8 @@ func TestCleanupAndRetention(t *testing.T) {
|
|||
t.Cleanup(func() { _ = db.Close() })
|
||||
|
||||
reten := retention{
|
||||
Days: 7,
|
||||
HistoryDays: 7,
|
||||
SlaDays: 30,
|
||||
Options: map[string]int{
|
||||
"acknowledgement": 0, // No cleanup.
|
||||
"comment": 1,
|
||||
|
|
@ -31,6 +33,16 @@ func TestCleanupAndRetention(t *testing.T) {
|
|||
},
|
||||
}
|
||||
|
||||
daysForCategory := func(category string) int {
|
||||
if strings.HasPrefix(category, "sla_") {
|
||||
return reten.SlaDays
|
||||
} else if d, ok := reten.Options[category]; ok {
|
||||
return d
|
||||
} else {
|
||||
return reten.HistoryDays
|
||||
}
|
||||
}
|
||||
|
||||
rowsToDelete := 10000
|
||||
rowsToSpare := 1000
|
||||
|
||||
|
|
@ -38,11 +50,7 @@ func TestCleanupAndRetention(t *testing.T) {
|
|||
err := dropNotNullColumns(db, stmt)
|
||||
assert.NoError(t, err)
|
||||
|
||||
retentionDays, ok := reten.Options[category]
|
||||
if !ok {
|
||||
retentionDays = reten.Days
|
||||
}
|
||||
|
||||
retentionDays := daysForCategory(category)
|
||||
start := time.Now().AddDate(0, 0, -retentionDays).Add(-1 * time.Millisecond * time.Duration(rowsToDelete))
|
||||
startMilli := start.UnixMilli()
|
||||
|
||||
|
|
@ -75,18 +83,14 @@ func TestCleanupAndRetention(t *testing.T) {
|
|||
i.Reload()
|
||||
waitForDumpDoneSignal(t, r, 20*time.Second, 100*time.Millisecond)
|
||||
config, err := yaml.Marshal(struct {
|
||||
Retention retention `yaml:"history-retention"`
|
||||
Retention retention `yaml:"retention"`
|
||||
}{reten})
|
||||
assert.NoError(t, err)
|
||||
it.IcingaDbInstanceT(t, r, rdb, services.WithIcingaDbConfig(string(config)))
|
||||
|
||||
eventually.Assert(t, func(t require.TestingT) {
|
||||
for category, stmt := range retentionStatements {
|
||||
retentionDays, ok := reten.Options[category]
|
||||
if !ok {
|
||||
retentionDays = reten.Days
|
||||
}
|
||||
|
||||
retentionDays := daysForCategory(category)
|
||||
threshold := time.Now().AddDate(0, 0, -retentionDays)
|
||||
thresholdMilli := threshold.UnixMilli()
|
||||
|
||||
|
|
@ -106,10 +110,10 @@ func TestCleanupAndRetention(t *testing.T) {
|
|||
|
||||
if retentionDays == 0 {
|
||||
// No cleanup.
|
||||
assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there")
|
||||
assert.Equal(t, rowsToDelete+rowsToSpare, rowsLeft+rowsSpared, "all rows should still be there for %s", category)
|
||||
} else {
|
||||
assert.Equal(t, 0, rowsLeft, "rows left in retention period")
|
||||
assert.Equal(t, rowsToSpare, rowsSpared, "rows spared")
|
||||
assert.Equal(t, 0, rowsLeft, "rows left in retention period for %s", category)
|
||||
assert.Equal(t, rowsToSpare, rowsSpared, "rows spared for %s", category)
|
||||
}
|
||||
}
|
||||
}, time.Minute, time.Second)
|
||||
|
|
@ -122,8 +126,9 @@ type cleanupStmt struct {
|
|||
}
|
||||
|
||||
type retention struct {
|
||||
Days int `yaml:"days"`
|
||||
Options map[string]int `yaml:"options"`
|
||||
HistoryDays int `yaml:"history-days"`
|
||||
SlaDays int `yaml:"sla-days"`
|
||||
Options map[string]int `yaml:"options"`
|
||||
}
|
||||
|
||||
var retentionStatements = map[string]cleanupStmt{
|
||||
|
|
@ -157,6 +162,16 @@ var retentionStatements = map[string]cleanupStmt{
|
|||
PK: "id",
|
||||
Column: "event_time",
|
||||
},
|
||||
"sla_downtime": {
|
||||
Table: "sla_history_downtime",
|
||||
PK: "downtime_id",
|
||||
Column: "downtime_end",
|
||||
},
|
||||
"sla_state": {
|
||||
Table: "sla_history_state",
|
||||
PK: "id",
|
||||
Column: "event_time",
|
||||
},
|
||||
}
|
||||
|
||||
// dropNotNullColumns drops all columns with a NOT NULL constraint that are not
|
||||
|
|
|
|||
|
|
@ -5,10 +5,12 @@ go 1.16
|
|||
require (
|
||||
github.com/containerd/containerd v1.5.6 // indirect
|
||||
github.com/go-redis/redis/v8 v8.11.4
|
||||
github.com/go-sql-driver/mysql v1.6.0
|
||||
github.com/goccy/go-yaml v1.9.5
|
||||
github.com/google/uuid v1.3.0
|
||||
github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c
|
||||
github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01
|
||||
github.com/jmoiron/sqlx v1.3.4
|
||||
github.com/lib/pq v1.10.5
|
||||
github.com/stretchr/testify v1.7.0
|
||||
go.uber.org/zap v1.21.0
|
||||
golang.org/x/net v0.0.0-20211020060615-d418f374d309 // indirect
|
||||
|
|
|
|||
13
tests/go.sum
13
tests/go.sum
|
|
@ -37,6 +37,8 @@ github.com/Azure/go-autorest/logger v0.2.0/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZ
|
|||
github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU=
|
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
|
||||
github.com/Icinga/go-libs v0.0.0-20220420130327-ef58ad52edd8 h1:hG4Y/LPERK9i+P8/jnYlq9PeDd9deIkwEWOIimDU3uk=
|
||||
github.com/Icinga/go-libs v0.0.0-20220420130327-ef58ad52edd8/go.mod h1:xlgU55MKs/vIg1fMlAEBSrslahYayZNwjXvf3w1dvyA=
|
||||
github.com/Microsoft/go-winio v0.4.11/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA=
|
||||
github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA=
|
||||
github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw=
|
||||
|
|
@ -60,6 +62,7 @@ github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5
|
|||
github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY=
|
||||
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
|
||||
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
|
||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
|
||||
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
|
||||
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ=
|
||||
|
|
@ -68,6 +71,7 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy
|
|||
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
|
||||
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
|
||||
github.com/alexflint/go-filemutex v0.0.0-20171022225611-72bdc8eae2ae/go.mod h1:CgnQgUtFrFz9mxFNtED3jI5tLDjKlOM+oUF/sTk6ps0=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
|
||||
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
|
||||
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
|
||||
|
|
@ -402,8 +406,8 @@ github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ
|
|||
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
|
||||
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
|
||||
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
|
||||
github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c h1:jJVPK9dvsZ99Xl/xDOBM+DEj+7i7En51/04ckc/lTZc=
|
||||
github.com/icinga/icinga-testing v0.0.0-20220503150619-1c215361234c/go.mod h1:W9pLmq2dsgLSag568N/LDHNu4oah6qWvjT05Drz2RYw=
|
||||
github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01 h1:0dwlZFGWPnmmhvHr2P7chxMwzbW7+R3iX6SyeFBd+WM=
|
||||
github.com/icinga/icinga-testing v0.0.0-20220513095329-9c98d3145b01/go.mod h1:ZP0pyqhmrRwwQ6FpAfz7UZMgmH7i3vOjEOm9JcFwOw0=
|
||||
github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
|
||||
github.com/imdario/mergo v0.3.8/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
|
||||
github.com/imdario/mergo v0.3.10/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA=
|
||||
|
|
@ -444,8 +448,9 @@ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
|||
github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y=
|
||||
github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII=
|
||||
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
|
||||
github.com/lib/pq v1.10.4 h1:SO9z7FRPzA03QhHKJrH5BXA6HU1rS4V2nIVrrNC1iYk=
|
||||
github.com/lib/pq v1.10.4/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||
github.com/lib/pq v1.10.5 h1:J+gdV2cUmX7ZqL2B0lFcW0m+egaHC2V3lpO8nWxyYiQ=
|
||||
github.com/lib/pq v1.10.5/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
|
||||
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
|
|
@ -459,6 +464,7 @@ github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHX
|
|||
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
|
||||
github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
|
||||
github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o=
|
||||
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
|
||||
github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
|
||||
github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
|
||||
|
|
@ -725,6 +731,7 @@ golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzB
|
|||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
|
|
|
|||
28
tests/internal/utils/database.go
Normal file
28
tests/internal/utils/database.go
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/icinga/icinga-testing"
|
||||
"github.com/icinga/icinga-testing/services"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func GetDatabase(it *icingatesting.IT, t testing.TB) services.RelationalDatabase {
|
||||
k := "ICINGADB_TESTS_DATABASE_TYPE"
|
||||
v := strings.ToLower(os.Getenv(k))
|
||||
|
||||
var rdb services.RelationalDatabase
|
||||
|
||||
switch v {
|
||||
case "mysql":
|
||||
rdb = it.MysqlDatabaseT(t)
|
||||
case "pgsql":
|
||||
rdb = it.PostgresqlDatabaseT(t)
|
||||
default:
|
||||
panic(fmt.Sprintf(`unknown database in %s environment variable: %q (must be "mysql" or "pgsql")`, k, v))
|
||||
}
|
||||
|
||||
return rdb
|
||||
}
|
||||
|
|
@ -1,10 +1,9 @@
|
|||
package icingadb_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/icinga/icinga-testing"
|
||||
"github.com/icinga/icinga-testing/services"
|
||||
"os"
|
||||
"github.com/icinga/icingadb/tests/internal/utils"
|
||||
"testing"
|
||||
)
|
||||
|
||||
|
|
@ -26,19 +25,5 @@ func getDatabase(t testing.TB) services.RelationalDatabase {
|
|||
}
|
||||
|
||||
func getEmptyDatabase(t testing.TB) services.RelationalDatabase {
|
||||
k := "ICINGADB_TESTS_DATABASE_TYPE"
|
||||
v := os.Getenv(k)
|
||||
|
||||
var rdb services.RelationalDatabase
|
||||
|
||||
switch v {
|
||||
case "mysql":
|
||||
rdb = it.MysqlDatabaseT(t)
|
||||
case "pgsql":
|
||||
rdb = it.PostgresqlDatabaseT(t)
|
||||
default:
|
||||
panic(fmt.Sprintf(`unknown database in %s environment variable: %q (must be "mysql" or "pgsql")`, k, v))
|
||||
}
|
||||
|
||||
return rdb
|
||||
return utils.GetDatabase(it, t)
|
||||
}
|
||||
|
|
|
|||
385
tests/sla_test.go
Normal file
385
tests/sla_test.go
Normal file
|
|
@ -0,0 +1,385 @@
|
|||
package icingadb_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/icinga/icinga-testing/utils"
|
||||
"github.com/icinga/icinga-testing/utils/eventually"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/zap"
|
||||
"math"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestSla(t *testing.T) {
|
||||
m := it.MysqlDatabaseT(t)
|
||||
m.ImportIcingaDbSchema()
|
||||
|
||||
r := it.RedisServerT(t)
|
||||
i := it.Icinga2NodeT(t, "master")
|
||||
i.EnableIcingaDb(r)
|
||||
err := i.Reload()
|
||||
require.NoError(t, err, "icinga2 should reload without error")
|
||||
it.IcingaDbInstanceT(t, r, m)
|
||||
|
||||
client := i.ApiClient()
|
||||
|
||||
t.Run("StateEvents", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
hostname := utils.UniqueName(t, "host")
|
||||
client.CreateHost(t, hostname, map[string]interface{}{
|
||||
"attrs": map[string]interface{}{
|
||||
"enable_active_checks": false,
|
||||
"enable_passive_checks": true,
|
||||
"check_command": "dummy",
|
||||
"max_check_attempts": 3,
|
||||
},
|
||||
})
|
||||
|
||||
type StateChange struct {
|
||||
Time float64
|
||||
State int
|
||||
}
|
||||
|
||||
var stateChanges []StateChange
|
||||
|
||||
processCheckResult := func(exitStatus int, isHard bool) *ObjectsHostsResponse {
|
||||
time.Sleep(10 * time.Millisecond) // ensure there is a bit of difference in ms resolution
|
||||
|
||||
output := utils.UniqueName(t, "output")
|
||||
data := ActionsProcessCheckResultRequest{
|
||||
Type: "Host",
|
||||
Filter: fmt.Sprintf(`host.name==%q`, hostname),
|
||||
ExitStatus: exitStatus,
|
||||
PluginOutput: output,
|
||||
}
|
||||
dataJson, err := json.Marshal(data)
|
||||
require.NoError(t, err, "marshal request")
|
||||
response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(dataJson))
|
||||
require.NoError(t, err, "process-check-result")
|
||||
require.Equal(t, 200, response.StatusCode, "process-check-result")
|
||||
|
||||
response, err = client.GetJson("/v1/objects/hosts/" + hostname)
|
||||
require.NoError(t, err, "get host: request")
|
||||
require.Equal(t, 200, response.StatusCode, "get host: request")
|
||||
|
||||
var hosts ObjectsHostsResponse
|
||||
err = json.NewDecoder(response.Body).Decode(&hosts)
|
||||
require.NoError(t, err, "get host: parse response")
|
||||
|
||||
require.Equal(t, 1, len(hosts.Results), "there must be one host in the response")
|
||||
host := hosts.Results[0]
|
||||
require.Equal(t, output, host.Attrs.LastCheckResult.Output,
|
||||
"last check result should be visible in host object")
|
||||
require.Equal(t, exitStatus, host.Attrs.State, "soft state should match check result")
|
||||
|
||||
if isHard {
|
||||
require.Equal(t, exitStatus, host.Attrs.LastHardState, "hard state should match check result")
|
||||
if len(stateChanges) > 0 {
|
||||
require.Greater(t, host.Attrs.LastHardStateChange, stateChanges[len(stateChanges)-1].Time,
|
||||
"last_hard_state_change_time of host should have changed")
|
||||
}
|
||||
stateChanges = append(stateChanges, StateChange{
|
||||
Time: host.Attrs.LastHardStateChange,
|
||||
State: exitStatus,
|
||||
})
|
||||
} else {
|
||||
require.NotEmpty(t, stateChanges, "there should be a hard state change prior to a soft one")
|
||||
require.Equal(t, stateChanges[len(stateChanges)-1].Time, host.Attrs.LastHardStateChange,
|
||||
"check result should not lead to a hard state change, i.e. last_hard_state_change should not change")
|
||||
}
|
||||
|
||||
return &hosts
|
||||
}
|
||||
|
||||
processCheckResult(0, true) // hard (UNKNOWN -> UP)
|
||||
processCheckResult(1, false) // soft
|
||||
processCheckResult(1, false) // soft
|
||||
processCheckResult(1, true) // hard (UP -> DOWN)
|
||||
processCheckResult(1, false) // hard
|
||||
processCheckResult(0, true) // hard (DOWN -> UP)
|
||||
processCheckResult(0, false) // hard
|
||||
|
||||
assert.Equal(t, 3, len(stateChanges), "there should be three hard state changes")
|
||||
|
||||
db, err := sqlx.Connect("mysql", m.DSN())
|
||||
require.NoError(t, err, "connecting to mysql")
|
||||
defer func() { _ = db.Close() }()
|
||||
|
||||
type Row struct {
|
||||
Time int64 `db:"event_time"`
|
||||
State int `db:"hard_state"`
|
||||
}
|
||||
|
||||
eventually.Assert(t, func(t require.TestingT) {
|
||||
var rows []Row
|
||||
err = db.Select(&rows, db.Rebind("SELECT s.event_time, s.hard_state FROM sla_history_state s "+
|
||||
"JOIN host ON host.id = s.host_id WHERE host.name = ? ORDER BY event_time ASC"), hostname)
|
||||
require.NoError(t, err, "select sla_history_state")
|
||||
|
||||
assert.Equal(t, len(stateChanges), len(rows), "number of sla_history_state entries")
|
||||
|
||||
for i := range rows {
|
||||
assert.WithinDuration(t, time.UnixMilli(int64(stateChanges[i].Time*1000)), time.UnixMilli(rows[i].Time),
|
||||
time.Millisecond, "event time should match state change time")
|
||||
assert.Equal(t, stateChanges[i].State, rows[i].State, "hard state should match")
|
||||
}
|
||||
}, 5*time.Second, 200*time.Millisecond)
|
||||
|
||||
redis := r.Open()
|
||||
defer func() { _ = redis.Close() }()
|
||||
|
||||
logger := it.Logger(t)
|
||||
|
||||
logger.Debug("redis state history", zap.Bool("before", true))
|
||||
eventually.Assert(t, func(t require.TestingT) {
|
||||
result, err := redis.XRange(context.Background(), "icinga:history:stream:state", "-", "+").Result()
|
||||
require.NoError(t, err, "reading state history stream should not fail")
|
||||
logger.Debug("redis state history", zap.Any("values", result))
|
||||
assert.Empty(t, result, "redis state history stream should be drained")
|
||||
}, 5*time.Second, 10*time.Millisecond)
|
||||
logger.Debug("redis state history", zap.Bool("after", true))
|
||||
})
|
||||
|
||||
t.Run("DowntimeEvents", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
type Options struct {
|
||||
Fixed bool // Whether to schedule a fixed or flexible downtime.
|
||||
Cancel bool // Whether to cancel the downtime or let it expire.
|
||||
}
|
||||
|
||||
downtimeTest := func(t *testing.T, o Options) {
|
||||
hostname := utils.UniqueName(t, "host")
|
||||
client.CreateHost(t, hostname, map[string]interface{}{
|
||||
"attrs": map[string]interface{}{
|
||||
"enable_active_checks": false,
|
||||
"enable_passive_checks": true,
|
||||
"check_command": "dummy",
|
||||
"max_check_attempts": 1,
|
||||
},
|
||||
})
|
||||
|
||||
processCheckResult := func(status int) time.Time {
|
||||
output := utils.RandomString(8)
|
||||
reqBody, err := json.Marshal(ActionsProcessCheckResultRequest{
|
||||
Type: "Host",
|
||||
Filter: fmt.Sprintf(`host.name==%q`, hostname),
|
||||
ExitStatus: status,
|
||||
PluginOutput: output,
|
||||
})
|
||||
require.NoError(t, err, "marshal request")
|
||||
response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(reqBody))
|
||||
require.NoError(t, err, "process-check-result")
|
||||
require.Equal(t, 200, response.StatusCode, "process-check-result")
|
||||
|
||||
response, err = client.GetJson("/v1/objects/hosts/" + hostname)
|
||||
require.NoError(t, err, "get host: request")
|
||||
require.Equal(t, 200, response.StatusCode, "get host: request")
|
||||
|
||||
var hosts ObjectsHostsResponse
|
||||
err = json.NewDecoder(response.Body).Decode(&hosts)
|
||||
require.NoError(t, err, "get host: parse response")
|
||||
|
||||
require.Equal(t, 1, len(hosts.Results), "there must be one host in the response")
|
||||
host := hosts.Results[0]
|
||||
require.Equal(t, output, host.Attrs.LastCheckResult.Output,
|
||||
"last check result should be visible in host object")
|
||||
require.Equal(t, 1, host.Attrs.StateType, "host should be in hard state")
|
||||
require.Equal(t, status, host.Attrs.State, "state should match check result")
|
||||
|
||||
sec, nsec := math.Modf(host.Attrs.LastCheckResult.ExecutionEnd)
|
||||
return time.Unix(int64(sec), int64(nsec*1e9))
|
||||
}
|
||||
|
||||
// Ensure that host is in UP state.
|
||||
processCheckResult(0)
|
||||
|
||||
refTime := time.Now().Truncate(time.Second)
|
||||
// Schedule the downtime start in the past so that we would notice if Icinga 2/DB would
|
||||
// use the current time somewhere where we expect the scheduled start time.
|
||||
downtimeStart := refTime.Add(-1 * time.Hour)
|
||||
var downtimeEnd time.Time
|
||||
if o.Cancel || !o.Fixed {
|
||||
// Downtimes we will cancel can expire long in the future as we don't have to wait for it.
|
||||
// Same for flexible downtimes as for these, we don't have to wait until the scheduled end but only
|
||||
// for their duration.
|
||||
downtimeEnd = refTime.Add(1 * time.Hour)
|
||||
} else {
|
||||
// Let all other downtimes expire soon (fixed downtimes where we wait for expiry).
|
||||
downtimeEnd = refTime.Add(5 * time.Second)
|
||||
}
|
||||
|
||||
var duration time.Duration
|
||||
if !o.Fixed {
|
||||
duration = 10 * time.Second
|
||||
}
|
||||
req, err := json.Marshal(ActionsScheduleDowntimeRequest{
|
||||
Type: "Host",
|
||||
Filter: fmt.Sprintf(`host.name==%q`, hostname),
|
||||
StartTime: downtimeStart.Unix(),
|
||||
EndTime: downtimeEnd.Unix(),
|
||||
Fixed: o.Fixed,
|
||||
Duration: duration.Seconds(),
|
||||
Author: utils.RandomString(8),
|
||||
Comment: utils.RandomString(8),
|
||||
})
|
||||
require.NoError(t, err, "marshal request")
|
||||
response, err := client.PostJson("/v1/actions/schedule-downtime", bytes.NewBuffer(req))
|
||||
require.NoError(t, err, "schedule-downtime")
|
||||
require.Equal(t, 200, response.StatusCode, "schedule-downtime")
|
||||
|
||||
var scheduleResponse ActionsScheduleDowntimeResponse
|
||||
err = json.NewDecoder(response.Body).Decode(&scheduleResponse)
|
||||
require.NoError(t, err, "decode schedule-downtime response")
|
||||
require.Equal(t, 1, len(scheduleResponse.Results), "schedule-downtime should return 1 result")
|
||||
require.Equal(t, http.StatusOK, scheduleResponse.Results[0].Code, "schedule-downtime should return 1 result")
|
||||
downtimeName := scheduleResponse.Results[0].Name
|
||||
|
||||
type Row struct {
|
||||
Start int64 `db:"downtime_start"`
|
||||
End int64 `db:"downtime_end"`
|
||||
}
|
||||
|
||||
db, err := sqlx.Connect("mysql", m.DSN())
|
||||
require.NoError(t, err, "connecting to mysql")
|
||||
defer func() { _ = db.Close() }()
|
||||
|
||||
if !o.Fixed {
|
||||
// Give Icinga 2 and Icinga DB some time that if they would generate an SLA history event in error,
|
||||
// they have a chance to do so before we check for its absence.
|
||||
time.Sleep(10 * time.Second)
|
||||
|
||||
var count int
|
||||
err = db.Get(&count, db.Rebind("SELECT COUNT(*) FROM sla_history_downtime s "+
|
||||
"JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
|
||||
require.NoError(t, err, "select sla_history_state")
|
||||
assert.Zero(t, count, "there should be no event in sla_history_downtime when scheduling a flexible downtime on an UP host")
|
||||
}
|
||||
|
||||
// Bring host into DOWN state.
|
||||
criticalTime := processCheckResult(1)
|
||||
|
||||
eventually.Assert(t, func(t require.TestingT) {
|
||||
var rows []Row
|
||||
err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+
|
||||
"JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
|
||||
require.NoError(t, err, "select sla_history_state")
|
||||
|
||||
require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row")
|
||||
if o.Fixed {
|
||||
assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start),
|
||||
"downtime_start should match scheduled start time")
|
||||
assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End),
|
||||
"downtime_end should match scheduled end time")
|
||||
} else {
|
||||
assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second,
|
||||
"downtime_start should match time of host state change")
|
||||
assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)),
|
||||
"downtime_end - downtime_start duration should match scheduled duration")
|
||||
}
|
||||
}, 5*time.Second, 200*time.Millisecond)
|
||||
|
||||
redis := r.Open()
|
||||
defer func() { _ = redis.Close() }()
|
||||
|
||||
eventually.Assert(t, func(t require.TestingT) {
|
||||
result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result()
|
||||
require.NoError(t, err, "reading downtime history stream should not fail")
|
||||
assert.Empty(t, result, "redis downtime history stream should be drained")
|
||||
}, 5*time.Second, 10*time.Millisecond)
|
||||
|
||||
if o.Cancel {
|
||||
req, err = json.Marshal(ActionsRemoveDowntimeRequest{
|
||||
Downtime: downtimeName,
|
||||
})
|
||||
require.NoError(t, err, "marshal remove-downtime request")
|
||||
response, err = client.PostJson("/v1/actions/remove-downtime", bytes.NewBuffer(req))
|
||||
require.NoError(t, err, "remove-downtime")
|
||||
require.Equal(t, 200, response.StatusCode, "remove-downtime")
|
||||
}
|
||||
|
||||
downtimeCancel := time.Now()
|
||||
|
||||
if !o.Cancel {
|
||||
// Wait for downtime to expire + a few extra seconds. The row should not be updated, give
|
||||
// enough time to have a chance catching if Icinga DB updates it nonetheless.
|
||||
if !o.Fixed {
|
||||
time.Sleep(duration + 5*time.Second)
|
||||
} else {
|
||||
d := time.Until(downtimeEnd) + 5*time.Second
|
||||
require.Less(t, d, time.Minute, "bug in tests: don't wait too long")
|
||||
time.Sleep(d)
|
||||
}
|
||||
}
|
||||
|
||||
eventually.Assert(t, func(t require.TestingT) {
|
||||
var rows []Row
|
||||
err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+
|
||||
"JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
|
||||
require.NoError(t, err, "select sla_history_state")
|
||||
|
||||
require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row")
|
||||
if o.Fixed {
|
||||
assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start),
|
||||
"downtime_start should match scheduled start")
|
||||
} else {
|
||||
assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second,
|
||||
"downtime_start should match critical time")
|
||||
}
|
||||
if o.Cancel {
|
||||
// Allow more delta for the end time after cancel as we did not choose the exact time.
|
||||
assert.WithinDuration(t, downtimeCancel, time.UnixMilli(rows[0].End), time.Second,
|
||||
"downtime_end should match cancel time")
|
||||
} else if o.Fixed {
|
||||
assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End),
|
||||
"downtime_start should match scheduled end")
|
||||
} else {
|
||||
assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)),
|
||||
"downtime_end - downtime_start duration should match scheduled duration")
|
||||
}
|
||||
}, 5*time.Second, 200*time.Millisecond)
|
||||
|
||||
eventually.Assert(t, func(t require.TestingT) {
|
||||
result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result()
|
||||
require.NoError(t, err, "reading downtime history stream should not fail")
|
||||
assert.Empty(t, result, "redis downtime history stream should be drained")
|
||||
}, 5*time.Second, 10*time.Millisecond)
|
||||
}
|
||||
|
||||
t.Run("Fixed", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("Cancel", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
downtimeTest(t, Options{Fixed: true, Cancel: true})
|
||||
})
|
||||
|
||||
t.Run("Expire", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
downtimeTest(t, Options{Fixed: true, Cancel: false})
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("Flexible", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("Cancel", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
downtimeTest(t, Options{Fixed: false, Cancel: true})
|
||||
})
|
||||
|
||||
t.Run("Expire", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
downtimeTest(t, Options{Fixed: false, Cancel: false})
|
||||
})
|
||||
})
|
||||
})
|
||||
}
|
||||
23
tests/sql/main_test.go
Normal file
23
tests/sql/main_test.go
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
package sql_test
|
||||
|
||||
import (
|
||||
"github.com/icinga/icinga-testing"
|
||||
"github.com/icinga/icinga-testing/services"
|
||||
"github.com/icinga/icingadb/tests/internal/utils"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var it *icingatesting.IT
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
it = icingatesting.NewIT()
|
||||
defer it.Cleanup()
|
||||
|
||||
m.Run()
|
||||
}
|
||||
|
||||
func getDatabase(t testing.TB) services.RelationalDatabase {
|
||||
rdb := utils.GetDatabase(it, t)
|
||||
rdb.ImportIcingaDbSchema()
|
||||
return rdb
|
||||
}
|
||||
406
tests/sql/sla_test.go
Normal file
406
tests/sql/sla_test.go
Normal file
|
|
@ -0,0 +1,406 @@
|
|||
package sql_test
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"database/sql/driver"
|
||||
"fmt"
|
||||
"github.com/go-sql-driver/mysql"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/lib/pq"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSla(t *testing.T) {
|
||||
rdb := getDatabase(t)
|
||||
db, err := sqlx.Open(rdb.Driver(), rdb.DSN())
|
||||
require.NoError(t, err, "connect to database")
|
||||
|
||||
type TestData struct {
|
||||
Name string
|
||||
Events []SlaHistoryEvent
|
||||
Start uint64
|
||||
End uint64
|
||||
Expected float64
|
||||
}
|
||||
|
||||
tests := []TestData{{
|
||||
Name: "EmptyHistory",
|
||||
// Empty history implies no previous problem state, therefore SLA should be 100%
|
||||
Events: nil,
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 100.0,
|
||||
}, {
|
||||
Name: "MultipleStateChanges",
|
||||
// Some flapping, test that all changes are considered.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 1000, State: 2, PreviousState: 99}, // -10%
|
||||
&State{Time: 1100, State: 0, PreviousState: 2},
|
||||
&State{Time: 1300, State: 2, PreviousState: 0}, // -10%
|
||||
&State{Time: 1400, State: 0, PreviousState: 2},
|
||||
&State{Time: 1600, State: 2, PreviousState: 0}, // -10%
|
||||
&State{Time: 1700, State: 0, PreviousState: 2},
|
||||
&State{Time: 1900, State: 2, PreviousState: 0}, // -10%
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 60.0,
|
||||
}, {
|
||||
Name: "OverlappingDowntimesAndProblems",
|
||||
// SLA should be 90%:
|
||||
// 1000..1100: OK, no downtime
|
||||
// 1100..1200: OK, in downtime
|
||||
// 1200..1300: CRITICAL, in downtime
|
||||
// 1300..1400: CRITICAL, no downtime (only period counting for SLA, -10%)
|
||||
// 1400..1500: CRITICAL, in downtime
|
||||
// 1500..1600: OK, in downtime
|
||||
// 1600..2000: OK, no downtime
|
||||
Events: []SlaHistoryEvent{
|
||||
&Downtime{Start: 1100, End: 1300},
|
||||
&Downtime{Start: 1400, End: 1600},
|
||||
&State{Time: 1200, State: 2, PreviousState: 0},
|
||||
&State{Time: 1500, State: 0, PreviousState: 2},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 90.0,
|
||||
}, {
|
||||
Name: "CriticalBeforeInterval",
|
||||
// If there is no event within the SLA interval, the last state from before the interval should be used.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 0, State: 2, PreviousState: 99},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 0.0,
|
||||
}, {
|
||||
Name: "CriticalBeforeIntervalWithDowntime",
|
||||
// State change and downtime start from before the SLA interval should be considered if still relevant.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 800, State: 2, PreviousState: 99},
|
||||
&Downtime{Start: 600, End: 1800},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 80.0,
|
||||
}, {
|
||||
Name: "CriticalBeforeIntervalWithOverlappingDowntimes",
|
||||
// Test that overlapping downtimes are properly accounted for.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 800, State: 2, PreviousState: 99},
|
||||
&Downtime{Start: 600, End: 1000},
|
||||
&Downtime{Start: 800, End: 1200},
|
||||
&Downtime{Start: 1000, End: 1400},
|
||||
// Everything except 1400-1600 is covered by downtimes, -20%
|
||||
&Downtime{Start: 1600, End: 2000},
|
||||
&Downtime{Start: 1800, End: 2200},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 80.0,
|
||||
}, {
|
||||
Name: "FallbackToPreviousState",
|
||||
// If there is no state event from before the SLA interval, the previous hard state from the first event
|
||||
// after the beginning of the SLA interval should be used as the initial state.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 1200, State: 0, PreviousState: 2},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 80.0,
|
||||
}, {
|
||||
Name: "FallbackToCurrentState",
|
||||
// If there are no state history events, the current state of the checkable should be used.
|
||||
Events: []SlaHistoryEvent{
|
||||
&CurrentState{State: 2},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 0.0,
|
||||
}, {
|
||||
Name: "PreferInitialStateFromBeforeOverLaterState",
|
||||
// The previous_hard_state should only be used as a fallback when there is no event from before the
|
||||
// SLA interval. Therefore, the latter should be preferred if there is conflicting information.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 800, State: 2, PreviousState: 99},
|
||||
&State{Time: 1200, State: 0, PreviousState: 0},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 80.0,
|
||||
}, {
|
||||
Name: "PreferInitialStateFromBeforeOverCurrentState",
|
||||
// The current state should only be used as a fallback when there is no state history event.
|
||||
// Therefore, the latter should be preferred if there is conflicting information.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 800, State: 2, PreviousState: 99},
|
||||
&CurrentState{State: 0},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 0.0,
|
||||
}, {
|
||||
Name: "PreferLaterStateOverCurrentState",
|
||||
// The current state should only be used as a fallback when there is no state history event.
|
||||
// Therefore, the latter should be preferred if there is conflicting information.
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 1200, State: 0, PreviousState: 2},
|
||||
&CurrentState{State: 2},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 80.0,
|
||||
}, {
|
||||
Name: "InitialUnknownReducesTotalTime",
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 1500, State: 2, PreviousState: 99},
|
||||
&State{Time: 1700, State: 0, PreviousState: 2},
|
||||
&CurrentState{State: 0},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 60,
|
||||
}, {
|
||||
Name: "IntermediateUnknownReducesTotalTime",
|
||||
Events: []SlaHistoryEvent{
|
||||
&State{Time: 1000, State: 0, PreviousState: 2},
|
||||
&State{Time: 1100, State: 2, PreviousState: 0},
|
||||
&State{Time: 1600, State: 0, PreviousState: 99},
|
||||
&State{Time: 1800, State: 2, PreviousState: 0},
|
||||
&CurrentState{State: 0},
|
||||
},
|
||||
Start: 1000,
|
||||
End: 2000,
|
||||
Expected: 60,
|
||||
}}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.Name, func(t *testing.T) {
|
||||
testSla(t, db, test.Events, test.Start, test.End, test.Expected, "unexpected SLA value")
|
||||
})
|
||||
}
|
||||
|
||||
t.Run("Invalid", func(t *testing.T) {
|
||||
m := SlaHistoryMeta{
|
||||
EnvironmentId: make([]byte, 20),
|
||||
EndpointId: make([]byte, 20),
|
||||
ObjectType: "host",
|
||||
HostId: make([]byte, 20),
|
||||
}
|
||||
|
||||
checkErr := func(t *testing.T, err error) {
|
||||
require.Error(t, err, "SLA function should return an error")
|
||||
|
||||
switch d := db.DriverName(); d {
|
||||
case "mysql":
|
||||
var mysqlErr *mysql.MySQLError
|
||||
require.ErrorAs(t, err, &mysqlErr, "SLA function should return a MySQL error")
|
||||
// https://dev.mysql.com/doc/mysql-errors/8.0/en/server-error-reference.html#error_er_signal_exception
|
||||
assert.Equal(t, uint16(1644), mysqlErr.Number, "MySQL error should be ER_SIGNAL_EXCEPTION")
|
||||
assert.Equal(t, "end time must be greater than start time", mysqlErr.Message,
|
||||
"MySQL error should contain custom message")
|
||||
|
||||
case "postgres":
|
||||
var pqErr *pq.Error
|
||||
require.ErrorAs(t, err, &pqErr, "SLA function should return a PostgreSQL error")
|
||||
assert.Equal(t, pq.ErrorCode("P0001"), pqErr.Code, "MySQL error should be ER_SIGNAL_EXCEPTION")
|
||||
assert.Equal(t, "end time must be greater than start time", pqErr.Message,
|
||||
"PostgreSQL error should contain custom message")
|
||||
|
||||
default:
|
||||
panic(fmt.Sprintf("unknown database driver %q", d))
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("ZeroDuration", func(t *testing.T) {
|
||||
_, err := execSqlSlaFunc(db, &m, 1000, 1000)
|
||||
checkErr(t, err)
|
||||
})
|
||||
|
||||
t.Run("NegativeDuration", func(t *testing.T) {
|
||||
_, err := execSqlSlaFunc(db, &m, 2000, 1000)
|
||||
checkErr(t, err)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
func execSqlSlaFunc(db *sqlx.DB, m *SlaHistoryMeta, start uint64, end uint64) (float64, error) {
|
||||
var result float64
|
||||
err := db.Get(&result, db.Rebind("SELECT get_sla_ok_percent(?, ?, ?, ?)"),
|
||||
m.HostId, m.ServiceId, start, end)
|
||||
return result, err
|
||||
}
|
||||
|
||||
func testSla(t *testing.T, db *sqlx.DB, events []SlaHistoryEvent, start uint64, end uint64, expected float64, msg string) {
|
||||
t.Run("Host", func(t *testing.T) {
|
||||
testSlaWithObjectType(t, db, events, false, start, end, expected, msg)
|
||||
})
|
||||
t.Run("Service", func(t *testing.T) {
|
||||
testSlaWithObjectType(t, db, events, true, start, end, expected, msg)
|
||||
})
|
||||
}
|
||||
|
||||
func testSlaWithObjectType(t *testing.T, db *sqlx.DB,
|
||||
events []SlaHistoryEvent, service bool, start uint64, end uint64, expected float64, msg string,
|
||||
) {
|
||||
makeId := func() []byte {
|
||||
id := make([]byte, 20)
|
||||
_, err := rand.Read(id)
|
||||
require.NoError(t, err, "generating random id failed")
|
||||
return id
|
||||
}
|
||||
|
||||
meta := SlaHistoryMeta{
|
||||
EnvironmentId: makeId(),
|
||||
EndpointId: makeId(),
|
||||
HostId: makeId(),
|
||||
}
|
||||
if service {
|
||||
meta.ObjectType = "service"
|
||||
meta.ServiceId = makeId()
|
||||
} else {
|
||||
meta.ObjectType = "host"
|
||||
}
|
||||
|
||||
for _, event := range events {
|
||||
err := event.WriteSlaEventToDatabase(db, &meta)
|
||||
require.NoErrorf(t, err, "Inserting SLA history for %#v failed", event)
|
||||
}
|
||||
|
||||
r, err := execSqlSlaFunc(db, &meta, start, end)
|
||||
require.NoError(t, err, "SLA query should not fail")
|
||||
assert.Equal(t, expected, r, msg)
|
||||
}
|
||||
|
||||
type SlaHistoryMeta struct {
|
||||
EnvironmentId NullableBytes `db:"environment_id"`
|
||||
EndpointId NullableBytes `db:"endpoint_id"`
|
||||
ObjectType string `db:"object_type"`
|
||||
HostId NullableBytes `db:"host_id"`
|
||||
ServiceId NullableBytes `db:"service_id"`
|
||||
}
|
||||
|
||||
type SlaHistoryEvent interface {
|
||||
WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error
|
||||
}
|
||||
|
||||
type State struct {
|
||||
Time uint64
|
||||
State uint8
|
||||
PreviousState uint8
|
||||
}
|
||||
|
||||
var _ SlaHistoryEvent = (*State)(nil)
|
||||
|
||||
func (s *State) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error {
|
||||
type values struct {
|
||||
*SlaHistoryMeta
|
||||
Id []byte `db:"id"`
|
||||
EventTime uint64 `db:"event_time"`
|
||||
HardState uint8 `db:"hard_state"`
|
||||
PreviousHardState uint8 `db:"previous_hard_state"`
|
||||
}
|
||||
|
||||
id := make([]byte, 20)
|
||||
_, err := rand.Read(id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = db.NamedExec("INSERT INTO sla_history_state"+
|
||||
" (id, environment_id, endpoint_id, object_type, host_id, service_id, event_time, hard_state, previous_hard_state)"+
|
||||
" VALUES (:id, :environment_id, :endpoint_id, :object_type, :host_id, :service_id, :event_time, :hard_state, :previous_hard_state)",
|
||||
&values{
|
||||
SlaHistoryMeta: m,
|
||||
Id: id[:],
|
||||
EventTime: s.Time,
|
||||
HardState: s.State,
|
||||
PreviousHardState: s.PreviousState,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
type CurrentState struct {
|
||||
State uint8
|
||||
}
|
||||
|
||||
func (c *CurrentState) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error {
|
||||
type values struct {
|
||||
*SlaHistoryMeta
|
||||
State uint8 `db:"state"`
|
||||
PropertiesChecksum NullableBytes `db:"properties_checksum"`
|
||||
}
|
||||
|
||||
v := values{
|
||||
SlaHistoryMeta: m,
|
||||
State: c.State,
|
||||
PropertiesChecksum: make([]byte, 20),
|
||||
}
|
||||
|
||||
if len(m.ServiceId) == 0 {
|
||||
_, err := db.NamedExec("INSERT INTO host_state"+
|
||||
" (id, host_id, environment_id, properties_checksum, soft_state, previous_soft_state,"+
|
||||
" hard_state, previous_hard_state, attempt, severity, last_state_change, next_check, next_update)"+
|
||||
" VALUES (:host_id, :host_id, :environment_id, :properties_checksum, :state, :state, :state, :state, 0, 0, 0, 0, 0)",
|
||||
&v)
|
||||
return err
|
||||
} else {
|
||||
_, err := db.NamedExec("INSERT INTO service_state"+
|
||||
" (id, host_id, service_id, environment_id, properties_checksum, soft_state, previous_soft_state,"+
|
||||
" hard_state, previous_hard_state, attempt, severity, last_state_change, next_check, next_update)"+
|
||||
" VALUES (:service_id, :host_id, :service_id, :environment_id, :properties_checksum, :state, :state, :state, :state, 0, 0, 0, 0, 0)",
|
||||
&v)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
var _ SlaHistoryEvent = (*CurrentState)(nil)
|
||||
|
||||
type Downtime struct {
|
||||
Start uint64
|
||||
End uint64
|
||||
}
|
||||
|
||||
var _ SlaHistoryEvent = (*Downtime)(nil)
|
||||
|
||||
type slaHistoryDowntime struct {
|
||||
*SlaHistoryMeta
|
||||
DowntimeId []byte `db:"downtime_id"`
|
||||
DowntimeStart uint64 `db:"downtime_start"`
|
||||
DowntimeEnd uint64 `db:"downtime_end"`
|
||||
}
|
||||
|
||||
func (d *Downtime) WriteSlaEventToDatabase(db *sqlx.DB, m *SlaHistoryMeta) error {
|
||||
downtimeId := make([]byte, 20)
|
||||
_, err := rand.Read(downtimeId)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = db.NamedExec("INSERT INTO sla_history_downtime"+
|
||||
" (environment_id, endpoint_id, object_type, host_id, service_id, downtime_id, downtime_start, downtime_end)"+
|
||||
" VALUES (:environment_id, :endpoint_id, :object_type, :host_id,"+
|
||||
" :service_id, :downtime_id, :downtime_start, :downtime_end)",
|
||||
&slaHistoryDowntime{
|
||||
SlaHistoryMeta: m,
|
||||
DowntimeId: downtimeId[:],
|
||||
DowntimeStart: d.Start,
|
||||
DowntimeEnd: d.End,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
// NullableBytes allows writing to binary columns in a database with support for NULL.
|
||||
type NullableBytes []byte
|
||||
|
||||
// Value implements the database/sql/driver.Valuer interface.
|
||||
func (b NullableBytes) Value() (driver.Value, error) {
|
||||
if b != nil {
|
||||
return []byte(b), nil
|
||||
}
|
||||
|
||||
// any(nil) is treated as NULL in contrast to []byte(nil) which is a non-NULL byte sequence of length 0.
|
||||
return nil, nil
|
||||
}
|
||||
Loading…
Reference in a new issue