From 8855c2e6262f3e2800fc92bcd8b413cb4b39c3aa Mon Sep 17 00:00:00 2001
From: Julien Duchesne <julienduchesne@live.com>
Date: Wed, 16 Jun 2021 05:33:02 -0400
Subject: [PATCH] Add `prometheus_tsdb_clean_start` metric (#8824)

Add cleanup of the lockfile when the db is cleanly closed

The metric describes the status of the lockfile on startup
0: Already existed
1: Did not exist
-1: Disabled

Therefore, if the min value over time of this metric is 0, that means that executions have exited uncleanly
We can then use that metric to have a much lower threshold on the crashlooping alert:

If the metric exists and it has been zero, two restarts is enough to trigger the alarm
If it does not exist (old prom version for example), the current five restarts threshold remains

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Change metric name + set unset value to -1

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Only check the last value of the clean start alert

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Fix test + nit

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>
---
 .../prometheus-mixin/alerts.libsonnet         | 17 ++++-
 tsdb/db.go                                    | 55 +++++++++++-----
 tsdb/db_test.go                               | 64 +++++++++++++++++++
 3 files changed, 119 insertions(+), 17 deletions(-)

diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet
index 898a39c60d..baf2f2f0fd 100644
--- a/documentation/prometheus-mixin/alerts.libsonnet
+++ b/documentation/prometheus-mixin/alerts.libsonnet
@@ -386,6 +386,21 @@
           {
             alert: 'PrometheusHAGroupCrashlooping',
             expr: |||
+              (
+                  prometheus_tsdb_clean_start{%(prometheusSelector)s} == 0
+                and
+                  ( 
+                    count by (%(prometheusHAGroupLabels)s) (
+                      changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 1
+                    ) 
+                    / 
+                    count by (%(prometheusHAGroupLabels)s) (
+                      up{%(prometheusSelector)s}
+                    )
+                  )
+                  > 0.5
+              )
+              or
               (
                 count by (%(prometheusHAGroupLabels)s) (
                   changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 4
@@ -403,7 +418,7 @@
             },
             annotations: {
               summary: 'More than half of the Prometheus instances within the same HA group are crashlooping.',
-              description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have restarted at least 5 times in the last 30m.' % $._config,
+              description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have had at least 5 total restarts or 2 unclean restarts in the last 30m.' % $._config,
             },
           },
         ],
diff --git a/tsdb/db.go b/tsdb/db.go
index 1114be8243..6507c25f8e 100644
--- a/tsdb/db.go
+++ b/tsdb/db.go
@@ -58,6 +58,10 @@ const (
 	tmpForCreationBlockDirSuffix = ".tmp-for-creation"
 	// Pre-2.21 tmp dir suffix, used in clean-up functions.
 	tmpLegacy = ".tmp"
+
+	lockfileDisabled       = -1
+	lockfileReplaced       = 0
+	lockfileCreatedCleanly = 1
 )
 
 var (
@@ -153,8 +157,9 @@ type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}
 // DB handles reads and writes of time series falling into
 // a hashed partition of a seriedb.
 type DB struct {
-	dir   string
-	lockf fileutil.Releaser
+	dir       string
+	lockf     fileutil.Releaser
+	lockfPath string
 
 	logger         log.Logger
 	metrics        *dbMetrics
@@ -186,19 +191,20 @@ type DB struct {
 }
 
 type dbMetrics struct {
-	loadedBlocks         prometheus.GaugeFunc
-	symbolTableSize      prometheus.GaugeFunc
-	reloads              prometheus.Counter
-	reloadsFailed        prometheus.Counter
-	compactionsFailed    prometheus.Counter
-	compactionsTriggered prometheus.Counter
-	compactionsSkipped   prometheus.Counter
-	sizeRetentionCount   prometheus.Counter
-	timeRetentionCount   prometheus.Counter
-	startTime            prometheus.GaugeFunc
-	tombCleanTimer       prometheus.Histogram
-	blocksBytes          prometheus.Gauge
-	maxBytes             prometheus.Gauge
+	loadedBlocks           prometheus.GaugeFunc
+	symbolTableSize        prometheus.GaugeFunc
+	reloads                prometheus.Counter
+	reloadsFailed          prometheus.Counter
+	compactionsFailed      prometheus.Counter
+	compactionsTriggered   prometheus.Counter
+	compactionsSkipped     prometheus.Counter
+	sizeRetentionCount     prometheus.Counter
+	timeRetentionCount     prometheus.Counter
+	startTime              prometheus.GaugeFunc
+	tombCleanTimer         prometheus.Histogram
+	blocksBytes            prometheus.Gauge
+	maxBytes               prometheus.Gauge
+	lockfileCreatedCleanly prometheus.Gauge
 }
 
 func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
@@ -276,6 +282,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
 		Name: "prometheus_tsdb_size_retentions_total",
 		Help: "The number of times that blocks were deleted because the maximum number of bytes was exceeded.",
 	})
+	m.lockfileCreatedCleanly = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "prometheus_tsdb_clean_start",
+		Help: "-1: lockfile is disabled. 0: a lockfile from a previous execution was replaced. 1: lockfile creation was clean",
+	})
 
 	if r != nil {
 		r.MustRegister(
@@ -292,6 +302,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
 			m.tombCleanTimer,
 			m.blocksBytes,
 			m.maxBytes,
+			m.lockfileCreatedCleanly,
 		)
 	}
 	return m
@@ -653,12 +664,22 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 		db.blocksToDelete = DefaultBlocksToDelete(db)
 	}
 
+	lockfileCreationStatus := lockfileDisabled
 	if !opts.NoLockfile {
 		absdir, err := filepath.Abs(dir)
 		if err != nil {
 			return nil, err
 		}
-		lockf, _, err := fileutil.Flock(filepath.Join(absdir, "lock"))
+		db.lockfPath = filepath.Join(absdir, "lock")
+
+		if _, err := os.Stat(db.lockfPath); err == nil {
+			level.Warn(db.logger).Log("msg", "A TSDB lockfile from a previous execution already existed. It was replaced", "file", db.lockfPath)
+			lockfileCreationStatus = lockfileReplaced
+		} else {
+			lockfileCreationStatus = lockfileCreatedCleanly
+		}
+
+		lockf, _, err := fileutil.Flock(db.lockfPath)
 		if err != nil {
 			return nil, errors.Wrap(err, "lock DB directory")
 		}
@@ -707,6 +728,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	if maxBytes < 0 {
 		maxBytes = 0
 	}
+	db.metrics.lockfileCreatedCleanly.Set(float64(lockfileCreationStatus))
 	db.metrics.maxBytes.Set(float64(maxBytes))
 
 	if err := db.reload(); err != nil {
@@ -1418,6 +1440,7 @@ func (db *DB) Close() error {
 	errs := tsdb_errors.NewMulti(g.Wait())
 	if db.lockf != nil {
 		errs.Add(db.lockf.Release())
+		errs.Add(os.Remove(db.lockfPath))
 	}
 	if db.head != nil {
 		errs.Add(db.head.Close())
diff --git a/tsdb/db_test.go b/tsdb/db_test.go
index 8548f68482..6d977d77dc 100644
--- a/tsdb/db_test.go
+++ b/tsdb/db_test.go
@@ -3125,6 +3125,70 @@ func TestNoPanicOnTSDBOpenError(t *testing.T) {
 	require.NoError(t, lockf.Release())
 }
 
+func TestLockfileMetric(t *testing.T) {
+	cases := []struct {
+		fileAlreadyExists bool
+		lockFileDisabled  bool
+		expectedValue     int
+	}{
+		{
+			fileAlreadyExists: false,
+			lockFileDisabled:  false,
+			expectedValue:     lockfileCreatedCleanly,
+		},
+		{
+			fileAlreadyExists: true,
+			lockFileDisabled:  false,
+			expectedValue:     lockfileReplaced,
+		},
+		{
+			fileAlreadyExists: true,
+			lockFileDisabled:  true,
+			expectedValue:     lockfileDisabled,
+		},
+		{
+			fileAlreadyExists: false,
+			lockFileDisabled:  true,
+			expectedValue:     lockfileDisabled,
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(fmt.Sprintf("%+v", c), func(t *testing.T) {
+			tmpdir, err := ioutil.TempDir("", "test")
+			require.NoError(t, err)
+			t.Cleanup(func() {
+				require.NoError(t, os.RemoveAll(tmpdir))
+			})
+			absdir, err := filepath.Abs(tmpdir)
+			require.NoError(t, err)
+
+			// Test preconditions (file already exists + lockfile option)
+			lockfilePath := filepath.Join(absdir, "lock")
+			if c.fileAlreadyExists {
+				err = ioutil.WriteFile(lockfilePath, []byte{}, 0644)
+				require.NoError(t, err)
+			}
+			opts := DefaultOptions()
+			opts.NoLockfile = c.lockFileDisabled
+
+			// Create the DB, this should create a lockfile and the metrics
+			db, err := Open(tmpdir, nil, nil, opts)
+			require.NoError(t, err)
+			require.Equal(t, float64(c.expectedValue), prom_testutil.ToFloat64(db.metrics.lockfileCreatedCleanly))
+
+			// Close the DB, this should delete the lockfile
+			require.NoError(t, db.Close())
+
+			// Check that the lockfile is always deleted
+			if !c.lockFileDisabled {
+				_, err = os.Stat(lockfilePath)
+				require.Error(t, err, "lockfile was not deleted")
+			}
+		})
+	}
+}
+
 func TestQuerier_ShouldNotPanicIfHeadChunkIsTruncatedWhileReadingQueriedChunks(t *testing.T) {
 	t.Skip("TODO: investigate why process crash in CI")