fix(test): make TestRemoteWrite_ReshardingWithoutDeadlock more reliable and re-enable it (#17490)
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (push) Waiting to run
CI / Build Prometheus for all architectures (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run

Improve test stability by waiting for the relevant metrics to appear on /metrics before the
first check on the desired shard count.

Increase the scrape interval to avoid timeouts, as 100 ms may be insufficient for Prometheus
to scrape itself in some environments (e.g., CI).

Have Prometheus scrape itself multiple times to increase the volume of data sent and help
fill the queue more quickly.

Signed-off-by: machine424 <ayoubmrini424@gmail.com>
This commit is contained in:
Ayoub Mrini 2025-11-25 22:30:42 +01:00 committed by GitHub
parent a66c696530
commit e7999528fa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -968,8 +968,17 @@ remote_write:
// TestRemoteWrite_ReshardingWithoutDeadlock ensures that resharding (scaling up) doesn't block when the shards are full.
// See: https://github.com/prometheus/prometheus/issues/17384.
//
// The following shows key resharding metrics before and after the fix.
// In v3.7.0, the deadlock prevented the resharding logic from observing the true incoming data rate.
//
// | Metric | v3.7.0 | after the fix |
// |---------------------|---------------|---------------------|
// | dataInRate | 0.6 | 307.2 |
// | dataPendingRate | 0.2 | 306.8 |
// | dataPending | 0 | 1228.8 |
// | desiredShards | 0.6 | 369.2 |.
func TestRemoteWrite_ReshardingWithoutDeadlock(t *testing.T) {
t.Skip("flaky test, see https://github.com/prometheus/prometheus/issues/17489")
t.Parallel()
tmpDir := t.TempDir()
@ -984,7 +993,8 @@ func TestRemoteWrite_ReshardingWithoutDeadlock(t *testing.T) {
config := fmt.Sprintf(`
global:
scrape_interval: 100ms
# Using a smaller interval may cause the scrape to time out.
scrape_interval: 1s
scrape_configs:
- job_name: 'self'
static_configs:
@ -995,6 +1005,8 @@ remote_write:
queue_config:
# Speed up the queue being full.
capacity: 1
# Helps keep the time to send one sample low so it doesnt influence the resharding logic.
max_samples_per_send: 1
`, port, server.URL)
require.NoError(t, os.WriteFile(configFile, []byte(config), 0o777))
@ -1003,36 +1015,52 @@ remote_write:
configFile,
port,
fmt.Sprintf("--storage.tsdb.path=%s", tmpDir),
"--log.level=debug",
)
require.NoError(t, prom.Start())
var checkInitialDesiredShardsOnce sync.Once
require.Eventually(t, func() bool {
const desiredShardsMetric = "prometheus_remote_storage_shards_desired"
getMetrics := func() ([]byte, error) {
r, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/metrics", port))
if err != nil {
return false
return nil, err
}
defer r.Body.Close()
if r.StatusCode != http.StatusOK {
return false
return nil, fmt.Errorf("unexpected status code: %d", r.StatusCode)
}
metrics, err := io.ReadAll(r.Body)
if err != nil {
return nil, err
}
return metrics, nil
}
// Ensure the initial desired shards is 1.
require.Eventually(t, func() bool {
metrics, err := getMetrics()
if err != nil {
return false
}
initialDesiredShards, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, desiredShardsMetric)
if err != nil {
return false
}
return initialDesiredShards == 1.0
}, 10*time.Second, 100*time.Millisecond)
checkInitialDesiredShardsOnce.Do(func() {
s, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, "prometheus_remote_storage_shards_desired")
require.NoError(t, err)
require.Equal(t, 1.0, s)
})
desiredShards, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, "prometheus_remote_storage_shards_desired")
if err != nil || desiredShards <= 1 {
// Ensure scaling up is triggered after some time.
require.Eventually(t, func() bool {
metrics, err := getMetrics()
if err != nil {
return false
}
desiredShards, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, desiredShardsMetric)
if err != nil || desiredShards <= 1.0 {
return false
}
return true
// 3*shardUpdateDuration to allow for the resharding logic to run.
}, 30*time.Second, 1*time.Second)
}, 30*time.Second, time.Second)
}