mirror of
https://github.com/prometheus/prometheus.git
synced 2026-05-28 04:02:21 -04:00
fix(test): make TestRemoteWrite_ReshardingWithoutDeadlock more reliable and re-enable it (#17490)
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (push) Waiting to run
CI / Build Prometheus for all architectures (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (push) Waiting to run
CI / Build Prometheus for all architectures (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
Improve test stability by waiting for the relevant metrics to appear on /metrics before the first check on the desired shard count. Increase the scrape interval to avoid timeouts, as 100 ms may be insufficient for Prometheus to scrape itself in some environments (e.g., CI). Have Prometheus scrape itself multiple times to increase the volume of data sent and help fill the queue more quickly. Signed-off-by: machine424 <ayoubmrini424@gmail.com>
This commit is contained in:
parent
a66c696530
commit
e7999528fa
1 changed files with 43 additions and 15 deletions
|
|
@ -968,8 +968,17 @@ remote_write:
|
|||
|
||||
// TestRemoteWrite_ReshardingWithoutDeadlock ensures that resharding (scaling up) doesn't block when the shards are full.
|
||||
// See: https://github.com/prometheus/prometheus/issues/17384.
|
||||
//
|
||||
// The following shows key resharding metrics before and after the fix.
|
||||
// In v3.7.0, the deadlock prevented the resharding logic from observing the true incoming data rate.
|
||||
//
|
||||
// | Metric | v3.7.0 | after the fix |
|
||||
// |---------------------|---------------|---------------------|
|
||||
// | dataInRate | 0.6 | 307.2 |
|
||||
// | dataPendingRate | 0.2 | 306.8 |
|
||||
// | dataPending | 0 | 1228.8 |
|
||||
// | desiredShards | 0.6 | 369.2 |.
|
||||
func TestRemoteWrite_ReshardingWithoutDeadlock(t *testing.T) {
|
||||
t.Skip("flaky test, see https://github.com/prometheus/prometheus/issues/17489")
|
||||
t.Parallel()
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
|
|
@ -984,7 +993,8 @@ func TestRemoteWrite_ReshardingWithoutDeadlock(t *testing.T) {
|
|||
|
||||
config := fmt.Sprintf(`
|
||||
global:
|
||||
scrape_interval: 100ms
|
||||
# Using a smaller interval may cause the scrape to time out.
|
||||
scrape_interval: 1s
|
||||
scrape_configs:
|
||||
- job_name: 'self'
|
||||
static_configs:
|
||||
|
|
@ -995,6 +1005,8 @@ remote_write:
|
|||
queue_config:
|
||||
# Speed up the queue being full.
|
||||
capacity: 1
|
||||
# Helps keep the “time to send one sample” low so it doesn’t influence the resharding logic.
|
||||
max_samples_per_send: 1
|
||||
`, port, server.URL)
|
||||
require.NoError(t, os.WriteFile(configFile, []byte(config), 0o777))
|
||||
|
||||
|
|
@ -1003,36 +1015,52 @@ remote_write:
|
|||
configFile,
|
||||
port,
|
||||
fmt.Sprintf("--storage.tsdb.path=%s", tmpDir),
|
||||
"--log.level=debug",
|
||||
)
|
||||
require.NoError(t, prom.Start())
|
||||
|
||||
var checkInitialDesiredShardsOnce sync.Once
|
||||
require.Eventually(t, func() bool {
|
||||
const desiredShardsMetric = "prometheus_remote_storage_shards_desired"
|
||||
getMetrics := func() ([]byte, error) {
|
||||
r, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/metrics", port))
|
||||
if err != nil {
|
||||
return false
|
||||
return nil, err
|
||||
}
|
||||
defer r.Body.Close()
|
||||
if r.StatusCode != http.StatusOK {
|
||||
return false
|
||||
return nil, fmt.Errorf("unexpected status code: %d", r.StatusCode)
|
||||
}
|
||||
|
||||
metrics, err := io.ReadAll(r.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
// Ensure the initial desired shards is 1.
|
||||
require.Eventually(t, func() bool {
|
||||
metrics, err := getMetrics()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
initialDesiredShards, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, desiredShardsMetric)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return initialDesiredShards == 1.0
|
||||
}, 10*time.Second, 100*time.Millisecond)
|
||||
|
||||
checkInitialDesiredShardsOnce.Do(func() {
|
||||
s, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, "prometheus_remote_storage_shards_desired")
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1.0, s)
|
||||
})
|
||||
|
||||
desiredShards, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, "prometheus_remote_storage_shards_desired")
|
||||
if err != nil || desiredShards <= 1 {
|
||||
// Ensure scaling up is triggered after some time.
|
||||
require.Eventually(t, func() bool {
|
||||
metrics, err := getMetrics()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
desiredShards, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, desiredShardsMetric)
|
||||
if err != nil || desiredShards <= 1.0 {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
// 3*shardUpdateDuration to allow for the resharding logic to run.
|
||||
}, 30*time.Second, 1*time.Second)
|
||||
}, 30*time.Second, time.Second)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue