Hopefully fix flaky Consul fencing test (#23280)

* Hopefully fix flaky fencing test when run in Enterprise

* Fix typo
This commit is contained in:
Paul Banks 2023-09-28 13:25:52 +01:00 committed by GitHub
parent 98e9d0c95e
commit 9fc67b6656
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -68,6 +68,18 @@ func TestConsulFencing_PartitionedLeaderCantWrite(t *testing.T) {
})
require.NoError(t, err)
// We need to wait for the KV mount to be ready on all servers - it runs an
// "Upgrade" process on mount and will error until that's done. In practice
// that's so fast that in CE I've never seen it fail yet, but in Enterprise
// there is the added complexity of the upgrade running on the primary while
// standby nodes wait for it to complete. So in Enterprise the first PATCH
// sent to a standby often comes in before the standby has "seen" that the
// primary has completed the upgrade and fails causing the whole test to
// error. We can prevent that by taking a short loop here to wait for a read
// to succeed on the standby/non-leader (since KV-v2 does upgrade check on
// read too).
waitForKVv2Upgrade(t, ctx, notLeader.APIClient(), "/test")
// Start two background workers that will cause writes to Consul in the
// background. KV v2 relies on a single active node for correctness.
// Specifically its patch operation does a read-modify-write under a
@ -263,6 +275,46 @@ func TestConsulFencing_PartitionedLeaderCantWrite(t *testing.T) {
require.Equal(t, expect1, sets[1], "Client 1 writes lost")
}
func waitForKVv2Upgrade(t *testing.T, ctx context.Context, client *api.Client, path string) {
t.Helper()
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
kv := client.KVv2(path)
attempts := 0
wait := 20 * time.Millisecond
for {
// Attempt to perform a write on the KVv2 mount. It will fail until the
// backend is done upgrading. If this is a performance standby in Ent then
// it will not complete until the primary is done upgrading AND the standby
// has noticed that!
_, err := kv.Put(ctx, "test-upgrade-done", map[string]interface{}{
"ok": 1,
})
if err == nil {
return
}
t.Logf("waitForKVv2Upgrade: write failed: %s", err)
select {
case <-ctx.Done():
t.Fatalf("context cancelled waiting for KVv2 (%s) upgrade to complete: %s",
path, ctx.Err())
return
case <-time.After(wait):
}
attempts++
// We don't quite want exponential backoff because it really should be fast,
// but just reduce log spam on failures if it's taking a while.
if attempts > 4 {
wait = 250 * time.Millisecond
}
if attempts > 10 {
wait = time.Second
}
}
}
func makeSet(n int) []int {
a := make([]int, n)
for i := 0; i < n; i++ {