mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-04-23 15:17:41 -04:00
DRA device taints: artificially delay pod deletion during test
We can observe the delay in the metric histogram. Because we run in a synctest bubble, the delay is 100% predictable. Unfortunately we cannot use the reactor mechanism of the fake client: that delays while holding the fake's mutex. When some other goroutine (in this case, the event recorder) calls the client, it gets blocked without being considered durably blocked by synctest, so time does not advance and the test gets stuck.
This commit is contained in:
parent
7d7b4c3dcb
commit
fb94a99d2f
1 changed files with 75 additions and 8 deletions
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package devicetainteviction
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
|
|
@ -44,8 +45,11 @@ import (
|
|||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
clientfeatures "k8s.io/client-go/features"
|
||||
"k8s.io/client-go/informers"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
core "k8s.io/client-go/testing"
|
||||
"k8s.io/component-base/featuregate"
|
||||
metricstestutil "k8s.io/component-base/metrics/testutil"
|
||||
|
|
@ -68,6 +72,18 @@ func init() {
|
|||
}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Disabled because we not only use the fake client, we also wrap it:
|
||||
// that prevents auto-detection via IsWatchListSemanticsUnSupported.
|
||||
// This here is simpler than implementing IsWatchListSemanticsUnSupported.
|
||||
//
|
||||
// We cannot use clientfeaturestesting.SetFeatureDuringTest (needs testing.TB),
|
||||
// so here we just do it ourselves.
|
||||
if err := clientfeatures.FeatureGates().(interface {
|
||||
Set(clientfeatures.Feature, bool) error
|
||||
}).Set(clientfeatures.WatchListClient, false); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce typing with some constructors.
|
||||
|
|
@ -2367,14 +2383,15 @@ func testEviction(tCtx ktesting.TContext) {
|
|||
func TestDeviceTaintRule(t *testing.T) { testDeviceTaintRule(ktesting.Init(t)) }
|
||||
func testDeviceTaintRule(tCtx ktesting.TContext) {
|
||||
tCtx.Parallel()
|
||||
tCtx.SyncTest("immediate", func(tCtx ktesting.TContext) { synctestDeviceTaintRule(tCtx, false) })
|
||||
tCtx.SyncTest("delayed", func(tCtx ktesting.TContext) { synctestDeviceTaintRule(tCtx, true) })
|
||||
tCtx.SyncTest("immediate", func(tCtx ktesting.TContext) { synctestDeviceTaintRule(tCtx, false, false) })
|
||||
tCtx.SyncTest("delayed", func(tCtx ktesting.TContext) { synctestDeviceTaintRule(tCtx, true, false) })
|
||||
tCtx.SyncTest("slow", func(tCtx ktesting.TContext) { synctestDeviceTaintRule(tCtx, true, true) })
|
||||
}
|
||||
func synctestDeviceTaintRule(tCtx ktesting.TContext, delayed bool) {
|
||||
func synctestDeviceTaintRule(tCtx ktesting.TContext, toleration, slowDelete bool) {
|
||||
rule := ruleNone.DeepCopy()
|
||||
claim := inUseClaim.DeepCopy()
|
||||
tolerationSeconds := int64(0)
|
||||
if delayed {
|
||||
if toleration {
|
||||
tolerationSeconds = 30
|
||||
claim.Status.Allocation.Devices.Results[0].Tolerations = []resourceapi.DeviceToleration{{
|
||||
Effect: resourceapi.DeviceTaintEffectNoExecute,
|
||||
|
|
@ -2383,7 +2400,17 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext, delayed bool) {
|
|||
}}
|
||||
}
|
||||
fakeClientset := fake.NewClientset(podWithClaimName, claim, rule)
|
||||
tCtx = tCtx.WithClients(nil, nil, fakeClientset, nil, nil)
|
||||
blockDelete := make(chan struct{})
|
||||
client := clientset.Interface(fakeClientset)
|
||||
if slowDelete {
|
||||
client = &myFakeClient{client, func() {
|
||||
// This coordinates with the main goroutine to move forward after a certain delay.
|
||||
tCtx.Logf("Delaying pod deletion")
|
||||
<-blockDelete
|
||||
tCtx.Logf("Proceeding with pod deletion")
|
||||
}}
|
||||
}
|
||||
tCtx = tCtx.WithClients(nil, nil, client, nil, nil)
|
||||
controller := newTestController(tCtx)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
|
@ -2430,9 +2457,16 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext, delayed bool) {
|
|||
}
|
||||
check(tCtx, "evict: ", l(inProgress(rule, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", &updated)), expectedPods)
|
||||
|
||||
var slowDeleteDelay time.Duration
|
||||
if tolerationSeconds > 0 {
|
||||
// Need to move forward in time past the delay.
|
||||
// Need to move forward in time past the delay(s)
|
||||
time.Sleep(time.Duration(tolerationSeconds) * time.Second)
|
||||
if slowDelete {
|
||||
// This is going to show up in the histogram.
|
||||
slowDeleteDelay = 3 * time.Second
|
||||
time.Sleep(slowDeleteDelay)
|
||||
close(blockDelete)
|
||||
}
|
||||
tCtx.Wait()
|
||||
// Now the pod is deleted. Status gets updated later.
|
||||
check(tCtx, "evict: ", l(inProgress(rule, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", &updated)), nil)
|
||||
|
|
@ -2458,8 +2492,7 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext, delayed bool) {
|
|||
|
||||
tCtx.Assert(controller.workqueue.Len()).Should(gomega.Equal(0), "work queue empty")
|
||||
// There are no delays inside a synctest bubble. The pod always gets deleted "immediately".
|
||||
// TODO: simulate delays?
|
||||
tCtx.ExpectNoError(testPodDeletionsMetrics(controller, 0))
|
||||
tCtx.ExpectNoError(testPodDeletionsMetrics(controller, slowDeleteDelay))
|
||||
}
|
||||
|
||||
func check(tCtx ktesting.TContext, prefix string, expectRules []*resourcealpha.DeviceTaintRule, expectPods []*v1.Pod) {
|
||||
|
|
@ -2482,6 +2515,40 @@ func check(tCtx ktesting.TContext, prefix string, expectRules []*resourcealpha.D
|
|||
assertEqual(tCtx, expectRules, trimRules(rules.Items), prefix+"rules", opts...)
|
||||
}
|
||||
|
||||
// myFakeClient and it's children have a single purpose: inject a callback into
|
||||
// CoreV1().Pods().Delete. We cannot use fakeClientset.PrependReactor for that
|
||||
// because it invokes the callback while holding the fake's mutex. If any other
|
||||
// goroutine then also makes an API call, a synctest bubble deadlocks because
|
||||
// holding the mutex is not considered "durably blocking" even though it is
|
||||
// in our case.
|
||||
type myFakeClient struct {
|
||||
clientset.Interface
|
||||
deletePodCallback func()
|
||||
}
|
||||
|
||||
func (mf *myFakeClient) CoreV1() corev1.CoreV1Interface {
|
||||
return &myFakeCoreV1{mf.Interface.CoreV1(), mf.deletePodCallback}
|
||||
}
|
||||
|
||||
type myFakeCoreV1 struct {
|
||||
corev1.CoreV1Interface
|
||||
deletePodCallback func()
|
||||
}
|
||||
|
||||
func (mf *myFakeCoreV1) Pods(namespace string) corev1.PodInterface {
|
||||
return &myFakePods{mf.CoreV1Interface.Pods(namespace), mf.deletePodCallback}
|
||||
}
|
||||
|
||||
type myFakePods struct {
|
||||
corev1.PodInterface
|
||||
deletePodCallback func()
|
||||
}
|
||||
|
||||
func (mf *myFakePods) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
|
||||
mf.deletePodCallback()
|
||||
return mf.PodInterface.Delete(ctx, name, opts)
|
||||
}
|
||||
|
||||
// TestCancelEviction deletes the pod before the controller deletes it
|
||||
// or removes the slice. Either way, eviction gets cancelled.
|
||||
func TestCancelEviction(t *testing.T) { testCancelEviction(ktesting.Init(t)) }
|
||||
|
|
|
|||
Loading…
Reference in a new issue