From 2fe49b0cd06a367b2eedc535fc51f0d8a8b1ecd6 Mon Sep 17 00:00:00 2001
From: Akhil Singh <singhakhil69@gmail.com>
Date: Wed, 3 Jun 2026 11:31:43 +0530
Subject: [PATCH] Fix job controller reporting active=0 during pod creation
 backoff

When manageJob() needs to create replacement pods but defers creation
because a pod-failure backoff is still active, it returned a hardcoded
active=0 to the caller. Because no pods were actually created or deleted,
this left Status.Active=0 while Status.Ready still reflected the running
pods. The apiserver correctly rejects such updates ("cannot set more
ready pods than active") with a 422, which blocks flushing uncounted
terminated pods, removing finalizers, and updating job status, leaving
pods stuck Terminating with stale status.

Return the real active count from both backoff early-returns instead,
since the deferral does not change the number of active pods.

Issue: https://github.com/kubernetes/kubernetes/issues/139428
---
 pkg/controller/job/job_controller.go      | 11 +++-
 pkg/controller/job/job_controller_test.go | 75 +++++++++++++++++++++++
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go
index 3c332110178..db0bf7e9cbd 100644
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -1822,7 +1822,12 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
 		}
 		if remainingTime > 0 {
 			jm.enqueueSyncJobWithDelay(logger, job, remainingTime)
-			return 0, metrics.JobSyncActionPodsCreated, nil
+			// No pods were created or deleted, so return the current active
+			// count rather than 0. Returning 0 here would cause the status
+			// update to set Active=0 while Ready still reflects the running
+			// pods, which the API server rejects ("cannot set more ready pods
+			// than active"), blocking finalizer removal and status flushing.
+			return active, metrics.JobSyncActionPodsCreated, nil
 		}
 		if diff > int32(MaxPodCreateDeletePerSync) {
 			diff = int32(MaxPodCreateDeletePerSync)
@@ -1835,7 +1840,9 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
 				indexesToAdd, remainingTime = jm.getPodCreationInfoForIndependentIndexes(logger, indexesToAdd, jobCtx.podsWithDelayedDeletionPerIndex)
 				if remainingTime > 0 {
 					jm.enqueueSyncJobWithDelay(logger, job, remainingTime)
-					return 0, metrics.JobSyncActionPodsCreated, nil
+					// No pods were created or deleted, so return the current
+					// active count rather than 0 (see comment above).
+					return active, metrics.JobSyncActionPodsCreated, nil
 				}
 			}
 			diff = int32(len(indexesToAdd))
diff --git a/pkg/controller/job/job_controller_test.go b/pkg/controller/job/job_controller_test.go
index b58f5fb147a..f4819f7c341 100644
--- a/pkg/controller/job/job_controller_test.go
+++ b/pkg/controller/job/job_controller_test.go
@@ -455,6 +455,37 @@ func TestControllerSyncJob(t *testing.T) {
 			expectedTerminating: ptr.To[int32](0),
 			controllerTime:      &referenceTime,
 		},
+		// Regression test for https://github.com/kubernetes/kubernetes/issues/139428.
+		// When replacement pods are needed but pod creation is deferred due to an
+		// active backoff, manageJob must report the actual active count rather than
+		// 0. Reporting 0 while Ready still reflects the running pods makes the status
+		// update fail apiserver validation ("cannot set more ready pods than active"),
+		// blocking finalizer removal and status flushing.
+		"too few active pods and active back-off with running pods": {
+			parallelism:  2,
+			completions:  2,
+			backoffLimit: 6,
+			backoffRecord: &backoffRecord{
+				failuresAfterLastSuccess: 1,
+				lastFailureTime:          &referenceTime,
+			},
+			initialStatus: &jobInitialStatus{
+				startTime: func() *time.Time {
+					now := time.Now()
+					return &now
+				}(),
+			},
+			activePods:          1,
+			readyPods:           1,
+			succeededPods:       0,
+			expectedCreations:   0,
+			expectedActive:      1,
+			expectedSucceeded:   0,
+			expectedPodPatches:  0,
+			expectedReady:       ptr.To[int32](1),
+			expectedTerminating: ptr.To[int32](0),
+			controllerTime:      &referenceTime,
+		},
 		"too few active pods and no back-offs": {
 			parallelism:  1,
 			completions:  1,
@@ -5221,6 +5252,50 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
 				FailedIndexes:           ptr.To(""),
 			},
 		},
+		// Regression test for https://github.com/kubernetes/kubernetes/issues/139428.
+		// One index has a running pod while another index's replacement pod
+		// creation is deferred because its per-index backoff is still active.
+		// manageJob must report the actual active count (1) rather than 0; a
+		// status with active=0 while pods are still running is rejected by the
+		// apiserver ("cannot set more ready pods than active"), blocking
+		// finalizer removal and status flushing.
+		"replacement pod creation delayed by per-index backoff while another index runs": {
+			job: batch.Job{
+				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
+				ObjectMeta: validObjectMeta,
+				Spec: batch.JobSpec{
+					Selector:             validSelector,
+					Template:             validTemplate,
+					Parallelism:          ptr.To[int32](2),
+					Completions:          ptr.To[int32](2),
+					BackoffLimit:         ptr.To[int32](math.MaxInt32),
+					CompletionMode:       ptr.To(batch.IndexedCompletion),
+					BackoffLimitPerIndex: ptr.To[int32](1),
+				},
+			},
+			pods: []v1.Pod{
+				*buildPod().uid("a").index("0").phase(v1.PodRunning).indexFailureCount("0").trackingFinalizer().Pod,
+				*buildPod().uid("b").index("1").status(v1.PodStatus{
+					Phase: v1.PodFailed,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							Name: "x",
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									FinishedAt: metav1.NewTime(now),
+								},
+							},
+						},
+					},
+				}).indexFailureCount("0").trackingFinalizer().Pod,
+			},
+			wantStatus: batch.JobStatus{
+				Active:                  1,
+				Terminating:             ptr.To[int32](0),
+				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
+				FailedIndexes:           new(string),
+			},
+		},
 		"single failed index due to exceeding the backoff limit per index, the job continues": {
 			job: batch.Job{
 				TypeMeta:   metav1.TypeMeta{Kind: "Job"},